Get Started

Installation

Install the development version from GitHub:

# Install devtools if not already installed
if (!require("devtools")) install.packages("devtools")

# Install contentanalysis
devtools::install_github("massimoaria/contentanalysis")
AI-Enhanced PDF Import (Optional)

For improved text extraction from complex PDFs, you can enable AI support:

  1. Get a free API key from Google AI Studio
  2. Set the environment variable:
# In R
Sys.setenv(GEMINI_API_KEY = "your-api-key-here")

# Or in your .Renviron file
GEMINI_API_KEY=your-api-key-here

Load the package:

library(contentanalysis)
library(dplyr)

Your First Analysis

Step 1: Download an Example Paper

Weโ€™ll use an open-access paper on Machine Learning:

# Download example paper
paper_url <- "https://raw.githubusercontent.com/massimoaria/contentanalysis/master/inst/examples/example_paper.pdf"
download.file(paper_url, destfile = "example_paper.pdf", mode = "wb")

Step 2: Import the PDF

Import with automatic section detection:

# Import PDF with automatic section detection
doc <- pdf2txt_auto("example_paper.pdf", n_columns = 2)

# Check detected sections
names(doc)

Expected output:

[1] "Full_text"    "Abstract"     "Introduction" "Methods"     
[5] "Results"      "Discussion"   "References"

Step 3: Analyze the Content

Perform comprehensive analysis with CrossRef and OpenAlex integration:

analysis <- analyze_scientific_content(
  text = doc,
  doi = "10.1016/j.mlwa.2021.100094",
  mailto = "your@email.com",
  window_size = 10,
  remove_stopwords = TRUE,
  ngram_range = c(1, 3)
)
Enhanced Metadata Integration ๐Ÿ†•

The package now automatically enriches references with metadata from both CrossRef and OpenAlex:

  • CrossRef: Retrieves structured reference data including authors, years, journals, and DOIs
  • OpenAlex: Fills gaps and provides comprehensive bibliographic information
  • Improved matching: Enhanced algorithms for connecting citations to references with confidence scoring

This dual integration significantly improves citation-reference matching accuracy!

Step 4: Explore the Results

View summary statistics:

analysis$summary

Example output:

$total_words
[1] 5234

$citations_extracted
[1] 42

$narrative_citations
[1] 18

$parenthetical_citations
[1] 24

$references_matched
[1] 38

$lexical_diversity
[1] 0.421

$citation_density
[1] 8.03

Common Workflows

Workflow 1: Citation Analysis

Extract and analyze citations:

# View all citations
head(analysis$citations)

# Count by type
table(analysis$citations$citation_type)

# Find citations in specific section
intro_citations <- analysis$citations %>%
  filter(section == "Introduction")

nrow(intro_citations)

Workflow 2: Text Analysis

Analyze word usage:

# Top words
head(analysis$word_frequencies, 20)

# Bigrams
head(analysis$ngrams$`2gram`, 10)

# Track specific terms
terms <- c("machine learning", "random forest", "accuracy")

dist <- calculate_word_distribution(
  text = doc,
  selected_words = terms,
  use_sections = TRUE
)

# Visualize
plot_word_distribution(dist, plot_type = "line")

Workflow 3: Network Visualization

Create citation networks:

# Create network
network <- create_citation_network(
  analysis,
  max_distance = 800,
  min_connections = 2,
  show_labels = TRUE
)

# Display
network

# View statistics
stats <- attr(network, "stats")
print(stats$section_distribution)

Workflow 4: Readability Assessment

Calculate readability metrics:

# Full document readability
readability <- calculate_readability_indices(
  doc$Full_text,
  detailed = TRUE
)

print(readability)

# Compare sections
sections <- c("Abstract", "Introduction", "Methods", "Discussion")
readability_by_section <- lapply(sections, function(s) {
  calculate_readability_indices(doc[[s]], detailed = FALSE)
})
names(readability_by_section) <- sections

do.call(rbind, readability_by_section)

Workflow 5: AI-Enhanced PDF Import ๐Ÿ†•

For complex PDFs with difficult layouts, use AI-enhanced extraction:

# Set your Gemini API key (if not in .Renviron)
# Sys.setenv(GEMINI_API_KEY = "your-api-key-here")

# Use AI-enhanced extraction for complex PDFs
doc_enhanced <- pdf2txt_auto(
  "complex_paper.pdf",
  n_columns = 2,
  use_ai = TRUE,           # Enable AI processing
  ai_model = "2.0-flash"   # Use Gemini 2.0 Flash
)

# Process large PDFs in chunks
large_doc <- process_large_pdf(
  "large_paper.pdf",
  chunk_pages = 10,        # Process 10 pages at a time
  ai_model = "2.0-flash"
)

# Direct AI content analysis
result <- gemini_content_ai(
  docs = "paper.pdf",
  prompt = "Extract and structure all citations from this document",
  outputSize = "large"
)

Export Results

Save your analysis results:

# Export citations
write.csv(analysis$citations, 
          "citations.csv", 
          row.names = FALSE)

# Export matched references
write.csv(analysis$citation_references_mapping, 
          "matched_citations.csv", 
          row.names = FALSE)

# Export word frequencies
write.csv(analysis$word_frequencies, 
          "word_frequencies.csv", 
          row.names = FALSE)

Processing Multiple Papers

Batch process multiple documents:

# List of papers and DOIs
papers <- c("paper1.pdf", "paper2.pdf", "paper3.pdf")
dois <- c("10.xxxx/1", "10.xxxx/2", "10.xxxx/3")

# Process all papers
results <- lapply(seq_along(papers), function(i) {
  doc <- pdf2txt_auto(papers[i], n_columns = 2)
  analyze_scientific_content(
    doc, 
    doi = dois[i],
    mailto = "your@email.com"
  )
})

# Extract citation counts
citation_counts <- sapply(results, function(x) {
  x$summary$citations_extracted
})
names(citation_counts) <- papers

print(citation_counts)

Next Steps

Now that youโ€™re familiar with the basics, explore:

Troubleshooting

Common Issues

PDF wonโ€™t import

  • Ensure the PDF is text-based, not scanned images
  • Try different n_columns values (1, 2, or 3)
  • Check that the file path is correct

Citations not detected

  • Verify the paper uses standard citation formats
  • Check if sections are properly detected with names(doc)
  • Try adjusting window_size parameter

Low reference matching

  • Provide a DOI for CrossRef integration
  • Ensure your email is valid for CrossRef API
  • Check that the References section was properly extracted

Network wonโ€™t display

  • Ensure there are enough citations (min_connections)
  • Try adjusting max_distance parameter
  • Check that citations were successfully extracted
Need Help?

If you encounter issues not covered here, please open an issue on GitHub.

Additional Resources