# Install devtools if not already installed
if (!require("devtools")) install.packages("devtools")
# Install contentanalysis
devtools::install_github("massimoaria/contentanalysis")Get Started
Installation
Install the development version from GitHub:
For improved text extraction from complex PDFs, you can enable AI support:
- Get a free API key from Google AI Studio
- Set the environment variable:
# In R
Sys.setenv(GEMINI_API_KEY = "your-api-key-here")
# Or in your .Renviron file
GEMINI_API_KEY=your-api-key-hereLoad the package:
library(contentanalysis)
library(dplyr)Your First Analysis
Step 1: Download an Example Paper
Weโll use an open-access paper on Machine Learning:
# Download example paper
paper_url <- "https://raw.githubusercontent.com/massimoaria/contentanalysis/master/inst/examples/example_paper.pdf"
download.file(paper_url, destfile = "example_paper.pdf", mode = "wb")Step 2: Import the PDF
Import with automatic section detection:
# Import PDF with automatic section detection
doc <- pdf2txt_auto("example_paper.pdf", n_columns = 2)
# Check detected sections
names(doc)Expected output:
[1] "Full_text" "Abstract" "Introduction" "Methods"
[5] "Results" "Discussion" "References"
Step 3: Analyze the Content
Perform comprehensive analysis with CrossRef and OpenAlex integration:
analysis <- analyze_scientific_content(
text = doc,
doi = "10.1016/j.mlwa.2021.100094",
mailto = "your@email.com",
window_size = 10,
remove_stopwords = TRUE,
ngram_range = c(1, 3)
)The package now automatically enriches references with metadata from both CrossRef and OpenAlex:
- CrossRef: Retrieves structured reference data including authors, years, journals, and DOIs
- OpenAlex: Fills gaps and provides comprehensive bibliographic information
- Improved matching: Enhanced algorithms for connecting citations to references with confidence scoring
This dual integration significantly improves citation-reference matching accuracy!
Step 4: Explore the Results
View summary statistics:
analysis$summaryExample output:
$total_words
[1] 5234
$citations_extracted
[1] 42
$narrative_citations
[1] 18
$parenthetical_citations
[1] 24
$references_matched
[1] 38
$lexical_diversity
[1] 0.421
$citation_density
[1] 8.03
Common Workflows
Workflow 1: Citation Analysis
Extract and analyze citations:
# View all citations
head(analysis$citations)
# Count by type
table(analysis$citations$citation_type)
# Find citations in specific section
intro_citations <- analysis$citations %>%
filter(section == "Introduction")
nrow(intro_citations)Workflow 2: Text Analysis
Analyze word usage:
# Top words
head(analysis$word_frequencies, 20)
# Bigrams
head(analysis$ngrams$`2gram`, 10)
# Track specific terms
terms <- c("machine learning", "random forest", "accuracy")
dist <- calculate_word_distribution(
text = doc,
selected_words = terms,
use_sections = TRUE
)
# Visualize
plot_word_distribution(dist, plot_type = "line")Workflow 3: Network Visualization
Create citation networks:
# Create network
network <- create_citation_network(
analysis,
max_distance = 800,
min_connections = 2,
show_labels = TRUE
)
# Display
network
# View statistics
stats <- attr(network, "stats")
print(stats$section_distribution)Workflow 4: Readability Assessment
Calculate readability metrics:
# Full document readability
readability <- calculate_readability_indices(
doc$Full_text,
detailed = TRUE
)
print(readability)
# Compare sections
sections <- c("Abstract", "Introduction", "Methods", "Discussion")
readability_by_section <- lapply(sections, function(s) {
calculate_readability_indices(doc[[s]], detailed = FALSE)
})
names(readability_by_section) <- sections
do.call(rbind, readability_by_section)Workflow 5: AI-Enhanced PDF Import ๐
For complex PDFs with difficult layouts, use AI-enhanced extraction:
# Set your Gemini API key (if not in .Renviron)
# Sys.setenv(GEMINI_API_KEY = "your-api-key-here")
# Use AI-enhanced extraction for complex PDFs
doc_enhanced <- pdf2txt_auto(
"complex_paper.pdf",
n_columns = 2,
use_ai = TRUE, # Enable AI processing
ai_model = "2.0-flash" # Use Gemini 2.0 Flash
)
# Process large PDFs in chunks
large_doc <- process_large_pdf(
"large_paper.pdf",
chunk_pages = 10, # Process 10 pages at a time
ai_model = "2.0-flash"
)
# Direct AI content analysis
result <- gemini_content_ai(
docs = "paper.pdf",
prompt = "Extract and structure all citations from this document",
outputSize = "large"
)Export Results
Save your analysis results:
# Export citations
write.csv(analysis$citations,
"citations.csv",
row.names = FALSE)
# Export matched references
write.csv(analysis$citation_references_mapping,
"matched_citations.csv",
row.names = FALSE)
# Export word frequencies
write.csv(analysis$word_frequencies,
"word_frequencies.csv",
row.names = FALSE)Processing Multiple Papers
Batch process multiple documents:
# List of papers and DOIs
papers <- c("paper1.pdf", "paper2.pdf", "paper3.pdf")
dois <- c("10.xxxx/1", "10.xxxx/2", "10.xxxx/3")
# Process all papers
results <- lapply(seq_along(papers), function(i) {
doc <- pdf2txt_auto(papers[i], n_columns = 2)
analyze_scientific_content(
doc,
doi = dois[i],
mailto = "your@email.com"
)
})
# Extract citation counts
citation_counts <- sapply(results, function(x) {
x$summary$citations_extracted
})
names(citation_counts) <- papers
print(citation_counts)Next Steps
Now that youโre familiar with the basics, explore:
- Reference Documentation for detailed function descriptions
- Tutorial for complete workflow examples
- Citation Analysis for advanced citation techniques
- Network Visualization for network analysis
Troubleshooting
Common Issues
PDF wonโt import
- Ensure the PDF is text-based, not scanned images
- Try different
n_columnsvalues (1, 2, or 3) - Check that the file path is correct
Citations not detected
- Verify the paper uses standard citation formats
- Check if sections are properly detected with
names(doc) - Try adjusting
window_sizeparameter
Low reference matching
- Provide a DOI for CrossRef integration
- Ensure your email is valid for CrossRef API
- Check that the References section was properly extracted
Network wonโt display
- Ensure there are enough citations (
min_connections) - Try adjusting
max_distanceparameter - Check that citations were successfully extracted
If you encounter issues not covered here, please open an issue on GitHub.
Additional Resources
- Package Vignette: Detailed examples
- GitHub Repository: Source code
- Issue Tracker: Report bugs