Extracts content and metadata from local documents or websites. Supports:
Local files: PDF, DOCX, PPTX, TXT, HTML
Crawled websites: with optional breadth-first crawl depth
Details
The returned data frame includes structured columns such as:
source
, title
, author
, publishedDate
, description
, content
, url
, and source_type
.
## Required Packages
install.packages(c("pdftools", "officer", "rvest", "xml2", "dplyr", "stringi", "curl", "httr", "jsonlite", "magrittr"))
Examples
if (FALSE) { # \dontrun{
local_files <- c("tests/testthat/test-data/sprint.pdf",
"tests/testthat/test-data/introduction.pptx",
"tests/testthat/test-data/overview.txt")
website_urls <- c("https://www.r-project.org")
crawl_depth <- 1
response <- fetch_data(
local_paths = local_files,
website_urls = website_urls,
crawl_depth = crawl_depth
)
} # }