MCP-1st-Birthday_Hackathon / tool_fetch_documents_texts.py
RCaz's picture
modification for production
bcfa0c8
# PDF parsing
from pypdf import PdfReader
from io import BytesIO
# HTTP requests
import requests
# XML parsing (PubMed FTP metadata)
import xml.etree.ElementTree as ET
# FTP download
from ftplib import FTP
from urllib.parse import urlparse
# ArXiv retrieval
import arxiv
from langchain_community.retrievers import ArxivRetriever
# PubMed → PDF resolution
from metapub import FindIt
# SerpAPI DOI search
import serpapi
import os
from dotenv import load_dotenv
load_dotenv()
def parse_pdf_file(path:str) -> str:
if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
response = requests.get(path)
response.raise_for_status() # Ensure download succeeded
reader = PdfReader(BytesIO(response.content))
else:
reader = PdfReader(path)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
def get_paper_from_arxiv_id(doi: str):
"""
Retrieve paper from arXiv using its arXiv ID.
"""
client = arxiv.Client()
search = arxiv.Search(query=doi, max_results=1)
results = client.results(search)
pdf_url = next(results).pdf_url
text = parse_pdf_file(pdf_url)
return text
def get_paper_from_arxiv_id_langchain(arxiv_id: str):
"""
Retrieve paper from arXiv using its arXiv ID. ==> returns a Langchain Document
"""
search = "2304.07814"
retriever = ArxivRetriever(
load_max_docs=2,
get_full_documents=True,
)
docs = retriever.invoke(search)
return docs
def get_paper_from_pmid(pmid:str):
src = FindIt(pmid)
if src.url:
pdf_text = parse_pdf_file(src.url)
return pdf_text
else:
print(src.reason)
def download_pdf_via_ftp(url: str) -> bytes:
"""
Download a PDF file from an FTP URL and return its content as bytes.
"""
parsed_url = urlparse(url)
ftp_host = parsed_url.netloc
ftp_path = parsed_url.path
file_buffer = BytesIO()
with FTP(ftp_host) as ftp:
ftp.login()
ftp.retrbinary(f'RETR {ftp_path}', file_buffer.write)
file_buffer.getvalue()
file_buffer.seek(0)
return file_buffer
def parse_pdf_from_pubmed_pmid(pmid: str) -> str:
"""
Download and parse a PDF from PubMed using its PMID.
"""
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmid}"
response = requests.get(url)
cleaned_string = response.content.decode('utf-8').strip()
try:
root = ET.fromstring(cleaned_string)
pdf_link_element = root.find(".//link[@format='pdf']")
ftp_url = pdf_link_element.get('href')
file_byte = download_pdf_via_ftp(ftp_url)
reader = PdfReader(file_byte)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
print(f"got {pmid} via ftp download")
return text
except Exception as e:
print(e)
def download_pdf_from_url(url):
"""
Download and extract text from a PDF URL
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
content_type = response.headers.get('content-type', '').lower()
if 'pdf' not in content_type and not response.content.startswith(b'%PDF'):
raise Exception(f"URL did not return a PDF (got {content_type})")
reader = PdfReader(BytesIO(response.content))
text = ""
for page in reader.pages:
text += page.extract_text() #or ""
return text
def download_paper_from_doi(doi):
"""
Attempt to download paper from DOI with multiple fallback methods
"""
# Clean DOI if it has prefix
doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
# Method 1: Try Unpaywall API (free, legal access)
try:
unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=your@email.com"
response = requests.get(unpaywall_url, timeout=10)
if response.status_code == 200:
data = response.json()
if data.get('best_oa_location') and data['best_oa_location'].get('url_for_pdf'):
pdf_url = data['best_oa_location']['url_for_pdf']
text = download_pdf_from_url(pdf_url)
print(f"Found PDF via Unpaywall: {pdf_url}")
return text
except Exception as e:
print(f"Unpaywall failed: {e}")
def get_pdf_content_serpapi(doi: str) -> str:
"""
Get the link to the paper from its DOI using SerpAPI Google Scholar search.
"""
client = serpapi.Client(api_key=os.getenv("SERPAPI_API_KEY"))
results = client.search({
'engine': 'google_scholar',
'q': doi,
})
pdf_path = results["organic_results"][0]["link"]
pdf_text = parse_pdf_file(pdf_path)
return pdf_text