Spaces:

RCaz
/

MCP-1st-Birthday_Hackathon

Running

App Files Files Community

MCP-1st-Birthday_Hackathon / tool_fetch_documents_texts.py

RCaz

modification for production

bcfa0c8 3 months ago

raw

history blame contribute delete

4.93 kB

	# PDF parsing
	from pypdf import PdfReader
	from io import BytesIO

	# HTTP requests
	import requests

	# XML parsing (PubMed FTP metadata)
	import xml.etree.ElementTree as ET

	# FTP download
	from ftplib import FTP
	from urllib.parse import urlparse

	# ArXiv retrieval
	import arxiv
	from langchain_community.retrievers import ArxivRetriever

	# PubMed → PDF resolution
	from metapub import FindIt

	# SerpAPI DOI search
	import serpapi
	import os
	from dotenv import load_dotenv

	load_dotenv()

	def parse_pdf_file(path:str) -> str:

	if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
	response = requests.get(path)
	response.raise_for_status() # Ensure download succeeded
	reader = PdfReader(BytesIO(response.content))
	else:
	reader = PdfReader(path)

	text = ""
	for page in reader.pages:
	text += page.extract_text() or ""

	return text

	def get_paper_from_arxiv_id(doi: str):
	"""
	Retrieve paper from arXiv using its arXiv ID.
	"""
	client = arxiv.Client()
	search = arxiv.Search(query=doi, max_results=1)
	results = client.results(search)
	pdf_url = next(results).pdf_url
	text = parse_pdf_file(pdf_url)
	return text

	def get_paper_from_arxiv_id_langchain(arxiv_id: str):
	"""
	Retrieve paper from arXiv using its arXiv ID. ==> returns a Langchain Document
	"""
	search = "2304.07814"
	retriever = ArxivRetriever(
	load_max_docs=2,
	get_full_documents=True,
	)
	docs = retriever.invoke(search)
	return docs

	def get_paper_from_pmid(pmid:str):
	src = FindIt(pmid)
	if src.url:
	pdf_text = parse_pdf_file(src.url)
	return pdf_text
	else:
	print(src.reason)



	def download_pdf_via_ftp(url: str) -> bytes:
	"""
	Download a PDF file from an FTP URL and return its content as bytes.
	"""
	parsed_url = urlparse(url)
	ftp_host = parsed_url.netloc
	ftp_path = parsed_url.path

	file_buffer = BytesIO()

	with FTP(ftp_host) as ftp:
	ftp.login()
	ftp.retrbinary(f'RETR {ftp_path}', file_buffer.write)

	file_buffer.getvalue()
	file_buffer.seek(0)
	return file_buffer


	def parse_pdf_from_pubmed_pmid(pmid: str) -> str:
	"""
	Download and parse a PDF from PubMed using its PMID.
	"""
	url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmid}"
	response = requests.get(url)
	cleaned_string = response.content.decode('utf-8').strip()
	try:
	root = ET.fromstring(cleaned_string)
	pdf_link_element = root.find(".//link[@format='pdf']")
	ftp_url = pdf_link_element.get('href')
	file_byte = download_pdf_via_ftp(ftp_url)

	reader = PdfReader(file_byte)
	text = ""
	for page in reader.pages:
	text += page.extract_text() or ""
	print(f"got {pmid} via ftp download")
	return text
	except Exception as e:
	print(e)

	def download_pdf_from_url(url):
	"""
	Download and extract text from a PDF URL
	"""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}
	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()
	content_type = response.headers.get('content-type', '').lower()
	if 'pdf' not in content_type and not response.content.startswith(b'%PDF'):
	raise Exception(f"URL did not return a PDF (got {content_type})")

	reader = PdfReader(BytesIO(response.content))
	text = ""
	for page in reader.pages:
	text += page.extract_text() #or ""
	return text


	def download_paper_from_doi(doi):
	"""
	Attempt to download paper from DOI with multiple fallback methods
	"""
	# Clean DOI if it has prefix
	doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')

	# Method 1: Try Unpaywall API (free, legal access)
	try:
	unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=your@email.com"
	response = requests.get(unpaywall_url, timeout=10)
	if response.status_code == 200:
	data = response.json()
	if data.get('best_oa_location') and data['best_oa_location'].get('url_for_pdf'):
	pdf_url = data['best_oa_location']['url_for_pdf']
	text = download_pdf_from_url(pdf_url)
	print(f"Found PDF via Unpaywall: {pdf_url}")
	return text
	except Exception as e:
	print(f"Unpaywall failed: {e}")

	def get_pdf_content_serpapi(doi: str) -> str:
	"""
	Get the link to the paper from its DOI using SerpAPI Google Scholar search.
	"""
	client = serpapi.Client(api_key=os.getenv("SERPAPI_API_KEY"))
	results = client.search({
	'engine': 'google_scholar',
	'q': doi,
	})

	pdf_path = results["organic_results"][0]["link"]
	pdf_text = parse_pdf_file(pdf_path)
	return pdf_text