Spaces:
Running
Running
| # PDF parsing | |
| from pypdf import PdfReader | |
| from io import BytesIO | |
| # HTTP requests | |
| import requests | |
| # XML parsing (PubMed FTP metadata) | |
| import xml.etree.ElementTree as ET | |
| # FTP download | |
| from ftplib import FTP | |
| from urllib.parse import urlparse | |
| # ArXiv retrieval | |
| import arxiv | |
| from langchain_community.retrievers import ArxivRetriever | |
| # PubMed → PDF resolution | |
| from metapub import FindIt | |
| # SerpAPI DOI search | |
| import serpapi | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| def parse_pdf_file(path:str) -> str: | |
| if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"): | |
| response = requests.get(path) | |
| response.raise_for_status() # Ensure download succeeded | |
| reader = PdfReader(BytesIO(response.content)) | |
| else: | |
| reader = PdfReader(path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| return text | |
| def get_paper_from_arxiv_id(doi: str): | |
| """ | |
| Retrieve paper from arXiv using its arXiv ID. | |
| """ | |
| client = arxiv.Client() | |
| search = arxiv.Search(query=doi, max_results=1) | |
| results = client.results(search) | |
| pdf_url = next(results).pdf_url | |
| text = parse_pdf_file(pdf_url) | |
| return text | |
| def get_paper_from_arxiv_id_langchain(arxiv_id: str): | |
| """ | |
| Retrieve paper from arXiv using its arXiv ID. ==> returns a Langchain Document | |
| """ | |
| search = "2304.07814" | |
| retriever = ArxivRetriever( | |
| load_max_docs=2, | |
| get_full_documents=True, | |
| ) | |
| docs = retriever.invoke(search) | |
| return docs | |
| def get_paper_from_pmid(pmid:str): | |
| src = FindIt(pmid) | |
| if src.url: | |
| pdf_text = parse_pdf_file(src.url) | |
| return pdf_text | |
| else: | |
| print(src.reason) | |
| def download_pdf_via_ftp(url: str) -> bytes: | |
| """ | |
| Download a PDF file from an FTP URL and return its content as bytes. | |
| """ | |
| parsed_url = urlparse(url) | |
| ftp_host = parsed_url.netloc | |
| ftp_path = parsed_url.path | |
| file_buffer = BytesIO() | |
| with FTP(ftp_host) as ftp: | |
| ftp.login() | |
| ftp.retrbinary(f'RETR {ftp_path}', file_buffer.write) | |
| file_buffer.getvalue() | |
| file_buffer.seek(0) | |
| return file_buffer | |
| def parse_pdf_from_pubmed_pmid(pmid: str) -> str: | |
| """ | |
| Download and parse a PDF from PubMed using its PMID. | |
| """ | |
| url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmid}" | |
| response = requests.get(url) | |
| cleaned_string = response.content.decode('utf-8').strip() | |
| try: | |
| root = ET.fromstring(cleaned_string) | |
| pdf_link_element = root.find(".//link[@format='pdf']") | |
| ftp_url = pdf_link_element.get('href') | |
| file_byte = download_pdf_via_ftp(ftp_url) | |
| reader = PdfReader(file_byte) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| print(f"got {pmid} via ftp download") | |
| return text | |
| except Exception as e: | |
| print(e) | |
| def download_pdf_from_url(url): | |
| """ | |
| Download and extract text from a PDF URL | |
| """ | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| content_type = response.headers.get('content-type', '').lower() | |
| if 'pdf' not in content_type and not response.content.startswith(b'%PDF'): | |
| raise Exception(f"URL did not return a PDF (got {content_type})") | |
| reader = PdfReader(BytesIO(response.content)) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() #or "" | |
| return text | |
| def download_paper_from_doi(doi): | |
| """ | |
| Attempt to download paper from DOI with multiple fallback methods | |
| """ | |
| # Clean DOI if it has prefix | |
| doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '') | |
| # Method 1: Try Unpaywall API (free, legal access) | |
| try: | |
| unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=your@email.com" | |
| response = requests.get(unpaywall_url, timeout=10) | |
| if response.status_code == 200: | |
| data = response.json() | |
| if data.get('best_oa_location') and data['best_oa_location'].get('url_for_pdf'): | |
| pdf_url = data['best_oa_location']['url_for_pdf'] | |
| text = download_pdf_from_url(pdf_url) | |
| print(f"Found PDF via Unpaywall: {pdf_url}") | |
| return text | |
| except Exception as e: | |
| print(f"Unpaywall failed: {e}") | |
| def get_pdf_content_serpapi(doi: str) -> str: | |
| """ | |
| Get the link to the paper from its DOI using SerpAPI Google Scholar search. | |
| """ | |
| client = serpapi.Client(api_key=os.getenv("SERPAPI_API_KEY")) | |
| results = client.search({ | |
| 'engine': 'google_scholar', | |
| 'q': doi, | |
| }) | |
| pdf_path = results["organic_results"][0]["link"] | |
| pdf_text = parse_pdf_file(pdf_path) | |
| return pdf_text | |