Spaces:

RCaz
/

MCP-1st-Birthday_Hackathon

Sleeping

App Files Files Community

RCaz commited on Nov 30, 2025

Commit

bcfa0c8

1 Parent(s): 5821a0e

modification for production

Browse files

Files changed (5) hide show

DESCRIPTION.md +14 -1
agent.py +0 -33
tool_create_FAISS_vector.py +28 -162
tool_fetch_documents_DOI.py +0 -0
tool_fetch_documents_texts.py +172 -0

DESCRIPTION.md CHANGED Viewed

@@ -234,4 +234,17 @@ Medical question answering
 Literature reviews
-Automated extraction pipelines

 Literature reviews
+Automated extraction pipelines
+## Git branches
+main : main branch to merge development
+dev : auxiliary branches to add components
+production : branch to push on huggingface space [specific remote branch]
+Changes for productio includes:
+- Guard function to insure clinical trials topic
+- PATCH OpenInference
+- disbale tqmd
+- patch

agent.py CHANGED Viewed

@@ -101,42 +101,9 @@ def parse_pdf(pdf_path:str)->list[str]:
         text.append(page.extract_text())
     return text
-# @tool
-# def make_rag_ressource(paths :list(str)) -> list(str):
-#     """
-#     Use extracted text to build a RAG tool and retreive documents to use to answer request
-#     Args:
-#         paths: The list of path where the file are stored
-#     Returns:
-#         A list of strings, where each string is the extracted text content
-#         from the retreiver
-#     """
-#     pdf_files=[]
-#     for path in paths:
-#     pdf_documents = []
-#     for pdf_file in pdf_files:
-#         loader = PyPDFLoader(pdf_file)
-#         pdf_documents.extend(loader.load())
-#     embeddings_model = OpenAIEmbeddings()
-#     pdf_texts = [doc.page_content for doc in pdf_documents]
-#     return ""
-# # Initialize the model
-# model = InferenceClientModel(
-#     model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct",
-#     provider="nebius"
-# )
 # Create clinical trial search agent
 clinical_agent = CodeAgent(
     name="clinical_agent",
     description=(

         text.append(page.extract_text())
     return text
 # Create clinical trial search agent
 clinical_agent = CodeAgent(
     name="clinical_agent",
     description=(

tool_create_FAISS_vector.py CHANGED Viewed

@@ -1,184 +1,50 @@
 from pypdf import PdfReader
-import requests
 from io import BytesIO
-import serpapi
 import os
 from dotenv import load_dotenv
 load_dotenv()
-from langchain_core.documents import Document as LangchainDocument
 from metapub import FindIt
-import requests
 import xml.etree.ElementTree as ET
 from ftplib import FTP
 from urllib.parse import urlparse
-from io import BytesIO
-from langchain_community.retrievers import ArxivRetriever
 import arxiv
-import requests
-from io import BytesIO
-from pypdf import PdfReader
-import re
-from langchain_community.vectorstores.utils import DistanceStrategy
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from transformers import AutoTokenizer
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from tqdm import tqdm
 import re
-from typing import List, Dict, Tuple
-def parse_pdf_file(path:str) -> str:
-    if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
-        response = requests.get(path)
-        response.raise_for_status()  # Ensure download succeeded
-        reader = PdfReader(BytesIO(response.content))
-    else:
-        reader = PdfReader(path)
-    text = ""
-    for page in reader.pages:
-        text += page.extract_text() or ""
-    return text
-def get_paper_from_arxiv_id(doi: str):
-    """
-    Retrieve paper from arXiv using its arXiv ID.
-    """
-    client = arxiv.Client()
-    search = arxiv.Search(query=doi, max_results=1)
-    results = client.results(search)
-    pdf_url = next(results).pdf_url
-    text = parse_pdf_file(pdf_url)
-    return text
-def get_paper_from_arxiv_id_langchain(arxiv_id: str):
-    """
-    Retrieve paper from arXiv using its arXiv ID. ==> returns a Langchain Document
-    """
-    search = "2304.07814"
-    retriever = ArxivRetriever(
-        load_max_docs=2,
-        get_full_documents=True,
-    )
-    docs = retriever.invoke(search)
-    return docs
-def get_paper_from_pmid(pmid:str):
-    src = FindIt(pmid)
-    if src.url:
-        pdf_text = parse_pdf_file(src.url)
-        return pdf_text
-    else:
-       print(src.reason)
-def download_pdf_via_ftp(url: str) -> bytes:
-    """
-    Download a PDF file from an FTP URL and return its content as bytes.
-    """
-    parsed_url = urlparse(url)
-    ftp_host = parsed_url.netloc
-    ftp_path = parsed_url.path
-    file_buffer = BytesIO()
-    with FTP(ftp_host) as ftp:
-        ftp.login()
-        ftp.retrbinary(f'RETR {ftp_path}', file_buffer.write)
-    file_buffer.getvalue()
-    file_buffer.seek(0)
-    return file_buffer
-def parse_pdf_from_pubmed_pmid(pmid: str) -> str:
-    """
-    Download and parse a PDF from PubMed using its PMID.
-    """
-    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmid}"
-    response = requests.get(url)
-    cleaned_string = response.content.decode('utf-8').strip()
-    try:
-        root = ET.fromstring(cleaned_string)
-        pdf_link_element = root.find(".//link[@format='pdf']")
-        ftp_url = pdf_link_element.get('href')
-        file_byte = download_pdf_via_ftp(ftp_url)
-        reader = PdfReader(file_byte)
-        text = ""
-        for page in reader.pages:
-            text += page.extract_text() or ""
-        print(f"got {pmid} via ftp download")
-        return text
-    except Exception as e:
-        print(e)
-def download_pdf_from_url(url):
-    """
-    Download and extract text from a PDF URL
-    """
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
-    }
-    response = requests.get(url, headers=headers, timeout=30)
-    response.raise_for_status()
-    content_type = response.headers.get('content-type', '').lower()
-    if 'pdf' not in content_type and not response.content.startswith(b'%PDF'):
-        raise Exception(f"URL did not return a PDF (got {content_type})")
-    reader = PdfReader(BytesIO(response.content))
-    text = ""
-    for page in reader.pages:
-        text += page.extract_text() #or ""
-    return text
-def download_paper_from_doi(doi):
-    """
-    Attempt to download paper from DOI with multiple fallback methods
-    """
-    # Clean DOI if it has prefix
-    doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
-    # Method 1: Try Unpaywall API (free, legal access)
-    try:
-        unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=your@email.com"
-        response = requests.get(unpaywall_url, timeout=10)
-        if response.status_code == 200:
-            data = response.json()
-            if data.get('best_oa_location') and data['best_oa_location'].get('url_for_pdf'):
-                pdf_url = data['best_oa_location']['url_for_pdf']
-                text = download_pdf_from_url(pdf_url)
-                print(f"Found PDF via Unpaywall: {pdf_url}")
-                return text
-    except Exception as e:
-        print(f"Unpaywall failed: {e}")
-def get_pdf_content_serpapi(doi: str) -> str:
-    """
-    Get the link to the paper from its DOI using SerpAPI Google Scholar search.
-    """
-    client = serpapi.Client(api_key=os.getenv("SERPAPI_API_KEY"))
-    results = client.search({
-        'engine': 'google_scholar',
-        'q': doi,
-    })
-    pdf_path = results["organic_results"][0]["link"]
-    pdf_text = parse_pdf_file(pdf_path)
-    return pdf_text
 class ReferenceExtractor:
     """Extract and classify references from LLM outputs."""
@@ -339,7 +205,7 @@ def create_vector_store_from_list_of_doi(refs :str, VECTOR_DB_PATH:str) -> str:
     # define embedding
     device = get_device()
-    embedding_name="BAAI/bge-large-en-v1.5"
     embedding_model = HuggingFaceEmbeddings(model_name=embedding_name,
                                         model_kwargs={"device": device}, # set device acording to availaility
                                         encode_kwargs={"normalize_embeddings": True},)

+# PDF parsing
 from pypdf import PdfReader
 from io import BytesIO
+# HTTP requests
+import requests
+# Environment
 import os
 from dotenv import load_dotenv
 load_dotenv()
+# SerpAPI DOI lookup
+import serpapi
+# PubMed / Metapub
 from metapub import FindIt
 import xml.etree.ElementTree as ET
+# FTP download
 from ftplib import FTP
 from urllib.parse import urlparse
+# ArXiv
 import arxiv
+from langchain_community.retrievers import ArxivRetriever
+# Regex
 import re
+# LangChain document
+from langchain_core.documents import Document as LangchainDocument
+# Reference parser & vector store tools
+from tool_create_FAISS_vector import *
+# Torch device detection
+import torch
+# Embeddings & vector store dependencies
+from langchain_community.vectorstores.utils import DistanceStrategy
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from transformers import AutoTokenizer
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# Progress bar
+from tqdm import tqdm
 class ReferenceExtractor:
     """Extract and classify references from LLM outputs."""
     # define embedding
     device = get_device()
+    embedding_name="BAAI/bge-small-en-v1.5"
     embedding_model = HuggingFaceEmbeddings(model_name=embedding_name,
                                         model_kwargs={"device": device}, # set device acording to availaility
                                         encode_kwargs={"normalize_embeddings": True},)

tool_fetch_documents_DOI.py DELETED Viewed

File without changes

tool_fetch_documents_texts.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# PDF parsing
+from pypdf import PdfReader
+from io import BytesIO
+# HTTP requests
+import requests
+# XML parsing (PubMed FTP metadata)
+import xml.etree.ElementTree as ET
+# FTP download
+from ftplib import FTP
+from urllib.parse import urlparse
+# ArXiv retrieval
+import arxiv
+from langchain_community.retrievers import ArxivRetriever
+# PubMed → PDF resolution
+from metapub import FindIt
+# SerpAPI DOI search
+import serpapi
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def parse_pdf_file(path:str) -> str:
+    if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
+        response = requests.get(path)
+        response.raise_for_status()  # Ensure download succeeded
+        reader = PdfReader(BytesIO(response.content))
+    else:
+        reader = PdfReader(path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() or ""
+    return text
+def get_paper_from_arxiv_id(doi: str):
+    """
+    Retrieve paper from arXiv using its arXiv ID.
+    """
+    client = arxiv.Client()
+    search = arxiv.Search(query=doi, max_results=1)
+    results = client.results(search)
+    pdf_url = next(results).pdf_url
+    text = parse_pdf_file(pdf_url)
+    return text
+def get_paper_from_arxiv_id_langchain(arxiv_id: str):
+    """
+    Retrieve paper from arXiv using its arXiv ID. ==> returns a Langchain Document
+    """
+    search = "2304.07814"
+    retriever = ArxivRetriever(
+        load_max_docs=2,
+        get_full_documents=True,
+    )
+    docs = retriever.invoke(search)
+    return docs
+def get_paper_from_pmid(pmid:str):
+    src = FindIt(pmid)
+    if src.url:
+        pdf_text = parse_pdf_file(src.url)
+        return pdf_text
+    else:
+       print(src.reason)
+def download_pdf_via_ftp(url: str) -> bytes:
+    """
+    Download a PDF file from an FTP URL and return its content as bytes.
+    """
+    parsed_url = urlparse(url)
+    ftp_host = parsed_url.netloc
+    ftp_path = parsed_url.path
+    file_buffer = BytesIO()
+    with FTP(ftp_host) as ftp:
+        ftp.login()
+        ftp.retrbinary(f'RETR {ftp_path}', file_buffer.write)
+    file_buffer.getvalue()
+    file_buffer.seek(0)
+    return file_buffer
+def parse_pdf_from_pubmed_pmid(pmid: str) -> str:
+    """
+    Download and parse a PDF from PubMed using its PMID.
+    """
+    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmid}"
+    response = requests.get(url)
+    cleaned_string = response.content.decode('utf-8').strip()
+    try:
+        root = ET.fromstring(cleaned_string)
+        pdf_link_element = root.find(".//link[@format='pdf']")
+        ftp_url = pdf_link_element.get('href')
+        file_byte = download_pdf_via_ftp(ftp_url)
+        reader = PdfReader(file_byte)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text() or ""
+        print(f"got {pmid} via ftp download")
+        return text
+    except Exception as e:
+        print(e)
+def download_pdf_from_url(url):
+    """
+    Download and extract text from a PDF URL
+    """
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+    }
+    response = requests.get(url, headers=headers, timeout=30)
+    response.raise_for_status()
+    content_type = response.headers.get('content-type', '').lower()
+    if 'pdf' not in content_type and not response.content.startswith(b'%PDF'):
+        raise Exception(f"URL did not return a PDF (got {content_type})")
+    reader = PdfReader(BytesIO(response.content))
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() #or ""
+    return text
+def download_paper_from_doi(doi):
+    """
+    Attempt to download paper from DOI with multiple fallback methods
+    """
+    # Clean DOI if it has prefix
+    doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
+    # Method 1: Try Unpaywall API (free, legal access)
+    try:
+        unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=your@email.com"
+        response = requests.get(unpaywall_url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if data.get('best_oa_location') and data['best_oa_location'].get('url_for_pdf'):
+                pdf_url = data['best_oa_location']['url_for_pdf']
+                text = download_pdf_from_url(pdf_url)
+                print(f"Found PDF via Unpaywall: {pdf_url}")
+                return text
+    except Exception as e:
+        print(f"Unpaywall failed: {e}")
+def get_pdf_content_serpapi(doi: str) -> str:
+    """
+    Get the link to the paper from its DOI using SerpAPI Google Scholar search.
+    """
+    client = serpapi.Client(api_key=os.getenv("SERPAPI_API_KEY"))
+    results = client.search({
+        'engine': 'google_scholar',
+        'q': doi,
+    })
+    pdf_path = results["organic_results"][0]["link"]
+    pdf_text = parse_pdf_file(pdf_path)
+    return pdf_text