Spaces:

RCaz
/

MCP-1st-Birthday_Hackathon

Sleeping

App Files Files Community

RCaz commited on Nov 29, 2025

Commit

c64c1d6

1 Parent(s): 2ea0c95

added tools and agent

Browse files

Files changed (7) hide show

agent.py +197 -0
app.py +258 -0
tool_TOON_formater.py +237 -0
tool_create_FAISS_vector.py +473 -0
tool_describe_figure.py +57 -0
tool_fetch_documents_DOI.py +0 -0
tool_query_FAISS_vector.py +52 -0

agent.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import os
+import re
+import requests
+from dotenv import load_dotenv
+from markdownify import markdownify
+from requests.exceptions import RequestException
+from smolagents import (
+    LiteLLMModel,
+    CodeAgent,
+    ToolCallingAgent,
+    InferenceClientModel,
+    WebSearchTool,
+    tool,
+    FinalAnswerTool,
+    WikipediaSearchTool,
+    VisitWebpageTool,
+    DuckDuckGoSearchTool
+)
+load_dotenv()
+from langfuse import get_client
+langfuse = get_client()
+if langfuse.auth_check():
+    print("Langfuse client is authenticated and ready!")
+else:
+    print("Authentication failed. Please check your credentials and host.")
+from openinference.instrumentation.smolagents import SmolagentsInstrumentor
+SmolagentsInstrumentor().instrument()
+model = LiteLLMModel(
+    model_id="openai/Qwen/Qwen3-Coder-480B-A35B-Instruct",
+    api_key=os.environ.get("NEBIUS_API_KEY"),
+    api_base="https://api.tokenfactory.nebius.com/v1/"
+)
+from tool_clinical_trial import ClinicalTrialsSearchTool
+@tool
+def search_pubmed(topic: str, author: str) -> list[str]:
+    """
+    Searches the PubMed database for articles related to a specific topic.
+    Args:
+        topic: The topic or keywords to search for (e.g., "CRISPR gene editing").
+        author: The name of the author to search for (e.g., "Albert Einstein").
+    Returns:
+        A list of PubMed IDs (strings) for the top 100 articles found.
+    Raises:
+        requests.exceptions.HTTPError: If the API request fails.
+    """
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+    terms = []
+    if topic:
+        terms.append(topic)
+    if author:
+        terms.append(f"{author}[Author]")
+    query = " AND ".join(terms)
+    params = {
+        "db": "pubmed",
+        "term": query,
+        "retmode": "json",
+        "retmax": 1000
+    }
+    response = requests.get(base_url, params=params)
+    response.raise_for_status()
+    data = response.json()
+    return data["esearchresult"]["idlist"]
+@tool
+def parse_pdf(pdf_path:str)->list[str]:
+    """
+    Reads a PDF file from a specified path and extracts the text content
+    from every page.
+    Args:
+        pdf_path: The local file path (string) to the PDF document to be parsed.
+                  **NOTE**: In a remote agent environment, this path must be
+                  accessible by the executing process (e.g., a path to an
+                  uploaded file).
+    Returns:
+        A list of strings, where each string is the extracted text content
+        from a single page of the PDF.
+    """
+    from pypdf import PdfReader
+    reader = PdfReader(pdf_path)
+    number_of_pages = len(reader.pages)
+    text=list()
+    for p in range(number_of_pages):
+        page = reader.pages[p]
+        text.append(page.extract_text())
+    return text
+# @tool
+# def make_rag_ressource(paths :list(str)) -> list(str):
+#     """
+#     Use extracted text to build a RAG tool and retreive documents to use to answer request
+#     Args:
+#         paths: The list of path where the file are stored
+#     Returns:
+#         A list of strings, where each string is the extracted text content
+#         from the retreiver
+#     """
+#     pdf_files=[]
+#     for path in paths:
+#     pdf_documents = []
+#     for pdf_file in pdf_files:
+#         loader = PyPDFLoader(pdf_file)
+#         pdf_documents.extend(loader.load())
+#     embeddings_model = OpenAIEmbeddings()
+#     pdf_texts = [doc.page_content for doc in pdf_documents]
+#     return ""
+# # Initialize the model
+# model = InferenceClientModel(
+#     model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct",
+#     provider="nebius"
+# )
+# Create clinical trial search agent
+clinical_agent = CodeAgent(
+    name="clinical_agent",
+    description=(
+        "Retrieve and parse clinical study data for a given disease. "
+        "Use ClinicalTrialsSearchTool for trials, search_pubmed for authors, and parse_pdf for full-text analysis. "
+        "Return structured tables or summaries as requested."
+        "Gather general or recent information from online sources. "
+        "Use Wikipedia for overviews, DuckDuckGo for recent data, and VisitWebpageTool for specific URLs. "
+        "Return structured summaries with sources."
+    ),
+    tools=[ClinicalTrialsSearchTool()],
+    additional_authorized_imports=["time", "numpy", "pandas"],
+    # executor_type="blaxel", #executor_type="modal",
+    use_structured_outputs_internally=True,
+    return_full_result=True,
+    planning_interval=3,                      # V3 add structure
+    model=model,
+    max_steps=6,
+    verbosity_level=2
+)
+search_online_info = CodeAgent(
+    name="search_online_info",
+    description=(
+        "Gather general or recent information from online sources. "
+        "Use Wikipedia for overviews, DuckDuckGo for recent data, and VisitWebpageTool for specific URLs. "
+        "Return structured summaries with sources."
+    ),
+    tools=[WikipediaSearchTool(),VisitWebpageTool(max_output_length=10000),DuckDuckGoSearchTool(max_results=5),search_pubmed,parse_pdf],
+    additional_authorized_imports=["time", "numpy", "pandas"],
+    # use_structured_outputs_internally=True,
+    # executor_type="modal",
+    planning_interval=2,
+    model=model,
+    max_steps=4,
+    verbosity_level=2
+)
+manager_agent = CodeAgent(
+    name="manager_agent",
+    description=(
+    "Most important task is to provide a complete answer to user questions based on clinical trial data and online information. "
+    "Orchestrate workflow between clinical and online agents. "
+    "Validate outputs, resolve conflicts, and ensure the final answer is complete and accurate."
+    ),
+    tools=[FinalAnswerTool()],
+    model=model,
+    managed_agents=[clinical_agent,search_online_info],
+    # executor_type="modal",
+    provide_run_summary=True,
+    additional_authorized_imports=["time", "numpy", "pandas"],
+    use_structured_outputs_internally=True,
+    verbosity_level=2,
+    planning_interval=3,
+    max_steps=6,
+)

app.py ADDED Viewed

	@@ -0,0 +1,258 @@

+from agent import manager_agent
+import gradio as gr
+from smolagents import stream_to_gradio
+import smolagents
+import json
+import re
+import ast
+agent = manager_agent
+import logging
+logging.info("Processing request")
+# --- PATCH OpenTelemetry detach bug (generator-safe) ---
+from opentelemetry.context import _RUNTIME_CONTEXT
+_orig_detach = _RUNTIME_CONTEXT.detach
+def _safe_detach(token):
+    try:
+        _orig_detach(token)
+    except Exception:
+        # Suppress context-var boundary errors caused by streamed generators
+        pass
+_RUNTIME_CONTEXT.detach = _safe_detach
+# --- PATCH OpenTelemetry detach bug (generator-safe) ---
+def answer_question(question):
+    """Use a smolagent CodeAgent with tools to answer a question.
+    The agent streams its thought process (planning steps) and the final answer.
+    Args:
+        question (str): The question to be answered by the agent.
+    Yields:
+        tuple(str, str): A tuple containing the current 'thoughts' (planning/intermediate steps)
+                         and the current 'final_answer'.
+    """
+    thoughts = ""
+    final_answer = ""
+    n_tokens =0
+    try:
+        logging.info(f"Received question: {question}")
+        for st in manager_agent.run(question,stream=True,return_full_result=True):
+            if isinstance(st, smolagents.memory.PlanningStep):
+                plan = st.model_output_message.content.split("## 2.")[-1]
+                for m in plan.split("\n"):
+                    thoughts += "\n" + m
+                    yield thoughts, final_answer
+            elif isinstance(st,  smolagents.memory.ToolCall):
+                thoughts += f"\nTool called: {st.dict()['function']['name']}\n"
+                for m in st.dict()['function']['arguments'].split("\n"):
+                    thoughts += "\n" + m
+                    yield thoughts, final_answer
+            elif isinstance(st,  smolagents.agents.ActionOutput):
+                if st.output:
+                    thoughts += "\n" + str(st.output) + "\n"
+                    yield thoughts, final_answer
+                else:
+                    thoughts += "\n****************\nNo output from action.\n****************\n"
+                    yield thoughts, final_answer
+            elif isinstance(st,  smolagents.memory.ActionStep):
+                for m in st.model_output_message.content.split("\n"):
+                    thoughts += m
+                    yield thoughts, final_answer
+                thoughts += "\n********** End fo Step " + str(st.step_number) + " : *********\n " + str(st.token_usage) + "\nStep duration" + str(st.timing) + "\n\n"
+                yield thoughts, final_answer
+            elif isinstance(st, smolagents.memory.FinalAnswerStep):
+                final_answer = st.output
+                yield thoughts, final_answer
+    except GeneratorExit:
+        print("Stream closed cleanly.")
+        return "",""
+# def create_rag_files(refs :list[str], VECTOR_DB_PATH:str)-> str:
+#     from tool_create_FAISS_vector import create_vector_store_from_list_of_doi
+#     FAISS_VECTOR_PATH = create_vector_store_from_list_of_doi(refs,VECTOR_DB_PATH)
+#     return FAISS_VECTOR_PATH
+def tool_clinical_trial(query_cond:str=None, query_term:str=None,query_lead:str=None,max_results: int = 5000) -> list:
+    """
+    Search Clinical Trials database for trials with 4 arguments.
+    Args:
+        query_cond (str): Disease or condition (e.g., 'lung cancer', 'diabetes')
+        query_term (str): Other terms (e.g., 'AREA[LastUpdatePostDate]RANGE[2023-01-15,MAX]').
+        query_lead (str): Searches the LeadSponsorName
+        max_results (int): Number of trials to return (max: 1000)
+    Returns:
+        list(str): each string being a structured representation of a trial.
+    """
+    from tool_TOON_formater import TOON_formater
+    try:
+        max_results = int(max_results)
+    except:
+        max_results = 500
+    params = {
+        "query.cond": query_cond,
+        "query.term":query_term,
+        "query.lead":query_lead,
+        "pageSize": min(max_results, 5000),
+        "format": "json"
+    }
+    params = {k: v for k, v in params.items() if v is not None}
+    try:
+        response = requests.get(
+            "https://clinicaltrials.gov/api/v2/studies",
+            params=params,
+            timeout=30
+        )
+        response.raise_for_status()
+        studies = response.json().get("studies", [])
+        structured_trials = []
+        for i, study in enumerate(studies):
+            structured_data = TOON_formater(study)
+            structured_trials.append(structured_data)
+        return structured_trials
+    except Exception as e:
+        return [f"Error searching clinical trials: {str(e)}"]
+def create_rag(refs :str, VECTOR_DB_PATH:str)-> str:
+    """Create a RAG (Retrieval-Augmented Generation) vector store from a list of DOIs.
+    Args:
+        refs (str): A comma-separated string of DOIs (Digital Object Identifiers).
+        VECTOR_DB_PATH (str): The local path where the FAISS vector store should be saved.
+    Returns:
+        str: The path to the newly created FAISS vector store.
+    """
+    from tool_create_FAISS_vector import create_vector_store_from_list_of_doi
+    FAISS_VECTOR_PATH = create_vector_store_from_list_of_doi(refs,VECTOR_DB_PATH)
+    return FAISS_VECTOR_PATH
+def use_rag(query: str, store_name: str, top_k: int = 5) -> str:
+    """Retrieve context from a FAISS vector store based on a query.
+    Args:
+        query (str): The question or query string to use for retrieval.
+        store_name (str): The path to the FAISS vector store to query.
+        top_k (int): The number of top-k most relevant context documents to retrieve (default: 5).
+    Returns:
+        str: A JSON string containing the retrieved context, including the content and source (DOI).
+    """
+    from tool_query_FAISS_vector import query_vector_store
+    context_as_dict = query_vector_store(query, store_name, top_k)
+    return json.dumps(context_as_dict, indent=2)
+from PIL import Image
+def describe_figure(figure : Image) -> str:
+    """Provide a detailed, thorough description of an image figure.
+    Args:
+        figure (Image): The image figure object (from PIL) to be described.
+    Returns:
+        description (str): A detailed textual description of the figure's content.
+    """
+    from tool_describe_figure import thourough_picture_description
+    description = thourough_picture_description(figure)
+    return description
+# Create neat interface - Question Analyzer as a Blocks component
+with gr.Blocks() as interface2:
+    gr.Markdown("# Question Analyzer")
+    gr.Markdown("""Enter a question to analyze. Examples:
+    - Find the name of the sponsor that did the most studies on Alzheimer's disease in the last 10 years.
+    - Provide a summary of recent clinical trials on diabetes and list 3 relevant research articles from PubMed.
+    - What are the scientific paper linked to the clinical study referenced as NCT04516746?
+    - How many clinical studies on cancer were completed in the last 5 years?
+    - Find recent phase 3 trials for lung cancer sponsored by Pfizer
+    """)
+    with gr.Row():
+        with gr.Column():
+            question_input = gr.Textbox(
+                label="Question",
+                placeholder="Enter your question here...",
+                lines=3,
+            )
+            submit_btn = gr.Button("Submit", variant="primary")
+            response_output = gr.Textbox(
+                label="Final Answer",
+                interactive=False,
+                lines=8
+            )
+        with gr.Column():
+            thoughts_output = gr.Textbox(
+                label="LLM Thoughts/Reasoning",
+                interactive=False,
+                lines=8
+            )
+    chat_history = gr.State([])
+    submit_btn.click(
+        fn=answer_question,
+        inputs=[question_input],
+        outputs=[thoughts_output, response_output],
+        queue=True
+    )
+# Combine interfaces into a single tabbed interface
+demo = gr.TabbedInterface(
+    [interface2,
+     gr.Interface(
+         fn=create_rag,
+         inputs=[gr.Textbox("list of references to include in vector store",lines=2, info="(can be DOIs, PMIDs, erxivs, ... and a mix of it)"),
+                 gr.Textbox("Name of the vactore store", lines=2, placeholder="My_Diabetes_vector") ],
+                 outputs=gr.Textbox("path of the vactore store"),
+            api_name="create_vector_store_for_rag"),
+         gr.Interface(
+            fn=use_rag,
+            inputs=[gr.Textbox("question that needs context to answer"),
+                    gr.Textbox("Name of the vector store to use", placeholder="Diabetes, Sickel_cell_anemia, Prostate_cancer, ..")],
+            outputs=gr.Textbox("Answer with Rag"),
+            api_name="use_vector_store_to_create_context"),
+         gr.Interface(
+            fn=tool_clinical_trial,
+            inputs=[gr.Textbox("Disease or condition (e.g., 'lung cancer', 'diabetes')"),
+                    gr.Textbox("Other terms (e.g., 'AREA[LastUpdatePostDate]RANGE[2023-01-15,MAX]'"),
+                    gr.Textbox("Searches the LeadSponsorName"),
+                    gr.Textbox("max results")],
+            outputs=gr.Textbox("TOON formated response"),
+            api_name="use_vector_store_to_create_context"),
+        gr.Interface(
+            describe_figure,
+            gr.Image(type="pil"),
+            gr.Textbox(),
+            api_name="figure_description"),
+    ],
+    ["Use a code agent with sandbox execution equiped with clinical trial tool",
+    "Create RAG tool with FAISS vector store",
+    "Query RAG tool",
+    "Query clinical trial database"
+    "Thourough figure description",]
+)
+if __name__ == "__main__":
+    demo.queue().launch(mcp_server=True)

tool_TOON_formater.py ADDED Viewed

	@@ -0,0 +1,237 @@

+def TOON_formater(api_response):
+    """
+    Extract core partner identification information from ClinicalTrials.gov API response.
+    Args:
+        api_response (dict): Raw API response from ClinicalTrials.gov
+    Returns:
+        str: TOOn (Token-Oriented Object Notation) formatted string with 41 core fields
+    """
+    # Helper function to safely navigate nested dicts
+    def safe_get(data, *keys, default=None):
+        for key in keys:
+            if isinstance(data, dict):
+                data = data.get(key, {})
+            else:
+                return default
+        return data if data != {} else default
+    # Helper function to format value for TOOn
+    def format_value(val):
+        if val is None:
+            return ''
+        elif isinstance(val, bool):
+            return str(val).lower()
+        else:
+            return str(val)
+    # Helper function to format list for TOOn
+    def format_list(lst):
+        if not lst:
+            return ''
+        # Escape commas in individual items by wrapping in quotes if needed
+        formatted_items = []
+        for item in lst:
+            item_str = format_value(item)
+            if ',' in item_str or '\n' in item_str:
+                item_str = f'"{item_str}"'
+            formatted_items.append(item_str)
+        return ','.join(formatted_items)
+    protocol = api_response.get('protocolSection', {})
+    # Extract basic identification
+    identification = protocol.get('identificationModule', {})
+    nct_id = identification.get('nctId')
+    brief_title = identification.get('briefTitle')
+    official_title = identification.get('officialTitle')
+    org_full_name = safe_get(identification, 'organization', 'fullName')
+    # Extract status information
+    status = protocol.get('statusModule', {})
+    overall_status = status.get('overallStatus')
+    last_update_post_date = safe_get(status, 'lastUpdatePostDateStruct', 'date')
+    recruitment_status = overall_status
+    start_date = safe_get(status, 'startDateStruct', 'date')
+    primary_completion_date = safe_get(status, 'primaryCompletionDateStruct', 'date')
+    completion_date = safe_get(status, 'completionDateStruct', 'date')
+    study_first_post_date = safe_get(status, 'studyFirstPostDateStruct', 'date')
+    # Extract sponsor/collaborator information
+    sponsors = protocol.get('sponsorCollaboratorsModule', {})
+    lead_sponsor = sponsors.get('leadSponsor', {})
+    lead_sponsor_name = lead_sponsor.get('name')
+    lead_sponsor_class = lead_sponsor.get('class')
+    # Extract collaborators (list)
+    collaborators = sponsors.get('collaborators', [])
+    collaborator_names = [c.get('name') for c in collaborators if c.get('name')]
+    collaborator_classes = [c.get('class') for c in collaborators if c.get('class')]
+    num_collaborators = len(collaborators)
+    num_collaborators_plus_lead = num_collaborators + 1 if lead_sponsor_name else num_collaborators
+    # Extract responsible party
+    responsible_party = sponsors.get('responsibleParty', {})
+    responsible_party_investigator_full_name = responsible_party.get('investigatorFullName')
+    responsible_party_investigator_affiliation = responsible_party.get('investigatorAffiliation')
+    # Extract overall officials
+    contacts_locations = protocol.get('contactsLocationsModule', {})
+    overall_officials = contacts_locations.get('overallOfficials', [])
+    overall_official_names = [o.get('name') for o in overall_officials if o.get('name')]
+    overall_official_affiliations = [o.get('affiliation') for o in overall_officials if o.get('affiliation')]
+    overall_official_roles = [o.get('role') for o in overall_officials if o.get('role')]
+    # Extract conditions and interventions
+    conditions_module = protocol.get('conditionsModule', {})
+    conditions = conditions_module.get('conditions', [])
+    arms_interventions = protocol.get('armsInterventionsModule', {})
+    interventions = arms_interventions.get('interventions', [])
+    intervention_names = [i.get('name') for i in interventions if i.get('name')]
+    intervention_types = [i.get('type') for i in interventions if i.get('type')]
+    # Extract design information
+    design = protocol.get('designModule', {})
+    study_type = design.get('studyType')
+    phases = design.get('phases', [])
+    primary_purpose = safe_get(design, 'designInfo', 'primaryPurpose')
+    # Extract enrollment
+    enrollment_info = design.get('enrollmentInfo', {})
+    enrollment_count = enrollment_info.get('count')
+    # Extract primary outcome
+    outcomes = protocol.get('outcomesModule', {})
+    primary_outcomes = outcomes.get('primaryOutcomes', [])
+    primary_outcome_measures = [p.get('measure') for p in primary_outcomes if p.get('measure')]
+    # Extract locations
+    locations = contacts_locations.get('locations', [])
+    num_locations = len(locations)
+    location_facilities = [loc.get('facility') for loc in locations if loc.get('facility')]
+    location_cities = [loc.get('city') for loc in locations if loc.get('city')]
+    location_states = [loc.get('state') for loc in locations if loc.get('state')]
+    location_countries = [loc.get('country') for loc in locations if loc.get('country')]
+    location_statuses = [loc.get('status') for loc in locations if loc.get('status')]
+    # Extract geopoints
+    geopoints = [loc.get('geoPoint') for loc in locations if loc.get('geoPoint')]
+    # Extract MeSH terms
+    derived = api_response.get('derivedSection', {})
+    condition_browse = derived.get('conditionBrowseModule', {})
+    condition_mesh_terms = [m.get('term') for m in condition_browse.get('meshes', []) if m.get('term')]
+    intervention_browse = derived.get('interventionBrowseModule', {})
+    intervention_mesh_terms = [m.get('term') for m in intervention_browse.get('meshes', []) if m.get('term')]
+    # Extract has results
+    has_results = api_response.get('hasResults', False)
+    # Extract oversight
+    oversight = protocol.get('oversightModule', {})
+    oversight_has_dmc = oversight.get('oversightHasDmc')
+    is_fda_regulated_drug = oversight.get('isFdaRegulatedDrug')
+    is_fda_regulated_device = oversight.get('isFdaRegulatedDevice')
+    # Extract references/citations
+    references_module = protocol.get('referencesModule', {})
+    references = references_module.get('references', [])
+    citations = []
+    pmids = []
+    for ref in references:
+        citations.append(ref.get('citation'))
+        pmids.append(ref.get('pmid'))
+    # Build TOOn formatted output
+    toon_lines = []
+    # Basic identification
+    toon_lines.append(f"nct_id: {format_value(nct_id)}")
+    toon_lines.append(f"brief_title: {format_value(brief_title)}")
+    toon_lines.append(f"official_title: {format_value(official_title)}")
+    toon_lines.append(f"overall_status: {format_value(overall_status)}")
+    # Organization & Sponsor
+    toon_lines.append(f"lead_sponsor_name: {format_value(lead_sponsor_name)}")
+    toon_lines.append(f"lead_sponsor_class: {format_value(lead_sponsor_class)}")
+    toon_lines.append(f"collaborator_names[{len(collaborator_names)}]: {format_list(collaborator_names)}")
+    toon_lines.append(f"collaborator_classes[{len(collaborator_classes)}]: {format_list(collaborator_classes)}")
+    toon_lines.append(f"org_full_name: {format_value(org_full_name)}")
+    # Key personnel
+    toon_lines.append(f"overall_official_names[{len(overall_official_names)}]: {format_list(overall_official_names)}")
+    toon_lines.append(f"overall_official_affiliations[{len(overall_official_affiliations)}]: {format_list(overall_official_affiliations)}")
+    toon_lines.append(f"overall_official_roles[{len(overall_official_roles)}]: {format_list(overall_official_roles)}")
+    toon_lines.append(f"responsible_party_investigator_full_name: {format_value(responsible_party_investigator_full_name)}")
+    toon_lines.append(f"responsible_party_investigator_affiliation: {format_value(responsible_party_investigator_affiliation)}")
+    toon_lines.append(f"num_collaborators: {format_value(num_collaborators)}")
+    # Scientific focus
+    toon_lines.append(f"conditions[{len(conditions)}]: {format_list(conditions)}")
+    toon_lines.append(f"intervention_names[{len(intervention_names)}]: {format_list(intervention_names)}")
+    toon_lines.append(f"intervention_types[{len(intervention_types)}]: {format_list(intervention_types)}")
+    toon_lines.append(f"phases[{len(phases)}]: {format_list(phases)}")
+    toon_lines.append(f"primary_outcome_measures[{len(primary_outcome_measures)}]: {format_list(primary_outcome_measures)}")
+    # Study scope & capacity
+    toon_lines.append(f"enrollment_count: {format_value(enrollment_count)}")
+    toon_lines.append(f"study_type: {format_value(study_type)}")
+    toon_lines.append(f"num_locations: {format_value(num_locations)}")
+    toon_lines.append(f"location_facilities[{len(location_facilities)}]: {format_list(location_facilities)}")
+    toon_lines.append(f"location_cities[{len(location_cities)}]: {format_list(location_cities)}")
+    toon_lines.append(f"location_states[{len(location_states)}]: {format_list(location_states)}")
+    toon_lines.append(f"location_countries[{len(location_countries)}]: {format_list(location_countries)}")
+    # Experience & track record
+    toon_lines.append(f"study_first_post_date: {format_value(study_first_post_date)}")
+    toon_lines.append(f"completion_date: {format_value(completion_date)}")
+    toon_lines.append(f"has_results: {format_value(has_results)}")
+    toon_lines.append(f"num_collaborators_plus_lead: {format_value(num_collaborators_plus_lead)}")
+    # Therapeutic area expertise
+    toon_lines.append(f"condition_mesh_terms[{len(condition_mesh_terms)}]: {format_list(condition_mesh_terms)}")
+    toon_lines.append(f"intervention_mesh_terms[{len(intervention_mesh_terms)}]: {format_list(intervention_mesh_terms)}")
+    toon_lines.append(f"primary_purpose: {format_value(primary_purpose)}")
+    # Current activity status
+    toon_lines.append(f"last_update_post_date: {format_value(last_update_post_date)}")
+    toon_lines.append(f"recruitment_status: {format_value(recruitment_status)}")
+    toon_lines.append(f"start_date: {format_value(start_date)}")
+    toon_lines.append(f"primary_completion_date: {format_value(primary_completion_date)}")
+    # Secondary fields
+    toon_lines.append(f"oversight_has_dmc: {format_value(oversight_has_dmc)}")
+    toon_lines.append(f"is_fda_regulated_drug: {format_value(is_fda_regulated_drug)}")
+    toon_lines.append(f"is_fda_regulated_device: {format_value(is_fda_regulated_device)}")
+    toon_lines.append(f"location_statuses[{len(location_statuses)}]: {format_list(location_statuses)}")
+    # Additional fields
+    toon_lines.append(f"citations[{len(citations)}]: {format_list(citations)}")
+    toon_lines.append(f"pmids[{len(pmids)}]: {format_list(pmids)}")
+    # Geopoints (structured data - format as array of objects)
+    if geopoints:
+        geo_keys = set()
+        for gp in geopoints:
+            if gp:
+                geo_keys.update(gp.keys())
+        if geo_keys:
+            geo_keys_sorted = sorted(geo_keys)
+            toon_lines.append(f"geopoints[{len(geopoints)}]{{{','.join(geo_keys_sorted)}}}:")
+            for gp in geopoints:
+                if gp:
+                    values = [format_value(gp.get(k)) for k in geo_keys_sorted]
+                    toon_lines.append(f"  {','.join(values)}")
+                else:
+                    toon_lines.append(f"  {','.join(['' for _ in geo_keys_sorted])}")
+    else:
+        toon_lines.append(f"geopoints[0]:")
+    return '\n'.join(toon_lines)

tool_create_FAISS_vector.py ADDED Viewed

	@@ -0,0 +1,473 @@

+from pypdf import PdfReader
+import requests
+from io import BytesIO
+import serpapi
+import os
+from dotenv import load_dotenv
+load_dotenv()
+from langchain_core.documents import Document as LangchainDocument
+from metapub import FindIt
+import requests
+import xml.etree.ElementTree as ET
+from ftplib import FTP
+from urllib.parse import urlparse
+from io import BytesIO
+from langchain_community.retrievers import ArxivRetriever
+import arxiv
+import requests
+from io import BytesIO
+from pypdf import PdfReader
+import re
+from langchain_community.vectorstores.utils import DistanceStrategy
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from transformers import AutoTokenizer
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from tqdm import tqdm
+import re
+from typing import List, Dict, Tuple
+def process_ref(extr_ref:tuple[str,str]) -> str:
+    if extr_ref[1] == "arxiv":
+        for tool in [get_paper_from_arxiv_id,get_paper_from_arxiv_id_langchain]:
+            try:
+                return tool(extr_ref[0])
+            except:
+                continue
+    elif extr_ref[1] == "pmid":
+        for tool in [get_paper_from_pmid,parse_pdf_from_pubmed_pmid]:
+            try:
+                return tool(extr_ref[0])
+            except:
+                continue
+    elif extr_ref[1] == "doi":
+        for tool in [download_paper_from_doi,get_pdf_content_serpapi]:
+            try:
+                return tool(extr_ref[0])
+            except:
+                continue
+    elif extr_ref[1] == "pmcid":
+        return get_paper_from_pmid(extr_ref[0])
+class ReferenceExtractor:
+    """Extract and classify references from LLM outputs."""
+    # Regex patterns for identification
+    DOI_PATTERN = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
+    DOI_LOOSE = r"10\.\d{4,9}/[A-Za-z0-9.\-_/]+"
+    PMID_PATTERN = r"\b\d{7,8}\b"
+    ARXIV_NEW = r"\b\d{4}\.\d{4,5}(?:v\d+)?\b"
+    ARXIV_OLD = r"\b[a-z\-]+/\d{7}\b"
+    PMCID_PATTERN = r"\bPMC\d+\b"
+    def __init__(self):
+        """Initialize the extractor with compiled regex patterns."""
+        self.patterns = {
+            'doi': re.compile(self.DOI_PATTERN, re.IGNORECASE),
+            'pmid': re.compile(self.PMID_PATTERN),
+            'arxiv': re.compile(f"({self.ARXIV_NEW})|({self.ARXIV_OLD})", re.IGNORECASE),
+            'pmcid': re.compile(self.PMCID_PATTERN, re.IGNORECASE)
+        }
+    def extract_references(self, text: str) -> List[Tuple[str, str]]:
+        """
+        Extract all references from text and classify them.
+        Args:
+            text: Input string that may contain references in various formats
+        Returns:
+            List of tuples: (reference_value, reference_type)
+        """
+        references = []
+        seen = set()
+        # First, try to parse as a list-like string
+        list_refs = self._extract_from_list_format(text)
+        if list_refs:
+            for ref in list_refs:
+                ref_type = self._classify_single_ref(ref)
+                if ref not in seen:
+                    references.append((ref, ref_type))
+                    seen.add(ref)
+            return references
+        # If not a list format, extract using regex patterns
+        for ref_type, pattern in self.patterns.items():
+            matches = pattern.finditer(text)
+            for match in matches:
+                ref_value = match.group(0).strip()
+                if ref_value not in seen:
+                    references.append((ref_value, ref_type))
+                    seen.add(ref_value)
+        return references
+    def _extract_from_list_format(self, text: str) -> List[str]:
+        """
+        Extract references from list-like formats.
+        Handles: "id1,id2,id3" and '["id1","id2"]' and "['id1', 'id2']"
+        """
+        text = text.strip()
+        # Try parsing as Python list string
+        if text.startswith('[') and text.endswith(']'):
+            try:
+                # Remove brackets and quotes, split by comma
+                cleaned = text[1:-1]
+                # Handle both single and double quotes
+                items = re.findall(r'["\']([^"\']+)["\']', cleaned)
+                if items:
+                    return [item.strip() for item in items]
+            except:
+                pass
+        # Try comma-separated format (no brackets)
+        if ',' in text and not any(char in text for char in ['\n', '(', ')']):
+            # Check if it looks like a simple list
+            if text.count(',') >= 1 and len(text) < 200:
+                items = [item.strip().strip('"\'') for item in text.split(',')]
+                # Filter out empty strings
+                return [item for item in items if item]
+        return []
+    def _classify_single_ref(self, ref: str) -> str:
+        """Classify a single extracted reference string."""
+        ref = ref.strip().strip('"\'')
+        # Check each pattern in priority order
+        if re.match(r"^10\.\d{4,9}/[A-Za-z0-9.\-_/:()]+$", ref, re.IGNORECASE):
+            return "doi"
+        if re.match(r"^PMC\d+$", ref, re.IGNORECASE):
+            return "pmcid"
+        if re.match(r"^\d{4}\.\d{4,5}(?:v\d+)?$", ref):
+            return "arxiv"
+        if re.match(r"^[a-z\-]+/\d{7}$", ref, re.IGNORECASE):
+            return "arxiv"
+        if re.match(r"^\d{7,8}$", ref):
+            return "pmid"
+        return "unknown"
+def download_paper_from_doi(doi):
+    """
+    Attempt to download paper from DOI with multiple fallback methods
+    """
+    # Clean DOI if it has prefix
+    doi = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
+    # Method 1: Try Unpaywall API (free, legal access)
+    try:
+        unpaywall_url = f"https://api.unpaywall.org/v2/{doi}?email=your@email.com"
+        response = requests.get(unpaywall_url, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if data.get('best_oa_location') and data['best_oa_location'].get('url_for_pdf'):
+                pdf_url = data['best_oa_location']['url_for_pdf']
+                text = download_pdf_from_url(pdf_url)
+                print(f"Found PDF via Unpaywall: {pdf_url}")
+                return text
+    except Exception as e:
+        print(f"Unpaywall failed: {e}")
+    # Method 2: Try arXiv if it's an arXiv paper
+    if 'arxiv' in doi.lower() or doi.startswith('2'):
+        try:
+            # Extract arXiv ID
+            arxiv_id = doi.split('/')[-1] if '/' in doi else doi
+            arxiv_pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+            text = download_pdf_from_url(arxiv_pdf_url)
+            print(f"Trying arXiv: {arxiv_pdf_url}")
+            return text
+        except Exception as e:
+            print(f"arXiv failed: {e}")
+    # Method 3: Try Sci-Hub (use with caution - check your local laws)
+    try:
+        scihub_url = f"https://sci-hub.se/{doi}"
+        print(f"Trying Sci-Hub: {scihub_url}")
+        headers = {'User-Agent': 'Mozilla/5.0'}
+        response = requests.get(scihub_url, headers=headers, timeout=15)
+        if response.status_code == 200:
+            # Look for PDF link in the HTML
+            pdf_match = re.search(r'(https?://[^"]+\.pdf[^"]*)', response.text)
+            if pdf_match:
+                pdf_url = pdf_match.group(1)
+                text = download_pdf_from_url(pdf_url)
+                print(f"got {doi} by chance")
+                return text
+    except Exception as e:
+        print(f"Sci-Hub failed: {e}")
+def download_pdf_from_url(url):
+    """
+    Download and extract text from a PDF URL
+    """
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+    }
+    response = requests.get(url, headers=headers, timeout=30)
+    response.raise_for_status()
+    content_type = response.headers.get('content-type', '').lower()
+    if 'pdf' not in content_type and not response.content.startswith(b'%PDF'):
+        raise Exception(f"URL did not return a PDF (got {content_type})")
+    reader = PdfReader(BytesIO(response.content))
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() or ""
+    return text
+def get_paper_from_arxiv_id(doi: str):
+    """
+    Retrieve paper from arXiv using its arXiv ID.
+    """
+    client = arxiv.Client()
+    search = arxiv.Search(query=doi, max_results=1)
+    results = client.results(search)
+    pdf_url = next(results).pdf_url
+    text = parse_pdf_file(pdf_url)
+    return text
+def get_paper_from_arxiv_id_langchain(arxiv_id: str):
+    """
+    Retrieve paper from arXiv using its arXiv ID.
+    """
+    search = "2304.07814"
+    retriever = ArxivRetriever(
+        load_max_docs=2,
+        get_full_documents=True,
+    )
+    docs = retriever.invoke(search)
+    return docs
+def parse_pdf_file(path:str) -> str:
+    if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
+        response = requests.get(path)
+        response.raise_for_status()  # Ensure download succeeded
+        reader = PdfReader(BytesIO(response.content))
+    else:
+        reader = PdfReader(path)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() or ""
+    return text
+def get_pdf_content_serpapi(doi: str) -> str:
+    """
+    Get the link to the paper from its DOI using SerpAPI Google Scholar search.
+    """
+    client = serpapi.Client(api_key=os.getenv("SERPAPI_API_KEY"))
+    results = client.search({
+        'engine': 'google_scholar',
+        'q': doi,
+    })
+    pdf_path = results["organic_results"][0]["link"]
+    pdf_text = parse_pdf_file(pdf_path)
+    return pdf_text
+def get_paper_from_pmid(pmid:str):
+    src = FindIt(pmid)
+    if src.url:
+        pdf_text = parse_pdf_file(src.url)
+        return pdf_text
+    else:
+       print(src.reason)
+def download_pdf_via_ftp(url: str) -> bytes:
+    """
+    Download a PDF file from an FTP URL and return its content as bytes.
+    """
+    parsed_url = urlparse(url)
+    ftp_host = parsed_url.netloc
+    ftp_path = parsed_url.path
+    file_buffer = BytesIO()
+    with FTP(ftp_host) as ftp:
+        ftp.login()
+        ftp.retrbinary(f'RETR {ftp_path}', file_buffer.write)
+    file_buffer.getvalue()
+    file_buffer.seek(0)
+    return file_buffer
+def parse_pdf_from_pubmed_pmid(pmid: str) -> str:
+    """
+    Download and parse a PDF from PubMed using its PMID.
+    """
+    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmid}"
+    response = requests.get(url)
+    cleaned_string = response.content.decode('utf-8').strip()
+    try:
+        root = ET.fromstring(cleaned_string)
+        pdf_link_element = root.find(".//link[@format='pdf']")
+        ftp_url = pdf_link_element.get('href')
+        file_byte = download_pdf_via_ftp(ftp_url)
+        reader = PdfReader(file_byte)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text() or ""
+        print(f"got {pmid} via ftp download")
+        return text
+    except ET.ParseError as e:
+        pass
+def safe_parse_of_ref_list(refs : list[str]) -> list[str]:
+    return
+def classify_ref(ref: str) -> str:
+    DOI_REGEX = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
+    DOI_LOOSE  = r"^10\.\d{4,9}/?[A-Za-z0-9.\-_/]+$"   # supports 'NEJMoa2307100'
+    PMID_REGEX = r"^\d{7,8}$"
+    ARXIV_REGEX = r"^\d{4}\.\d{4,5}(v\d+)?$"   # new style
+    ARXIV_OLD   = r"^[a-z\-]+/\d{7}$"          # old style hep-th/xxxxxxx
+    ref = ref.strip()
+    if re.match(DOI_REGEX, ref, re.IGNORECASE) or re.match(DOI_LOOSE, ref, re.IGNORECASE):
+        return "doi"
+    if re.match(PMID_REGEX, ref):
+        return "pmid"
+    if re.match(ARXIV_REGEX, ref, re.IGNORECASE) or re.match(ARXIV_OLD, ref, re.IGNORECASE):
+        return "arxiv"
+    return "unknown"
+def process_ref(ref: str):
+    """We try twice to download"""
+    kind = classify_ref(ref)
+    if kind == "doi":
+        for tool in [download_paper_from_doi,get_pdf_content_serpapi]:
+            try:
+                return tool(ref)
+            except:
+                continue
+    if kind == "pmid":
+        for tool in [get_paper_from_pmid,parse_pdf_from_pubmed_pmid]:
+            try:
+                return tool(ref)
+            except:
+                continue
+    if kind == "arxiv":
+        for tool in [get_paper_from_arxiv_id,get_pdf_content_serpapi]:
+            try:
+                return tool(ref)
+            except:
+                continue
+    print(f"Skipping invalid ref: {ref}")
+    return None
+from langchain_community.vectorstores.utils import DistanceStrategy
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from transformers import AutoTokenizer
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from tqdm import tqdm
+def create_vector_store_from_list_of_doi(refs :list[str], VECTOR_DB_PATH:str) -> str:
+    from langchain_community.vectorstores import FAISS
+    # define embedding
+    embedding_name="BAAI/bge-large-en-v1.5"
+    embedding_model = HuggingFaceEmbeddings(model_name=embedding_name,
+                                        model_kwargs={"device": "mps"},
+                                        encode_kwargs={"normalize_embeddings": True},)
+    try:
+        # Load the vector database from the folder
+        print(f"try to load vector store from {VECTOR_DB_PATH}")
+        KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(
+            VECTOR_DB_PATH,
+            embedding_model,
+            allow_dangerous_deserialization=True  # Required for security in newer LangChain versions
+        )
+        existing_reference = [doc.metadata.get("source") for doc in KNOWLEDGE_VECTOR_DATABASE.docstore._dict.values()]
+        print("vectro store loaded")
+    except Exception as e :
+        print("FAISS load error:", e)
+        KNOWLEDGE_VECTOR_DATABASE = None
+        existing_reference = []
+        print("no vector store found, creating a new one...")
+    # fetch docs
+    extractor = ReferenceExtractor()
+    REFS = extractor.extract_references(refs) # Change here the type of IDs to DEBUG
+    raw_docs=[]
+    for ref in tqdm(REFS):
+        if ref not in set(existing_reference):
+            text = process_ref(ref)
+            if text:
+                raw_docs.append(LangchainDocument(page_content=text,metadata={'source':ref[0]}))
+    recover_yield = f" *** -> {round(100*len(raw_docs)/len(REFS))}% papers downloaded"
+    print(recover_yield)
+    # split texts into chunks
+    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+                                AutoTokenizer.from_pretrained(embedding_name),
+                                chunk_size=3000,
+                                chunk_overlap=int(3000 / 10),
+                                add_start_index=True,
+                                strip_whitespace=True,
+                                separators="."
+                                )
+    if raw_docs:
+        docs_processed = text_splitter.split_documents(raw_docs)
+        print("creating the vector store...")
+        # create the vector store
+        NEW_KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE)
+        if KNOWLEDGE_VECTOR_DATABASE :
+            print("merge vector store")
+            KNOWLEDGE_VECTOR_DATABASE.merge_from(NEW_KNOWLEDGE_VECTOR_DATABASE)
+            KNOWLEDGE_VECTOR_DATABASE.save_local(VECTOR_DB_PATH)
+        else:
+            NEW_KNOWLEDGE_VECTOR_DATABASE.save_local(VECTOR_DB_PATH)
+        return VECTOR_DB_PATH
+    else:
+        return f"all the data already in vector store {VECTOR_DB_PATH}"

tool_describe_figure.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+from openai import OpenAI
+# The OpenAI library handles the API key and base URL automatically
+# after instantiation.
+def thorough_picture_description(figure: str) -> str:
+    """
+    Generates a thorough description for a given image URL using
+    the Nebius Token Factory endpoint.
+    Args:
+        figure: The URL of the image to describe.
+    Returns:
+        The generated text description of the image.
+    """
+    try:
+        client = OpenAI(
+            base_url="https://api.tokenfactory.nebius.com/v1/",
+            api_key=os.environ.get("NEBIUS_API_KEY")
+        )
+    except Exception as e:
+        return f"Error initializing OpenAI client: {e}"
+    messages_payload = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Provide a very detailed, thorough, and descriptive analysis of this image."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": figure},
+                },
+            ],
+        }
+    ]
+    try:
+        response = client.chat.completions.create(
+            model="gemini-2.5-flash",
+            messages=messages_payload,
+            max_tokens=2048
+        )
+        if response.choices and response.choices[0].message.content:
+            return response.choices[0].message.content
+        else:
+            return "Could not retrieve a description from the API."
+    except Exception as e:
+        return f"An error occurred during the API call: {e}"

tool_fetch_documents_DOI.py ADDED Viewed

File without changes

tool_query_FAISS_vector.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+def query_vector_store(query: str, store_name: str, top_k: int = 5) -> dict:
+    """
+    Query a specific vector store to retreive top_k documents related to the user question.
+    Each document have metadata that is the identification of the source, it must be said clearly.
+    Args:
+        query (str): User's question
+        store_name (str): Which vector store to search
+        top_k (int): Number of chunks to retrieve
+    Returns:
+        dict: Retrieved context, sources, store_name
+    """
+    from langchain_community.vectorstores import FAISS
+    vector_stores = os.listdir("./vector_stores")
+    store_path = f"./vector_stores{store_name}"
+    if store_name not in vector_stores:
+        return {"error": f"Vector store '{store_name}' not found, you must create it first with tool create faiss vector"}
+    embedding_name="BAAI/bge-large-en-v1.5"
+    embedding_model = HuggingFaceEmbeddings(model_name=embedding_name,
+                                        model_kwargs={"device": "mps"},
+                                        encode_kwargs={"normalize_embeddings": True},)
+    vector_store = FAISS.load_local(
+        store_path,
+        embedding_model,
+        allow_dangerous_deserialization=True
+        )
+    results = vector_store.similarity_search(query, top_k)
+    context = "\n\n".join([r["text"] for r in results])
+    sources = [
+        {"ids": r["metadata"]["source"], "relevance": r["score"]}
+        for r in results
+    ]
+    return {
+        "context": context,
+        "sources": sources,
+        "store_name": store_name
+    }