| |
| from openai import OpenAI |
| import json |
| import requests |
| import pandas as pd |
| from bs4 import BeautifulSoup |
| import time |
| import os |
| from typing import List, Dict |
| import xml.etree.ElementTree as ET |
| import re |
| import sys |
|
|
| |
| try: |
| API_KEY = os.environ['OPENAI_API_KEY'] |
| if not API_KEY: |
| raise KeyError("OPENAI_API_KEY is empty") |
| except KeyError: |
| print("ERROR: OPENAI_API_KEY environment variable not found or empty.") |
| print("Please set your OpenAI API key as an environment variable:") |
| print("export OPENAI_API_KEY='your-api-key-here'") |
| sys.exit(1) |
|
|
| client = OpenAI(api_key=API_KEY) |
|
|
| |
| def safe_json_loads(json_string: str) -> dict: |
| """Safely parse JSON string, handling control characters and common formatting issues""" |
| try: |
| return json.loads(json_string) |
| except json.JSONDecodeError as e: |
| print(f"JSON decode error: {e}") |
| print("Attempting to clean control characters...") |
| |
| cleaned = re.sub(r'[\x00-\x1f\x7f]', '', json_string) |
| try: |
| return json.loads(cleaned) |
| except json.JSONDecodeError as e2: |
| print(f"Still failed after cleaning: {e2}") |
| |
| try: |
| |
| cleaned = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', cleaned) |
| |
| cleaned = re.sub(r':\s*([a-zA-Z][a-zA-Z0-9\s]*[a-zA-Z0-9])\s*([,}])', r': "\1"\2', cleaned) |
| |
| cleaned = re.sub(r',\s*}', '}', cleaned) |
| cleaned = re.sub(r',\s*]', ']', cleaned) |
| |
| cleaned = re.sub(r'([^\\])"([^"]*)"([^"]*)"', r'\1"\2\\"\3"', cleaned) |
| |
| cleaned = re.sub(r'\n', '\\n', cleaned) |
| |
| cleaned = re.sub(r'\r', '\\r', cleaned) |
| return json.loads(cleaned) |
| except json.JSONDecodeError as e3: |
| print(f"All JSON parsing attempts failed: {e3}") |
| print(f"Problematic JSON string: {json_string[:200]}...") |
| |
| try: |
| |
| result = {} |
| |
| pairs = re.findall(r'"([^"]+)"\s*:\s*"([^"]*)"', json_string) |
| for key, value in pairs: |
| result[key] = value |
| if result: |
| print(f"Extracted partial JSON: {result}") |
| return result |
| except: |
| pass |
| |
| return {} |
|
|
| |
| |
| def generate_pubmed_prompt(query: str) -> str: |
| """ |
| Transform natural language user query into optimized PubMed search string. |
| |
| Args: |
| query (str): Natural language query from user |
| |
| Returns: |
| str: Optimized PubMed search string with Boolean operators and field tags |
| """ |
| |
| |
| def create_pubmed_search(search_string: str, explanation: str) -> dict: |
| """Function that creates a PubMed search string - called by the LLM""" |
| print(f"LLM called create_pubmed_search with: {search_string}") |
| print(f"Explanation: {explanation}") |
| return {"status": "success", "search_string": search_string} |
| |
| |
| tools = [{ |
| "type": "function", |
| "function": { |
| "name": "create_pubmed_search", |
| "description": "Create an optimized PubMed search string from a natural language query", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "search_string": { |
| "type": "string", |
| "description": "The optimized PubMed search string with Boolean operators, field tags, and filters" |
| }, |
| "explanation": { |
| "type": "string", |
| "description": "Brief explanation of how the search string was constructed" |
| } |
| }, |
| "required": ["search_string", "explanation"] |
| } |
| } |
| }] |
| |
| |
| system_prompt = """You are a medical research expert who specializes in creating optimized PubMed search queries. |
| |
| Your task is to transform a natural language query into a PubMed search string that will return the most relevant research. |
| |
| CRITICAL REQUIREMENTS: |
| 1. Start with BROAD, GENERAL terms to ensure results are found |
| 2. Use the most common, widely-used medical terminology |
| 3. Avoid overly specific or rare phrases that might return 0 results |
| 4. Focus on 2-3 core concepts maximum to avoid over-restriction |
| 5. Use field tags like [tiab] (title/abstract), [mesh] (MeSH terms) |
| 6. Add Boolean operators (AND, OR, NOT) appropriately |
| |
| SEARCH STRATEGY: |
| - Begin with the most fundamental concept (e.g., "semantic variant PPA" or "svPPA") |
| - Add 1-2 additional key concepts with OR operators for synonyms |
| - Avoid complex multi-concept searches that are too restrictive |
| - If the query is very specific, start broader and let the user refine |
| |
| Example transformations: |
| - "latest research on svPPA" → "(svPPA OR "semantic variant primary progressive aphasia")[tiab]" |
| - "Alzheimer treatment" → "(Alzheimer[tiab] OR "Alzheimer disease"[mesh]) AND (treatment[tiab] OR therapy[tiab])" |
| - Complex query → Start with main disease/condition, then add 1-2 key concepts |
| |
| You MUST call the create_pubmed_search function with your optimized search string and explanation.""" |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": f"Transform this query into a PubMed search string: {query}"} |
| ] |
| |
| try: |
| response = client.chat.completions.create( |
| model="gpt-4", |
| messages=messages, |
| tools=tools, |
| tool_choice={"type": "function", "function": {"name": "create_pubmed_search"}} |
| ) |
| |
| |
| if response.choices[0].message.tool_calls: |
| tool_call = response.choices[0].message.tool_calls[0] |
| function_args = safe_json_loads(tool_call.function.arguments) |
| |
| |
| result = create_pubmed_search( |
| search_string=function_args["search_string"], |
| explanation=function_args["explanation"] |
| ) |
| |
| return result["search_string"] |
| else: |
| print("LLM did not make a function call, using fallback") |
| return f'"{query}"[tiab]' |
| |
| except Exception as e: |
| print(f"Error generating PubMed prompt: {e}") |
| return f'"{query}"[tiab]' |
|
|
| |
| |
| def search_pubmed(search_str: str, max_results: int = 10) -> List[str]: |
| """ |
| Search PubMed using Entrez API and return list of PMIDs. |
| |
| Args: |
| search_str (str): PubMed search string |
| max_results (int): Maximum number of results to return (default: 10) |
| |
| Returns: |
| List[str]: List of PubMed IDs (PMIDs) |
| """ |
| |
| |
| def execute_pubmed_search(search_query: str, max_count: int, sort_by: str = "relevance", use_mesh: bool = True) -> dict: |
| """Function that executes PubMed search - called by the LLM""" |
| print(f"LLM called execute_pubmed_search with query: {search_query}, max: {max_count}, sort: {sort_by}, use_mesh: {use_mesh}") |
| |
| |
| base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" |
| |
| try: |
| |
| search_url = f"{base_url}esearch.fcgi" |
| search_params = { |
| "db": "pubmed", |
| "term": search_query, |
| "retmax": max_count * 2, |
| "retmode": "json", |
| "sort": sort_by, |
| "field": "relevance" if sort_by == "relevance" else None, |
| "reldate": None, |
| "datetype": None |
| } |
| |
| |
| search_params = {k: v for k, v in search_params.items() if v is not None} |
| |
| response = requests.get(search_url, params=search_params) |
| response.raise_for_status() |
| |
| search_data = response.json() |
| |
| |
| if "esearchresult" in search_data and "idlist" in search_data["esearchresult"]: |
| pmids = search_data["esearchresult"]["idlist"] |
| count = search_data["esearchresult"].get("count", "0") |
| |
| |
| pmids = pmids[:max_count] |
| |
| print(f"Found {count} total results, returning top {len(pmids)} most relevant PMIDs") |
| return { |
| "status": "success", |
| "pmids": pmids, |
| "total_count": count, |
| "returned_count": len(pmids), |
| "sort_method": sort_by |
| } |
| else: |
| return { |
| "status": "error", |
| "message": "No results found or unexpected response format", |
| "pmids": [] |
| } |
| |
| except Exception as e: |
| return { |
| "status": "error", |
| "message": f"Search error: {str(e)}", |
| "pmids": [] |
| } |
| |
| |
| tools = [{ |
| "type": "function", |
| "function": { |
| "name": "execute_pubmed_search", |
| "description": "Execute a PubMed search and return PMIDs, prioritizing relevance over recency", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "search_query": { |
| "type": "string", |
| "description": "The PubMed search query to execute" |
| }, |
| "max_count": { |
| "type": "integer", |
| "description": "Maximum number of results to return (default: 10)" |
| }, |
| "sort_by": { |
| "type": "string", |
| "enum": ["relevance", "date"], |
| "description": "Sort results by relevance (recommended) or date. Use relevance to get the most relevant results regardless of publication date." |
| }, |
| "use_mesh": { |
| "type": "boolean", |
| "description": "Whether to consider MeSH terms for relevance (default: true)" |
| } |
| }, |
| "required": ["search_query", "max_count"] |
| } |
| } |
| }] |
| |
| |
| system_prompt = """You are a PubMed search expert. Your task is to execute a PubMed search using the provided search string. |
| |
| CRITICAL: Always prioritize RELEVANCE over recency. The goal is to find the most relevant research for the query, not necessarily the most recent. |
| |
| Guidelines: |
| 1. Use "relevance" as the sort method to get the most relevant results |
| 2. Don't limit by date - relevant research can be from any time period |
| 3. Focus on finding papers that directly address the research question |
| 4. Consider that seminal or highly cited papers may be older but more relevant |
| |
| You should call the execute_pubmed_search function with the search query and appropriate parameters. |
| For research queries, always use sort_by="relevance" to get the most relevant results.""" |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": f"Execute this PubMed search: {search_str} (max {max_results} results, prioritize relevance)"} |
| ] |
| |
| try: |
| response = client.chat.completions.create( |
| model="gpt-4", |
| messages=messages, |
| tools=tools, |
| tool_choice={"type": "function", "function": {"name": "execute_pubmed_search"}} |
| ) |
| |
| |
| if response.choices[0].message.tool_calls: |
| tool_call = response.choices[0].message.tool_calls[0] |
| function_args = safe_json_loads(tool_call.function.arguments) |
| |
| |
| result = execute_pubmed_search( |
| search_query=function_args["search_query"], |
| max_count=function_args.get("max_count", max_results), |
| sort_by=function_args.get("sort_by", "relevance"), |
| use_mesh=function_args.get("use_mesh", True) |
| ) |
| |
| return result["pmids"] |
| else: |
| print("LLM did not make a function call, using fallback") |
| return [] |
| |
| except Exception as e: |
| print(f"Error in PubMed search: {e}") |
| return [] |
|
|
| |
| |
| def fetch_pubmed_abstracts(pmids: List[str]) -> List[Dict]: |
| """ |
| Fetch abstracts and metadata for given PMIDs using Entrez API. |
| |
| Args: |
| pmids (List[str]): List of PubMed IDs (PMIDs) |
| |
| Returns: |
| List[Dict]: List of dictionaries containing article metadata |
| """ |
| |
| |
| def retrieve_abstracts(pmid_list: List[str], include_authors: bool = True, include_journal: bool = True) -> dict: |
| """Function that retrieves abstracts - called by the LLM""" |
| print(f"LLM called retrieve_abstracts for {len(pmid_list)} PMIDs") |
| |
| if not pmid_list: |
| return {"status": "error", "message": "No PMIDs provided", "abstracts": []} |
| |
| |
| base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" |
| |
| try: |
| |
| fetch_url = f"{base_url}efetch.fcgi" |
| fetch_params = { |
| "db": "pubmed", |
| "id": ",".join(pmid_list), |
| "retmode": "xml", |
| "rettype": "abstract" |
| } |
| |
| response = requests.get(fetch_url, params=fetch_params) |
| response.raise_for_status() |
| |
| |
| root = ET.fromstring(response.content) |
| |
| abstracts = [] |
| |
| |
| for article in root.findall(".//PubmedArticle"): |
| try: |
| |
| pmid = article.find(".//PMID") |
| pmid_text = pmid.text if pmid is not None else "Unknown" |
| |
| |
| title_elem = article.find(".//ArticleTitle") |
| title = title_elem.text if title_elem is not None else "No title available" |
| |
| |
| abstract_elem = article.find(".//AbstractText") |
| abstract = abstract_elem.text if abstract_elem is not None else "No abstract available" |
| |
| |
| authors = [] |
| if include_authors: |
| author_list = article.find(".//AuthorList") |
| if author_list is not None: |
| for author in author_list.findall(".//Author"): |
| last_name = author.find("LastName") |
| first_name = author.find("ForeName") |
| if last_name is not None and first_name is not None: |
| authors.append(f"{first_name.text} {last_name.text}") |
| elif last_name is not None: |
| authors.append(last_name.text) |
| |
| |
| journal = "Unknown journal" |
| if include_journal: |
| journal_elem = article.find(".//Journal/Title") |
| journal = journal_elem.text if journal_elem is not None else "Unknown journal" |
| |
| |
| pub_date = article.find(".//PubDate") |
| year = "Unknown" |
| if pub_date is not None: |
| year_elem = pub_date.find("Year") |
| if year_elem is not None: |
| year = year_elem.text |
| |
| |
| article_data = { |
| "pmid": pmid_text, |
| "title": title, |
| "abstract": abstract, |
| "authors": authors, |
| "journal": journal, |
| "year": year |
| } |
| |
| abstracts.append(article_data) |
| |
| except Exception as e: |
| print(f"Error parsing article: {e}") |
| continue |
| |
| print(f"Successfully retrieved {len(abstracts)} abstracts") |
| return { |
| "status": "success", |
| "abstracts": abstracts, |
| "retrieved_count": len(abstracts), |
| "requested_count": len(pmid_list) |
| } |
| |
| except Exception as e: |
| return { |
| "status": "error", |
| "message": f"Retrieval error: {str(e)}", |
| "abstracts": [] |
| } |
| |
| |
| tools = [{ |
| "type": "function", |
| "function": { |
| "name": "retrieve_abstracts", |
| "description": "Retrieve abstracts and metadata for given PMIDs", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "pmid_list": { |
| "type": "array", |
| "items": {"type": "string"}, |
| "description": "List of PubMed IDs to retrieve abstracts for" |
| }, |
| "include_authors": { |
| "type": "boolean", |
| "description": "Whether to include author information (default: true)" |
| }, |
| "include_journal": { |
| "type": "boolean", |
| "description": "Whether to include journal information (default: true)" |
| } |
| }, |
| "required": ["pmid_list"] |
| } |
| } |
| }] |
| |
| |
| system_prompt = """You are a PubMed data retrieval expert. Your task is to retrieve abstracts and metadata for the provided PMIDs. |
| |
| You should call the retrieve_abstracts function with the PMID list and appropriate parameters. |
| Consider whether to include authors and journal information based on the retrieval needs.""" |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": f"Retrieve abstracts for these PMIDs: {pmids}"} |
| ] |
| |
| try: |
| response = client.chat.completions.create( |
| model="gpt-4", |
| messages=messages, |
| tools=tools, |
| tool_choice={"type": "function", "function": {"name": "retrieve_abstracts"}} |
| ) |
| |
| |
| if response.choices[0].message.tool_calls: |
| tool_call = response.choices[0].message.tool_calls[0] |
| function_args = safe_json_loads(tool_call.function.arguments) |
| |
| |
| result = retrieve_abstracts( |
| pmid_list=function_args["pmid_list"], |
| include_authors=function_args.get("include_authors", True), |
| include_journal=function_args.get("include_journal", True) |
| ) |
| |
| return result["abstracts"] |
| else: |
| print("LLM did not make a function call, using fallback") |
| return [] |
| |
| except Exception as e: |
| print(f"Error fetching abstracts: {e}") |
| return [] |
|
|
| |
| |
| def summarize_abstracts(abstracts: List[Dict], original_query: str) -> str: |
| """ |
| Summarize abstracts using LLM based on original query with inline citations and quotes. |
| |
| Args: |
| abstracts (List[Dict]): List of article dictionaries |
| original_query (str): Original user query |
| |
| Returns: |
| str: Summary of the abstracts with inline citations and quotes |
| """ |
| |
| |
| if not abstracts: |
| return "No research articles were found for the given query. This could be due to:\n1. The search terms being too specific or restrictive\n2. Limited research on this particular topic\n3. The need to use broader or alternative search terms\n\nConsider trying a more general search or rephrasing the query with more common medical terminology." |
| |
| |
| def create_summary_with_citations(summary_text: str, key_findings: List[Dict], confidence_level: str) -> dict: |
| """Function that creates a summary with citations and quotes - called by the LLM""" |
| print(f"LLM called create_summary_with_citations with confidence: {confidence_level}") |
| print(f"Key findings with citations: {len(key_findings)} items identified") |
| |
| return { |
| "status": "success", |
| "summary": summary_text, |
| "key_findings": key_findings, |
| "confidence": confidence_level |
| } |
| |
| |
| tools = [{ |
| "type": "function", |
| "function": { |
| "name": "create_summary_with_citations", |
| "description": "Create a comprehensive summary of research abstracts with inline citations and direct quotes", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "summary_text": { |
| "type": "string", |
| "description": "The main summary text with inline citations [1], [2], etc. and direct quotes from abstracts" |
| }, |
| "key_findings": { |
| "type": "array", |
| "items": { |
| "type": "object", |
| "properties": { |
| "finding": {"type": "string"}, |
| "citation": {"type": "string"}, |
| "quote": {"type": "string"}, |
| "pmid": {"type": "string"} |
| }, |
| "required": ["finding", "citation", "quote", "pmid"] |
| }, |
| "description": "List of key findings with their citations and supporting quotes" |
| }, |
| "confidence_level": { |
| "type": "string", |
| "enum": ["high", "medium", "low"], |
| "description": "Confidence level in the summary based on available evidence" |
| } |
| }, |
| "required": ["summary_text", "key_findings", "confidence_level"] |
| } |
| } |
| }] |
| |
| |
| abstracts_text = "" |
| for i, article in enumerate(abstracts, 1): |
| abstracts_text += f"\n[{i}] Article {i}:\n" |
| abstracts_text += f"PMID: {article['pmid']}\n" |
| abstracts_text += f"Title: {article['title']}\n" |
| abstracts_text += f"Authors: {', '.join(article['authors'][:3])}\n" |
| abstracts_text += f"Journal: {article['journal']} ({article['year']})\n" |
| abstracts_text += f"Abstract: {article['abstract']}\n" |
| abstracts_text += "-" * 50 + "\n" |
| |
| |
| system_prompt = """You are a medical research expert who specializes in summarizing scientific literature with proper citations. |
| |
| Your task is to analyze the provided research abstracts and create a comprehensive summary that directly answers the original user query. |
| |
| CRITICAL REQUIREMENTS: |
| 1. Use inline citations [1], [2], [3], etc. for EVERY fact or claim |
| 2. Include DIRECT QUOTES from the abstracts to support key findings - use quotation marks |
| 3. Focus on answering the specific question asked |
| 4. Identify key findings and trends across the studies |
| 5. Note any conflicting evidence or limitations |
| 6. Provide evidence-backed statements with specific citations |
| 7. Assess the confidence level based on the quality and quantity of evidence |
| 8. Create COMPREHENSIVE summaries that cover multiple aspects of the research |
| 9. Aim for substantial length (300-800 words) to provide thorough coverage |
| 10. Include both supporting and contradictory findings when present |
| |
| FORMATTING REQUIREMENTS: |
| - Use [1], [2], [3], etc. for inline citations |
| - Include direct quotes in quotation marks with citations, e.g., "quote here" [1] |
| - Structure the summary to flow logically with multiple paragraphs |
| - Make sure every claim is supported by at least one citation |
| - Include at least 3-5 direct quotes from the abstracts |
| - Each major finding should have a supporting quote |
| - Cover multiple aspects: mechanisms, clinical features, diagnostic criteria, outcomes, etc. |
| - Provide context and background when relevant |
| |
| CONTENT REQUIREMENTS: |
| - Start with a broad overview of the topic |
| - Include specific mechanisms or pathophysiology when discussed |
| - Cover clinical manifestations and diagnostic features |
| - Address treatment approaches if mentioned |
| - Discuss prognosis and outcomes |
| - Note any controversies or conflicting evidence |
| - Include relevant anatomical or neuroimaging findings |
| - Mention study limitations and future research directions |
| |
| EXAMPLE FORMAT (like the provided example): |
| "Frontotemporal dementia patients, including those with the semantic variant of primary progressive aphasia (svPPA), exhibit degradation in semantic memory due to atrophy in the anterior temporal lobe (ATL), which interferes with hierarchical semantic categorization" [2][4][5]. The ATL is proposed to be a functionally unitary 'semantic hub' that supports both social and non-social semantic knowledge, indicating that semantic degradation in svPPA might reflect a disintegration of complex conceptual networks within this region [4]. |
| |
| You MUST call the create_summary_with_citations function with your comprehensive analysis.""" |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": f"Original query: {original_query}\n\nResearch abstracts:\n{abstracts_text}"} |
| ] |
| |
| try: |
| response = client.chat.completions.create( |
| model="gpt-4", |
| messages=messages, |
| tools=tools, |
| tool_choice={"type": "function", "function": {"name": "create_summary_with_citations"}} |
| ) |
| |
| |
| if response.choices[0].message.tool_calls: |
| tool_call = response.choices[0].message.tool_calls[0] |
| function_args = safe_json_loads(tool_call.function.arguments) |
| |
| |
| result = create_summary_with_citations( |
| summary_text=function_args["summary_text"], |
| key_findings=function_args["key_findings"], |
| confidence_level=function_args["confidence_level"] |
| ) |
| |
| return result["summary"] |
| else: |
| print("LLM did not make a function call, using fallback") |
| return "Unable to generate summary due to technical issues." |
| |
| except Exception as e: |
| print(f"Error summarizing abstracts: {e}") |
| return "Error occurred during summarization." |
|
|
| |
| |
| def format_references(abstracts: List[Dict]) -> str: |
| """ |
| Format references for the articles in a standardized format. |
| |
| Args: |
| abstracts (List[Dict]): List of article dictionaries |
| |
| Returns: |
| str: Formatted reference list |
| """ |
| |
| |
| if not abstracts: |
| return "No references available - no articles were found for the given query." |
| |
| |
| def format_reference_list(references: List[str], format_style: str = "APA") -> dict: |
| """Function that formats references - called by the LLM""" |
| print(f"LLM called format_reference_list with style: {format_style}") |
| print(f"Formatted {len(references)} references") |
| |
| return { |
| "status": "success", |
| "formatted_references": references, |
| "style": format_style, |
| "count": len(references) |
| } |
| |
| |
| tools = [{ |
| "type": "function", |
| "function": { |
| "name": "format_reference_list", |
| "description": "Format a list of references in a standardized citation style", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "references": { |
| "type": "array", |
| "items": {"type": "string"}, |
| "description": "List of formatted references" |
| }, |
| "format_style": { |
| "type": "string", |
| "enum": ["APA", "MLA", "Chicago", "Vancouver"], |
| "description": "Citation format style to use (default: APA)" |
| } |
| }, |
| "required": ["references"] |
| } |
| } |
| }] |
| |
| |
| articles_data = [] |
| for i, article in enumerate(abstracts, 1): |
| articles_data.append({ |
| "number": i, |
| "pmid": article["pmid"], |
| "title": article["title"], |
| "authors": article["authors"], |
| "journal": article["journal"], |
| "year": article["year"] |
| }) |
| |
| |
| system_prompt = """You are a bibliographic expert who specializes in formatting academic references. |
| |
| Your task is to format the provided article information into a standardized reference list. |
| |
| Guidelines: |
| 1. Use consistent formatting throughout |
| 2. Include all necessary bibliographic information |
| 3. Follow standard citation conventions |
| 4. Number the references sequentially [1], [2], [3], etc. |
| 5. Include PMID when available |
| 6. Make sure the numbering matches the inline citations in the summary |
| |
| You MUST call the format_reference_list function with your formatted references.""" |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": f"Format references for these articles: {json.dumps(articles_data, indent=2)}"} |
| ] |
| |
| try: |
| response = client.chat.completions.create( |
| model="gpt-4", |
| messages=messages, |
| tools=tools, |
| tool_choice={"type": "function", "function": {"name": "format_reference_list"}} |
| ) |
| |
| |
| if response.choices[0].message.tool_calls: |
| tool_call = response.choices[0].message.tool_calls[0] |
| function_args = safe_json_loads(tool_call.function.arguments) |
| |
| |
| result = format_reference_list( |
| references=function_args["references"], |
| format_style=function_args.get("format_style", "APA") |
| ) |
| |
| return "\n".join(result["formatted_references"]) |
| else: |
| print("LLM did not make a function call, using fallback") |
| return "Unable to format references due to technical issues." |
| |
| except Exception as e: |
| print(f"Error formatting references: {e}") |
| return "Error occurred during reference formatting." |
|
|
| |
| |
| def run_research_pipeline(user_query: str, max_results: int = 10) -> Dict: |
| """ |
| Run the complete research pipeline with 5 LLM function calls. |
| |
| Args: |
| user_query (str): Original user query |
| max_results (int): Maximum number of results to retrieve |
| |
| Returns: |
| Dict: Complete pipeline results |
| """ |
| |
| print("=== Research Pipeline with LLM Function Calling ===") |
| print(f"Query: {user_query}") |
| print(f"Max results: {max_results}") |
| print() |
| |
| |
| print("Step 1: LLM generating PubMed search prompt...") |
| search_string = generate_pubmed_prompt(user_query) |
| print(f"Generated search: {search_string}") |
| print() |
| |
| |
| print("Step 2: LLM executing PubMed search...") |
| pmids = search_pubmed(search_string, max_results) |
| print(f"Found PMIDs: {pmids}") |
| print() |
| |
| |
| if not pmids: |
| print("No results found with initial search. Trying broader fallback search...") |
| |
| fallback_search = create_fallback_search(user_query) |
| print(f"Fallback search: {fallback_search}") |
| pmids = search_pubmed(fallback_search, max_results) |
| print(f"Fallback search found PMIDs: {pmids}") |
| print() |
| |
| |
| print("Step 3: LLM retrieving abstracts...") |
| abstracts = fetch_pubmed_abstracts(pmids) |
| print(f"Retrieved {len(abstracts)} abstracts") |
| print(abstracts) |
| print() |
| |
| |
| print("Step 4: LLM summarizing abstracts...") |
| summary = summarize_abstracts(abstracts, user_query) |
| print(f"Generated summary ({len(summary)} characters)") |
| print(summary) |
| print() |
| |
| |
| print("Step 5: LLM formatting references...") |
| references = format_references(abstracts) |
| print(f"Formatted {len(abstracts)} references") |
| print() |
| |
| |
| results = { |
| "original_query": user_query, |
| "search_string": search_string, |
| "pmids": pmids, |
| "abstracts": abstracts, |
| "summary": summary, |
| "references": references, |
| "llm_calls": 5 |
| } |
| |
| print("=== Pipeline Complete ===") |
| print(f"Total LLM function calls: {results['llm_calls']}") |
| print(f"Articles processed: {len(abstracts)}") |
| print() |
| |
| return results |
|
|
| def create_fallback_search(query: str) -> str: |
| """ |
| Create a broader fallback search when the initial search returns no results. |
| |
| Args: |
| query (str): Original user query |
| |
| Returns: |
| str: Broader search string |
| """ |
| |
| |
| def create_broad_search(search_string: str, explanation: str) -> dict: |
| """Function that creates a broad search string - called by the LLM""" |
| print(f"LLM called create_broad_search with: {search_string}") |
| print(f"Explanation: {explanation}") |
| return {"status": "success", "search_string": search_string} |
| |
| |
| tools = [{ |
| "type": "function", |
| "function": { |
| "name": "create_broad_search", |
| "description": "Create a broad PubMed search string when the initial search returns no results", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "search_string": { |
| "type": "string", |
| "description": "A broad PubMed search string using only the most fundamental concepts" |
| }, |
| "explanation": { |
| "type": "string", |
| "description": "Brief explanation of how the broad search string was constructed" |
| } |
| }, |
| "required": ["search_string", "explanation"] |
| } |
| } |
| }] |
| |
| |
| system_prompt = """You are a medical research expert who needs to create a BROAD search when the initial search returned no results. |
| |
| Your task is to extract the most fundamental, widely-used medical terms from the query and create a simple search that will definitely return results. |
| |
| CRITICAL REQUIREMENTS: |
| 1. Use ONLY the most basic, fundamental medical terms |
| 2. Focus on 1-2 core concepts maximum |
| 3. Use the most common synonyms and variations |
| 4. Avoid complex or specific terminology |
| 5. Use broad field tags like [tiab] or [mesh] |
| 6. Use OR operators liberally for synonyms |
| |
| EXAMPLES: |
| - Complex query about "progressive lexical-semantic degradation in semantic variant PPA" → "(svPPA OR "semantic variant primary progressive aphasia")[tiab]" |
| - Query about "heteromodal conceptual hubs" → "(conceptual OR semantic)[tiab] AND (temporal lobe OR brain)[tiab]" |
| - Very specific query → Start with the main disease/condition name only |
| |
| You MUST call the create_broad_search function with your broad search string and explanation.""" |
| |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": f"Create a broad search for this query that returned no results: {query}"} |
| ] |
| |
| try: |
| response = client.chat.completions.create( |
| model="gpt-4", |
| messages=messages, |
| tools=tools, |
| tool_choice={"type": "function", "function": {"name": "create_broad_search"}} |
| ) |
| |
| |
| if response.choices[0].message.tool_calls: |
| tool_call = response.choices[0].message.tool_calls[0] |
| function_args = safe_json_loads(tool_call.function.arguments) |
| |
| |
| result = create_broad_search( |
| search_string=function_args["search_string"], |
| explanation=function_args["explanation"] |
| ) |
| |
| return result["search_string"] |
| else: |
| print("LLM did not make a function call, using fallback") |
| |
| basic_terms = extract_basic_terms(query) |
| return f'"{basic_terms}"[tiab]' |
| |
| except Exception as e: |
| print(f"Error creating fallback search: {e}") |
| |
| basic_terms = extract_basic_terms(query) |
| return f'"{basic_terms}"[tiab]' |
|
|
| def extract_basic_terms(query: str) -> str: |
| """ |
| Extract basic medical terms from a query as a fallback. |
| |
| Args: |
| query (str): Original user query |
| |
| Returns: |
| str: Basic search terms |
| """ |
| |
| common_terms = [ |
| "PPA", "svPPA", "semantic variant", "primary progressive aphasia", |
| "Alzheimer", "dementia", "temporal lobe", "semantic", |
| "cognitive", "neurological", "brain", "neurodegenerative" |
| ] |
| |
| query_lower = query.lower() |
| found_terms = [] |
| |
| for term in common_terms: |
| if term.lower() in query_lower: |
| found_terms.append(term) |
| |
| if found_terms: |
| return " OR ".join(f'"{term}"' for term in found_terms[:2]) |
| else: |
| |
| words = query.split()[:3] |
| return " ".join(words) |
|
|
| |
| |
| if __name__ == "__main__": |
| |
| test_query = "To what extent does the progressive lexical-semantic degradation in semantic variant PPA reflect a disintegration of heteromodal conceptual hubs within the anterior temporal lobe, and how might this selectively compromise hierarchical semantic categorization while sparing syntactic scaffolding and non-verbal cognition?" |
| |
| print("=== DIRECT LLM RESPONSE (for comparison) ===") |
| print(f"Query: {test_query}") |
| print() |
| |
| |
| direct_prompt = f"""You are a medical research expert. Please answer this question based on your knowledge. Write it as a summary. |
| |
| {test_query}""" |
| |
| try: |
| direct_response = client.chat.completions.create( |
| model="gpt-4", |
| messages=[ |
| {"role": "system", "content": "You are a medical research expert specializing in neurology and cognitive disorders."}, |
| {"role": "user", "content": direct_prompt} |
| ] |
| ) |
| |
| print("Direct LLM Response:") |
| print(direct_response.choices[0].message.content) |
| print() |
| print("=" * 80) |
| print() |
| |
| |
| print("=== STRUCTURED PIPELINE RESPONSE ===") |
| results = run_research_pipeline(test_query, max_results=5) |
| |
| |
| print("=== Final Results ===") |
| print(f"Summary: {results['summary']}") |
| print() |
| print("References:") |
| print(results['references']) |
| |
| except Exception as e: |
| print(f"Error in direct LLM response: {e}") |
| |
| print("=== STRUCTURED PIPELINE RESPONSE ===") |
| results = run_research_pipeline(test_query, max_results=5) |
| |
| |
| print("=== Final Results ===") |
| print(f"Summary: {results['summary']}") |
| print() |
| print("References:") |
| print(results['references']) |
|
|
| |
|
|
|
|