MedSearchPro / data_sources /pubmed_client.py
paulhemb's picture
Initial Backend Deployment
1367957
# data_sources/pubmed_client.py
import requests
import time
from typing import List, Dict
import xml.etree.ElementTree as ET
class PubMedClient:
def __init__(self):
self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
def search_papers(self, query: str, max_results: int = 50) -> List[Dict]:
"""Search PubMed for papers"""
search_url = f"{self.base_url}esearch.fcgi"
params = {
'db': 'pubmed',
'term': query,
'retmax': max_results,
'sort': 'relevance',
'retmode': 'json'
}
try:
response = requests.get(search_url, params=params)
data = response.json()
pmids = data.get('esearchresult', {}).get('idlist', [])
if not pmids:
return []
# Get detailed paper info
return self._fetch_paper_details(pmids)
except Exception as e:
print(f"PubMed search error: {e}")
return []
def _fetch_paper_details(self, pmids: List[str]) -> List[Dict]:
"""Fetch detailed paper information"""
fetch_url = f"{self.base_url}efetch.fcgi"
params = {
'db': 'pubmed',
'id': ','.join(pmids),
'retmode': 'xml'
}
response = requests.get(fetch_url, params=params)
root = ET.fromstring(response.content)
papers = []
for article in root.findall('.//PubmedArticle'):
paper_data = self._parse_article(article)
if paper_data and paper_data.get('abstract'):
papers.append(paper_data)
return papers
def _parse_article(self, article) -> Dict:
"""Parse XML article into structured data"""
try:
# Extract title
title_elem = article.find('.//ArticleTitle')
title = title_elem.text if title_elem is not None else "No title"
# Extract abstract
abstract_elem = article.find('.//AbstractText')
abstract = abstract_elem.text if abstract_elem is not None else ""
# Extract authors
authors = []
for author_elem in article.findall('.//Author'):
last_name = author_elem.find('LastName')
fore_name = author_elem.find('ForeName')
if last_name is not None and fore_name is not None:
authors.append(f"{fore_name.text} {last_name.text}")
# Extract journal and date
journal_elem = article.find('.//Journal/Title')
journal = journal_elem.text if journal_elem is not None else ""
pub_date_elem = article.find('.//PubDate/Year')
pub_date = pub_date_elem.text if pub_date_elem is not None else ""
# Extract DOI
article_id_elem = article.find('.//ArticleId[@IdType="doi"]')
doi = article_id_elem.text if article_id_elem is not None else ""
return {
'source': 'pubmed',
'title': title,
'abstract': abstract,
'authors': authors,
'journal': journal,
'publication_date': pub_date,
'doi': doi,
'domain': self._infer_domain(title, abstract) # This will now detect all 11 domains
}
except Exception as e:
print(f"Error parsing article: {e}")
return None
def _infer_domain(self, title: str, abstract: str) -> str:
"""Infer the medical domain from title and abstract - UPDATED FOR ALL 11 DOMAINS"""
text = f"{title} {abstract}".lower()
# UPDATED: Complete domain keywords for all 11 domains
domain_keywords = {
'medical_imaging': [
'imaging', 'radiology', 'ct', 'mri', 'x-ray', 'ultrasound', 'segmentation',
'detection', 'diagnosis', 'radiomics', 'medical image', 'computed tomography',
'magnetic resonance', 'scan', 'radiological'
],
'computational_biology': [
'computational biology', 'systems biology', 'biological networks', 'pathway analysis',
'gene regulatory', 'network biology', 'multi-scale modeling', 'biological modeling',
'computational modeling', 'systems modeling'
],
'bioinformatics': [
'bioinformatics', 'sequence analysis', 'structural bioinformatics', 'omics data',
'genome annotation', 'phylogenetic', 'protein structure', 'biological database',
'computational genomics', 'metagenomics', 'sequence alignment', 'blast'
],
'genomics': [
'genomics', 'sequencing', 'dna', 'rna', 'genome', 'variant', 'mutation', 'gwas',
'next generation sequencing', 'whole genome', 'precision genomics', 'functional genomics',
'comparative genomics', 'epigenomics', 'population genomics'
],
'deep_learning_medicine': [
'deep learning', 'neural network', 'ai clinical', 'medical ai', 'clinical decision support',
'healthcare machine learning', 'medical pattern recognition', 'ai assisted diagnosis',
'predictive modeling healthcare', 'clinical ai', 'neural networks healthcare'
],
'drug_discovery': [
'drug discovery', 'virtual screening', 'de novo drug', 'molecular docking', 'drug target',
'pharmacoinformatics', 'cheminformatics', 'drug repurposing', 'molecular property',
'admet', 'compound', 'pharmaceutical', 'screening'
],
'disease_prediction': [
'disease prediction', 'risk stratification', 'prognostic model', 'clinical prediction',
'disease onset', 'predictive modeling', 'risk prediction', 'early detection',
'progression prediction', 'clinical risk', 'prognosis'
],
'robotics_surgery': [
'surgical robotics', 'robot-assisted surgery', 'surgical ai', 'operative robotics',
'surgical navigation', 'minimally invasive robotics', 'surgical automation',
'computer assisted surgery', 'surgical planning', 'medical robotics'
],
'diagnostics': [
'diagnostic ai', 'clinical decision support', 'differential diagnosis', 'diagnostic accuracy',
'ai diagnostic', 'automated diagnosis', 'point-of-care diagnostic', 'diagnostic imaging',
'laboratory diagnostic', 'ai assisted diagnosis'
],
'epidemiology': [
'epidemiology', 'disease surveillance', 'outbreak prediction', 'public health surveillance',
'infectious disease modeling', 'epidemiological forecasting', 'disease transmission',
'public health analytics', 'epidemic prediction', 'computational epidemiology'
],
'public_health': [
'public health', 'health policy', 'population health', 'health interventions',
'public health informatics', 'health equity', 'healthcare access', 'public health decision',
'community health', 'health outcomes'
]
}
# Count matches for each domain
domain_scores = {}
for domain, keywords in domain_keywords.items():
score = sum(1 for keyword in keywords if keyword in text)
domain_scores[domain] = score
# Return domain with highest score, or 'general_medical' if no strong match
best_domain = max(domain_scores, key=domain_scores.get)
return best_domain if domain_scores[best_domain] > 0 else 'general_medical'