Spaces:
Sleeping
Sleeping
| # data_sources/pubmed_client.py | |
| import requests | |
| import time | |
| from typing import List, Dict | |
| import xml.etree.ElementTree as ET | |
| class PubMedClient: | |
| def __init__(self): | |
| self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" | |
| def search_papers(self, query: str, max_results: int = 50) -> List[Dict]: | |
| """Search PubMed for papers""" | |
| search_url = f"{self.base_url}esearch.fcgi" | |
| params = { | |
| 'db': 'pubmed', | |
| 'term': query, | |
| 'retmax': max_results, | |
| 'sort': 'relevance', | |
| 'retmode': 'json' | |
| } | |
| try: | |
| response = requests.get(search_url, params=params) | |
| data = response.json() | |
| pmids = data.get('esearchresult', {}).get('idlist', []) | |
| if not pmids: | |
| return [] | |
| # Get detailed paper info | |
| return self._fetch_paper_details(pmids) | |
| except Exception as e: | |
| print(f"PubMed search error: {e}") | |
| return [] | |
| def _fetch_paper_details(self, pmids: List[str]) -> List[Dict]: | |
| """Fetch detailed paper information""" | |
| fetch_url = f"{self.base_url}efetch.fcgi" | |
| params = { | |
| 'db': 'pubmed', | |
| 'id': ','.join(pmids), | |
| 'retmode': 'xml' | |
| } | |
| response = requests.get(fetch_url, params=params) | |
| root = ET.fromstring(response.content) | |
| papers = [] | |
| for article in root.findall('.//PubmedArticle'): | |
| paper_data = self._parse_article(article) | |
| if paper_data and paper_data.get('abstract'): | |
| papers.append(paper_data) | |
| return papers | |
| def _parse_article(self, article) -> Dict: | |
| """Parse XML article into structured data""" | |
| try: | |
| # Extract title | |
| title_elem = article.find('.//ArticleTitle') | |
| title = title_elem.text if title_elem is not None else "No title" | |
| # Extract abstract | |
| abstract_elem = article.find('.//AbstractText') | |
| abstract = abstract_elem.text if abstract_elem is not None else "" | |
| # Extract authors | |
| authors = [] | |
| for author_elem in article.findall('.//Author'): | |
| last_name = author_elem.find('LastName') | |
| fore_name = author_elem.find('ForeName') | |
| if last_name is not None and fore_name is not None: | |
| authors.append(f"{fore_name.text} {last_name.text}") | |
| # Extract journal and date | |
| journal_elem = article.find('.//Journal/Title') | |
| journal = journal_elem.text if journal_elem is not None else "" | |
| pub_date_elem = article.find('.//PubDate/Year') | |
| pub_date = pub_date_elem.text if pub_date_elem is not None else "" | |
| # Extract DOI | |
| article_id_elem = article.find('.//ArticleId[@IdType="doi"]') | |
| doi = article_id_elem.text if article_id_elem is not None else "" | |
| return { | |
| 'source': 'pubmed', | |
| 'title': title, | |
| 'abstract': abstract, | |
| 'authors': authors, | |
| 'journal': journal, | |
| 'publication_date': pub_date, | |
| 'doi': doi, | |
| 'domain': self._infer_domain(title, abstract) # This will now detect all 11 domains | |
| } | |
| except Exception as e: | |
| print(f"Error parsing article: {e}") | |
| return None | |
| def _infer_domain(self, title: str, abstract: str) -> str: | |
| """Infer the medical domain from title and abstract - UPDATED FOR ALL 11 DOMAINS""" | |
| text = f"{title} {abstract}".lower() | |
| # UPDATED: Complete domain keywords for all 11 domains | |
| domain_keywords = { | |
| 'medical_imaging': [ | |
| 'imaging', 'radiology', 'ct', 'mri', 'x-ray', 'ultrasound', 'segmentation', | |
| 'detection', 'diagnosis', 'radiomics', 'medical image', 'computed tomography', | |
| 'magnetic resonance', 'scan', 'radiological' | |
| ], | |
| 'computational_biology': [ | |
| 'computational biology', 'systems biology', 'biological networks', 'pathway analysis', | |
| 'gene regulatory', 'network biology', 'multi-scale modeling', 'biological modeling', | |
| 'computational modeling', 'systems modeling' | |
| ], | |
| 'bioinformatics': [ | |
| 'bioinformatics', 'sequence analysis', 'structural bioinformatics', 'omics data', | |
| 'genome annotation', 'phylogenetic', 'protein structure', 'biological database', | |
| 'computational genomics', 'metagenomics', 'sequence alignment', 'blast' | |
| ], | |
| 'genomics': [ | |
| 'genomics', 'sequencing', 'dna', 'rna', 'genome', 'variant', 'mutation', 'gwas', | |
| 'next generation sequencing', 'whole genome', 'precision genomics', 'functional genomics', | |
| 'comparative genomics', 'epigenomics', 'population genomics' | |
| ], | |
| 'deep_learning_medicine': [ | |
| 'deep learning', 'neural network', 'ai clinical', 'medical ai', 'clinical decision support', | |
| 'healthcare machine learning', 'medical pattern recognition', 'ai assisted diagnosis', | |
| 'predictive modeling healthcare', 'clinical ai', 'neural networks healthcare' | |
| ], | |
| 'drug_discovery': [ | |
| 'drug discovery', 'virtual screening', 'de novo drug', 'molecular docking', 'drug target', | |
| 'pharmacoinformatics', 'cheminformatics', 'drug repurposing', 'molecular property', | |
| 'admet', 'compound', 'pharmaceutical', 'screening' | |
| ], | |
| 'disease_prediction': [ | |
| 'disease prediction', 'risk stratification', 'prognostic model', 'clinical prediction', | |
| 'disease onset', 'predictive modeling', 'risk prediction', 'early detection', | |
| 'progression prediction', 'clinical risk', 'prognosis' | |
| ], | |
| 'robotics_surgery': [ | |
| 'surgical robotics', 'robot-assisted surgery', 'surgical ai', 'operative robotics', | |
| 'surgical navigation', 'minimally invasive robotics', 'surgical automation', | |
| 'computer assisted surgery', 'surgical planning', 'medical robotics' | |
| ], | |
| 'diagnostics': [ | |
| 'diagnostic ai', 'clinical decision support', 'differential diagnosis', 'diagnostic accuracy', | |
| 'ai diagnostic', 'automated diagnosis', 'point-of-care diagnostic', 'diagnostic imaging', | |
| 'laboratory diagnostic', 'ai assisted diagnosis' | |
| ], | |
| 'epidemiology': [ | |
| 'epidemiology', 'disease surveillance', 'outbreak prediction', 'public health surveillance', | |
| 'infectious disease modeling', 'epidemiological forecasting', 'disease transmission', | |
| 'public health analytics', 'epidemic prediction', 'computational epidemiology' | |
| ], | |
| 'public_health': [ | |
| 'public health', 'health policy', 'population health', 'health interventions', | |
| 'public health informatics', 'health equity', 'healthcare access', 'public health decision', | |
| 'community health', 'health outcomes' | |
| ] | |
| } | |
| # Count matches for each domain | |
| domain_scores = {} | |
| for domain, keywords in domain_keywords.items(): | |
| score = sum(1 for keyword in keywords if keyword in text) | |
| domain_scores[domain] = score | |
| # Return domain with highest score, or 'general_medical' if no strong match | |
| best_domain = max(domain_scores, key=domain_scores.get) | |
| return best_domain if domain_scores[best_domain] > 0 else 'general_medical' |