Spaces:

paulhemb
/

MedSearchPro

Sleeping

App Files Files Community

MedSearchPro / data_sources /pubmed_client.py

paulhemb

Initial Backend Deployment

1367957 3 months ago

raw

history blame contribute delete

8 kB

	# data_sources/pubmed_client.py
	import requests
	import time
	from typing import List, Dict
	import xml.etree.ElementTree as ET


	class PubMedClient:
	def __init__(self):
	self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

	def search_papers(self, query: str, max_results: int = 50) -> List[Dict]:
	"""Search PubMed for papers"""
	search_url = f"{self.base_url}esearch.fcgi"
	params = {
	'db': 'pubmed',
	'term': query,
	'retmax': max_results,
	'sort': 'relevance',
	'retmode': 'json'
	}

	try:
	response = requests.get(search_url, params=params)
	data = response.json()
	pmids = data.get('esearchresult', {}).get('idlist', [])

	if not pmids:
	return []

	# Get detailed paper info
	return self._fetch_paper_details(pmids)

	except Exception as e:
	print(f"PubMed search error: {e}")
	return []

	def _fetch_paper_details(self, pmids: List[str]) -> List[Dict]:
	"""Fetch detailed paper information"""
	fetch_url = f"{self.base_url}efetch.fcgi"
	params = {
	'db': 'pubmed',
	'id': ','.join(pmids),
	'retmode': 'xml'
	}

	response = requests.get(fetch_url, params=params)
	root = ET.fromstring(response.content)

	papers = []
	for article in root.findall('.//PubmedArticle'):
	paper_data = self._parse_article(article)
	if paper_data and paper_data.get('abstract'):
	papers.append(paper_data)

	return papers

	def _parse_article(self, article) -> Dict:
	"""Parse XML article into structured data"""
	try:
	# Extract title
	title_elem = article.find('.//ArticleTitle')
	title = title_elem.text if title_elem is not None else "No title"

	# Extract abstract
	abstract_elem = article.find('.//AbstractText')
	abstract = abstract_elem.text if abstract_elem is not None else ""

	# Extract authors
	authors = []
	for author_elem in article.findall('.//Author'):
	last_name = author_elem.find('LastName')
	fore_name = author_elem.find('ForeName')
	if last_name is not None and fore_name is not None:
	authors.append(f"{fore_name.text} {last_name.text}")

	# Extract journal and date
	journal_elem = article.find('.//Journal/Title')
	journal = journal_elem.text if journal_elem is not None else ""

	pub_date_elem = article.find('.//PubDate/Year')
	pub_date = pub_date_elem.text if pub_date_elem is not None else ""

	# Extract DOI
	article_id_elem = article.find('.//ArticleId[@IdType="doi"]')
	doi = article_id_elem.text if article_id_elem is not None else ""

	return {
	'source': 'pubmed',
	'title': title,
	'abstract': abstract,
	'authors': authors,
	'journal': journal,
	'publication_date': pub_date,
	'doi': doi,
	'domain': self._infer_domain(title, abstract) # This will now detect all 11 domains
	}

	except Exception as e:
	print(f"Error parsing article: {e}")
	return None

	def _infer_domain(self, title: str, abstract: str) -> str:
	"""Infer the medical domain from title and abstract - UPDATED FOR ALL 11 DOMAINS"""
	text = f"{title} {abstract}".lower()

	# UPDATED: Complete domain keywords for all 11 domains
	domain_keywords = {
	'medical_imaging': [
	'imaging', 'radiology', 'ct', 'mri', 'x-ray', 'ultrasound', 'segmentation',
	'detection', 'diagnosis', 'radiomics', 'medical image', 'computed tomography',
	'magnetic resonance', 'scan', 'radiological'
	],
	'computational_biology': [
	'computational biology', 'systems biology', 'biological networks', 'pathway analysis',
	'gene regulatory', 'network biology', 'multi-scale modeling', 'biological modeling',
	'computational modeling', 'systems modeling'
	],
	'bioinformatics': [
	'bioinformatics', 'sequence analysis', 'structural bioinformatics', 'omics data',
	'genome annotation', 'phylogenetic', 'protein structure', 'biological database',
	'computational genomics', 'metagenomics', 'sequence alignment', 'blast'
	],
	'genomics': [
	'genomics', 'sequencing', 'dna', 'rna', 'genome', 'variant', 'mutation', 'gwas',
	'next generation sequencing', 'whole genome', 'precision genomics', 'functional genomics',
	'comparative genomics', 'epigenomics', 'population genomics'
	],
	'deep_learning_medicine': [
	'deep learning', 'neural network', 'ai clinical', 'medical ai', 'clinical decision support',
	'healthcare machine learning', 'medical pattern recognition', 'ai assisted diagnosis',
	'predictive modeling healthcare', 'clinical ai', 'neural networks healthcare'
	],
	'drug_discovery': [
	'drug discovery', 'virtual screening', 'de novo drug', 'molecular docking', 'drug target',
	'pharmacoinformatics', 'cheminformatics', 'drug repurposing', 'molecular property',
	'admet', 'compound', 'pharmaceutical', 'screening'
	],
	'disease_prediction': [
	'disease prediction', 'risk stratification', 'prognostic model', 'clinical prediction',
	'disease onset', 'predictive modeling', 'risk prediction', 'early detection',
	'progression prediction', 'clinical risk', 'prognosis'
	],
	'robotics_surgery': [
	'surgical robotics', 'robot-assisted surgery', 'surgical ai', 'operative robotics',
	'surgical navigation', 'minimally invasive robotics', 'surgical automation',
	'computer assisted surgery', 'surgical planning', 'medical robotics'
	],
	'diagnostics': [
	'diagnostic ai', 'clinical decision support', 'differential diagnosis', 'diagnostic accuracy',
	'ai diagnostic', 'automated diagnosis', 'point-of-care diagnostic', 'diagnostic imaging',
	'laboratory diagnostic', 'ai assisted diagnosis'
	],
	'epidemiology': [
	'epidemiology', 'disease surveillance', 'outbreak prediction', 'public health surveillance',
	'infectious disease modeling', 'epidemiological forecasting', 'disease transmission',
	'public health analytics', 'epidemic prediction', 'computational epidemiology'
	],
	'public_health': [
	'public health', 'health policy', 'population health', 'health interventions',
	'public health informatics', 'health equity', 'healthcare access', 'public health decision',
	'community health', 'health outcomes'
	]
	}

	# Count matches for each domain
	domain_scores = {}
	for domain, keywords in domain_keywords.items():
	score = sum(1 for keyword in keywords if keyword in text)
	domain_scores[domain] = score

	# Return domain with highest score, or 'general_medical' if no strong match
	best_domain = max(domain_scores, key=domain_scores.get)
	return best_domain if domain_scores[best_domain] > 0 else 'general_medical'