Spaces:

paulhemb
/

MedSearchPro

Sleeping

App Files Files Community

MedSearchPro / data_sources /crossref_client.py

paulhemb

Initial Backend Deployment

1367957 24 days ago

raw

history blame contribute delete

7.59 kB

	# data_sources/crossref_client.py (FINAL FLEXIBLE VERSION - 2025)
	import requests
	import re
	from typing import List, Dict


	class CrossrefClient:
	"""
	Fully working and flexible Crossref client (2025)
	- Fixed email requirement (polite access)
	- Fixed abstract HTML stripping
	- Fixed URL/DOI handling
	- Added proper domain inference
	- FIXED: Removed invalid 'select' parameter causing 400 error
	- ENHANCED: Added configurable date filtering via start_year parameter
	- ENHANCED: Increased max results limit per request to 500
	"""

	def __init__(self, email=None):
	self.base_url = "https://api.crossref.org/works"
	# Provide email during initialization or set it later
	self.email = email or "hembrompaul75@gmail.com"

	def search_papers(self, query: str, max_results: int = 1000, start_year: int = None) -> List[Dict]:
	"""Search Crossref with flexible date filtering"""
	# Configure base parameters
	params = {
	'query': query,
	'rows': min(max_results, 500), # Max 500 per API request
	'mailto': self.email # REQUIRED for polite API access
	}

	# Add date filter ONLY if start_year is provided
	if start_year:
	params['filter'] = f'from-pub-date:{start_year}-01-01'
	print(f" Date filter: from {start_year} onwards")

	try:
	print(f"Searching Crossref: '{query}'")
	response = requests.get(self.base_url, params=params, timeout=15)

	if response.status_code != 200:
	print(f"Crossref API error {response.status_code}: {response.text[:200]}")
	return []

	data = response.json()
	items = data.get('message', {}).get('items', [])

	papers = []
	for item in items:
	paper = self._parse_result(item)
	if paper:
	papers.append(paper)
	# Stop if we've reached the requested max_results
	if len(papers) >= max_results:
	break

	print(f"Crossref: Found {len(papers)} papers")
	return papers

	except Exception as e:
	print(f"Crossref request failed: {e}")
	return []

	def _parse_result(self, result: Dict) -> Dict:
	"""Parse one Crossref item correctly"""
	try:
	# Title: always a list
	title = 'No title'
	if result.get('title'):
	title = result['title'][0] if isinstance(result['title'], list) else result['title']

	if not title or title == 'No title':
	return None

	# Abstract: often contains HTML like <jats:p>...</jats:p>
	abstract = ''
	if 'abstract' in result:
	raw = result['abstract']
	if isinstance(raw, str):
	abstract = re.sub(r'<[^>]+>', ' ', raw) # Strip all HTML tags
	abstract = re.sub(r'\s+', ' ', abstract).strip()

	# Authors
	authors = []
	for auth in result.get('author', []):
	given = auth.get('given', '')
	family = auth.get('family', '')
	name = f"{given} {family}".strip()
	if name:
	authors.append(name)

	# Journal
	journal = ''
	if result.get('container-title'):
	journal = result['container-title'][0] if isinstance(result['container-title'], list) else result[
	'container-title']

	# Publication date (try 'published' first, then 'created')
	pub_date = ''
	if result.get('published'):
	parts = result['published'].get('date-parts', [[]])[0]
	if parts:
	pub_date = '-'.join(str(p) for p in parts[:3] if p)
	elif result.get('created'):
	dt = result['created'].get('date-time', '')
	if dt:
	pub_date = dt[:10]

	# DOI & URL
	doi = result.get('DOI', '')
	url = result.get('URL', f"https://doi.org/{doi}" if doi else '')

	# Reference count
	ref_count = result.get('reference-count', 0)

	return {
	'source': 'crossref',
	'title': title,
	'abstract': abstract,
	'authors': authors,
	'journal': journal,
	'publication_date': pub_date,
	'doi': doi,
	'url': url,
	'reference_count': ref_count,
	'domain': self._infer_domain(title, abstract)
	}

	except Exception as e:
	print(f"Error parsing Crossref item: {e}")
	return None

	def _infer_domain(self, title: str, abstract: str) -> str:
	"""Standalone domain inference - no external import needed"""
	text = f"{title} {abstract}".lower()

	domain_map = {
	'medical_imaging': ['imaging', 'mri', 'ct', 'radiology', 'ultrasound', 'segmentation'],
	'deep_learning_medicine': ['deep learning', 'neural network', 'ai ', 'machine learning',
	'artificial intelligence'],
	'drug_discovery': ['drug discovery', 'virtual screening', 'molecular docking', 'compound'],
	'genomics': ['genomics', 'sequencing', 'dna', 'rna', 'gwas'],
	'diagnostics': ['diagnostic', 'diagnosis', 'clinical decision', 'biomarker'],
	'epidemiology': ['epidemiology', 'outbreak', 'surveillance', 'public health'],
	'public_health': ['public health', 'health policy', 'population health']
	}

	for domain, keywords in domain_map.items():
	if any(k in text for k in keywords):
	return domain

	return 'general_medical'


	# ====================== QUICK TEST ======================
	if __name__ == "__main__":
	# Test the enhanced client
	client = CrossrefClient(email="hembrompaul75@gmail.com") # Use your real email

	print("Testing Enhanced Crossref Client\n" + "=" * 60)

	# Example 1: Recent papers only
	print("\n1. Fetching recent papers (from 2023 onwards):")
	recent_papers = client.search_papers("immunotherapy cancer", max_results=15, start_year=2023)
	print(f" Found {len(recent_papers)} recent papers")

	# Example 2: All papers (no date filter)
	print("\n2. Fetching all papers (no date filter):")
	all_papers = client.search_papers("machine learning diagnosis", max_results=10)
	print(f" Found {len(all_papers)} total papers")

	# Example 3: Using the medical terms we discussed
	print("\n3. Testing specific medical terms:")
	specific_papers = client.search_papers("liquid biopsy early detection", max_results=5, start_year=2020)

	# Display first few results
	if recent_papers:
	print(f"\nSample recent results (showing {min(3, len(recent_papers))} of {len(recent_papers)}):")
	for i, p in enumerate(recent_papers[:3], 1):
	print(f"\n{i}. {p['title'][:80]}...")
	print(f" Journal: {p['journal'][:40] if p['journal'] else 'N/A'}")
	print(f" Date: {p['publication_date']} \| Domain: {p['domain']}")

	print(f"\nEnhanced Crossref client is WORKING and FLEXIBLE!")