MedSearchPro / data_sources /crossref_client.py
paulhemb's picture
Initial Backend Deployment
1367957
# data_sources/crossref_client.py (FINAL FLEXIBLE VERSION - 2025)
import requests
import re
from typing import List, Dict
class CrossrefClient:
"""
Fully working and flexible Crossref client (2025)
- Fixed email requirement (polite access)
- Fixed abstract HTML stripping
- Fixed URL/DOI handling
- Added proper domain inference
- FIXED: Removed invalid 'select' parameter causing 400 error
- ENHANCED: Added configurable date filtering via start_year parameter
- ENHANCED: Increased max results limit per request to 500
"""
def __init__(self, email=None):
self.base_url = "https://api.crossref.org/works"
# Provide email during initialization or set it later
self.email = email or "hembrompaul75@gmail.com"
def search_papers(self, query: str, max_results: int = 1000, start_year: int = None) -> List[Dict]:
"""Search Crossref with flexible date filtering"""
# Configure base parameters
params = {
'query': query,
'rows': min(max_results, 500), # Max 500 per API request
'mailto': self.email # REQUIRED for polite API access
}
# Add date filter ONLY if start_year is provided
if start_year:
params['filter'] = f'from-pub-date:{start_year}-01-01'
print(f" Date filter: from {start_year} onwards")
try:
print(f"Searching Crossref: '{query}'")
response = requests.get(self.base_url, params=params, timeout=15)
if response.status_code != 200:
print(f"Crossref API error {response.status_code}: {response.text[:200]}")
return []
data = response.json()
items = data.get('message', {}).get('items', [])
papers = []
for item in items:
paper = self._parse_result(item)
if paper:
papers.append(paper)
# Stop if we've reached the requested max_results
if len(papers) >= max_results:
break
print(f"Crossref: Found {len(papers)} papers")
return papers
except Exception as e:
print(f"Crossref request failed: {e}")
return []
def _parse_result(self, result: Dict) -> Dict:
"""Parse one Crossref item correctly"""
try:
# Title: always a list
title = 'No title'
if result.get('title'):
title = result['title'][0] if isinstance(result['title'], list) else result['title']
if not title or title == 'No title':
return None
# Abstract: often contains HTML like <jats:p>...</jats:p>
abstract = ''
if 'abstract' in result:
raw = result['abstract']
if isinstance(raw, str):
abstract = re.sub(r'<[^>]+>', ' ', raw) # Strip all HTML tags
abstract = re.sub(r'\s+', ' ', abstract).strip()
# Authors
authors = []
for auth in result.get('author', []):
given = auth.get('given', '')
family = auth.get('family', '')
name = f"{given} {family}".strip()
if name:
authors.append(name)
# Journal
journal = ''
if result.get('container-title'):
journal = result['container-title'][0] if isinstance(result['container-title'], list) else result[
'container-title']
# Publication date (try 'published' first, then 'created')
pub_date = ''
if result.get('published'):
parts = result['published'].get('date-parts', [[]])[0]
if parts:
pub_date = '-'.join(str(p) for p in parts[:3] if p)
elif result.get('created'):
dt = result['created'].get('date-time', '')
if dt:
pub_date = dt[:10]
# DOI & URL
doi = result.get('DOI', '')
url = result.get('URL', f"https://doi.org/{doi}" if doi else '')
# Reference count
ref_count = result.get('reference-count', 0)
return {
'source': 'crossref',
'title': title,
'abstract': abstract,
'authors': authors,
'journal': journal,
'publication_date': pub_date,
'doi': doi,
'url': url,
'reference_count': ref_count,
'domain': self._infer_domain(title, abstract)
}
except Exception as e:
print(f"Error parsing Crossref item: {e}")
return None
def _infer_domain(self, title: str, abstract: str) -> str:
"""Standalone domain inference - no external import needed"""
text = f"{title} {abstract}".lower()
domain_map = {
'medical_imaging': ['imaging', 'mri', 'ct', 'radiology', 'ultrasound', 'segmentation'],
'deep_learning_medicine': ['deep learning', 'neural network', 'ai ', 'machine learning',
'artificial intelligence'],
'drug_discovery': ['drug discovery', 'virtual screening', 'molecular docking', 'compound'],
'genomics': ['genomics', 'sequencing', 'dna', 'rna', 'gwas'],
'diagnostics': ['diagnostic', 'diagnosis', 'clinical decision', 'biomarker'],
'epidemiology': ['epidemiology', 'outbreak', 'surveillance', 'public health'],
'public_health': ['public health', 'health policy', 'population health']
}
for domain, keywords in domain_map.items():
if any(k in text for k in keywords):
return domain
return 'general_medical'
# ====================== QUICK TEST ======================
if __name__ == "__main__":
# Test the enhanced client
client = CrossrefClient(email="hembrompaul75@gmail.com") # Use your real email
print("Testing Enhanced Crossref Client\n" + "=" * 60)
# Example 1: Recent papers only
print("\n1. Fetching recent papers (from 2023 onwards):")
recent_papers = client.search_papers("immunotherapy cancer", max_results=15, start_year=2023)
print(f" Found {len(recent_papers)} recent papers")
# Example 2: All papers (no date filter)
print("\n2. Fetching all papers (no date filter):")
all_papers = client.search_papers("machine learning diagnosis", max_results=10)
print(f" Found {len(all_papers)} total papers")
# Example 3: Using the medical terms we discussed
print("\n3. Testing specific medical terms:")
specific_papers = client.search_papers("liquid biopsy early detection", max_results=5, start_year=2020)
# Display first few results
if recent_papers:
print(f"\nSample recent results (showing {min(3, len(recent_papers))} of {len(recent_papers)}):")
for i, p in enumerate(recent_papers[:3], 1):
print(f"\n{i}. {p['title'][:80]}...")
print(f" Journal: {p['journal'][:40] if p['journal'] else 'N/A'}")
print(f" Date: {p['publication_date']} | Domain: {p['domain']}")
print(f"\nEnhanced Crossref client is WORKING and FLEXIBLE!")