# data_sources/crossref_client.py (FINAL FLEXIBLE VERSION - 2025) import requests import re from typing import List, Dict class CrossrefClient: """ Fully working and flexible Crossref client (2025) - Fixed email requirement (polite access) - Fixed abstract HTML stripping - Fixed URL/DOI handling - Added proper domain inference - FIXED: Removed invalid 'select' parameter causing 400 error - ENHANCED: Added configurable date filtering via start_year parameter - ENHANCED: Increased max results limit per request to 500 """ def __init__(self, email=None): self.base_url = "https://api.crossref.org/works" # Provide email during initialization or set it later self.email = email or "hembrompaul75@gmail.com" def search_papers(self, query: str, max_results: int = 1000, start_year: int = None) -> List[Dict]: """Search Crossref with flexible date filtering""" # Configure base parameters params = { 'query': query, 'rows': min(max_results, 500), # Max 500 per API request 'mailto': self.email # REQUIRED for polite API access } # Add date filter ONLY if start_year is provided if start_year: params['filter'] = f'from-pub-date:{start_year}-01-01' print(f" Date filter: from {start_year} onwards") try: print(f"Searching Crossref: '{query}'") response = requests.get(self.base_url, params=params, timeout=15) if response.status_code != 200: print(f"Crossref API error {response.status_code}: {response.text[:200]}") return [] data = response.json() items = data.get('message', {}).get('items', []) papers = [] for item in items: paper = self._parse_result(item) if paper: papers.append(paper) # Stop if we've reached the requested max_results if len(papers) >= max_results: break print(f"Crossref: Found {len(papers)} papers") return papers except Exception as e: print(f"Crossref request failed: {e}") return [] def _parse_result(self, result: Dict) -> Dict: """Parse one Crossref item correctly""" try: # Title: always a list title = 'No title' if result.get('title'): title = result['title'][0] if isinstance(result['title'], list) else result['title'] if not title or title == 'No title': return None # Abstract: often contains HTML like ... abstract = '' if 'abstract' in result: raw = result['abstract'] if isinstance(raw, str): abstract = re.sub(r'<[^>]+>', ' ', raw) # Strip all HTML tags abstract = re.sub(r'\s+', ' ', abstract).strip() # Authors authors = [] for auth in result.get('author', []): given = auth.get('given', '') family = auth.get('family', '') name = f"{given} {family}".strip() if name: authors.append(name) # Journal journal = '' if result.get('container-title'): journal = result['container-title'][0] if isinstance(result['container-title'], list) else result[ 'container-title'] # Publication date (try 'published' first, then 'created') pub_date = '' if result.get('published'): parts = result['published'].get('date-parts', [[]])[0] if parts: pub_date = '-'.join(str(p) for p in parts[:3] if p) elif result.get('created'): dt = result['created'].get('date-time', '') if dt: pub_date = dt[:10] # DOI & URL doi = result.get('DOI', '') url = result.get('URL', f"https://doi.org/{doi}" if doi else '') # Reference count ref_count = result.get('reference-count', 0) return { 'source': 'crossref', 'title': title, 'abstract': abstract, 'authors': authors, 'journal': journal, 'publication_date': pub_date, 'doi': doi, 'url': url, 'reference_count': ref_count, 'domain': self._infer_domain(title, abstract) } except Exception as e: print(f"Error parsing Crossref item: {e}") return None def _infer_domain(self, title: str, abstract: str) -> str: """Standalone domain inference - no external import needed""" text = f"{title} {abstract}".lower() domain_map = { 'medical_imaging': ['imaging', 'mri', 'ct', 'radiology', 'ultrasound', 'segmentation'], 'deep_learning_medicine': ['deep learning', 'neural network', 'ai ', 'machine learning', 'artificial intelligence'], 'drug_discovery': ['drug discovery', 'virtual screening', 'molecular docking', 'compound'], 'genomics': ['genomics', 'sequencing', 'dna', 'rna', 'gwas'], 'diagnostics': ['diagnostic', 'diagnosis', 'clinical decision', 'biomarker'], 'epidemiology': ['epidemiology', 'outbreak', 'surveillance', 'public health'], 'public_health': ['public health', 'health policy', 'population health'] } for domain, keywords in domain_map.items(): if any(k in text for k in keywords): return domain return 'general_medical' # ====================== QUICK TEST ====================== if __name__ == "__main__": # Test the enhanced client client = CrossrefClient(email="hembrompaul75@gmail.com") # Use your real email print("Testing Enhanced Crossref Client\n" + "=" * 60) # Example 1: Recent papers only print("\n1. Fetching recent papers (from 2023 onwards):") recent_papers = client.search_papers("immunotherapy cancer", max_results=15, start_year=2023) print(f" Found {len(recent_papers)} recent papers") # Example 2: All papers (no date filter) print("\n2. Fetching all papers (no date filter):") all_papers = client.search_papers("machine learning diagnosis", max_results=10) print(f" Found {len(all_papers)} total papers") # Example 3: Using the medical terms we discussed print("\n3. Testing specific medical terms:") specific_papers = client.search_papers("liquid biopsy early detection", max_results=5, start_year=2020) # Display first few results if recent_papers: print(f"\nSample recent results (showing {min(3, len(recent_papers))} of {len(recent_papers)}):") for i, p in enumerate(recent_papers[:3], 1): print(f"\n{i}. {p['title'][:80]}...") print(f" Journal: {p['journal'][:40] if p['journal'] else 'N/A'}") print(f" Date: {p['publication_date']} | Domain: {p['domain']}") print(f"\nEnhanced Crossref client is WORKING and FLEXIBLE!")