Spaces:
Sleeping
Sleeping
| # data_sources/crossref_client.py (FINAL FLEXIBLE VERSION - 2025) | |
| import requests | |
| import re | |
| from typing import List, Dict | |
| class CrossrefClient: | |
| """ | |
| Fully working and flexible Crossref client (2025) | |
| - Fixed email requirement (polite access) | |
| - Fixed abstract HTML stripping | |
| - Fixed URL/DOI handling | |
| - Added proper domain inference | |
| - FIXED: Removed invalid 'select' parameter causing 400 error | |
| - ENHANCED: Added configurable date filtering via start_year parameter | |
| - ENHANCED: Increased max results limit per request to 500 | |
| """ | |
| def __init__(self, email=None): | |
| self.base_url = "https://api.crossref.org/works" | |
| # Provide email during initialization or set it later | |
| self.email = email or "hembrompaul75@gmail.com" | |
| def search_papers(self, query: str, max_results: int = 1000, start_year: int = None) -> List[Dict]: | |
| """Search Crossref with flexible date filtering""" | |
| # Configure base parameters | |
| params = { | |
| 'query': query, | |
| 'rows': min(max_results, 500), # Max 500 per API request | |
| 'mailto': self.email # REQUIRED for polite API access | |
| } | |
| # Add date filter ONLY if start_year is provided | |
| if start_year: | |
| params['filter'] = f'from-pub-date:{start_year}-01-01' | |
| print(f" Date filter: from {start_year} onwards") | |
| try: | |
| print(f"Searching Crossref: '{query}'") | |
| response = requests.get(self.base_url, params=params, timeout=15) | |
| if response.status_code != 200: | |
| print(f"Crossref API error {response.status_code}: {response.text[:200]}") | |
| return [] | |
| data = response.json() | |
| items = data.get('message', {}).get('items', []) | |
| papers = [] | |
| for item in items: | |
| paper = self._parse_result(item) | |
| if paper: | |
| papers.append(paper) | |
| # Stop if we've reached the requested max_results | |
| if len(papers) >= max_results: | |
| break | |
| print(f"Crossref: Found {len(papers)} papers") | |
| return papers | |
| except Exception as e: | |
| print(f"Crossref request failed: {e}") | |
| return [] | |
| def _parse_result(self, result: Dict) -> Dict: | |
| """Parse one Crossref item correctly""" | |
| try: | |
| # Title: always a list | |
| title = 'No title' | |
| if result.get('title'): | |
| title = result['title'][0] if isinstance(result['title'], list) else result['title'] | |
| if not title or title == 'No title': | |
| return None | |
| # Abstract: often contains HTML like <jats:p>...</jats:p> | |
| abstract = '' | |
| if 'abstract' in result: | |
| raw = result['abstract'] | |
| if isinstance(raw, str): | |
| abstract = re.sub(r'<[^>]+>', ' ', raw) # Strip all HTML tags | |
| abstract = re.sub(r'\s+', ' ', abstract).strip() | |
| # Authors | |
| authors = [] | |
| for auth in result.get('author', []): | |
| given = auth.get('given', '') | |
| family = auth.get('family', '') | |
| name = f"{given} {family}".strip() | |
| if name: | |
| authors.append(name) | |
| # Journal | |
| journal = '' | |
| if result.get('container-title'): | |
| journal = result['container-title'][0] if isinstance(result['container-title'], list) else result[ | |
| 'container-title'] | |
| # Publication date (try 'published' first, then 'created') | |
| pub_date = '' | |
| if result.get('published'): | |
| parts = result['published'].get('date-parts', [[]])[0] | |
| if parts: | |
| pub_date = '-'.join(str(p) for p in parts[:3] if p) | |
| elif result.get('created'): | |
| dt = result['created'].get('date-time', '') | |
| if dt: | |
| pub_date = dt[:10] | |
| # DOI & URL | |
| doi = result.get('DOI', '') | |
| url = result.get('URL', f"https://doi.org/{doi}" if doi else '') | |
| # Reference count | |
| ref_count = result.get('reference-count', 0) | |
| return { | |
| 'source': 'crossref', | |
| 'title': title, | |
| 'abstract': abstract, | |
| 'authors': authors, | |
| 'journal': journal, | |
| 'publication_date': pub_date, | |
| 'doi': doi, | |
| 'url': url, | |
| 'reference_count': ref_count, | |
| 'domain': self._infer_domain(title, abstract) | |
| } | |
| except Exception as e: | |
| print(f"Error parsing Crossref item: {e}") | |
| return None | |
| def _infer_domain(self, title: str, abstract: str) -> str: | |
| """Standalone domain inference - no external import needed""" | |
| text = f"{title} {abstract}".lower() | |
| domain_map = { | |
| 'medical_imaging': ['imaging', 'mri', 'ct', 'radiology', 'ultrasound', 'segmentation'], | |
| 'deep_learning_medicine': ['deep learning', 'neural network', 'ai ', 'machine learning', | |
| 'artificial intelligence'], | |
| 'drug_discovery': ['drug discovery', 'virtual screening', 'molecular docking', 'compound'], | |
| 'genomics': ['genomics', 'sequencing', 'dna', 'rna', 'gwas'], | |
| 'diagnostics': ['diagnostic', 'diagnosis', 'clinical decision', 'biomarker'], | |
| 'epidemiology': ['epidemiology', 'outbreak', 'surveillance', 'public health'], | |
| 'public_health': ['public health', 'health policy', 'population health'] | |
| } | |
| for domain, keywords in domain_map.items(): | |
| if any(k in text for k in keywords): | |
| return domain | |
| return 'general_medical' | |
| # ====================== QUICK TEST ====================== | |
| if __name__ == "__main__": | |
| # Test the enhanced client | |
| client = CrossrefClient(email="hembrompaul75@gmail.com") # Use your real email | |
| print("Testing Enhanced Crossref Client\n" + "=" * 60) | |
| # Example 1: Recent papers only | |
| print("\n1. Fetching recent papers (from 2023 onwards):") | |
| recent_papers = client.search_papers("immunotherapy cancer", max_results=15, start_year=2023) | |
| print(f" Found {len(recent_papers)} recent papers") | |
| # Example 2: All papers (no date filter) | |
| print("\n2. Fetching all papers (no date filter):") | |
| all_papers = client.search_papers("machine learning diagnosis", max_results=10) | |
| print(f" Found {len(all_papers)} total papers") | |
| # Example 3: Using the medical terms we discussed | |
| print("\n3. Testing specific medical terms:") | |
| specific_papers = client.search_papers("liquid biopsy early detection", max_results=5, start_year=2020) | |
| # Display first few results | |
| if recent_papers: | |
| print(f"\nSample recent results (showing {min(3, len(recent_papers))} of {len(recent_papers)}):") | |
| for i, p in enumerate(recent_papers[:3], 1): | |
| print(f"\n{i}. {p['title'][:80]}...") | |
| print(f" Journal: {p['journal'][:40] if p['journal'] else 'N/A'}") | |
| print(f" Date: {p['publication_date']} | Domain: {p['domain']}") | |
| print(f"\nEnhanced Crossref client is WORKING and FLEXIBLE!") |