Spaces:
Running
Running
File size: 7,594 Bytes
1367957 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# data_sources/crossref_client.py (FINAL FLEXIBLE VERSION - 2025)
import requests
import re
from typing import List, Dict
class CrossrefClient:
"""
Fully working and flexible Crossref client (2025)
- Fixed email requirement (polite access)
- Fixed abstract HTML stripping
- Fixed URL/DOI handling
- Added proper domain inference
- FIXED: Removed invalid 'select' parameter causing 400 error
- ENHANCED: Added configurable date filtering via start_year parameter
- ENHANCED: Increased max results limit per request to 500
"""
def __init__(self, email=None):
self.base_url = "https://api.crossref.org/works"
# Provide email during initialization or set it later
self.email = email or "hembrompaul75@gmail.com"
def search_papers(self, query: str, max_results: int = 1000, start_year: int = None) -> List[Dict]:
"""Search Crossref with flexible date filtering"""
# Configure base parameters
params = {
'query': query,
'rows': min(max_results, 500), # Max 500 per API request
'mailto': self.email # REQUIRED for polite API access
}
# Add date filter ONLY if start_year is provided
if start_year:
params['filter'] = f'from-pub-date:{start_year}-01-01'
print(f" Date filter: from {start_year} onwards")
try:
print(f"Searching Crossref: '{query}'")
response = requests.get(self.base_url, params=params, timeout=15)
if response.status_code != 200:
print(f"Crossref API error {response.status_code}: {response.text[:200]}")
return []
data = response.json()
items = data.get('message', {}).get('items', [])
papers = []
for item in items:
paper = self._parse_result(item)
if paper:
papers.append(paper)
# Stop if we've reached the requested max_results
if len(papers) >= max_results:
break
print(f"Crossref: Found {len(papers)} papers")
return papers
except Exception as e:
print(f"Crossref request failed: {e}")
return []
def _parse_result(self, result: Dict) -> Dict:
"""Parse one Crossref item correctly"""
try:
# Title: always a list
title = 'No title'
if result.get('title'):
title = result['title'][0] if isinstance(result['title'], list) else result['title']
if not title or title == 'No title':
return None
# Abstract: often contains HTML like <jats:p>...</jats:p>
abstract = ''
if 'abstract' in result:
raw = result['abstract']
if isinstance(raw, str):
abstract = re.sub(r'<[^>]+>', ' ', raw) # Strip all HTML tags
abstract = re.sub(r'\s+', ' ', abstract).strip()
# Authors
authors = []
for auth in result.get('author', []):
given = auth.get('given', '')
family = auth.get('family', '')
name = f"{given} {family}".strip()
if name:
authors.append(name)
# Journal
journal = ''
if result.get('container-title'):
journal = result['container-title'][0] if isinstance(result['container-title'], list) else result[
'container-title']
# Publication date (try 'published' first, then 'created')
pub_date = ''
if result.get('published'):
parts = result['published'].get('date-parts', [[]])[0]
if parts:
pub_date = '-'.join(str(p) for p in parts[:3] if p)
elif result.get('created'):
dt = result['created'].get('date-time', '')
if dt:
pub_date = dt[:10]
# DOI & URL
doi = result.get('DOI', '')
url = result.get('URL', f"https://doi.org/{doi}" if doi else '')
# Reference count
ref_count = result.get('reference-count', 0)
return {
'source': 'crossref',
'title': title,
'abstract': abstract,
'authors': authors,
'journal': journal,
'publication_date': pub_date,
'doi': doi,
'url': url,
'reference_count': ref_count,
'domain': self._infer_domain(title, abstract)
}
except Exception as e:
print(f"Error parsing Crossref item: {e}")
return None
def _infer_domain(self, title: str, abstract: str) -> str:
"""Standalone domain inference - no external import needed"""
text = f"{title} {abstract}".lower()
domain_map = {
'medical_imaging': ['imaging', 'mri', 'ct', 'radiology', 'ultrasound', 'segmentation'],
'deep_learning_medicine': ['deep learning', 'neural network', 'ai ', 'machine learning',
'artificial intelligence'],
'drug_discovery': ['drug discovery', 'virtual screening', 'molecular docking', 'compound'],
'genomics': ['genomics', 'sequencing', 'dna', 'rna', 'gwas'],
'diagnostics': ['diagnostic', 'diagnosis', 'clinical decision', 'biomarker'],
'epidemiology': ['epidemiology', 'outbreak', 'surveillance', 'public health'],
'public_health': ['public health', 'health policy', 'population health']
}
for domain, keywords in domain_map.items():
if any(k in text for k in keywords):
return domain
return 'general_medical'
# ====================== QUICK TEST ======================
if __name__ == "__main__":
# Test the enhanced client
client = CrossrefClient(email="hembrompaul75@gmail.com") # Use your real email
print("Testing Enhanced Crossref Client\n" + "=" * 60)
# Example 1: Recent papers only
print("\n1. Fetching recent papers (from 2023 onwards):")
recent_papers = client.search_papers("immunotherapy cancer", max_results=15, start_year=2023)
print(f" Found {len(recent_papers)} recent papers")
# Example 2: All papers (no date filter)
print("\n2. Fetching all papers (no date filter):")
all_papers = client.search_papers("machine learning diagnosis", max_results=10)
print(f" Found {len(all_papers)} total papers")
# Example 3: Using the medical terms we discussed
print("\n3. Testing specific medical terms:")
specific_papers = client.search_papers("liquid biopsy early detection", max_results=5, start_year=2020)
# Display first few results
if recent_papers:
print(f"\nSample recent results (showing {min(3, len(recent_papers))} of {len(recent_papers)}):")
for i, p in enumerate(recent_papers[:3], 1):
print(f"\n{i}. {p['title'][:80]}...")
print(f" Journal: {p['journal'][:40] if p['journal'] else 'N/A'}")
print(f" Date: {p['publication_date']} | Domain: {p['domain']}")
print(f"\nEnhanced Crossref client is WORKING and FLEXIBLE!") |