MedSearchPro / data_sources /core_client.py
paulhemb's picture
Initial Backend Deployment
1367957
# data_sources/core_client.py
import requests
import time
from typing import List, Dict
import json
class CoreClient:
"""
CORE API client - Massive repository but challenging API
Good for finding obscure papers and preprints
"""
def __init__(self):
self.base_url = "https://api.core.ac.uk/v3/"
# CORE requires API key for decent rate limits
self.api_key = None # Would need to get from https://core.ac.uk/services/api
def search_papers(self, query: str, max_results: int = 25) -> List[Dict]:
"""
Search CORE repository - LIMITED due to API constraints
"""
if not self.api_key:
print(" ⚠️ CORE: No API key, skipping (get free key from core.ac.uk)")
return []
search_url = f"{self.base_url}search/works"
params = {
'q': query,
'limit': min(max_results, 25), # They're strict about limits
'offset': 0
}
headers = {
'Authorization': f'Bearer {self.api_key}'
}
try:
response = requests.get(search_url, params=params, headers=headers)
if response.status_code == 429:
print(" ⚠️ CORE: Rate limited, skipping")
return []
elif response.status_code != 200:
print(f" ⚠️ CORE: API error {response.status_code}")
return []
data = response.json()
papers = []
for result in data.get('results', [])[:max_results]:
paper_data = self._parse_result(result)
if paper_data:
papers.append(paper_data)
print(f" ✅ CORE: Found {len(papers)} papers")
return papers
except Exception as e:
print(f" ❌ CORE search error: {e}")
return []
def _parse_result(self, result: Dict) -> Dict:
"""Parse CORE API result - dealing with their inconsistent format"""
try:
title = result.get('title', 'No title')
if not title or title == 'No title':
return None
# Abstract might be in different fields
abstract = result.get('abstract', '')
if not abstract:
abstract = result.get('description', '')
# Authors - could be string or list
authors = []
authors_data = result.get('authors', [])
if isinstance(authors_data, list):
for author in authors_data:
if isinstance(author, dict):
authors.append(author.get('name', ''))
else:
authors.append(str(author))
elif authors_data:
authors = [authors_data]
# Date - could be in multiple formats
published_date = result.get('publishedDate', '')
if not published_date:
published_date = result.get('year', '')
# PDF link
pdf_link = None
download_url = result.get('downloadUrl', '')
if download_url and 'pdf' in download_url.lower():
pdf_link = download_url
# DOI
doi = result.get('doi', '')
if doi and isinstance(doi, list):
doi = doi[0] if doi else ''
return {
'source': 'core',
'title': title,
'abstract': abstract,
'authors': authors,
'journal': result.get('publisher', 'CORE Repository'),
'publication_date': str(published_date),
'doi': doi,
'pdf_link': pdf_link,
'domain': self._infer_domain(title, abstract),
'data_quality': 'variable' # Warn about CORE data quality
}
except Exception as e:
print(f" ❌ Error parsing CORE result: {e}")
return None
def _infer_domain(self, title: str, abstract: str) -> str:
"""Infer domain from paper content"""
from .pubmed_client import PubMedClient
pubmed_client = PubMedClient()
return pubmed_client._infer_domain(title, abstract)