# data_sources/core_client.py import requests import time from typing import List, Dict import json class CoreClient: """ CORE API client - Massive repository but challenging API Good for finding obscure papers and preprints """ def __init__(self): self.base_url = "https://api.core.ac.uk/v3/" # CORE requires API key for decent rate limits self.api_key = None # Would need to get from https://core.ac.uk/services/api def search_papers(self, query: str, max_results: int = 25) -> List[Dict]: """ Search CORE repository - LIMITED due to API constraints """ if not self.api_key: print(" ⚠️ CORE: No API key, skipping (get free key from core.ac.uk)") return [] search_url = f"{self.base_url}search/works" params = { 'q': query, 'limit': min(max_results, 25), # They're strict about limits 'offset': 0 } headers = { 'Authorization': f'Bearer {self.api_key}' } try: response = requests.get(search_url, params=params, headers=headers) if response.status_code == 429: print(" ⚠️ CORE: Rate limited, skipping") return [] elif response.status_code != 200: print(f" ⚠️ CORE: API error {response.status_code}") return [] data = response.json() papers = [] for result in data.get('results', [])[:max_results]: paper_data = self._parse_result(result) if paper_data: papers.append(paper_data) print(f" ✅ CORE: Found {len(papers)} papers") return papers except Exception as e: print(f" ❌ CORE search error: {e}") return [] def _parse_result(self, result: Dict) -> Dict: """Parse CORE API result - dealing with their inconsistent format""" try: title = result.get('title', 'No title') if not title or title == 'No title': return None # Abstract might be in different fields abstract = result.get('abstract', '') if not abstract: abstract = result.get('description', '') # Authors - could be string or list authors = [] authors_data = result.get('authors', []) if isinstance(authors_data, list): for author in authors_data: if isinstance(author, dict): authors.append(author.get('name', '')) else: authors.append(str(author)) elif authors_data: authors = [authors_data] # Date - could be in multiple formats published_date = result.get('publishedDate', '') if not published_date: published_date = result.get('year', '') # PDF link pdf_link = None download_url = result.get('downloadUrl', '') if download_url and 'pdf' in download_url.lower(): pdf_link = download_url # DOI doi = result.get('doi', '') if doi and isinstance(doi, list): doi = doi[0] if doi else '' return { 'source': 'core', 'title': title, 'abstract': abstract, 'authors': authors, 'journal': result.get('publisher', 'CORE Repository'), 'publication_date': str(published_date), 'doi': doi, 'pdf_link': pdf_link, 'domain': self._infer_domain(title, abstract), 'data_quality': 'variable' # Warn about CORE data quality } except Exception as e: print(f" ❌ Error parsing CORE result: {e}") return None def _infer_domain(self, title: str, abstract: str) -> str: """Infer domain from paper content""" from .pubmed_client import PubMedClient pubmed_client = PubMedClient() return pubmed_client._infer_domain(title, abstract)