Spaces:
Running
Running
| # data_sources/core_client.py | |
| import requests | |
| import time | |
| from typing import List, Dict | |
| import json | |
| class CoreClient: | |
| """ | |
| CORE API client - Massive repository but challenging API | |
| Good for finding obscure papers and preprints | |
| """ | |
| def __init__(self): | |
| self.base_url = "https://api.core.ac.uk/v3/" | |
| # CORE requires API key for decent rate limits | |
| self.api_key = None # Would need to get from https://core.ac.uk/services/api | |
| def search_papers(self, query: str, max_results: int = 25) -> List[Dict]: | |
| """ | |
| Search CORE repository - LIMITED due to API constraints | |
| """ | |
| if not self.api_key: | |
| print(" ⚠️ CORE: No API key, skipping (get free key from core.ac.uk)") | |
| return [] | |
| search_url = f"{self.base_url}search/works" | |
| params = { | |
| 'q': query, | |
| 'limit': min(max_results, 25), # They're strict about limits | |
| 'offset': 0 | |
| } | |
| headers = { | |
| 'Authorization': f'Bearer {self.api_key}' | |
| } | |
| try: | |
| response = requests.get(search_url, params=params, headers=headers) | |
| if response.status_code == 429: | |
| print(" ⚠️ CORE: Rate limited, skipping") | |
| return [] | |
| elif response.status_code != 200: | |
| print(f" ⚠️ CORE: API error {response.status_code}") | |
| return [] | |
| data = response.json() | |
| papers = [] | |
| for result in data.get('results', [])[:max_results]: | |
| paper_data = self._parse_result(result) | |
| if paper_data: | |
| papers.append(paper_data) | |
| print(f" ✅ CORE: Found {len(papers)} papers") | |
| return papers | |
| except Exception as e: | |
| print(f" ❌ CORE search error: {e}") | |
| return [] | |
| def _parse_result(self, result: Dict) -> Dict: | |
| """Parse CORE API result - dealing with their inconsistent format""" | |
| try: | |
| title = result.get('title', 'No title') | |
| if not title or title == 'No title': | |
| return None | |
| # Abstract might be in different fields | |
| abstract = result.get('abstract', '') | |
| if not abstract: | |
| abstract = result.get('description', '') | |
| # Authors - could be string or list | |
| authors = [] | |
| authors_data = result.get('authors', []) | |
| if isinstance(authors_data, list): | |
| for author in authors_data: | |
| if isinstance(author, dict): | |
| authors.append(author.get('name', '')) | |
| else: | |
| authors.append(str(author)) | |
| elif authors_data: | |
| authors = [authors_data] | |
| # Date - could be in multiple formats | |
| published_date = result.get('publishedDate', '') | |
| if not published_date: | |
| published_date = result.get('year', '') | |
| # PDF link | |
| pdf_link = None | |
| download_url = result.get('downloadUrl', '') | |
| if download_url and 'pdf' in download_url.lower(): | |
| pdf_link = download_url | |
| # DOI | |
| doi = result.get('doi', '') | |
| if doi and isinstance(doi, list): | |
| doi = doi[0] if doi else '' | |
| return { | |
| 'source': 'core', | |
| 'title': title, | |
| 'abstract': abstract, | |
| 'authors': authors, | |
| 'journal': result.get('publisher', 'CORE Repository'), | |
| 'publication_date': str(published_date), | |
| 'doi': doi, | |
| 'pdf_link': pdf_link, | |
| 'domain': self._infer_domain(title, abstract), | |
| 'data_quality': 'variable' # Warn about CORE data quality | |
| } | |
| except Exception as e: | |
| print(f" ❌ Error parsing CORE result: {e}") | |
| return None | |
| def _infer_domain(self, title: str, abstract: str) -> str: | |
| """Infer domain from paper content""" | |
| from .pubmed_client import PubMedClient | |
| pubmed_client = PubMedClient() | |
| return pubmed_client._infer_domain(title, abstract) |