Spaces:

paulhemb
/

MedSearchPro

Running

App Files Files Community

MedSearchPro / data_sources /core_client.py

paulhemb

Initial Backend Deployment

1367957 23 days ago

raw

history blame contribute delete

4.4 kB

	# data_sources/core_client.py
	import requests
	import time
	from typing import List, Dict
	import json


	class CoreClient:
	"""
	CORE API client - Massive repository but challenging API
	Good for finding obscure papers and preprints
	"""

	def __init__(self):
	self.base_url = "https://api.core.ac.uk/v3/"
	# CORE requires API key for decent rate limits
	self.api_key = None # Would need to get from https://core.ac.uk/services/api

	def search_papers(self, query: str, max_results: int = 25) -> List[Dict]:
	"""
	Search CORE repository - LIMITED due to API constraints
	"""
	if not self.api_key:
	print(" ⚠️ CORE: No API key, skipping (get free key from core.ac.uk)")
	return []

	search_url = f"{self.base_url}search/works"
	params = {
	'q': query,
	'limit': min(max_results, 25), # They're strict about limits
	'offset': 0
	}
	headers = {
	'Authorization': f'Bearer {self.api_key}'
	}

	try:
	response = requests.get(search_url, params=params, headers=headers)

	if response.status_code == 429:
	print(" ⚠️ CORE: Rate limited, skipping")
	return []
	elif response.status_code != 200:
	print(f" ⚠️ CORE: API error {response.status_code}")
	return []

	data = response.json()
	papers = []

	for result in data.get('results', [])[:max_results]:
	paper_data = self._parse_result(result)
	if paper_data:
	papers.append(paper_data)

	print(f" ✅ CORE: Found {len(papers)} papers")
	return papers

	except Exception as e:
	print(f" ❌ CORE search error: {e}")
	return []

	def _parse_result(self, result: Dict) -> Dict:
	"""Parse CORE API result - dealing with their inconsistent format"""
	try:
	title = result.get('title', 'No title')
	if not title or title == 'No title':
	return None

	# Abstract might be in different fields
	abstract = result.get('abstract', '')
	if not abstract:
	abstract = result.get('description', '')

	# Authors - could be string or list
	authors = []
	authors_data = result.get('authors', [])
	if isinstance(authors_data, list):
	for author in authors_data:
	if isinstance(author, dict):
	authors.append(author.get('name', ''))
	else:
	authors.append(str(author))
	elif authors_data:
	authors = [authors_data]

	# Date - could be in multiple formats
	published_date = result.get('publishedDate', '')
	if not published_date:
	published_date = result.get('year', '')

	# PDF link
	pdf_link = None
	download_url = result.get('downloadUrl', '')
	if download_url and 'pdf' in download_url.lower():
	pdf_link = download_url

	# DOI
	doi = result.get('doi', '')
	if doi and isinstance(doi, list):
	doi = doi[0] if doi else ''

	return {
	'source': 'core',
	'title': title,
	'abstract': abstract,
	'authors': authors,
	'journal': result.get('publisher', 'CORE Repository'),
	'publication_date': str(published_date),
	'doi': doi,
	'pdf_link': pdf_link,
	'domain': self._infer_domain(title, abstract),
	'data_quality': 'variable' # Warn about CORE data quality
	}

	except Exception as e:
	print(f" ❌ Error parsing CORE result: {e}")
	return None

	def _infer_domain(self, title: str, abstract: str) -> str:
	"""Infer domain from paper content"""
	from .pubmed_client import PubMedClient
	pubmed_client = PubMedClient()
	return pubmed_client._infer_domain(title, abstract)