File size: 4,400 Bytes
1367957
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# data_sources/core_client.py
import requests
import time
from typing import List, Dict
import json


class CoreClient:
    """

    CORE API client - Massive repository but challenging API

    Good for finding obscure papers and preprints

    """

    def __init__(self):
        self.base_url = "https://api.core.ac.uk/v3/"
        # CORE requires API key for decent rate limits
        self.api_key = None  # Would need to get from https://core.ac.uk/services/api

    def search_papers(self, query: str, max_results: int = 25) -> List[Dict]:
        """

        Search CORE repository - LIMITED due to API constraints

        """
        if not self.api_key:
            print("   ⚠️  CORE: No API key, skipping (get free key from core.ac.uk)")
            return []

        search_url = f"{self.base_url}search/works"
        params = {
            'q': query,
            'limit': min(max_results, 25),  # They're strict about limits
            'offset': 0
        }
        headers = {
            'Authorization': f'Bearer {self.api_key}'
        }

        try:
            response = requests.get(search_url, params=params, headers=headers)

            if response.status_code == 429:
                print("   ⚠️  CORE: Rate limited, skipping")
                return []
            elif response.status_code != 200:
                print(f"   ⚠️  CORE: API error {response.status_code}")
                return []

            data = response.json()
            papers = []

            for result in data.get('results', [])[:max_results]:
                paper_data = self._parse_result(result)
                if paper_data:
                    papers.append(paper_data)

            print(f"   ✅ CORE: Found {len(papers)} papers")
            return papers

        except Exception as e:
            print(f"   ❌ CORE search error: {e}")
            return []

    def _parse_result(self, result: Dict) -> Dict:
        """Parse CORE API result - dealing with their inconsistent format"""
        try:
            title = result.get('title', 'No title')
            if not title or title == 'No title':
                return None

            # Abstract might be in different fields
            abstract = result.get('abstract', '')
            if not abstract:
                abstract = result.get('description', '')

            # Authors - could be string or list
            authors = []
            authors_data = result.get('authors', [])
            if isinstance(authors_data, list):
                for author in authors_data:
                    if isinstance(author, dict):
                        authors.append(author.get('name', ''))
                    else:
                        authors.append(str(author))
            elif authors_data:
                authors = [authors_data]

            # Date - could be in multiple formats
            published_date = result.get('publishedDate', '')
            if not published_date:
                published_date = result.get('year', '')

            # PDF link
            pdf_link = None
            download_url = result.get('downloadUrl', '')
            if download_url and 'pdf' in download_url.lower():
                pdf_link = download_url

            # DOI
            doi = result.get('doi', '')
            if doi and isinstance(doi, list):
                doi = doi[0] if doi else ''

            return {
                'source': 'core',
                'title': title,
                'abstract': abstract,
                'authors': authors,
                'journal': result.get('publisher', 'CORE Repository'),
                'publication_date': str(published_date),
                'doi': doi,
                'pdf_link': pdf_link,
                'domain': self._infer_domain(title, abstract),
                'data_quality': 'variable'  # Warn about CORE data quality
            }

        except Exception as e:
            print(f"   ❌ Error parsing CORE result: {e}")
            return None

    def _infer_domain(self, title: str, abstract: str) -> str:
        """Infer domain from paper content"""
        from .pubmed_client import PubMedClient
        pubmed_client = PubMedClient()
        return pubmed_client._infer_domain(title, abstract)