Spaces:

paulhemb
/

MedSearchPro

Running

File size: 7,594 Bytes
# data_sources/crossref_client.py (FINAL FLEXIBLE VERSION - 2025)
import requests
import re
from typing import List, Dict


class CrossrefClient:
    """

    Fully working and flexible Crossref client (2025)

    - Fixed email requirement (polite access)

    - Fixed abstract HTML stripping

    - Fixed URL/DOI handling

    - Added proper domain inference

    - FIXED: Removed invalid 'select' parameter causing 400 error

    - ENHANCED: Added configurable date filtering via start_year parameter

    - ENHANCED: Increased max results limit per request to 500

    """

    def __init__(self, email=None):
        self.base_url = "https://api.crossref.org/works"
        # Provide email during initialization or set it later
        self.email = email or "hembrompaul75@gmail.com"

    def search_papers(self, query: str, max_results: int = 1000, start_year: int = None) -> List[Dict]:
        """Search Crossref with flexible date filtering"""
        # Configure base parameters
        params = {
            'query': query,
            'rows': min(max_results, 500),  # Max 500 per API request
            'mailto': self.email  # REQUIRED for polite API access
        }

        # Add date filter ONLY if start_year is provided
        if start_year:
            params['filter'] = f'from-pub-date:{start_year}-01-01'
            print(f"  Date filter: from {start_year} onwards")

        try:
            print(f"Searching Crossref: '{query}'")
            response = requests.get(self.base_url, params=params, timeout=15)

            if response.status_code != 200:
                print(f"Crossref API error {response.status_code}: {response.text[:200]}")
                return []

            data = response.json()
            items = data.get('message', {}).get('items', [])

            papers = []
            for item in items:
                paper = self._parse_result(item)
                if paper:
                    papers.append(paper)
                    # Stop if we've reached the requested max_results
                    if len(papers) >= max_results:
                        break

            print(f"Crossref: Found {len(papers)} papers")
            return papers

        except Exception as e:
            print(f"Crossref request failed: {e}")
            return []

    def _parse_result(self, result: Dict) -> Dict:
        """Parse one Crossref item correctly"""
        try:
            # Title: always a list
            title = 'No title'
            if result.get('title'):
                title = result['title'][0] if isinstance(result['title'], list) else result['title']

            if not title or title == 'No title':
                return None

            # Abstract: often contains HTML like <jats:p>...</jats:p>
            abstract = ''
            if 'abstract' in result:
                raw = result['abstract']
                if isinstance(raw, str):
                    abstract = re.sub(r'<[^>]+>', ' ', raw)  # Strip all HTML tags
                    abstract = re.sub(r'\s+', ' ', abstract).strip()

            # Authors
            authors = []
            for auth in result.get('author', []):
                given = auth.get('given', '')
                family = auth.get('family', '')
                name = f"{given} {family}".strip()
                if name:
                    authors.append(name)

            # Journal
            journal = ''
            if result.get('container-title'):
                journal = result['container-title'][0] if isinstance(result['container-title'], list) else result[
                    'container-title']

            # Publication date (try 'published' first, then 'created')
            pub_date = ''
            if result.get('published'):
                parts = result['published'].get('date-parts', [[]])[0]
                if parts:
                    pub_date = '-'.join(str(p) for p in parts[:3] if p)
            elif result.get('created'):
                dt = result['created'].get('date-time', '')
                if dt:
                    pub_date = dt[:10]

            # DOI & URL
            doi = result.get('DOI', '')
            url = result.get('URL', f"https://doi.org/{doi}" if doi else '')

            # Reference count
            ref_count = result.get('reference-count', 0)

            return {
                'source': 'crossref',
                'title': title,
                'abstract': abstract,
                'authors': authors,
                'journal': journal,
                'publication_date': pub_date,
                'doi': doi,
                'url': url,
                'reference_count': ref_count,
                'domain': self._infer_domain(title, abstract)
            }

        except Exception as e:
            print(f"Error parsing Crossref item: {e}")
            return None

    def _infer_domain(self, title: str, abstract: str) -> str:
        """Standalone domain inference - no external import needed"""
        text = f"{title} {abstract}".lower()

        domain_map = {
            'medical_imaging': ['imaging', 'mri', 'ct', 'radiology', 'ultrasound', 'segmentation'],
            'deep_learning_medicine': ['deep learning', 'neural network', 'ai ', 'machine learning',
                                       'artificial intelligence'],
            'drug_discovery': ['drug discovery', 'virtual screening', 'molecular docking', 'compound'],
            'genomics': ['genomics', 'sequencing', 'dna', 'rna', 'gwas'],
            'diagnostics': ['diagnostic', 'diagnosis', 'clinical decision', 'biomarker'],
            'epidemiology': ['epidemiology', 'outbreak', 'surveillance', 'public health'],
            'public_health': ['public health', 'health policy', 'population health']
        }

        for domain, keywords in domain_map.items():
            if any(k in text for k in keywords):
                return domain

        return 'general_medical'


# ====================== QUICK TEST ======================
if __name__ == "__main__":
    # Test the enhanced client
    client = CrossrefClient(email="hembrompaul75@gmail.com")  # Use your real email

    print("Testing Enhanced Crossref Client\n" + "=" * 60)

    # Example 1: Recent papers only
    print("\n1. Fetching recent papers (from 2023 onwards):")
    recent_papers = client.search_papers("immunotherapy cancer", max_results=15, start_year=2023)
    print(f"   Found {len(recent_papers)} recent papers")

    # Example 2: All papers (no date filter)
    print("\n2. Fetching all papers (no date filter):")
    all_papers = client.search_papers("machine learning diagnosis", max_results=10)
    print(f"   Found {len(all_papers)} total papers")

    # Example 3: Using the medical terms we discussed
    print("\n3. Testing specific medical terms:")
    specific_papers = client.search_papers("liquid biopsy early detection", max_results=5, start_year=2020)

    # Display first few results
    if recent_papers:
        print(f"\nSample recent results (showing {min(3, len(recent_papers))} of {len(recent_papers)}):")
        for i, p in enumerate(recent_papers[:3], 1):
            print(f"\n{i}. {p['title'][:80]}...")
            print(f"    Journal: {p['journal'][:40] if p['journal'] else 'N/A'}")
            print(f"    Date: {p['publication_date']} | Domain: {p['domain']}")

    print(f"\nEnhanced Crossref client is WORKING and FLEXIBLE!")