Spaces:

paulhemb
/

MedSearchPro

Running

File size: 2,954 Bytes
# processing/paper_processor.py
from typing import List, Dict
import hashlib
import json


class PaperProcessor:
    def __init__(self):
        self.processed_papers = set()

    def process_papers(self, papers: List[Dict]) -> List[Dict]:
        """Process and clean papers for vector storage"""
        processed = []

        for paper in papers:
            # Create unique ID
            paper_id = self._generate_paper_id(paper)

            if paper_id in self.processed_papers:
                continue

            # Clean and enrich paper data
            cleaned_paper = self._clean_paper(paper)
            if cleaned_paper:
                processed.append(cleaned_paper)
                self.processed_papers.add(paper_id)

        return processed

    def _generate_paper_id(self, paper: Dict) -> str:
        """Generate unique ID for paper"""
        content = f"{paper.get('title', '')}{paper.get('doi', '')}{paper.get('id', '')}"
        return hashlib.md5(content.encode()).hexdigest()

    def _clean_paper(self, paper: Dict) -> Dict:
        """Clean and standardize paper data"""
        # FIXED: Handle abstract that might be a dictionary
        abstract_raw = paper.get('abstract', '')

        # Handle case where abstract is a dictionary
        if isinstance(abstract_raw, dict):
            # Try to extract text from common keys in dict
            abstract = abstract_raw.get('text', '') or abstract_raw.get('abstract', '') or str(abstract_raw)
        else:
            abstract = str(abstract_raw)

        # Now strip the string
        abstract = abstract.strip()

        if not abstract or len(abstract) < 100:  # Too short, probably not useful
            return None

        # FIXED: Also handle title that might be a dictionary
        title_raw = paper.get('title', '')
        if isinstance(title_raw, dict):
            title = title_raw.get('text', '') or title_raw.get('title', '') or str(title_raw)
        else:
            title = str(title_raw)
        title = title.strip()

        # Ensure authors is a list
        authors = paper.get('authors', [])
        if isinstance(authors, str):
            authors = [authors]
        elif isinstance(authors, dict):
            # Handle case where authors might be a dict
            authors = [str(authors)]

        return {
            'id': self._generate_paper_id(paper),
            'title': title,
            'abstract': abstract,
            'authors': authors,
            'source': paper.get('source', ''),
            'domain': paper.get('domain', 'general_medical'),
            'publication_date': paper.get('publication_date', ''),
            'journal': paper.get('journal', ''),
            'doi': paper.get('doi', ''),
            'pdf_link': paper.get('pdf_link', ''),
            'content_for_embedding': f"Title: {title}\nAbstract: {abstract}"
        }