MedSearchPro / processing /paper_processor.py
paulhemb's picture
Initial Backend Deployment
1367957
# processing/paper_processor.py
from typing import List, Dict
import hashlib
import json
class PaperProcessor:
def __init__(self):
self.processed_papers = set()
def process_papers(self, papers: List[Dict]) -> List[Dict]:
"""Process and clean papers for vector storage"""
processed = []
for paper in papers:
# Create unique ID
paper_id = self._generate_paper_id(paper)
if paper_id in self.processed_papers:
continue
# Clean and enrich paper data
cleaned_paper = self._clean_paper(paper)
if cleaned_paper:
processed.append(cleaned_paper)
self.processed_papers.add(paper_id)
return processed
def _generate_paper_id(self, paper: Dict) -> str:
"""Generate unique ID for paper"""
content = f"{paper.get('title', '')}{paper.get('doi', '')}{paper.get('id', '')}"
return hashlib.md5(content.encode()).hexdigest()
def _clean_paper(self, paper: Dict) -> Dict:
"""Clean and standardize paper data"""
# FIXED: Handle abstract that might be a dictionary
abstract_raw = paper.get('abstract', '')
# Handle case where abstract is a dictionary
if isinstance(abstract_raw, dict):
# Try to extract text from common keys in dict
abstract = abstract_raw.get('text', '') or abstract_raw.get('abstract', '') or str(abstract_raw)
else:
abstract = str(abstract_raw)
# Now strip the string
abstract = abstract.strip()
if not abstract or len(abstract) < 100: # Too short, probably not useful
return None
# FIXED: Also handle title that might be a dictionary
title_raw = paper.get('title', '')
if isinstance(title_raw, dict):
title = title_raw.get('text', '') or title_raw.get('title', '') or str(title_raw)
else:
title = str(title_raw)
title = title.strip()
# Ensure authors is a list
authors = paper.get('authors', [])
if isinstance(authors, str):
authors = [authors]
elif isinstance(authors, dict):
# Handle case where authors might be a dict
authors = [str(authors)]
return {
'id': self._generate_paper_id(paper),
'title': title,
'abstract': abstract,
'authors': authors,
'source': paper.get('source', ''),
'domain': paper.get('domain', 'general_medical'),
'publication_date': paper.get('publication_date', ''),
'journal': paper.get('journal', ''),
'doi': paper.get('doi', ''),
'pdf_link': paper.get('pdf_link', ''),
'content_for_embedding': f"Title: {title}\nAbstract: {abstract}"
}