# processing/paper_processor.py from typing import List, Dict import hashlib import json class PaperProcessor: def __init__(self): self.processed_papers = set() def process_papers(self, papers: List[Dict]) -> List[Dict]: """Process and clean papers for vector storage""" processed = [] for paper in papers: # Create unique ID paper_id = self._generate_paper_id(paper) if paper_id in self.processed_papers: continue # Clean and enrich paper data cleaned_paper = self._clean_paper(paper) if cleaned_paper: processed.append(cleaned_paper) self.processed_papers.add(paper_id) return processed def _generate_paper_id(self, paper: Dict) -> str: """Generate unique ID for paper""" content = f"{paper.get('title', '')}{paper.get('doi', '')}{paper.get('id', '')}" return hashlib.md5(content.encode()).hexdigest() def _clean_paper(self, paper: Dict) -> Dict: """Clean and standardize paper data""" # FIXED: Handle abstract that might be a dictionary abstract_raw = paper.get('abstract', '') # Handle case where abstract is a dictionary if isinstance(abstract_raw, dict): # Try to extract text from common keys in dict abstract = abstract_raw.get('text', '') or abstract_raw.get('abstract', '') or str(abstract_raw) else: abstract = str(abstract_raw) # Now strip the string abstract = abstract.strip() if not abstract or len(abstract) < 100: # Too short, probably not useful return None # FIXED: Also handle title that might be a dictionary title_raw = paper.get('title', '') if isinstance(title_raw, dict): title = title_raw.get('text', '') or title_raw.get('title', '') or str(title_raw) else: title = str(title_raw) title = title.strip() # Ensure authors is a list authors = paper.get('authors', []) if isinstance(authors, str): authors = [authors] elif isinstance(authors, dict): # Handle case where authors might be a dict authors = [str(authors)] return { 'id': self._generate_paper_id(paper), 'title': title, 'abstract': abstract, 'authors': authors, 'source': paper.get('source', ''), 'domain': paper.get('domain', 'general_medical'), 'publication_date': paper.get('publication_date', ''), 'journal': paper.get('journal', ''), 'doi': paper.get('doi', ''), 'pdf_link': paper.get('pdf_link', ''), 'content_for_embedding': f"Title: {title}\nAbstract: {abstract}" }