Spaces:
Running
Running
| # processing/paper_processor.py | |
| from typing import List, Dict | |
| import hashlib | |
| import json | |
| class PaperProcessor: | |
| def __init__(self): | |
| self.processed_papers = set() | |
| def process_papers(self, papers: List[Dict]) -> List[Dict]: | |
| """Process and clean papers for vector storage""" | |
| processed = [] | |
| for paper in papers: | |
| # Create unique ID | |
| paper_id = self._generate_paper_id(paper) | |
| if paper_id in self.processed_papers: | |
| continue | |
| # Clean and enrich paper data | |
| cleaned_paper = self._clean_paper(paper) | |
| if cleaned_paper: | |
| processed.append(cleaned_paper) | |
| self.processed_papers.add(paper_id) | |
| return processed | |
| def _generate_paper_id(self, paper: Dict) -> str: | |
| """Generate unique ID for paper""" | |
| content = f"{paper.get('title', '')}{paper.get('doi', '')}{paper.get('id', '')}" | |
| return hashlib.md5(content.encode()).hexdigest() | |
| def _clean_paper(self, paper: Dict) -> Dict: | |
| """Clean and standardize paper data""" | |
| # FIXED: Handle abstract that might be a dictionary | |
| abstract_raw = paper.get('abstract', '') | |
| # Handle case where abstract is a dictionary | |
| if isinstance(abstract_raw, dict): | |
| # Try to extract text from common keys in dict | |
| abstract = abstract_raw.get('text', '') or abstract_raw.get('abstract', '') or str(abstract_raw) | |
| else: | |
| abstract = str(abstract_raw) | |
| # Now strip the string | |
| abstract = abstract.strip() | |
| if not abstract or len(abstract) < 100: # Too short, probably not useful | |
| return None | |
| # FIXED: Also handle title that might be a dictionary | |
| title_raw = paper.get('title', '') | |
| if isinstance(title_raw, dict): | |
| title = title_raw.get('text', '') or title_raw.get('title', '') or str(title_raw) | |
| else: | |
| title = str(title_raw) | |
| title = title.strip() | |
| # Ensure authors is a list | |
| authors = paper.get('authors', []) | |
| if isinstance(authors, str): | |
| authors = [authors] | |
| elif isinstance(authors, dict): | |
| # Handle case where authors might be a dict | |
| authors = [str(authors)] | |
| return { | |
| 'id': self._generate_paper_id(paper), | |
| 'title': title, | |
| 'abstract': abstract, | |
| 'authors': authors, | |
| 'source': paper.get('source', ''), | |
| 'domain': paper.get('domain', 'general_medical'), | |
| 'publication_date': paper.get('publication_date', ''), | |
| 'journal': paper.get('journal', ''), | |
| 'doi': paper.get('doi', ''), | |
| 'pdf_link': paper.get('pdf_link', ''), | |
| 'content_for_embedding': f"Title: {title}\nAbstract: {abstract}" | |
| } |