Spaces:

paulhemb
/

MedSearchPro

Running

App Files Files Community

MedSearchPro / processing /paper_processor.py

paulhemb

Initial Backend Deployment

1367957 18 days ago

raw

history blame contribute delete

2.95 kB

	# processing/paper_processor.py
	from typing import List, Dict
	import hashlib
	import json


	class PaperProcessor:
	def __init__(self):
	self.processed_papers = set()

	def process_papers(self, papers: List[Dict]) -> List[Dict]:
	"""Process and clean papers for vector storage"""
	processed = []

	for paper in papers:
	# Create unique ID
	paper_id = self._generate_paper_id(paper)

	if paper_id in self.processed_papers:
	continue

	# Clean and enrich paper data
	cleaned_paper = self._clean_paper(paper)
	if cleaned_paper:
	processed.append(cleaned_paper)
	self.processed_papers.add(paper_id)

	return processed

	def _generate_paper_id(self, paper: Dict) -> str:
	"""Generate unique ID for paper"""
	content = f"{paper.get('title', '')}{paper.get('doi', '')}{paper.get('id', '')}"
	return hashlib.md5(content.encode()).hexdigest()

	def _clean_paper(self, paper: Dict) -> Dict:
	"""Clean and standardize paper data"""
	# FIXED: Handle abstract that might be a dictionary
	abstract_raw = paper.get('abstract', '')

	# Handle case where abstract is a dictionary
	if isinstance(abstract_raw, dict):
	# Try to extract text from common keys in dict
	abstract = abstract_raw.get('text', '') or abstract_raw.get('abstract', '') or str(abstract_raw)
	else:
	abstract = str(abstract_raw)

	# Now strip the string
	abstract = abstract.strip()

	if not abstract or len(abstract) < 100: # Too short, probably not useful
	return None

	# FIXED: Also handle title that might be a dictionary
	title_raw = paper.get('title', '')
	if isinstance(title_raw, dict):
	title = title_raw.get('text', '') or title_raw.get('title', '') or str(title_raw)
	else:
	title = str(title_raw)
	title = title.strip()

	# Ensure authors is a list
	authors = paper.get('authors', [])
	if isinstance(authors, str):
	authors = [authors]
	elif isinstance(authors, dict):
	# Handle case where authors might be a dict
	authors = [str(authors)]

	return {
	'id': self._generate_paper_id(paper),
	'title': title,
	'abstract': abstract,
	'authors': authors,
	'source': paper.get('source', ''),
	'domain': paper.get('domain', 'general_medical'),
	'publication_date': paper.get('publication_date', ''),
	'journal': paper.get('journal', ''),
	'doi': paper.get('doi', ''),
	'pdf_link': paper.get('pdf_link', ''),
	'content_for_embedding': f"Title: {title}\nAbstract: {abstract}"
	}