Spaces:

paulhemb
/

MedSearchPro

Running

App Files Files Community

MedSearchPro / chat /summarizer.py

paulhemb

Initial Backend Deployment

1367957 about 1 month ago

raw

history blame contribute delete

13.7 kB

	# chat/summarizer.py
	"""
	Advanced multi-document summarization with cross-paper synthesis
	Identifies consensus, contradictions, and trends across multiple papers
	"""

	from typing import List, Dict, Any, Tuple
	from llm.llm_provider import GrokLLM
	from llm.prompt_templates import MedicalResearchPrompts, ResponseFormatter
	import re


	class MultiDocumentSummarizer:
	"""
	Advanced summarizer that synthesizes information across multiple research papers
	Goes beyond simple aggregation to identify patterns and insights
	"""

	def __init__(self, llm=None):
	self.llm = llm or GrokLLM(model="model") # Use shared LLM
	self.prompts = MedicalResearchPrompts()
	self.formatter = ResponseFormatter()

	def summarize_research(self, papers: List[Dict], query: str, domain: str) -> Dict[str, Any]:
	"""
	Generate comprehensive multi-document summary
	Returns consistent dict format that other components expect
	"""
	print(f"📚 Summarizing {len(papers)} papers for query: '{query}'")

	# Filter relevant papers if we have too many
	if len(papers) > 15:
	print(f"🔄 Filtering {len(papers)} papers to top 15 most relevant")
	papers = papers[:15]

	try:
	# Generate comprehensive summary
	summary_prompt = self.prompts.multi_document_summary(papers, query, domain)

	response = self.llm.generate(
	summary_prompt,
	system_message=self.prompts.SYSTEM_MESSAGES["research_analyst"],
	temperature=0.1,
	max_tokens=4000
	)

	# Extract key insights programmatically
	key_insights = self._extract_key_insights(response, papers)
	consensus_analysis = self._analyze_consensus(papers)

	# Return consistent format that RAG engine expects
	return {
	"summary": response, # Main summary field
	"comprehensive_summary": response, # Alternative field
	"key_insights": key_insights,
	"consensus_analysis": consensus_analysis,
	"papers_analyzed": len(papers),
	"papers_cited": [self.formatter.format_citation(paper, i + 1) for i, paper in enumerate(papers)],
	"query": query,
	"domain": domain
	}

	except Exception as e:
	print(f"❌ Summarization error: {e}")
	return self._create_fallback_summary(papers, query, domain)

	def _extract_key_insights(self, summary: str, papers: List[Dict]) -> List[Dict[str, str]]:
	"""Extract structured key insights from summary text"""
	insights = []

	# Look for key findings patterns in the summary
	patterns = [
	r"\d+\.\s\\([^]+)\\[^\n]\n([^]+?(?=\n\d+\.\|\n\\\|$))",
	r"-?\s\\([^]+)\\[^\n]\n([^]+?(?=\n-\|\n\\\|$))",
	r"([A-Z][^.:!?]+:[^.:!?]+)[.:!?]\s*([^.:!?]+[.:!?])"
	]

	for pattern in patterns:
	matches = re.findall(pattern, summary, re.IGNORECASE \| re.DOTALL)
	for match in matches:
	if len(match) == 2:
	insight_type, insight_text = match
	insights.append({
	"type": insight_type.strip(),
	"description": insight_text.strip(),
	"confidence": "high" if any(keyword in insight_type.lower() for keyword in
	["consensus", "proven", "established"]) else "medium"
	})

	# If no structured insights found, create from summary sentences
	if not insights:
	sentences = re.split(r'[.!?]+', summary)
	for sentence in sentences[:5]: # Top 5 sentences
	sentence = sentence.strip()
	if len(sentence) > 50 and any(
	keyword in sentence.lower() for keyword in ["find", "show", "demonstrate", "conclude"]):
	insights.append({
	"type": "Key Finding",
	"description": sentence,
	"confidence": "medium"
	})

	return insights[:10] # Limit to top 10 insights

	def _analyze_consensus(self, papers: List[Dict]) -> Dict[str, Any]:
	"""Analyze consensus and contradictions across papers"""
	if len(papers) < 2:
	return {"consensus_level": "insufficient_data", "contradictions": []}

	# Simple consensus analysis based on abstract content
	consensus_keywords = {}
	all_keywords = set()

	for paper in papers:
	abstract = paper.get('abstract', '').lower()
	# Extract meaningful keywords (simplified)
	words = re.findall(r'\b[a-z]{5,15}\b', abstract)
	for word in words:
	if word not in ['which', 'their', 'about', 'using', 'method', 'study']:
	consensus_keywords[word] = consensus_keywords.get(word, 0) + 1
	all_keywords.add(word)

	# Find common keywords (appearing in multiple papers)
	common_keywords = {word: count for word, count in consensus_keywords.items()
	if count >= max(2, len(papers) * 0.3)} # At least 30% of papers

	# Analyze potential contradictions (simplified)
	contradictions = []
	if len(papers) >= 3:
	# Look for papers with opposing conclusions (simplified heuristic)
	positive_terms = ['improve', 'better', 'superior', 'effective', 'accurate']
	negative_terms = ['limit', 'challenge', 'worse', 'ineffective', 'inaccurate']

	for i, paper1 in enumerate(papers):
	abstract1 = paper1.get('abstract', '').lower()
	paper1_pos = sum(1 for term in positive_terms if term in abstract1)
	paper1_neg = sum(1 for term in negative_terms if term in abstract1)

	for j, paper2 in enumerate(papers[i + 1:], i + 1):
	abstract2 = paper2.get('abstract', '').lower()
	paper2_pos = sum(1 for term in positive_terms if term in abstract2)
	paper2_neg = sum(1 for term in negative_terms if term in abstract2)

	# Simple contradiction detection
	if (paper1_pos > paper2_pos + 2 and paper2_neg > paper1_neg + 2) or \
	(paper2_pos > paper1_pos + 2 and paper1_neg > paper2_neg + 2):
	contradictions.append({
	"paper1": paper1.get('title', f'Paper {i + 1}'),
	"paper2": paper2.get('title', f'Paper {j + 1}'),
	"nature": "differing_conclusions"
	})

	return {
	"consensus_level": "high" if len(common_keywords) > 10 else "medium" if len(common_keywords) > 5 else "low",
	"common_themes": list(common_keywords.keys())[:10],
	"contradictions": contradictions[:5],
	"papers_analyzed": len(papers)
	}

	def _create_fallback_summary(self, papers: List[Dict], query: str, domain: str) -> Dict[str, Any]:
	"""Create a basic fallback summary when LLM fails"""
	print("🔄 Using fallback summary method")

	# Extract basic information from papers
	titles = [paper.get('title', 'Unknown') for paper in papers]
	sources = list(set(paper.get('source', 'Unknown') for paper in papers))
	domains = list(set(paper.get('domain', 'Unknown') for paper in papers))

	basic_summary = f"""
	Research Summary for: {query}
	Domain: {domain}

	Analyzed {len(papers)} papers from sources including: {', '.join(sources[:3])}

	Key Papers:
	{chr(10).join(f'- {title}' for title in titles[:5])}

	Note: Detailed analysis unavailable due to technical limitations.
	Please try refining your search query or using fewer papers.
	"""

	return {
	"summary": basic_summary, # Main field
	"comprehensive_summary": basic_summary, # Alternative field
	"key_insights": [{"type": "Technical Note", "description": "Basic summary only - LLM analysis unavailable",
	"confidence": "low"}],
	"consensus_analysis": {"consensus_level": "unknown", "common_themes": [], "contradictions": []},
	"papers_analyzed": len(papers),
	"papers_cited": [self.formatter.format_citation(paper, i + 1) for i, paper in enumerate(papers)],
	"query": query,
	"domain": domain,
	"fallback_used": True
	}

	def generate_executive_summary(self, summary_data) -> str:
	"""
	Generate a concise executive summary from detailed analysis
	Handles both string and dict inputs for compatibility
	"""
	try:
	# Handle both string and dict inputs for backward compatibility
	if isinstance(summary_data, str):
	# If we get a string, create a basic executive summary
	prompt = f"Create a concise executive summary (2-3 paragraphs) from this research summary:\n\n{summary_data}"
	return self.llm.generate(
	prompt,
	system_message="Create professional executive summaries for research papers",
	max_tokens=400
	)

	elif isinstance(summary_data, dict):
	# If we get a dict (preferred), use the structured data
	insights = summary_data.get('key_insights', [])
	consensus = summary_data.get('consensus_analysis', {})
	papers_analyzed = summary_data.get('papers_analyzed', 0)
	domain = summary_data.get('domain', 'unknown domain')

	# Build executive summary from structured data
	executive_summary = f"Executive Summary\n\n"
	executive_summary += f"Based on analysis of {papers_analyzed} papers in {domain}:\n\n"

	# Add top insights
	if insights:
	executive_summary += "Key Findings:\n"
	for insight in insights[:3]:
	executive_summary += f"• {insight.get('description', 'No description')}\n"

	# Add consensus level
	if consensus.get('consensus_level') != 'unknown':
	executive_summary += f"\nConsensus Level: {consensus['consensus_level'].title()}\n"

	# Add common themes if available
	if consensus.get('common_themes'):
	executive_summary += f"\nCommon Themes: {', '.join(consensus['common_themes'][:5])}\n"

	return executive_summary
	else:
	return "Executive summary unavailable - invalid input format"

	except Exception as e:
	print(f"❌ Executive summary generation failed: {e}")
	return "Executive summary unavailable due to technical error"


	# Quick test
	def test_summarizer():
	"""Test the multi-document summarizer"""
	print("🧪 Testing Multi-Document Summarizer")
	print("=" * 50)

	test_papers = [
	{
	'title': 'Deep Learning for Medical Image Analysis',
	'authors': ['Smith J', 'Johnson A', 'Brown K'],
	'abstract': 'We demonstrate that convolutional neural networks significantly improve accuracy in medical image segmentation tasks. Our method achieves 95% accuracy on the BRATS dataset, outperforming traditional methods by 15%.',
	'source': 'Nature Medicine',
	'domain': 'medical_imaging',
	'publication_date': '2024-01-15'
	},
	{
	'title': 'Transformers in Radiology',
	'authors': ['Lee K', 'Chen R', 'Wang L'],
	'abstract': 'This study shows that transformer architectures provide better context understanding in radiology images compared to CNNs. However, they require more computational resources and larger datasets for training.',
	'source': 'Radiology AI Journal',
	'domain': 'medical_imaging',
	'publication_date': '2024-02-20'
	}
	]

	summarizer = MultiDocumentSummarizer()

	try:
	summary = summarizer.summarize_research(
	test_papers,
	"Latest advances in AI for medical imaging",
	"medical_imaging"
	)

	print(f"✅ Summary generated successfully")
	print(f"📊 Papers analyzed: {summary['papers_analyzed']}")
	print(f"💡 Key insights: {len(summary['key_insights'])}")
	print(f"🤝 Consensus level: {summary['consensus_analysis']['consensus_level']}")

	# Test executive summary with dict input
	executive = summarizer.generate_executive_summary(summary)
	print(f"\n📋 Executive Summary:\n{executive}")

	# Test executive summary with string input (backward compatibility)
	executive_str = summarizer.generate_executive_summary(summary['summary'])
	print(f"\n📋 Executive Summary (from string):\n{executive_str}")

	except Exception as e:
	print(f"❌ Summarization test failed: {e}")


	if __name__ == "__main__":
	test_summarizer()