MedSearchPro / chat /summarizer.py
paulhemb's picture
Initial Backend Deployment
1367957
# chat/summarizer.py
"""
Advanced multi-document summarization with cross-paper synthesis
Identifies consensus, contradictions, and trends across multiple papers
"""
from typing import List, Dict, Any, Tuple
from llm.llm_provider import GrokLLM
from llm.prompt_templates import MedicalResearchPrompts, ResponseFormatter
import re
class MultiDocumentSummarizer:
"""
Advanced summarizer that synthesizes information across multiple research papers
Goes beyond simple aggregation to identify patterns and insights
"""
def __init__(self, llm=None):
self.llm = llm or GrokLLM(model="model") # Use shared LLM
self.prompts = MedicalResearchPrompts()
self.formatter = ResponseFormatter()
def summarize_research(self, papers: List[Dict], query: str, domain: str) -> Dict[str, Any]:
"""
Generate comprehensive multi-document summary
Returns consistent dict format that other components expect
"""
print(f"πŸ“š Summarizing {len(papers)} papers for query: '{query}'")
# Filter relevant papers if we have too many
if len(papers) > 15:
print(f"πŸ”„ Filtering {len(papers)} papers to top 15 most relevant")
papers = papers[:15]
try:
# Generate comprehensive summary
summary_prompt = self.prompts.multi_document_summary(papers, query, domain)
response = self.llm.generate(
summary_prompt,
system_message=self.prompts.SYSTEM_MESSAGES["research_analyst"],
temperature=0.1,
max_tokens=4000
)
# Extract key insights programmatically
key_insights = self._extract_key_insights(response, papers)
consensus_analysis = self._analyze_consensus(papers)
# Return consistent format that RAG engine expects
return {
"summary": response, # Main summary field
"comprehensive_summary": response, # Alternative field
"key_insights": key_insights,
"consensus_analysis": consensus_analysis,
"papers_analyzed": len(papers),
"papers_cited": [self.formatter.format_citation(paper, i + 1) for i, paper in enumerate(papers)],
"query": query,
"domain": domain
}
except Exception as e:
print(f"❌ Summarization error: {e}")
return self._create_fallback_summary(papers, query, domain)
def _extract_key_insights(self, summary: str, papers: List[Dict]) -> List[Dict[str, str]]:
"""Extract structured key insights from summary text"""
insights = []
# Look for key findings patterns in the summary
patterns = [
r"\d+\.\s*\*\*([^*]+)\*\*[^\n]*\n([^*]+?(?=\n\d+\.|\n\*\*|$))",
r"-?\s*\*\*([^*]+)\*\*[^\n]*\n([^*]+?(?=\n-|\n\*\*|$))",
r"([A-Z][^.:!?]+:[^.:!?]+)[.:!?]\s*([^.:!?]+[.:!?])"
]
for pattern in patterns:
matches = re.findall(pattern, summary, re.IGNORECASE | re.DOTALL)
for match in matches:
if len(match) == 2:
insight_type, insight_text = match
insights.append({
"type": insight_type.strip(),
"description": insight_text.strip(),
"confidence": "high" if any(keyword in insight_type.lower() for keyword in
["consensus", "proven", "established"]) else "medium"
})
# If no structured insights found, create from summary sentences
if not insights:
sentences = re.split(r'[.!?]+', summary)
for sentence in sentences[:5]: # Top 5 sentences
sentence = sentence.strip()
if len(sentence) > 50 and any(
keyword in sentence.lower() for keyword in ["find", "show", "demonstrate", "conclude"]):
insights.append({
"type": "Key Finding",
"description": sentence,
"confidence": "medium"
})
return insights[:10] # Limit to top 10 insights
def _analyze_consensus(self, papers: List[Dict]) -> Dict[str, Any]:
"""Analyze consensus and contradictions across papers"""
if len(papers) < 2:
return {"consensus_level": "insufficient_data", "contradictions": []}
# Simple consensus analysis based on abstract content
consensus_keywords = {}
all_keywords = set()
for paper in papers:
abstract = paper.get('abstract', '').lower()
# Extract meaningful keywords (simplified)
words = re.findall(r'\b[a-z]{5,15}\b', abstract)
for word in words:
if word not in ['which', 'their', 'about', 'using', 'method', 'study']:
consensus_keywords[word] = consensus_keywords.get(word, 0) + 1
all_keywords.add(word)
# Find common keywords (appearing in multiple papers)
common_keywords = {word: count for word, count in consensus_keywords.items()
if count >= max(2, len(papers) * 0.3)} # At least 30% of papers
# Analyze potential contradictions (simplified)
contradictions = []
if len(papers) >= 3:
# Look for papers with opposing conclusions (simplified heuristic)
positive_terms = ['improve', 'better', 'superior', 'effective', 'accurate']
negative_terms = ['limit', 'challenge', 'worse', 'ineffective', 'inaccurate']
for i, paper1 in enumerate(papers):
abstract1 = paper1.get('abstract', '').lower()
paper1_pos = sum(1 for term in positive_terms if term in abstract1)
paper1_neg = sum(1 for term in negative_terms if term in abstract1)
for j, paper2 in enumerate(papers[i + 1:], i + 1):
abstract2 = paper2.get('abstract', '').lower()
paper2_pos = sum(1 for term in positive_terms if term in abstract2)
paper2_neg = sum(1 for term in negative_terms if term in abstract2)
# Simple contradiction detection
if (paper1_pos > paper2_pos + 2 and paper2_neg > paper1_neg + 2) or \
(paper2_pos > paper1_pos + 2 and paper1_neg > paper2_neg + 2):
contradictions.append({
"paper1": paper1.get('title', f'Paper {i + 1}'),
"paper2": paper2.get('title', f'Paper {j + 1}'),
"nature": "differing_conclusions"
})
return {
"consensus_level": "high" if len(common_keywords) > 10 else "medium" if len(common_keywords) > 5 else "low",
"common_themes": list(common_keywords.keys())[:10],
"contradictions": contradictions[:5],
"papers_analyzed": len(papers)
}
def _create_fallback_summary(self, papers: List[Dict], query: str, domain: str) -> Dict[str, Any]:
"""Create a basic fallback summary when LLM fails"""
print("πŸ”„ Using fallback summary method")
# Extract basic information from papers
titles = [paper.get('title', 'Unknown') for paper in papers]
sources = list(set(paper.get('source', 'Unknown') for paper in papers))
domains = list(set(paper.get('domain', 'Unknown') for paper in papers))
basic_summary = f"""
Research Summary for: {query}
Domain: {domain}
Analyzed {len(papers)} papers from sources including: {', '.join(sources[:3])}
Key Papers:
{chr(10).join(f'- {title}' for title in titles[:5])}
Note: Detailed analysis unavailable due to technical limitations.
Please try refining your search query or using fewer papers.
"""
return {
"summary": basic_summary, # Main field
"comprehensive_summary": basic_summary, # Alternative field
"key_insights": [{"type": "Technical Note", "description": "Basic summary only - LLM analysis unavailable",
"confidence": "low"}],
"consensus_analysis": {"consensus_level": "unknown", "common_themes": [], "contradictions": []},
"papers_analyzed": len(papers),
"papers_cited": [self.formatter.format_citation(paper, i + 1) for i, paper in enumerate(papers)],
"query": query,
"domain": domain,
"fallback_used": True
}
def generate_executive_summary(self, summary_data) -> str:
"""
Generate a concise executive summary from detailed analysis
Handles both string and dict inputs for compatibility
"""
try:
# Handle both string and dict inputs for backward compatibility
if isinstance(summary_data, str):
# If we get a string, create a basic executive summary
prompt = f"Create a concise executive summary (2-3 paragraphs) from this research summary:\n\n{summary_data}"
return self.llm.generate(
prompt,
system_message="Create professional executive summaries for research papers",
max_tokens=400
)
elif isinstance(summary_data, dict):
# If we get a dict (preferred), use the structured data
insights = summary_data.get('key_insights', [])
consensus = summary_data.get('consensus_analysis', {})
papers_analyzed = summary_data.get('papers_analyzed', 0)
domain = summary_data.get('domain', 'unknown domain')
# Build executive summary from structured data
executive_summary = f"**Executive Summary**\n\n"
executive_summary += f"Based on analysis of {papers_analyzed} papers in {domain}:\n\n"
# Add top insights
if insights:
executive_summary += "**Key Findings:**\n"
for insight in insights[:3]:
executive_summary += f"β€’ {insight.get('description', 'No description')}\n"
# Add consensus level
if consensus.get('consensus_level') != 'unknown':
executive_summary += f"\n**Consensus Level:** {consensus['consensus_level'].title()}\n"
# Add common themes if available
if consensus.get('common_themes'):
executive_summary += f"\n**Common Themes:** {', '.join(consensus['common_themes'][:5])}\n"
return executive_summary
else:
return "Executive summary unavailable - invalid input format"
except Exception as e:
print(f"❌ Executive summary generation failed: {e}")
return "Executive summary unavailable due to technical error"
# Quick test
def test_summarizer():
"""Test the multi-document summarizer"""
print("πŸ§ͺ Testing Multi-Document Summarizer")
print("=" * 50)
test_papers = [
{
'title': 'Deep Learning for Medical Image Analysis',
'authors': ['Smith J', 'Johnson A', 'Brown K'],
'abstract': 'We demonstrate that convolutional neural networks significantly improve accuracy in medical image segmentation tasks. Our method achieves 95% accuracy on the BRATS dataset, outperforming traditional methods by 15%.',
'source': 'Nature Medicine',
'domain': 'medical_imaging',
'publication_date': '2024-01-15'
},
{
'title': 'Transformers in Radiology',
'authors': ['Lee K', 'Chen R', 'Wang L'],
'abstract': 'This study shows that transformer architectures provide better context understanding in radiology images compared to CNNs. However, they require more computational resources and larger datasets for training.',
'source': 'Radiology AI Journal',
'domain': 'medical_imaging',
'publication_date': '2024-02-20'
}
]
summarizer = MultiDocumentSummarizer()
try:
summary = summarizer.summarize_research(
test_papers,
"Latest advances in AI for medical imaging",
"medical_imaging"
)
print(f"βœ… Summary generated successfully")
print(f"πŸ“Š Papers analyzed: {summary['papers_analyzed']}")
print(f"πŸ’‘ Key insights: {len(summary['key_insights'])}")
print(f"🀝 Consensus level: {summary['consensus_analysis']['consensus_level']}")
# Test executive summary with dict input
executive = summarizer.generate_executive_summary(summary)
print(f"\nπŸ“‹ Executive Summary:\n{executive}")
# Test executive summary with string input (backward compatibility)
executive_str = summarizer.generate_executive_summary(summary['summary'])
print(f"\nπŸ“‹ Executive Summary (from string):\n{executive_str}")
except Exception as e:
print(f"❌ Summarization test failed: {e}")
if __name__ == "__main__":
test_summarizer()