# chat/summarizer.py """ Advanced multi-document summarization with cross-paper synthesis Identifies consensus, contradictions, and trends across multiple papers """ from typing import List, Dict, Any, Tuple from llm.llm_provider import GrokLLM from llm.prompt_templates import MedicalResearchPrompts, ResponseFormatter import re class MultiDocumentSummarizer: """ Advanced summarizer that synthesizes information across multiple research papers Goes beyond simple aggregation to identify patterns and insights """ def __init__(self, llm=None): self.llm = llm or GrokLLM(model="model") # Use shared LLM self.prompts = MedicalResearchPrompts() self.formatter = ResponseFormatter() def summarize_research(self, papers: List[Dict], query: str, domain: str) -> Dict[str, Any]: """ Generate comprehensive multi-document summary Returns consistent dict format that other components expect """ print(f"๐Ÿ“š Summarizing {len(papers)} papers for query: '{query}'") # Filter relevant papers if we have too many if len(papers) > 15: print(f"๐Ÿ”„ Filtering {len(papers)} papers to top 15 most relevant") papers = papers[:15] try: # Generate comprehensive summary summary_prompt = self.prompts.multi_document_summary(papers, query, domain) response = self.llm.generate( summary_prompt, system_message=self.prompts.SYSTEM_MESSAGES["research_analyst"], temperature=0.1, max_tokens=4000 ) # Extract key insights programmatically key_insights = self._extract_key_insights(response, papers) consensus_analysis = self._analyze_consensus(papers) # Return consistent format that RAG engine expects return { "summary": response, # Main summary field "comprehensive_summary": response, # Alternative field "key_insights": key_insights, "consensus_analysis": consensus_analysis, "papers_analyzed": len(papers), "papers_cited": [self.formatter.format_citation(paper, i + 1) for i, paper in enumerate(papers)], "query": query, "domain": domain } except Exception as e: print(f"โŒ Summarization error: {e}") return self._create_fallback_summary(papers, query, domain) def _extract_key_insights(self, summary: str, papers: List[Dict]) -> List[Dict[str, str]]: """Extract structured key insights from summary text""" insights = [] # Look for key findings patterns in the summary patterns = [ r"\d+\.\s*\*\*([^*]+)\*\*[^\n]*\n([^*]+?(?=\n\d+\.|\n\*\*|$))", r"-?\s*\*\*([^*]+)\*\*[^\n]*\n([^*]+?(?=\n-|\n\*\*|$))", r"([A-Z][^.:!?]+:[^.:!?]+)[.:!?]\s*([^.:!?]+[.:!?])" ] for pattern in patterns: matches = re.findall(pattern, summary, re.IGNORECASE | re.DOTALL) for match in matches: if len(match) == 2: insight_type, insight_text = match insights.append({ "type": insight_type.strip(), "description": insight_text.strip(), "confidence": "high" if any(keyword in insight_type.lower() for keyword in ["consensus", "proven", "established"]) else "medium" }) # If no structured insights found, create from summary sentences if not insights: sentences = re.split(r'[.!?]+', summary) for sentence in sentences[:5]: # Top 5 sentences sentence = sentence.strip() if len(sentence) > 50 and any( keyword in sentence.lower() for keyword in ["find", "show", "demonstrate", "conclude"]): insights.append({ "type": "Key Finding", "description": sentence, "confidence": "medium" }) return insights[:10] # Limit to top 10 insights def _analyze_consensus(self, papers: List[Dict]) -> Dict[str, Any]: """Analyze consensus and contradictions across papers""" if len(papers) < 2: return {"consensus_level": "insufficient_data", "contradictions": []} # Simple consensus analysis based on abstract content consensus_keywords = {} all_keywords = set() for paper in papers: abstract = paper.get('abstract', '').lower() # Extract meaningful keywords (simplified) words = re.findall(r'\b[a-z]{5,15}\b', abstract) for word in words: if word not in ['which', 'their', 'about', 'using', 'method', 'study']: consensus_keywords[word] = consensus_keywords.get(word, 0) + 1 all_keywords.add(word) # Find common keywords (appearing in multiple papers) common_keywords = {word: count for word, count in consensus_keywords.items() if count >= max(2, len(papers) * 0.3)} # At least 30% of papers # Analyze potential contradictions (simplified) contradictions = [] if len(papers) >= 3: # Look for papers with opposing conclusions (simplified heuristic) positive_terms = ['improve', 'better', 'superior', 'effective', 'accurate'] negative_terms = ['limit', 'challenge', 'worse', 'ineffective', 'inaccurate'] for i, paper1 in enumerate(papers): abstract1 = paper1.get('abstract', '').lower() paper1_pos = sum(1 for term in positive_terms if term in abstract1) paper1_neg = sum(1 for term in negative_terms if term in abstract1) for j, paper2 in enumerate(papers[i + 1:], i + 1): abstract2 = paper2.get('abstract', '').lower() paper2_pos = sum(1 for term in positive_terms if term in abstract2) paper2_neg = sum(1 for term in negative_terms if term in abstract2) # Simple contradiction detection if (paper1_pos > paper2_pos + 2 and paper2_neg > paper1_neg + 2) or \ (paper2_pos > paper1_pos + 2 and paper1_neg > paper2_neg + 2): contradictions.append({ "paper1": paper1.get('title', f'Paper {i + 1}'), "paper2": paper2.get('title', f'Paper {j + 1}'), "nature": "differing_conclusions" }) return { "consensus_level": "high" if len(common_keywords) > 10 else "medium" if len(common_keywords) > 5 else "low", "common_themes": list(common_keywords.keys())[:10], "contradictions": contradictions[:5], "papers_analyzed": len(papers) } def _create_fallback_summary(self, papers: List[Dict], query: str, domain: str) -> Dict[str, Any]: """Create a basic fallback summary when LLM fails""" print("๐Ÿ”„ Using fallback summary method") # Extract basic information from papers titles = [paper.get('title', 'Unknown') for paper in papers] sources = list(set(paper.get('source', 'Unknown') for paper in papers)) domains = list(set(paper.get('domain', 'Unknown') for paper in papers)) basic_summary = f""" Research Summary for: {query} Domain: {domain} Analyzed {len(papers)} papers from sources including: {', '.join(sources[:3])} Key Papers: {chr(10).join(f'- {title}' for title in titles[:5])} Note: Detailed analysis unavailable due to technical limitations. Please try refining your search query or using fewer papers. """ return { "summary": basic_summary, # Main field "comprehensive_summary": basic_summary, # Alternative field "key_insights": [{"type": "Technical Note", "description": "Basic summary only - LLM analysis unavailable", "confidence": "low"}], "consensus_analysis": {"consensus_level": "unknown", "common_themes": [], "contradictions": []}, "papers_analyzed": len(papers), "papers_cited": [self.formatter.format_citation(paper, i + 1) for i, paper in enumerate(papers)], "query": query, "domain": domain, "fallback_used": True } def generate_executive_summary(self, summary_data) -> str: """ Generate a concise executive summary from detailed analysis Handles both string and dict inputs for compatibility """ try: # Handle both string and dict inputs for backward compatibility if isinstance(summary_data, str): # If we get a string, create a basic executive summary prompt = f"Create a concise executive summary (2-3 paragraphs) from this research summary:\n\n{summary_data}" return self.llm.generate( prompt, system_message="Create professional executive summaries for research papers", max_tokens=400 ) elif isinstance(summary_data, dict): # If we get a dict (preferred), use the structured data insights = summary_data.get('key_insights', []) consensus = summary_data.get('consensus_analysis', {}) papers_analyzed = summary_data.get('papers_analyzed', 0) domain = summary_data.get('domain', 'unknown domain') # Build executive summary from structured data executive_summary = f"**Executive Summary**\n\n" executive_summary += f"Based on analysis of {papers_analyzed} papers in {domain}:\n\n" # Add top insights if insights: executive_summary += "**Key Findings:**\n" for insight in insights[:3]: executive_summary += f"โ€ข {insight.get('description', 'No description')}\n" # Add consensus level if consensus.get('consensus_level') != 'unknown': executive_summary += f"\n**Consensus Level:** {consensus['consensus_level'].title()}\n" # Add common themes if available if consensus.get('common_themes'): executive_summary += f"\n**Common Themes:** {', '.join(consensus['common_themes'][:5])}\n" return executive_summary else: return "Executive summary unavailable - invalid input format" except Exception as e: print(f"โŒ Executive summary generation failed: {e}") return "Executive summary unavailable due to technical error" # Quick test def test_summarizer(): """Test the multi-document summarizer""" print("๐Ÿงช Testing Multi-Document Summarizer") print("=" * 50) test_papers = [ { 'title': 'Deep Learning for Medical Image Analysis', 'authors': ['Smith J', 'Johnson A', 'Brown K'], 'abstract': 'We demonstrate that convolutional neural networks significantly improve accuracy in medical image segmentation tasks. Our method achieves 95% accuracy on the BRATS dataset, outperforming traditional methods by 15%.', 'source': 'Nature Medicine', 'domain': 'medical_imaging', 'publication_date': '2024-01-15' }, { 'title': 'Transformers in Radiology', 'authors': ['Lee K', 'Chen R', 'Wang L'], 'abstract': 'This study shows that transformer architectures provide better context understanding in radiology images compared to CNNs. However, they require more computational resources and larger datasets for training.', 'source': 'Radiology AI Journal', 'domain': 'medical_imaging', 'publication_date': '2024-02-20' } ] summarizer = MultiDocumentSummarizer() try: summary = summarizer.summarize_research( test_papers, "Latest advances in AI for medical imaging", "medical_imaging" ) print(f"โœ… Summary generated successfully") print(f"๐Ÿ“Š Papers analyzed: {summary['papers_analyzed']}") print(f"๐Ÿ’ก Key insights: {len(summary['key_insights'])}") print(f"๐Ÿค Consensus level: {summary['consensus_analysis']['consensus_level']}") # Test executive summary with dict input executive = summarizer.generate_executive_summary(summary) print(f"\n๐Ÿ“‹ Executive Summary:\n{executive}") # Test executive summary with string input (backward compatibility) executive_str = summarizer.generate_executive_summary(summary['summary']) print(f"\n๐Ÿ“‹ Executive Summary (from string):\n{executive_str}") except Exception as e: print(f"โŒ Summarization test failed: {e}") if __name__ == "__main__": test_summarizer()