Spaces:
Running
Running
| # chat/summarizer.py | |
| """ | |
| Advanced multi-document summarization with cross-paper synthesis | |
| Identifies consensus, contradictions, and trends across multiple papers | |
| """ | |
| from typing import List, Dict, Any, Tuple | |
| from llm.llm_provider import GrokLLM | |
| from llm.prompt_templates import MedicalResearchPrompts, ResponseFormatter | |
| import re | |
| class MultiDocumentSummarizer: | |
| """ | |
| Advanced summarizer that synthesizes information across multiple research papers | |
| Goes beyond simple aggregation to identify patterns and insights | |
| """ | |
| def __init__(self, llm=None): | |
| self.llm = llm or GrokLLM(model="model") # Use shared LLM | |
| self.prompts = MedicalResearchPrompts() | |
| self.formatter = ResponseFormatter() | |
| def summarize_research(self, papers: List[Dict], query: str, domain: str) -> Dict[str, Any]: | |
| """ | |
| Generate comprehensive multi-document summary | |
| Returns consistent dict format that other components expect | |
| """ | |
| print(f"π Summarizing {len(papers)} papers for query: '{query}'") | |
| # Filter relevant papers if we have too many | |
| if len(papers) > 15: | |
| print(f"π Filtering {len(papers)} papers to top 15 most relevant") | |
| papers = papers[:15] | |
| try: | |
| # Generate comprehensive summary | |
| summary_prompt = self.prompts.multi_document_summary(papers, query, domain) | |
| response = self.llm.generate( | |
| summary_prompt, | |
| system_message=self.prompts.SYSTEM_MESSAGES["research_analyst"], | |
| temperature=0.1, | |
| max_tokens=4000 | |
| ) | |
| # Extract key insights programmatically | |
| key_insights = self._extract_key_insights(response, papers) | |
| consensus_analysis = self._analyze_consensus(papers) | |
| # Return consistent format that RAG engine expects | |
| return { | |
| "summary": response, # Main summary field | |
| "comprehensive_summary": response, # Alternative field | |
| "key_insights": key_insights, | |
| "consensus_analysis": consensus_analysis, | |
| "papers_analyzed": len(papers), | |
| "papers_cited": [self.formatter.format_citation(paper, i + 1) for i, paper in enumerate(papers)], | |
| "query": query, | |
| "domain": domain | |
| } | |
| except Exception as e: | |
| print(f"β Summarization error: {e}") | |
| return self._create_fallback_summary(papers, query, domain) | |
| def _extract_key_insights(self, summary: str, papers: List[Dict]) -> List[Dict[str, str]]: | |
| """Extract structured key insights from summary text""" | |
| insights = [] | |
| # Look for key findings patterns in the summary | |
| patterns = [ | |
| r"\d+\.\s*\*\*([^*]+)\*\*[^\n]*\n([^*]+?(?=\n\d+\.|\n\*\*|$))", | |
| r"-?\s*\*\*([^*]+)\*\*[^\n]*\n([^*]+?(?=\n-|\n\*\*|$))", | |
| r"([A-Z][^.:!?]+:[^.:!?]+)[.:!?]\s*([^.:!?]+[.:!?])" | |
| ] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, summary, re.IGNORECASE | re.DOTALL) | |
| for match in matches: | |
| if len(match) == 2: | |
| insight_type, insight_text = match | |
| insights.append({ | |
| "type": insight_type.strip(), | |
| "description": insight_text.strip(), | |
| "confidence": "high" if any(keyword in insight_type.lower() for keyword in | |
| ["consensus", "proven", "established"]) else "medium" | |
| }) | |
| # If no structured insights found, create from summary sentences | |
| if not insights: | |
| sentences = re.split(r'[.!?]+', summary) | |
| for sentence in sentences[:5]: # Top 5 sentences | |
| sentence = sentence.strip() | |
| if len(sentence) > 50 and any( | |
| keyword in sentence.lower() for keyword in ["find", "show", "demonstrate", "conclude"]): | |
| insights.append({ | |
| "type": "Key Finding", | |
| "description": sentence, | |
| "confidence": "medium" | |
| }) | |
| return insights[:10] # Limit to top 10 insights | |
| def _analyze_consensus(self, papers: List[Dict]) -> Dict[str, Any]: | |
| """Analyze consensus and contradictions across papers""" | |
| if len(papers) < 2: | |
| return {"consensus_level": "insufficient_data", "contradictions": []} | |
| # Simple consensus analysis based on abstract content | |
| consensus_keywords = {} | |
| all_keywords = set() | |
| for paper in papers: | |
| abstract = paper.get('abstract', '').lower() | |
| # Extract meaningful keywords (simplified) | |
| words = re.findall(r'\b[a-z]{5,15}\b', abstract) | |
| for word in words: | |
| if word not in ['which', 'their', 'about', 'using', 'method', 'study']: | |
| consensus_keywords[word] = consensus_keywords.get(word, 0) + 1 | |
| all_keywords.add(word) | |
| # Find common keywords (appearing in multiple papers) | |
| common_keywords = {word: count for word, count in consensus_keywords.items() | |
| if count >= max(2, len(papers) * 0.3)} # At least 30% of papers | |
| # Analyze potential contradictions (simplified) | |
| contradictions = [] | |
| if len(papers) >= 3: | |
| # Look for papers with opposing conclusions (simplified heuristic) | |
| positive_terms = ['improve', 'better', 'superior', 'effective', 'accurate'] | |
| negative_terms = ['limit', 'challenge', 'worse', 'ineffective', 'inaccurate'] | |
| for i, paper1 in enumerate(papers): | |
| abstract1 = paper1.get('abstract', '').lower() | |
| paper1_pos = sum(1 for term in positive_terms if term in abstract1) | |
| paper1_neg = sum(1 for term in negative_terms if term in abstract1) | |
| for j, paper2 in enumerate(papers[i + 1:], i + 1): | |
| abstract2 = paper2.get('abstract', '').lower() | |
| paper2_pos = sum(1 for term in positive_terms if term in abstract2) | |
| paper2_neg = sum(1 for term in negative_terms if term in abstract2) | |
| # Simple contradiction detection | |
| if (paper1_pos > paper2_pos + 2 and paper2_neg > paper1_neg + 2) or \ | |
| (paper2_pos > paper1_pos + 2 and paper1_neg > paper2_neg + 2): | |
| contradictions.append({ | |
| "paper1": paper1.get('title', f'Paper {i + 1}'), | |
| "paper2": paper2.get('title', f'Paper {j + 1}'), | |
| "nature": "differing_conclusions" | |
| }) | |
| return { | |
| "consensus_level": "high" if len(common_keywords) > 10 else "medium" if len(common_keywords) > 5 else "low", | |
| "common_themes": list(common_keywords.keys())[:10], | |
| "contradictions": contradictions[:5], | |
| "papers_analyzed": len(papers) | |
| } | |
| def _create_fallback_summary(self, papers: List[Dict], query: str, domain: str) -> Dict[str, Any]: | |
| """Create a basic fallback summary when LLM fails""" | |
| print("π Using fallback summary method") | |
| # Extract basic information from papers | |
| titles = [paper.get('title', 'Unknown') for paper in papers] | |
| sources = list(set(paper.get('source', 'Unknown') for paper in papers)) | |
| domains = list(set(paper.get('domain', 'Unknown') for paper in papers)) | |
| basic_summary = f""" | |
| Research Summary for: {query} | |
| Domain: {domain} | |
| Analyzed {len(papers)} papers from sources including: {', '.join(sources[:3])} | |
| Key Papers: | |
| {chr(10).join(f'- {title}' for title in titles[:5])} | |
| Note: Detailed analysis unavailable due to technical limitations. | |
| Please try refining your search query or using fewer papers. | |
| """ | |
| return { | |
| "summary": basic_summary, # Main field | |
| "comprehensive_summary": basic_summary, # Alternative field | |
| "key_insights": [{"type": "Technical Note", "description": "Basic summary only - LLM analysis unavailable", | |
| "confidence": "low"}], | |
| "consensus_analysis": {"consensus_level": "unknown", "common_themes": [], "contradictions": []}, | |
| "papers_analyzed": len(papers), | |
| "papers_cited": [self.formatter.format_citation(paper, i + 1) for i, paper in enumerate(papers)], | |
| "query": query, | |
| "domain": domain, | |
| "fallback_used": True | |
| } | |
| def generate_executive_summary(self, summary_data) -> str: | |
| """ | |
| Generate a concise executive summary from detailed analysis | |
| Handles both string and dict inputs for compatibility | |
| """ | |
| try: | |
| # Handle both string and dict inputs for backward compatibility | |
| if isinstance(summary_data, str): | |
| # If we get a string, create a basic executive summary | |
| prompt = f"Create a concise executive summary (2-3 paragraphs) from this research summary:\n\n{summary_data}" | |
| return self.llm.generate( | |
| prompt, | |
| system_message="Create professional executive summaries for research papers", | |
| max_tokens=400 | |
| ) | |
| elif isinstance(summary_data, dict): | |
| # If we get a dict (preferred), use the structured data | |
| insights = summary_data.get('key_insights', []) | |
| consensus = summary_data.get('consensus_analysis', {}) | |
| papers_analyzed = summary_data.get('papers_analyzed', 0) | |
| domain = summary_data.get('domain', 'unknown domain') | |
| # Build executive summary from structured data | |
| executive_summary = f"**Executive Summary**\n\n" | |
| executive_summary += f"Based on analysis of {papers_analyzed} papers in {domain}:\n\n" | |
| # Add top insights | |
| if insights: | |
| executive_summary += "**Key Findings:**\n" | |
| for insight in insights[:3]: | |
| executive_summary += f"β’ {insight.get('description', 'No description')}\n" | |
| # Add consensus level | |
| if consensus.get('consensus_level') != 'unknown': | |
| executive_summary += f"\n**Consensus Level:** {consensus['consensus_level'].title()}\n" | |
| # Add common themes if available | |
| if consensus.get('common_themes'): | |
| executive_summary += f"\n**Common Themes:** {', '.join(consensus['common_themes'][:5])}\n" | |
| return executive_summary | |
| else: | |
| return "Executive summary unavailable - invalid input format" | |
| except Exception as e: | |
| print(f"β Executive summary generation failed: {e}") | |
| return "Executive summary unavailable due to technical error" | |
| # Quick test | |
| def test_summarizer(): | |
| """Test the multi-document summarizer""" | |
| print("π§ͺ Testing Multi-Document Summarizer") | |
| print("=" * 50) | |
| test_papers = [ | |
| { | |
| 'title': 'Deep Learning for Medical Image Analysis', | |
| 'authors': ['Smith J', 'Johnson A', 'Brown K'], | |
| 'abstract': 'We demonstrate that convolutional neural networks significantly improve accuracy in medical image segmentation tasks. Our method achieves 95% accuracy on the BRATS dataset, outperforming traditional methods by 15%.', | |
| 'source': 'Nature Medicine', | |
| 'domain': 'medical_imaging', | |
| 'publication_date': '2024-01-15' | |
| }, | |
| { | |
| 'title': 'Transformers in Radiology', | |
| 'authors': ['Lee K', 'Chen R', 'Wang L'], | |
| 'abstract': 'This study shows that transformer architectures provide better context understanding in radiology images compared to CNNs. However, they require more computational resources and larger datasets for training.', | |
| 'source': 'Radiology AI Journal', | |
| 'domain': 'medical_imaging', | |
| 'publication_date': '2024-02-20' | |
| } | |
| ] | |
| summarizer = MultiDocumentSummarizer() | |
| try: | |
| summary = summarizer.summarize_research( | |
| test_papers, | |
| "Latest advances in AI for medical imaging", | |
| "medical_imaging" | |
| ) | |
| print(f"β Summary generated successfully") | |
| print(f"π Papers analyzed: {summary['papers_analyzed']}") | |
| print(f"π‘ Key insights: {len(summary['key_insights'])}") | |
| print(f"π€ Consensus level: {summary['consensus_analysis']['consensus_level']}") | |
| # Test executive summary with dict input | |
| executive = summarizer.generate_executive_summary(summary) | |
| print(f"\nπ Executive Summary:\n{executive}") | |
| # Test executive summary with string input (backward compatibility) | |
| executive_str = summarizer.generate_executive_summary(summary['summary']) | |
| print(f"\nπ Executive Summary (from string):\n{executive_str}") | |
| except Exception as e: | |
| print(f"β Summarization test failed: {e}") | |
| if __name__ == "__main__": | |
| test_summarizer() |