# chat/single_paper_summarizer.py from typing import Dict, Any, Tuple, List, Optional from llm.llm_provider import GrokLLM import re import json import statistics from datetime import datetime class SinglePaperSummarizer: """Enhanced clinical paper summarizer with user context awareness""" def __init__(self, model: str = "gpt-oss-120b"): self.llm = GrokLLM(model=model) def summarize_paper(self, paper: Dict[str, Any], query: str = None, user_context: str = "general") -> Dict[str, Any]: """ Generate comprehensive clinical summary of a single paper Args: paper: Dictionary with paper metadata query: Optional user query about the paper user_context: User context (clinician, researcher, student, administrator, general) Returns: Dict with enhanced clinical summary and structured analysis """ # Extract paper details title = paper.get('title', 'Unknown Title') abstract = paper.get('abstract', '') authors = paper.get('authors', []) publication_date = paper.get('publication_date', '') source = paper.get('source', 'Unknown Source') citations = paper.get('citations', 0) paper_id = paper.get('id', '') print(f"📄 Summarizing paper for {user_context}: {title[:50]}...") # Format authors if authors and isinstance(authors, list): if len(authors) <= 3: author_str = ', '.join(authors) else: author_str = f"{authors[0]} et al. ({len(authors)} authors)" else: author_str = "Unknown" # Create enhanced clinical prompt prompt = self._create_clinical_summarization_prompt( title, abstract, author_str, publication_date, source, citations, query, user_context ) # Generate enhanced summary system_msg = self._get_clinical_system_message(user_context) try: enhanced_summary = self.llm.generate( prompt, system_message=system_msg, max_tokens=2000 ) # Extract key metrics and sections key_metrics = self._extract_paper_metrics(abstract) clinical_relevance = self._assess_paper_clinical_relevance( abstract, title, user_context ) structured_analysis = self._extract_enhanced_sections(enhanced_summary) # Generate quick clinical bottom line quick_summary = self._generate_quick_clinical_summary( title, abstract, user_context ) # Calculate clinical confidence confidence = self._calculate_clinical_confidence( paper, key_metrics, clinical_relevance ) return { "success": True, "paper_id": paper_id, "paper_title": title, "authors": authors, "publication_date": publication_date, "source": source, "citations": citations, "enhanced_summary": enhanced_summary, "quick_summary": quick_summary, "structured_analysis": structured_analysis, "key_metrics": key_metrics, "clinical_relevance": clinical_relevance, "user_context": user_context, "query_context": query, "summary_type": "single_paper_enhanced", "confidence": confidence, "summary_length": len(enhanced_summary), "analysis_timestamp": datetime.now().isoformat() } except Exception as e: print(f"❌ Enhanced summarization failed: {e}") # Fallback to basic summary return self._generate_fallback_summary( paper, query, user_context, str(e) ) def _create_clinical_summarization_prompt(self, title, abstract, authors, date, source, citations, query, user_context): """Create specialized clinical summarization prompt""" base_prompt = f"""Create a comprehensive clinical analysis of this research paper for a {user_context}: **PAPER METADATA:** - Title: {title} - Authors: {authors} - Publication Date: {date} - Source: {source} - Citations: {citations if citations else 'Not available'} **ABSTRACT:** {abstract} **USER CONTEXT:** {user_context}""" if query: base_prompt += f""" **SPECIFIC QUESTION:** {query} Please focus your analysis on answering this specific clinical question about the paper.""" else: base_prompt += """ **ANALYSIS REQUESTED:** Provide a comprehensive clinical analysis of this paper tailored to a {user_context}.""" base_prompt += f""" **STRUCTURE YOUR ANALYSIS FOR A {user_context.upper()}:** ## 🎯 **Clinical Bottom Line** (1-2 sentences) *What is the single most important clinical takeaway?* ## 📊 **Study Design & Methodology** - Study type (RCT, cohort, case-control, etc.) - Sample size and population - Key interventions/techniques - Follow-up duration (if applicable) - Statistical methods ## 📈 **Key Findings with Clinical Data** - Primary outcomes with effect sizes - Secondary outcomes - Statistical significance (p-values, CIs) - Subgroup analyses - Adverse events/safety data ## 🏥 **Clinical Implications for {user_context}** *Tailor this section specifically for a {user_context}:* - How does this change practice/decision-making? - Which patients benefit most? - When should this be implemented? - What are the immediate applications? ## ⚠️ **Limitations & Cautions** - Study design limitations - Population generalizability - Potential biases - Conflicts of interest - Funding sources ## 🔬 **Research Implications** - Mechanism/biology insights - Future research directions - Unanswered questions - Replication needs ## 💡 **Clinical Recommendations** *Actionable recommendations for {user_context}:* 1. 2. 3. **Include specific numbers, effect sizes, and confidence intervals from the abstract.** **Use clinical terminology appropriate for a {user_context}.** **Highlight what's novel and what confirms existing knowledge.**""" return base_prompt def _get_clinical_system_message(self, user_context: str) -> str: """Get system message tailored to user context""" system_messages = { "clinician": """You are an expert clinical researcher analyzing papers for practicing physicians. Focus on: 1. Clinical applicability and patient impact 2. Evidence strength for decision-making 3. Practical implementation in clinical workflow 4. Risk-benefit analysis for patients 5. Immediate vs. future clinical implications Be evidence-based, practical, and action-oriented.""", "researcher": """You are a senior research scientist analyzing papers for academic researchers. Focus on: 1. Methodological rigor and innovation 2. Statistical analysis quality 3. Biological/mechanistic insights 4. Contribution to field knowledge 5. Research gaps and future directions Be critical, detailed, and forward-looking.""", "student": """You are a medical educator explaining papers to students. Focus on: 1. Clear, simplified explanations 2. Key learning points 3. Clinical relevance context 4. Foundational concepts 5. Study design basics Be educational, structured, and encouraging.""", "administrator": """You are a healthcare administrator analyzing papers for system leaders. Focus on: 1. Cost-effectiveness and ROI 2. Implementation feasibility 3. Workflow integration 4. Resource requirements 5. Regulatory/compliance aspects Be practical, data-driven, and strategic.""" } return system_messages.get( user_context, """You are a medical research expert analyzing papers. Provide comprehensive, evidence-based analyses that are: 1. Accurate and precise 2. Well-structured and clear 3. Clinically relevant 4. Transparent about evidence quality 5. Actionable for different stakeholders""" ) def _extract_enhanced_sections(self, summary: str) -> Dict[str, str]: """Extract structured sections from enhanced summary""" sections = { "clinical_bottom_line": "", "study_design_methodology": "", "key_findings": "", "clinical_implications": "", "limitations_cautions": "", "research_implications": "", "clinical_recommendations": "" } # Try to extract sections by headings section_patterns = { "clinical_bottom_line": [ r"clinical bottom line", r"takeaway", r"key message", r"🎯", r"bottom line" ], "study_design_methodology": [ r"study design", r"methodology", r"methods", r"experimental design", r"📊" ], "key_findings": [ r"key findings", r"results", r"findings", r"outcomes", r"📈" ], "clinical_implications": [ r"clinical implications", r"clinical relevance", r"practice implications", r"🏥" ], "limitations_cautions": [ r"limitations", r"cautions", r"weaknesses", r"biases", r"⚠️" ], "research_implications": [ r"research implications", r"future research", r"research directions", r"🔬" ], "clinical_recommendations": [ r"clinical recommendations", r"recommendations", r"action items", r"💡" ] } lines = summary.split('\n') current_section = None section_content = [] for line in lines: line_lower = line.lower().strip() # Check for new section for section, patterns in section_patterns.items(): if any(re.search(pattern, line_lower) for pattern in patterns): # Save previous section if current_section and section_content: sections[current_section] = '\n'.join(section_content) # Start new section current_section = section section_content = [] break else: # Add content to current section if not empty if current_section and line.strip(): section_content.append(line) # Save last section if current_section and section_content: sections[current_section] = '\n'.join(section_content) return sections def _extract_paper_metrics(self, abstract: str) -> Dict[str, Any]: """Extract key clinical metrics from paper abstract""" metrics = { "sample_size": None, "statistical_significance": [], "effect_sizes": [], "confidence_intervals": [], "adverse_events": [], "follow_up": None } abstract_lower = abstract.lower() # Extract sample size sample_patterns = [ r'n\s*=\s*(\d+,?\d*)', r'sample of (\d+,?\d*)', r'(\d+,?\d*)\s*participants', r'(\d+,?\d*)\s*patients', r'(\d+,?\d*)\s*subjects' ] for pattern in sample_patterns: match = re.search(pattern, abstract_lower) if match: metrics["sample_size"] = match.group(1).replace(',', '') break # Extract p-values p_value_matches = re.findall( r'p\s*[<≤=]\s*0?\.\d+(?:e[+-]?\d+)?', abstract_lower, re.IGNORECASE ) metrics["statistical_significance"] = p_value_matches[:5] # Extract effect sizes effect_patterns = [ (r'HR\s*[=]\s*[\d\.]+', "Hazard Ratio"), (r'OR\s*[=]\s*[\d\.]+', "Odds Ratio"), (r'RR\s*[=]\s*[\d\.]+', "Relative Risk"), (r'ARR\s*[=]\s*[\d\.]+%?', "Absolute Risk Reduction"), (r'NNT\s*[=]\s*[\d\.]+', "Number Needed to Treat") ] for pattern, label in effect_patterns: matches = re.findall(pattern, abstract_lower, re.IGNORECASE) for match in matches: metrics["effect_sizes"].append(f"{label}: {match}") # Extract confidence intervals ci_matches = re.findall( r'\d+\.?\d*%\s*CI\s*[\[\(].*?[\]\)]', abstract_lower, re.IGNORECASE ) metrics["confidence_intervals"] = ci_matches[:3] # Extract follow-up duration follow_up_matches = re.findall( r'(\d+(?:\.\d+)?)\s*(?:year|month|week|day)s?\s*(?:follow-up|follow up|FU)', abstract_lower ) if follow_up_matches: metrics["follow_up"] = follow_up_matches[0] # Extract adverse events ae_keywords = ['adverse event', 'side effect', 'complication', 'toxicity', 'safety'] for keyword in ae_keywords: if keyword in abstract_lower: metrics["adverse_events"].append(keyword) # Count metric richness metrics["metric_richness"] = sum( 1 for key in ['sample_size', 'follow_up'] if metrics[key] is not None ) + sum(len(metrics[key]) for key in [ 'statistical_significance', 'effect_sizes', 'confidence_intervals', 'adverse_events' ]) return metrics def _assess_paper_clinical_relevance(self, abstract: str, title: str, user_context: str) -> Dict[str, Any]: """Assess clinical relevance of paper for specific user context""" # Check for clinical endpoints clinical_keywords = { "high_impact": ['survival', 'mortality', 'cure', 'prevention', 'morbidity'], "medium_impact": ['symptom', 'recovery', 'function', 'quality of life', 'qol'], "low_impact": ['feasibility', 'pilot', 'mechanism', 'proof of concept'], "clinical_study": ['trial', 'cohort', 'case-control', 'observational', 'randomized'] } abstract_lower = abstract.lower() title_lower = title.lower() # Calculate relevance score score = 0 # Impact level for keyword in clinical_keywords["high_impact"]: if keyword in abstract_lower or keyword in title_lower: score += 3 for keyword in clinical_keywords["medium_impact"]: if keyword in abstract_lower or keyword in title_lower: score += 2 for keyword in clinical_keywords["low_impact"]: if keyword in abstract_lower or keyword in title_lower: score += 1 # Study design for keyword in clinical_keywords["clinical_study"]: if keyword in abstract_lower: score += 2 # Adjust for user context context_multipliers = { "clinician": 1.3, "researcher": 1.1, "student": 1.0, "administrator": 1.2, "general": 1.0 } score = min(10, score * context_multipliers.get(user_context, 1.0)) # Determine relevance level if score >= 8: relevance_level = "High" applicability = "Ready for clinical consideration" elif score >= 5: relevance_level = "Medium" applicability = "Promising but requires validation" elif score >= 3: relevance_level = "Low" applicability = "Preliminary evidence" else: relevance_level = "Very Low" applicability = "Primarily theoretical/research" # Check study design study_design = "Unknown" for design in ['randomized controlled trial', 'RCT', 'prospective cohort', 'retrospective cohort', 'case-control', 'systematic review']: if design in abstract_lower: study_design = design break return { "clinical_impact": relevance_level, "score": round(score, 1), "applicability": applicability, "study_design": study_design, "key_strengths": self._identify_strengths(abstract_lower), "main_limitations": self._identify_limitations(abstract_lower) } def _identify_strengths(self, abstract: str) -> List[str]: """Identify study strengths from abstract""" strengths = [] if 'randomized' in abstract or 'rct' in abstract: strengths.append("Randomized controlled trial design") if 'prospective' in abstract: strengths.append("Prospective design") if 'multicenter' in abstract or 'multi-center' in abstract: strengths.append("Multi-center study") if 'large sample' in abstract or 'n > 1000' in abstract: strengths.append("Large sample size") if 'long-term' in abstract: strengths.append("Long-term follow-up") if 'blinded' in abstract: strengths.append("Blinded assessment") return strengths[:3] if strengths else ["Standard study design"] def _identify_limitations(self, abstract: str) -> List[str]: """Identify study limitations from abstract""" limitations = [] limitation_phrases = [ 'limitation', 'limited by', 'caution', 'constraint', 'small sample', 'retrospective', 'single center', 'short-term', 'observational', 'cannot determine', 'further research', 'larger studies', 'validate' ] sentences = re.split(r'[.!?]+', abstract) for sentence in sentences: if any(phrase in sentence.lower() for phrase in limitation_phrases): limitations.append(sentence.strip()) return limitations[:3] if limitations else ["Standard study limitations apply"] def _calculate_clinical_confidence(self, paper: Dict, metrics: Dict, relevance: Dict) -> float: """Calculate confidence score for clinical paper summary""" confidence = 0.5 # Base confidence # Abstract quality abstract = paper.get('abstract', '') if len(abstract) > 800: confidence += 0.2 elif len(abstract) > 400: confidence += 0.1 # Source reliability source = paper.get('source', '').lower() if any(journal in source for journal in ['nejm', 'lancet', 'jama', 'bmj']): confidence += 0.15 elif 'pubmed' in source: confidence += 0.1 elif 'arxiv' in source: confidence += 0.05 # Recency if paper.get('publication_date'): try: pub_year = int(str(paper['publication_date'])[:4]) current_year = datetime.now().year if current_year - pub_year <= 2: confidence += 0.1 elif current_year - pub_year <= 5: confidence += 0.05 except: pass # Citations citations = paper.get('citations', 0) if citations > 100: confidence += 0.05 elif citations > 20: confidence += 0.03 # Metric richness metric_score = metrics.get("metric_richness", 0) confidence += min(0.1, metric_score * 0.02) # Clinical relevance relevance_score = relevance.get("score", 0) confidence += min(0.1, relevance_score * 0.01) return min(1.0, max(0.3, confidence)) def _generate_quick_clinical_summary(self, title: str, abstract: str, user_context: str) -> str: """Generate a quick 2-sentence clinical summary""" prompt = f"""Create a 2-sentence clinical summary of this paper for a {user_context}: Title: {title} Key content: {abstract[:800]} First sentence: Main clinical finding. Second sentence: Clinical implication for {user_context}. Be extremely concise and action-oriented.""" try: summary = self.llm.generate(prompt, max_tokens=150) return summary.strip() except: # Fallback title_snippet = title.split(':')[0] if ':' in title else title[:50] return f"""1. Study shows promising results in {title_snippet}. 2. Consider for {self._infer_application(abstract, user_context)}.""" def _infer_application(self, abstract: str, user_context: str) -> str: """Infer clinical application from abstract""" abstract_lower = abstract.lower() if user_context == "clinician": if 'treatment' in abstract_lower: return "treatment decisions" elif 'diagnosis' in abstract_lower: return "diagnostic workup" elif 'screening' in abstract_lower: return "screening protocols" else: return "clinical consideration" elif user_context == "researcher": if 'mechanism' in abstract_lower: return "mechanistic studies" elif 'novel' in abstract_lower: return "innovation validation" else: return "further research" return "appropriate applications" def _generate_fallback_summary(self, paper: Dict, query: str, user_context: str, error: str) -> Dict[str, Any]: """Generate fallback summary when enhanced summary fails""" title = paper.get('title', '') abstract = paper.get('abstract', '') return { "success": False, "paper_title": title, "error": f"Enhanced summarization failed: {error}", "enhanced_summary": f"""# 📄 Basic Paper Summary\n\n**Title:** {title}\n\n**Abstract:**\n{abstract[:1500]}...\n\n*Note: Enhanced clinical analysis unavailable. Please try again.*""", "quick_summary": f"Basic summary of {title[:50]}...", "user_context": user_context, "query_context": query, "confidence": 0.4 } def generate_quick_summary(self, paper: Dict[str, Any], user_context: str = "general") -> str: """Generate a quick clinical summary (legacy method for compatibility)""" return self._generate_quick_clinical_summary( paper.get('title', ''), paper.get('abstract', ''), user_context ) def summarize_multiple_papers(self, papers: List[Dict], query: str = None, user_context: str = "general") -> Dict[str, Any]: """Generate comparative summary of multiple papers""" if not papers: return {"error": "No papers provided"} summaries = [] for paper in papers[:5]: # Limit to 5 papers summary = self.summarize_paper(paper, query, user_context) summaries.append(summary) # Generate comparative analysis comparative = self._generate_comparative_analysis(summaries, user_context) return { "success": True, "paper_count": len(papers), "individual_summaries": summaries, "comparative_analysis": comparative, "user_context": user_context } def _generate_comparative_analysis(self, summaries: List[Dict], user_context: str) -> str: """Generate comparative analysis of multiple papers""" if len(summaries) < 2: return "Single paper analysis only" prompt = f"""Compare these {len(summaries)} papers for a {user_context}: """ for i, summary in enumerate(summaries, 1): prompt += f"""Paper {i}: {summary.get('paper_title', 'Unknown')} Clinical Impact: {summary.get('clinical_relevance', {}).get('clinical_impact', 'Unknown')} Key Finding: {summary.get('quick_summary', '')[:100]} """ prompt += f"""Provide a comparative analysis focusing on: 1. Consistency of findings 2. Evidence strength across papers 3. Clinical implications for {user_context} 4. Research gaps identified Format as a concise clinical comparison.""" try: return self.llm.generate(prompt, max_tokens=800) except: return "Comparative analysis unavailable"