Spaces:
Sleeping
Sleeping
| # chat/single_paper_summarizer.py | |
| from typing import Dict, Any, Tuple, List, Optional | |
| from llm.llm_provider import GrokLLM | |
| import re | |
| import json | |
| import statistics | |
| from datetime import datetime | |
| class SinglePaperSummarizer: | |
| """Enhanced clinical paper summarizer with user context awareness""" | |
| def __init__(self, model: str = "gpt-oss-120b"): | |
| self.llm = GrokLLM(model=model) | |
| def summarize_paper(self, | |
| paper: Dict[str, Any], | |
| query: str = None, | |
| user_context: str = "general") -> Dict[str, Any]: | |
| """ | |
| Generate comprehensive clinical summary of a single paper | |
| Args: | |
| paper: Dictionary with paper metadata | |
| query: Optional user query about the paper | |
| user_context: User context (clinician, researcher, student, administrator, general) | |
| Returns: | |
| Dict with enhanced clinical summary and structured analysis | |
| """ | |
| # Extract paper details | |
| title = paper.get('title', 'Unknown Title') | |
| abstract = paper.get('abstract', '') | |
| authors = paper.get('authors', []) | |
| publication_date = paper.get('publication_date', '') | |
| source = paper.get('source', 'Unknown Source') | |
| citations = paper.get('citations', 0) | |
| paper_id = paper.get('id', '') | |
| print(f"📄 Summarizing paper for {user_context}: {title[:50]}...") | |
| # Format authors | |
| if authors and isinstance(authors, list): | |
| if len(authors) <= 3: | |
| author_str = ', '.join(authors) | |
| else: | |
| author_str = f"{authors[0]} et al. ({len(authors)} authors)" | |
| else: | |
| author_str = "Unknown" | |
| # Create enhanced clinical prompt | |
| prompt = self._create_clinical_summarization_prompt( | |
| title, abstract, author_str, publication_date, | |
| source, citations, query, user_context | |
| ) | |
| # Generate enhanced summary | |
| system_msg = self._get_clinical_system_message(user_context) | |
| try: | |
| enhanced_summary = self.llm.generate( | |
| prompt, | |
| system_message=system_msg, | |
| max_tokens=2000 | |
| ) | |
| # Extract key metrics and sections | |
| key_metrics = self._extract_paper_metrics(abstract) | |
| clinical_relevance = self._assess_paper_clinical_relevance( | |
| abstract, title, user_context | |
| ) | |
| structured_analysis = self._extract_enhanced_sections(enhanced_summary) | |
| # Generate quick clinical bottom line | |
| quick_summary = self._generate_quick_clinical_summary( | |
| title, abstract, user_context | |
| ) | |
| # Calculate clinical confidence | |
| confidence = self._calculate_clinical_confidence( | |
| paper, key_metrics, clinical_relevance | |
| ) | |
| return { | |
| "success": True, | |
| "paper_id": paper_id, | |
| "paper_title": title, | |
| "authors": authors, | |
| "publication_date": publication_date, | |
| "source": source, | |
| "citations": citations, | |
| "enhanced_summary": enhanced_summary, | |
| "quick_summary": quick_summary, | |
| "structured_analysis": structured_analysis, | |
| "key_metrics": key_metrics, | |
| "clinical_relevance": clinical_relevance, | |
| "user_context": user_context, | |
| "query_context": query, | |
| "summary_type": "single_paper_enhanced", | |
| "confidence": confidence, | |
| "summary_length": len(enhanced_summary), | |
| "analysis_timestamp": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| print(f"❌ Enhanced summarization failed: {e}") | |
| # Fallback to basic summary | |
| return self._generate_fallback_summary( | |
| paper, query, user_context, str(e) | |
| ) | |
| def _create_clinical_summarization_prompt(self, title, abstract, authors, date, | |
| source, citations, query, user_context): | |
| """Create specialized clinical summarization prompt""" | |
| base_prompt = f"""Create a comprehensive clinical analysis of this research paper for a {user_context}: | |
| **PAPER METADATA:** | |
| - Title: {title} | |
| - Authors: {authors} | |
| - Publication Date: {date} | |
| - Source: {source} | |
| - Citations: {citations if citations else 'Not available'} | |
| **ABSTRACT:** | |
| {abstract} | |
| **USER CONTEXT:** {user_context}""" | |
| if query: | |
| base_prompt += f""" | |
| **SPECIFIC QUESTION:** {query} | |
| Please focus your analysis on answering this specific clinical question about the paper.""" | |
| else: | |
| base_prompt += """ | |
| **ANALYSIS REQUESTED:** | |
| Provide a comprehensive clinical analysis of this paper tailored to a {user_context}.""" | |
| base_prompt += f""" | |
| **STRUCTURE YOUR ANALYSIS FOR A {user_context.upper()}:** | |
| ## 🎯 **Clinical Bottom Line** (1-2 sentences) | |
| *What is the single most important clinical takeaway?* | |
| ## 📊 **Study Design & Methodology** | |
| - Study type (RCT, cohort, case-control, etc.) | |
| - Sample size and population | |
| - Key interventions/techniques | |
| - Follow-up duration (if applicable) | |
| - Statistical methods | |
| ## 📈 **Key Findings with Clinical Data** | |
| - Primary outcomes with effect sizes | |
| - Secondary outcomes | |
| - Statistical significance (p-values, CIs) | |
| - Subgroup analyses | |
| - Adverse events/safety data | |
| ## 🏥 **Clinical Implications for {user_context}** | |
| *Tailor this section specifically for a {user_context}:* | |
| - How does this change practice/decision-making? | |
| - Which patients benefit most? | |
| - When should this be implemented? | |
| - What are the immediate applications? | |
| ## ⚠️ **Limitations & Cautions** | |
| - Study design limitations | |
| - Population generalizability | |
| - Potential biases | |
| - Conflicts of interest | |
| - Funding sources | |
| ## 🔬 **Research Implications** | |
| - Mechanism/biology insights | |
| - Future research directions | |
| - Unanswered questions | |
| - Replication needs | |
| ## 💡 **Clinical Recommendations** | |
| *Actionable recommendations for {user_context}:* | |
| 1. | |
| 2. | |
| 3. | |
| **Include specific numbers, effect sizes, and confidence intervals from the abstract.** | |
| **Use clinical terminology appropriate for a {user_context}.** | |
| **Highlight what's novel and what confirms existing knowledge.**""" | |
| return base_prompt | |
| def _get_clinical_system_message(self, user_context: str) -> str: | |
| """Get system message tailored to user context""" | |
| system_messages = { | |
| "clinician": """You are an expert clinical researcher analyzing papers for practicing physicians. | |
| Focus on: | |
| 1. Clinical applicability and patient impact | |
| 2. Evidence strength for decision-making | |
| 3. Practical implementation in clinical workflow | |
| 4. Risk-benefit analysis for patients | |
| 5. Immediate vs. future clinical implications | |
| Be evidence-based, practical, and action-oriented.""", | |
| "researcher": """You are a senior research scientist analyzing papers for academic researchers. | |
| Focus on: | |
| 1. Methodological rigor and innovation | |
| 2. Statistical analysis quality | |
| 3. Biological/mechanistic insights | |
| 4. Contribution to field knowledge | |
| 5. Research gaps and future directions | |
| Be critical, detailed, and forward-looking.""", | |
| "student": """You are a medical educator explaining papers to students. | |
| Focus on: | |
| 1. Clear, simplified explanations | |
| 2. Key learning points | |
| 3. Clinical relevance context | |
| 4. Foundational concepts | |
| 5. Study design basics | |
| Be educational, structured, and encouraging.""", | |
| "administrator": """You are a healthcare administrator analyzing papers for system leaders. | |
| Focus on: | |
| 1. Cost-effectiveness and ROI | |
| 2. Implementation feasibility | |
| 3. Workflow integration | |
| 4. Resource requirements | |
| 5. Regulatory/compliance aspects | |
| Be practical, data-driven, and strategic.""" | |
| } | |
| return system_messages.get( | |
| user_context, | |
| """You are a medical research expert analyzing papers. | |
| Provide comprehensive, evidence-based analyses that are: | |
| 1. Accurate and precise | |
| 2. Well-structured and clear | |
| 3. Clinically relevant | |
| 4. Transparent about evidence quality | |
| 5. Actionable for different stakeholders""" | |
| ) | |
| def _extract_enhanced_sections(self, summary: str) -> Dict[str, str]: | |
| """Extract structured sections from enhanced summary""" | |
| sections = { | |
| "clinical_bottom_line": "", | |
| "study_design_methodology": "", | |
| "key_findings": "", | |
| "clinical_implications": "", | |
| "limitations_cautions": "", | |
| "research_implications": "", | |
| "clinical_recommendations": "" | |
| } | |
| # Try to extract sections by headings | |
| section_patterns = { | |
| "clinical_bottom_line": [ | |
| r"clinical bottom line", r"takeaway", r"key message", | |
| r"🎯", r"bottom line" | |
| ], | |
| "study_design_methodology": [ | |
| r"study design", r"methodology", r"methods", | |
| r"experimental design", r"📊" | |
| ], | |
| "key_findings": [ | |
| r"key findings", r"results", r"findings", | |
| r"outcomes", r"📈" | |
| ], | |
| "clinical_implications": [ | |
| r"clinical implications", r"clinical relevance", | |
| r"practice implications", r"🏥" | |
| ], | |
| "limitations_cautions": [ | |
| r"limitations", r"cautions", r"weaknesses", | |
| r"biases", r"⚠️" | |
| ], | |
| "research_implications": [ | |
| r"research implications", r"future research", | |
| r"research directions", r"🔬" | |
| ], | |
| "clinical_recommendations": [ | |
| r"clinical recommendations", r"recommendations", | |
| r"action items", r"💡" | |
| ] | |
| } | |
| lines = summary.split('\n') | |
| current_section = None | |
| section_content = [] | |
| for line in lines: | |
| line_lower = line.lower().strip() | |
| # Check for new section | |
| for section, patterns in section_patterns.items(): | |
| if any(re.search(pattern, line_lower) for pattern in patterns): | |
| # Save previous section | |
| if current_section and section_content: | |
| sections[current_section] = '\n'.join(section_content) | |
| # Start new section | |
| current_section = section | |
| section_content = [] | |
| break | |
| else: | |
| # Add content to current section if not empty | |
| if current_section and line.strip(): | |
| section_content.append(line) | |
| # Save last section | |
| if current_section and section_content: | |
| sections[current_section] = '\n'.join(section_content) | |
| return sections | |
| def _extract_paper_metrics(self, abstract: str) -> Dict[str, Any]: | |
| """Extract key clinical metrics from paper abstract""" | |
| metrics = { | |
| "sample_size": None, | |
| "statistical_significance": [], | |
| "effect_sizes": [], | |
| "confidence_intervals": [], | |
| "adverse_events": [], | |
| "follow_up": None | |
| } | |
| abstract_lower = abstract.lower() | |
| # Extract sample size | |
| sample_patterns = [ | |
| r'n\s*=\s*(\d+,?\d*)', | |
| r'sample of (\d+,?\d*)', | |
| r'(\d+,?\d*)\s*participants', | |
| r'(\d+,?\d*)\s*patients', | |
| r'(\d+,?\d*)\s*subjects' | |
| ] | |
| for pattern in sample_patterns: | |
| match = re.search(pattern, abstract_lower) | |
| if match: | |
| metrics["sample_size"] = match.group(1).replace(',', '') | |
| break | |
| # Extract p-values | |
| p_value_matches = re.findall( | |
| r'p\s*[<≤=]\s*0?\.\d+(?:e[+-]?\d+)?', | |
| abstract_lower, | |
| re.IGNORECASE | |
| ) | |
| metrics["statistical_significance"] = p_value_matches[:5] | |
| # Extract effect sizes | |
| effect_patterns = [ | |
| (r'HR\s*[=]\s*[\d\.]+', "Hazard Ratio"), | |
| (r'OR\s*[=]\s*[\d\.]+', "Odds Ratio"), | |
| (r'RR\s*[=]\s*[\d\.]+', "Relative Risk"), | |
| (r'ARR\s*[=]\s*[\d\.]+%?', "Absolute Risk Reduction"), | |
| (r'NNT\s*[=]\s*[\d\.]+', "Number Needed to Treat") | |
| ] | |
| for pattern, label in effect_patterns: | |
| matches = re.findall(pattern, abstract_lower, re.IGNORECASE) | |
| for match in matches: | |
| metrics["effect_sizes"].append(f"{label}: {match}") | |
| # Extract confidence intervals | |
| ci_matches = re.findall( | |
| r'\d+\.?\d*%\s*CI\s*[\[\(].*?[\]\)]', | |
| abstract_lower, | |
| re.IGNORECASE | |
| ) | |
| metrics["confidence_intervals"] = ci_matches[:3] | |
| # Extract follow-up duration | |
| follow_up_matches = re.findall( | |
| r'(\d+(?:\.\d+)?)\s*(?:year|month|week|day)s?\s*(?:follow-up|follow up|FU)', | |
| abstract_lower | |
| ) | |
| if follow_up_matches: | |
| metrics["follow_up"] = follow_up_matches[0] | |
| # Extract adverse events | |
| ae_keywords = ['adverse event', 'side effect', 'complication', 'toxicity', 'safety'] | |
| for keyword in ae_keywords: | |
| if keyword in abstract_lower: | |
| metrics["adverse_events"].append(keyword) | |
| # Count metric richness | |
| metrics["metric_richness"] = sum( | |
| 1 for key in ['sample_size', 'follow_up'] | |
| if metrics[key] is not None | |
| ) + sum(len(metrics[key]) for key in [ | |
| 'statistical_significance', | |
| 'effect_sizes', | |
| 'confidence_intervals', | |
| 'adverse_events' | |
| ]) | |
| return metrics | |
| def _assess_paper_clinical_relevance(self, abstract: str, title: str, | |
| user_context: str) -> Dict[str, Any]: | |
| """Assess clinical relevance of paper for specific user context""" | |
| # Check for clinical endpoints | |
| clinical_keywords = { | |
| "high_impact": ['survival', 'mortality', 'cure', 'prevention', 'morbidity'], | |
| "medium_impact": ['symptom', 'recovery', 'function', 'quality of life', 'qol'], | |
| "low_impact": ['feasibility', 'pilot', 'mechanism', 'proof of concept'], | |
| "clinical_study": ['trial', 'cohort', 'case-control', 'observational', 'randomized'] | |
| } | |
| abstract_lower = abstract.lower() | |
| title_lower = title.lower() | |
| # Calculate relevance score | |
| score = 0 | |
| # Impact level | |
| for keyword in clinical_keywords["high_impact"]: | |
| if keyword in abstract_lower or keyword in title_lower: | |
| score += 3 | |
| for keyword in clinical_keywords["medium_impact"]: | |
| if keyword in abstract_lower or keyword in title_lower: | |
| score += 2 | |
| for keyword in clinical_keywords["low_impact"]: | |
| if keyword in abstract_lower or keyword in title_lower: | |
| score += 1 | |
| # Study design | |
| for keyword in clinical_keywords["clinical_study"]: | |
| if keyword in abstract_lower: | |
| score += 2 | |
| # Adjust for user context | |
| context_multipliers = { | |
| "clinician": 1.3, | |
| "researcher": 1.1, | |
| "student": 1.0, | |
| "administrator": 1.2, | |
| "general": 1.0 | |
| } | |
| score = min(10, score * context_multipliers.get(user_context, 1.0)) | |
| # Determine relevance level | |
| if score >= 8: | |
| relevance_level = "High" | |
| applicability = "Ready for clinical consideration" | |
| elif score >= 5: | |
| relevance_level = "Medium" | |
| applicability = "Promising but requires validation" | |
| elif score >= 3: | |
| relevance_level = "Low" | |
| applicability = "Preliminary evidence" | |
| else: | |
| relevance_level = "Very Low" | |
| applicability = "Primarily theoretical/research" | |
| # Check study design | |
| study_design = "Unknown" | |
| for design in ['randomized controlled trial', 'RCT', 'prospective cohort', | |
| 'retrospective cohort', 'case-control', 'systematic review']: | |
| if design in abstract_lower: | |
| study_design = design | |
| break | |
| return { | |
| "clinical_impact": relevance_level, | |
| "score": round(score, 1), | |
| "applicability": applicability, | |
| "study_design": study_design, | |
| "key_strengths": self._identify_strengths(abstract_lower), | |
| "main_limitations": self._identify_limitations(abstract_lower) | |
| } | |
| def _identify_strengths(self, abstract: str) -> List[str]: | |
| """Identify study strengths from abstract""" | |
| strengths = [] | |
| if 'randomized' in abstract or 'rct' in abstract: | |
| strengths.append("Randomized controlled trial design") | |
| if 'prospective' in abstract: | |
| strengths.append("Prospective design") | |
| if 'multicenter' in abstract or 'multi-center' in abstract: | |
| strengths.append("Multi-center study") | |
| if 'large sample' in abstract or 'n > 1000' in abstract: | |
| strengths.append("Large sample size") | |
| if 'long-term' in abstract: | |
| strengths.append("Long-term follow-up") | |
| if 'blinded' in abstract: | |
| strengths.append("Blinded assessment") | |
| return strengths[:3] if strengths else ["Standard study design"] | |
| def _identify_limitations(self, abstract: str) -> List[str]: | |
| """Identify study limitations from abstract""" | |
| limitations = [] | |
| limitation_phrases = [ | |
| 'limitation', 'limited by', 'caution', 'constraint', | |
| 'small sample', 'retrospective', 'single center', | |
| 'short-term', 'observational', 'cannot determine', | |
| 'further research', 'larger studies', 'validate' | |
| ] | |
| sentences = re.split(r'[.!?]+', abstract) | |
| for sentence in sentences: | |
| if any(phrase in sentence.lower() for phrase in limitation_phrases): | |
| limitations.append(sentence.strip()) | |
| return limitations[:3] if limitations else ["Standard study limitations apply"] | |
| def _calculate_clinical_confidence(self, paper: Dict, | |
| metrics: Dict, | |
| relevance: Dict) -> float: | |
| """Calculate confidence score for clinical paper summary""" | |
| confidence = 0.5 # Base confidence | |
| # Abstract quality | |
| abstract = paper.get('abstract', '') | |
| if len(abstract) > 800: | |
| confidence += 0.2 | |
| elif len(abstract) > 400: | |
| confidence += 0.1 | |
| # Source reliability | |
| source = paper.get('source', '').lower() | |
| if any(journal in source for journal in ['nejm', 'lancet', 'jama', 'bmj']): | |
| confidence += 0.15 | |
| elif 'pubmed' in source: | |
| confidence += 0.1 | |
| elif 'arxiv' in source: | |
| confidence += 0.05 | |
| # Recency | |
| if paper.get('publication_date'): | |
| try: | |
| pub_year = int(str(paper['publication_date'])[:4]) | |
| current_year = datetime.now().year | |
| if current_year - pub_year <= 2: | |
| confidence += 0.1 | |
| elif current_year - pub_year <= 5: | |
| confidence += 0.05 | |
| except: | |
| pass | |
| # Citations | |
| citations = paper.get('citations', 0) | |
| if citations > 100: | |
| confidence += 0.05 | |
| elif citations > 20: | |
| confidence += 0.03 | |
| # Metric richness | |
| metric_score = metrics.get("metric_richness", 0) | |
| confidence += min(0.1, metric_score * 0.02) | |
| # Clinical relevance | |
| relevance_score = relevance.get("score", 0) | |
| confidence += min(0.1, relevance_score * 0.01) | |
| return min(1.0, max(0.3, confidence)) | |
| def _generate_quick_clinical_summary(self, title: str, abstract: str, | |
| user_context: str) -> str: | |
| """Generate a quick 2-sentence clinical summary""" | |
| prompt = f"""Create a 2-sentence clinical summary of this paper for a {user_context}: | |
| Title: {title} | |
| Key content: {abstract[:800]} | |
| First sentence: Main clinical finding. | |
| Second sentence: Clinical implication for {user_context}. | |
| Be extremely concise and action-oriented.""" | |
| try: | |
| summary = self.llm.generate(prompt, max_tokens=150) | |
| return summary.strip() | |
| except: | |
| # Fallback | |
| title_snippet = title.split(':')[0] if ':' in title else title[:50] | |
| return f"""1. Study shows promising results in {title_snippet}. | |
| 2. Consider for {self._infer_application(abstract, user_context)}.""" | |
| def _infer_application(self, abstract: str, user_context: str) -> str: | |
| """Infer clinical application from abstract""" | |
| abstract_lower = abstract.lower() | |
| if user_context == "clinician": | |
| if 'treatment' in abstract_lower: | |
| return "treatment decisions" | |
| elif 'diagnosis' in abstract_lower: | |
| return "diagnostic workup" | |
| elif 'screening' in abstract_lower: | |
| return "screening protocols" | |
| else: | |
| return "clinical consideration" | |
| elif user_context == "researcher": | |
| if 'mechanism' in abstract_lower: | |
| return "mechanistic studies" | |
| elif 'novel' in abstract_lower: | |
| return "innovation validation" | |
| else: | |
| return "further research" | |
| return "appropriate applications" | |
| def _generate_fallback_summary(self, paper: Dict, query: str, | |
| user_context: str, error: str) -> Dict[str, Any]: | |
| """Generate fallback summary when enhanced summary fails""" | |
| title = paper.get('title', '') | |
| abstract = paper.get('abstract', '') | |
| return { | |
| "success": False, | |
| "paper_title": title, | |
| "error": f"Enhanced summarization failed: {error}", | |
| "enhanced_summary": f"""# 📄 Basic Paper Summary\n\n**Title:** {title}\n\n**Abstract:**\n{abstract[:1500]}...\n\n*Note: Enhanced clinical analysis unavailable. Please try again.*""", | |
| "quick_summary": f"Basic summary of {title[:50]}...", | |
| "user_context": user_context, | |
| "query_context": query, | |
| "confidence": 0.4 | |
| } | |
| def generate_quick_summary(self, paper: Dict[str, Any], | |
| user_context: str = "general") -> str: | |
| """Generate a quick clinical summary (legacy method for compatibility)""" | |
| return self._generate_quick_clinical_summary( | |
| paper.get('title', ''), | |
| paper.get('abstract', ''), | |
| user_context | |
| ) | |
| def summarize_multiple_papers(self, papers: List[Dict], | |
| query: str = None, | |
| user_context: str = "general") -> Dict[str, Any]: | |
| """Generate comparative summary of multiple papers""" | |
| if not papers: | |
| return {"error": "No papers provided"} | |
| summaries = [] | |
| for paper in papers[:5]: # Limit to 5 papers | |
| summary = self.summarize_paper(paper, query, user_context) | |
| summaries.append(summary) | |
| # Generate comparative analysis | |
| comparative = self._generate_comparative_analysis(summaries, user_context) | |
| return { | |
| "success": True, | |
| "paper_count": len(papers), | |
| "individual_summaries": summaries, | |
| "comparative_analysis": comparative, | |
| "user_context": user_context | |
| } | |
| def _generate_comparative_analysis(self, summaries: List[Dict], | |
| user_context: str) -> str: | |
| """Generate comparative analysis of multiple papers""" | |
| if len(summaries) < 2: | |
| return "Single paper analysis only" | |
| prompt = f"""Compare these {len(summaries)} papers for a {user_context}: | |
| """ | |
| for i, summary in enumerate(summaries, 1): | |
| prompt += f"""Paper {i}: {summary.get('paper_title', 'Unknown')} | |
| Clinical Impact: {summary.get('clinical_relevance', {}).get('clinical_impact', 'Unknown')} | |
| Key Finding: {summary.get('quick_summary', '')[:100]} | |
| """ | |
| prompt += f"""Provide a comparative analysis focusing on: | |
| 1. Consistency of findings | |
| 2. Evidence strength across papers | |
| 3. Clinical implications for {user_context} | |
| 4. Research gaps identified | |
| Format as a concise clinical comparison.""" | |
| try: | |
| return self.llm.generate(prompt, max_tokens=800) | |
| except: | |
| return "Comparative analysis unavailable" |