MedSearchPro / chat /single_paper_summarizer.py
paulhemb's picture
Initial Backend Deployment
1367957
# chat/single_paper_summarizer.py
from typing import Dict, Any, Tuple, List, Optional
from llm.llm_provider import GrokLLM
import re
import json
import statistics
from datetime import datetime
class SinglePaperSummarizer:
"""Enhanced clinical paper summarizer with user context awareness"""
def __init__(self, model: str = "gpt-oss-120b"):
self.llm = GrokLLM(model=model)
def summarize_paper(self,
paper: Dict[str, Any],
query: str = None,
user_context: str = "general") -> Dict[str, Any]:
"""
Generate comprehensive clinical summary of a single paper
Args:
paper: Dictionary with paper metadata
query: Optional user query about the paper
user_context: User context (clinician, researcher, student, administrator, general)
Returns:
Dict with enhanced clinical summary and structured analysis
"""
# Extract paper details
title = paper.get('title', 'Unknown Title')
abstract = paper.get('abstract', '')
authors = paper.get('authors', [])
publication_date = paper.get('publication_date', '')
source = paper.get('source', 'Unknown Source')
citations = paper.get('citations', 0)
paper_id = paper.get('id', '')
print(f"📄 Summarizing paper for {user_context}: {title[:50]}...")
# Format authors
if authors and isinstance(authors, list):
if len(authors) <= 3:
author_str = ', '.join(authors)
else:
author_str = f"{authors[0]} et al. ({len(authors)} authors)"
else:
author_str = "Unknown"
# Create enhanced clinical prompt
prompt = self._create_clinical_summarization_prompt(
title, abstract, author_str, publication_date,
source, citations, query, user_context
)
# Generate enhanced summary
system_msg = self._get_clinical_system_message(user_context)
try:
enhanced_summary = self.llm.generate(
prompt,
system_message=system_msg,
max_tokens=2000
)
# Extract key metrics and sections
key_metrics = self._extract_paper_metrics(abstract)
clinical_relevance = self._assess_paper_clinical_relevance(
abstract, title, user_context
)
structured_analysis = self._extract_enhanced_sections(enhanced_summary)
# Generate quick clinical bottom line
quick_summary = self._generate_quick_clinical_summary(
title, abstract, user_context
)
# Calculate clinical confidence
confidence = self._calculate_clinical_confidence(
paper, key_metrics, clinical_relevance
)
return {
"success": True,
"paper_id": paper_id,
"paper_title": title,
"authors": authors,
"publication_date": publication_date,
"source": source,
"citations": citations,
"enhanced_summary": enhanced_summary,
"quick_summary": quick_summary,
"structured_analysis": structured_analysis,
"key_metrics": key_metrics,
"clinical_relevance": clinical_relevance,
"user_context": user_context,
"query_context": query,
"summary_type": "single_paper_enhanced",
"confidence": confidence,
"summary_length": len(enhanced_summary),
"analysis_timestamp": datetime.now().isoformat()
}
except Exception as e:
print(f"❌ Enhanced summarization failed: {e}")
# Fallback to basic summary
return self._generate_fallback_summary(
paper, query, user_context, str(e)
)
def _create_clinical_summarization_prompt(self, title, abstract, authors, date,
source, citations, query, user_context):
"""Create specialized clinical summarization prompt"""
base_prompt = f"""Create a comprehensive clinical analysis of this research paper for a {user_context}:
**PAPER METADATA:**
- Title: {title}
- Authors: {authors}
- Publication Date: {date}
- Source: {source}
- Citations: {citations if citations else 'Not available'}
**ABSTRACT:**
{abstract}
**USER CONTEXT:** {user_context}"""
if query:
base_prompt += f"""
**SPECIFIC QUESTION:** {query}
Please focus your analysis on answering this specific clinical question about the paper."""
else:
base_prompt += """
**ANALYSIS REQUESTED:**
Provide a comprehensive clinical analysis of this paper tailored to a {user_context}."""
base_prompt += f"""
**STRUCTURE YOUR ANALYSIS FOR A {user_context.upper()}:**
## 🎯 **Clinical Bottom Line** (1-2 sentences)
*What is the single most important clinical takeaway?*
## 📊 **Study Design & Methodology**
- Study type (RCT, cohort, case-control, etc.)
- Sample size and population
- Key interventions/techniques
- Follow-up duration (if applicable)
- Statistical methods
## 📈 **Key Findings with Clinical Data**
- Primary outcomes with effect sizes
- Secondary outcomes
- Statistical significance (p-values, CIs)
- Subgroup analyses
- Adverse events/safety data
## 🏥 **Clinical Implications for {user_context}**
*Tailor this section specifically for a {user_context}:*
- How does this change practice/decision-making?
- Which patients benefit most?
- When should this be implemented?
- What are the immediate applications?
## ⚠️ **Limitations & Cautions**
- Study design limitations
- Population generalizability
- Potential biases
- Conflicts of interest
- Funding sources
## 🔬 **Research Implications**
- Mechanism/biology insights
- Future research directions
- Unanswered questions
- Replication needs
## 💡 **Clinical Recommendations**
*Actionable recommendations for {user_context}:*
1.
2.
3.
**Include specific numbers, effect sizes, and confidence intervals from the abstract.**
**Use clinical terminology appropriate for a {user_context}.**
**Highlight what's novel and what confirms existing knowledge.**"""
return base_prompt
def _get_clinical_system_message(self, user_context: str) -> str:
"""Get system message tailored to user context"""
system_messages = {
"clinician": """You are an expert clinical researcher analyzing papers for practicing physicians.
Focus on:
1. Clinical applicability and patient impact
2. Evidence strength for decision-making
3. Practical implementation in clinical workflow
4. Risk-benefit analysis for patients
5. Immediate vs. future clinical implications
Be evidence-based, practical, and action-oriented.""",
"researcher": """You are a senior research scientist analyzing papers for academic researchers.
Focus on:
1. Methodological rigor and innovation
2. Statistical analysis quality
3. Biological/mechanistic insights
4. Contribution to field knowledge
5. Research gaps and future directions
Be critical, detailed, and forward-looking.""",
"student": """You are a medical educator explaining papers to students.
Focus on:
1. Clear, simplified explanations
2. Key learning points
3. Clinical relevance context
4. Foundational concepts
5. Study design basics
Be educational, structured, and encouraging.""",
"administrator": """You are a healthcare administrator analyzing papers for system leaders.
Focus on:
1. Cost-effectiveness and ROI
2. Implementation feasibility
3. Workflow integration
4. Resource requirements
5. Regulatory/compliance aspects
Be practical, data-driven, and strategic."""
}
return system_messages.get(
user_context,
"""You are a medical research expert analyzing papers.
Provide comprehensive, evidence-based analyses that are:
1. Accurate and precise
2. Well-structured and clear
3. Clinically relevant
4. Transparent about evidence quality
5. Actionable for different stakeholders"""
)
def _extract_enhanced_sections(self, summary: str) -> Dict[str, str]:
"""Extract structured sections from enhanced summary"""
sections = {
"clinical_bottom_line": "",
"study_design_methodology": "",
"key_findings": "",
"clinical_implications": "",
"limitations_cautions": "",
"research_implications": "",
"clinical_recommendations": ""
}
# Try to extract sections by headings
section_patterns = {
"clinical_bottom_line": [
r"clinical bottom line", r"takeaway", r"key message",
r"🎯", r"bottom line"
],
"study_design_methodology": [
r"study design", r"methodology", r"methods",
r"experimental design", r"📊"
],
"key_findings": [
r"key findings", r"results", r"findings",
r"outcomes", r"📈"
],
"clinical_implications": [
r"clinical implications", r"clinical relevance",
r"practice implications", r"🏥"
],
"limitations_cautions": [
r"limitations", r"cautions", r"weaknesses",
r"biases", r"⚠️"
],
"research_implications": [
r"research implications", r"future research",
r"research directions", r"🔬"
],
"clinical_recommendations": [
r"clinical recommendations", r"recommendations",
r"action items", r"💡"
]
}
lines = summary.split('\n')
current_section = None
section_content = []
for line in lines:
line_lower = line.lower().strip()
# Check for new section
for section, patterns in section_patterns.items():
if any(re.search(pattern, line_lower) for pattern in patterns):
# Save previous section
if current_section and section_content:
sections[current_section] = '\n'.join(section_content)
# Start new section
current_section = section
section_content = []
break
else:
# Add content to current section if not empty
if current_section and line.strip():
section_content.append(line)
# Save last section
if current_section and section_content:
sections[current_section] = '\n'.join(section_content)
return sections
def _extract_paper_metrics(self, abstract: str) -> Dict[str, Any]:
"""Extract key clinical metrics from paper abstract"""
metrics = {
"sample_size": None,
"statistical_significance": [],
"effect_sizes": [],
"confidence_intervals": [],
"adverse_events": [],
"follow_up": None
}
abstract_lower = abstract.lower()
# Extract sample size
sample_patterns = [
r'n\s*=\s*(\d+,?\d*)',
r'sample of (\d+,?\d*)',
r'(\d+,?\d*)\s*participants',
r'(\d+,?\d*)\s*patients',
r'(\d+,?\d*)\s*subjects'
]
for pattern in sample_patterns:
match = re.search(pattern, abstract_lower)
if match:
metrics["sample_size"] = match.group(1).replace(',', '')
break
# Extract p-values
p_value_matches = re.findall(
r'p\s*[<≤=]\s*0?\.\d+(?:e[+-]?\d+)?',
abstract_lower,
re.IGNORECASE
)
metrics["statistical_significance"] = p_value_matches[:5]
# Extract effect sizes
effect_patterns = [
(r'HR\s*[=]\s*[\d\.]+', "Hazard Ratio"),
(r'OR\s*[=]\s*[\d\.]+', "Odds Ratio"),
(r'RR\s*[=]\s*[\d\.]+', "Relative Risk"),
(r'ARR\s*[=]\s*[\d\.]+%?', "Absolute Risk Reduction"),
(r'NNT\s*[=]\s*[\d\.]+', "Number Needed to Treat")
]
for pattern, label in effect_patterns:
matches = re.findall(pattern, abstract_lower, re.IGNORECASE)
for match in matches:
metrics["effect_sizes"].append(f"{label}: {match}")
# Extract confidence intervals
ci_matches = re.findall(
r'\d+\.?\d*%\s*CI\s*[\[\(].*?[\]\)]',
abstract_lower,
re.IGNORECASE
)
metrics["confidence_intervals"] = ci_matches[:3]
# Extract follow-up duration
follow_up_matches = re.findall(
r'(\d+(?:\.\d+)?)\s*(?:year|month|week|day)s?\s*(?:follow-up|follow up|FU)',
abstract_lower
)
if follow_up_matches:
metrics["follow_up"] = follow_up_matches[0]
# Extract adverse events
ae_keywords = ['adverse event', 'side effect', 'complication', 'toxicity', 'safety']
for keyword in ae_keywords:
if keyword in abstract_lower:
metrics["adverse_events"].append(keyword)
# Count metric richness
metrics["metric_richness"] = sum(
1 for key in ['sample_size', 'follow_up']
if metrics[key] is not None
) + sum(len(metrics[key]) for key in [
'statistical_significance',
'effect_sizes',
'confidence_intervals',
'adverse_events'
])
return metrics
def _assess_paper_clinical_relevance(self, abstract: str, title: str,
user_context: str) -> Dict[str, Any]:
"""Assess clinical relevance of paper for specific user context"""
# Check for clinical endpoints
clinical_keywords = {
"high_impact": ['survival', 'mortality', 'cure', 'prevention', 'morbidity'],
"medium_impact": ['symptom', 'recovery', 'function', 'quality of life', 'qol'],
"low_impact": ['feasibility', 'pilot', 'mechanism', 'proof of concept'],
"clinical_study": ['trial', 'cohort', 'case-control', 'observational', 'randomized']
}
abstract_lower = abstract.lower()
title_lower = title.lower()
# Calculate relevance score
score = 0
# Impact level
for keyword in clinical_keywords["high_impact"]:
if keyword in abstract_lower or keyword in title_lower:
score += 3
for keyword in clinical_keywords["medium_impact"]:
if keyword in abstract_lower or keyword in title_lower:
score += 2
for keyword in clinical_keywords["low_impact"]:
if keyword in abstract_lower or keyword in title_lower:
score += 1
# Study design
for keyword in clinical_keywords["clinical_study"]:
if keyword in abstract_lower:
score += 2
# Adjust for user context
context_multipliers = {
"clinician": 1.3,
"researcher": 1.1,
"student": 1.0,
"administrator": 1.2,
"general": 1.0
}
score = min(10, score * context_multipliers.get(user_context, 1.0))
# Determine relevance level
if score >= 8:
relevance_level = "High"
applicability = "Ready for clinical consideration"
elif score >= 5:
relevance_level = "Medium"
applicability = "Promising but requires validation"
elif score >= 3:
relevance_level = "Low"
applicability = "Preliminary evidence"
else:
relevance_level = "Very Low"
applicability = "Primarily theoretical/research"
# Check study design
study_design = "Unknown"
for design in ['randomized controlled trial', 'RCT', 'prospective cohort',
'retrospective cohort', 'case-control', 'systematic review']:
if design in abstract_lower:
study_design = design
break
return {
"clinical_impact": relevance_level,
"score": round(score, 1),
"applicability": applicability,
"study_design": study_design,
"key_strengths": self._identify_strengths(abstract_lower),
"main_limitations": self._identify_limitations(abstract_lower)
}
def _identify_strengths(self, abstract: str) -> List[str]:
"""Identify study strengths from abstract"""
strengths = []
if 'randomized' in abstract or 'rct' in abstract:
strengths.append("Randomized controlled trial design")
if 'prospective' in abstract:
strengths.append("Prospective design")
if 'multicenter' in abstract or 'multi-center' in abstract:
strengths.append("Multi-center study")
if 'large sample' in abstract or 'n > 1000' in abstract:
strengths.append("Large sample size")
if 'long-term' in abstract:
strengths.append("Long-term follow-up")
if 'blinded' in abstract:
strengths.append("Blinded assessment")
return strengths[:3] if strengths else ["Standard study design"]
def _identify_limitations(self, abstract: str) -> List[str]:
"""Identify study limitations from abstract"""
limitations = []
limitation_phrases = [
'limitation', 'limited by', 'caution', 'constraint',
'small sample', 'retrospective', 'single center',
'short-term', 'observational', 'cannot determine',
'further research', 'larger studies', 'validate'
]
sentences = re.split(r'[.!?]+', abstract)
for sentence in sentences:
if any(phrase in sentence.lower() for phrase in limitation_phrases):
limitations.append(sentence.strip())
return limitations[:3] if limitations else ["Standard study limitations apply"]
def _calculate_clinical_confidence(self, paper: Dict,
metrics: Dict,
relevance: Dict) -> float:
"""Calculate confidence score for clinical paper summary"""
confidence = 0.5 # Base confidence
# Abstract quality
abstract = paper.get('abstract', '')
if len(abstract) > 800:
confidence += 0.2
elif len(abstract) > 400:
confidence += 0.1
# Source reliability
source = paper.get('source', '').lower()
if any(journal in source for journal in ['nejm', 'lancet', 'jama', 'bmj']):
confidence += 0.15
elif 'pubmed' in source:
confidence += 0.1
elif 'arxiv' in source:
confidence += 0.05
# Recency
if paper.get('publication_date'):
try:
pub_year = int(str(paper['publication_date'])[:4])
current_year = datetime.now().year
if current_year - pub_year <= 2:
confidence += 0.1
elif current_year - pub_year <= 5:
confidence += 0.05
except:
pass
# Citations
citations = paper.get('citations', 0)
if citations > 100:
confidence += 0.05
elif citations > 20:
confidence += 0.03
# Metric richness
metric_score = metrics.get("metric_richness", 0)
confidence += min(0.1, metric_score * 0.02)
# Clinical relevance
relevance_score = relevance.get("score", 0)
confidence += min(0.1, relevance_score * 0.01)
return min(1.0, max(0.3, confidence))
def _generate_quick_clinical_summary(self, title: str, abstract: str,
user_context: str) -> str:
"""Generate a quick 2-sentence clinical summary"""
prompt = f"""Create a 2-sentence clinical summary of this paper for a {user_context}:
Title: {title}
Key content: {abstract[:800]}
First sentence: Main clinical finding.
Second sentence: Clinical implication for {user_context}.
Be extremely concise and action-oriented."""
try:
summary = self.llm.generate(prompt, max_tokens=150)
return summary.strip()
except:
# Fallback
title_snippet = title.split(':')[0] if ':' in title else title[:50]
return f"""1. Study shows promising results in {title_snippet}.
2. Consider for {self._infer_application(abstract, user_context)}."""
def _infer_application(self, abstract: str, user_context: str) -> str:
"""Infer clinical application from abstract"""
abstract_lower = abstract.lower()
if user_context == "clinician":
if 'treatment' in abstract_lower:
return "treatment decisions"
elif 'diagnosis' in abstract_lower:
return "diagnostic workup"
elif 'screening' in abstract_lower:
return "screening protocols"
else:
return "clinical consideration"
elif user_context == "researcher":
if 'mechanism' in abstract_lower:
return "mechanistic studies"
elif 'novel' in abstract_lower:
return "innovation validation"
else:
return "further research"
return "appropriate applications"
def _generate_fallback_summary(self, paper: Dict, query: str,
user_context: str, error: str) -> Dict[str, Any]:
"""Generate fallback summary when enhanced summary fails"""
title = paper.get('title', '')
abstract = paper.get('abstract', '')
return {
"success": False,
"paper_title": title,
"error": f"Enhanced summarization failed: {error}",
"enhanced_summary": f"""# 📄 Basic Paper Summary\n\n**Title:** {title}\n\n**Abstract:**\n{abstract[:1500]}...\n\n*Note: Enhanced clinical analysis unavailable. Please try again.*""",
"quick_summary": f"Basic summary of {title[:50]}...",
"user_context": user_context,
"query_context": query,
"confidence": 0.4
}
def generate_quick_summary(self, paper: Dict[str, Any],
user_context: str = "general") -> str:
"""Generate a quick clinical summary (legacy method for compatibility)"""
return self._generate_quick_clinical_summary(
paper.get('title', ''),
paper.get('abstract', ''),
user_context
)
def summarize_multiple_papers(self, papers: List[Dict],
query: str = None,
user_context: str = "general") -> Dict[str, Any]:
"""Generate comparative summary of multiple papers"""
if not papers:
return {"error": "No papers provided"}
summaries = []
for paper in papers[:5]: # Limit to 5 papers
summary = self.summarize_paper(paper, query, user_context)
summaries.append(summary)
# Generate comparative analysis
comparative = self._generate_comparative_analysis(summaries, user_context)
return {
"success": True,
"paper_count": len(papers),
"individual_summaries": summaries,
"comparative_analysis": comparative,
"user_context": user_context
}
def _generate_comparative_analysis(self, summaries: List[Dict],
user_context: str) -> str:
"""Generate comparative analysis of multiple papers"""
if len(summaries) < 2:
return "Single paper analysis only"
prompt = f"""Compare these {len(summaries)} papers for a {user_context}:
"""
for i, summary in enumerate(summaries, 1):
prompt += f"""Paper {i}: {summary.get('paper_title', 'Unknown')}
Clinical Impact: {summary.get('clinical_relevance', {}).get('clinical_impact', 'Unknown')}
Key Finding: {summary.get('quick_summary', '')[:100]}
"""
prompt += f"""Provide a comparative analysis focusing on:
1. Consistency of findings
2. Evidence strength across papers
3. Clinical implications for {user_context}
4. Research gaps identified
Format as a concise clinical comparison."""
try:
return self.llm.generate(prompt, max_tokens=800)
except:
return "Comparative analysis unavailable"