MedSearchPro / chat /gap_analyzer.py
paulhemb's picture
Initial Backend Deployment
1367957
# chat/gap_analyzer.py
"""
Advanced research gap identification and opportunity analysis
Identifies under-explored areas and future research directions
"""
from typing import List, Dict, Any, Tuple
from llm.llm_provider import GrokLLM
from llm.prompt_templates import MedicalResearchPrompts, ResponseFormatter
from datetime import datetime
import re
class ResearchGapAnalyzer:
"""
Advanced analyzer that identifies research gaps and opportunities
across multiple papers in a domain
"""
def __init__(self, llm=None):
self.llm = llm or GrokLLM(model="model") # Use shared LLM
self.prompts = MedicalResearchPrompts()
self.formatter = ResponseFormatter()
def analyze_gaps(self, papers: List[Dict], domain: str, time_frame: str = "recent") -> Dict[str, Any]:
"""
Analyze research papers to identify gaps and opportunities
"""
print(f"πŸ” Analyzing research gaps in {domain} from {len(papers)} papers")
# Filter to most relevant papers if too many
if len(papers) > 20:
print(f"πŸ”„ Filtering {len(papers)} papers to top 20 most relevant")
papers = papers[:20]
try:
# Generate comprehensive gap analysis
gap_prompt = self.prompts.research_gap_analysis(papers, domain, time_frame)
response = self.llm.generate(
gap_prompt,
system_message=self.prompts.SYSTEM_MESSAGES["research_analyst"],
temperature=0.2, # Slightly higher for creative gap identification
max_tokens=3500
)
# Extract structured gaps and opportunities
structured_gaps = self._extract_structured_gaps(response)
domain_coverage = self._analyze_domain_coverage(papers, domain)
methodological_trends = self._analyze_methodological_trends(papers)
return {
"comprehensive_analysis": response,
"structured_gaps": structured_gaps,
"domain_coverage": domain_coverage,
"methodological_trends": methodological_trends,
"domain": domain,
"time_frame": time_frame,
"papers_analyzed": len(papers),
"analysis_timestamp": datetime.now().isoformat()
}
except Exception as e:
print(f"❌ Gap analysis error: {e}")
return self._create_fallback_gap_analysis(papers, domain, time_frame)
def _extract_structured_gaps(self, analysis: str) -> List[Dict[str, Any]]:
"""Extract structured gap information from analysis text"""
gaps = []
# Patterns for different types of gaps
gap_patterns = [
# Under-explored areas
r"(?:under.explored|understudied|rarely studied|less explored)[^.!?]*[.!?]",
# Methodological gaps
r"(?:methodological|methodology)[^.!?]*(?:gap|limitation|challenge)[^.!?]*[.!?]",
# Data gaps
r"(?:data|dataset)[^.!?]*(?:lack|missing|limited|insufficient)[^.!?]*[.!?]",
# Clinical gaps
r"(?:clinical|translation)[^.!?]*(?:gap|barrier|challenge)[^.!?]*[.!?]"
]
# Extract gap sentences
gap_sentences = []
for pattern in gap_patterns:
matches = re.findall(pattern, analysis, re.IGNORECASE)
gap_sentences.extend(matches)
# Categorize and structure gaps
for sentence in gap_sentences[:15]: # Limit to top 15
gap_type = self._categorize_gap(sentence)
priority = self._assess_gap_priority(sentence)
gaps.append({
"description": sentence.strip(),
"type": gap_type,
"priority": priority,
"research_questions": self._generate_research_questions(sentence),
"potential_impact": self._assess_potential_impact(sentence)
})
return gaps
def _categorize_gap(self, gap_sentence: str) -> str:
"""Categorize the type of research gap"""
gap_sentence = gap_sentence.lower()
if any(term in gap_sentence for term in ['method', 'approach', 'technique', 'algorithm']):
return "methodological"
elif any(term in gap_sentence for term in ['data', 'dataset', 'sample', 'population']):
return "data"
elif any(term in gap_sentence for term in ['clinical', 'patient', 'treatment', 'diagnosis']):
return "clinical_translation"
elif any(term in gap_sentence for term in ['theory', 'concept', 'framework', 'model']):
return "theoretical"
else:
return "general"
def _assess_gap_priority(self, gap_sentence: str) -> str:
"""Assess the priority level of a research gap"""
gap_sentence = gap_sentence.lower()
if any(term in gap_sentence for term in ['critical', 'urgent', 'essential', 'fundamental']):
return "high"
elif any(term in gap_sentence for term in ['important', 'significant', 'major']):
return "medium"
else:
return "low"
def _generate_research_questions(self, gap_sentence: str) -> List[str]:
"""Generate potential research questions from a gap description"""
questions = []
gap_sentence = gap_sentence.lower()
# Simple question generation based on gap type
if 'method' in gap_sentence:
questions.extend([
"What novel methods could address this limitation?",
"How can existing methods be improved for this application?",
"What comparative evaluation is needed for different approaches?"
])
if 'data' in gap_sentence:
questions.extend([
"What new datasets need to be collected or created?",
"How can data limitations be overcome through augmentation or synthesis?",
"What data sharing initiatives would benefit this area?"
])
if 'clinical' in gap_sentence:
questions.extend([
"What clinical validation studies are needed?",
"How can these findings be translated to clinical practice?",
"What are the barriers to clinical adoption and how can they be addressed?"
])
# Add general research questions
questions.extend([
"What experimental design would best address this gap?",
"How can interdisciplinary approaches contribute to solving this?",
"What metrics should be used to evaluate progress in this area?"
])
return questions[:5] # Limit to 5 questions
def _assess_potential_impact(self, gap_sentence: str) -> str:
"""Assess the potential impact of addressing the gap"""
gap_sentence = gap_sentence.lower()
if any(term in gap_sentence for term in ['transform', 'revolutionize', 'breakthrough', 'paradigm']):
return "transformative"
elif any(term in gap_sentence for term in ['significant', 'major', 'substantial', 'important']):
return "high"
elif any(term in gap_sentence for term in ['moderate', 'valuable', 'useful']):
return "medium"
else:
return "low"
def _analyze_domain_coverage(self, papers: List[Dict], domain: str) -> Dict[str, Any]:
"""Analyze coverage and distribution within the domain"""
# Extract sub-topics from paper titles and abstracts
sub_topics = {}
for paper in papers:
text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower()
# Common medical sub-topics (simplified)
medical_subtopics = {
'diagnosis': ['diagnosis', 'detection', 'classification', 'screening'],
'treatment': ['treatment', 'therapy', 'intervention', 'management'],
'prognosis': ['prognosis', 'prediction', 'outcome', 'survival'],
'prevention': ['prevention', 'risk', 'screening', 'early detection'],
'mechanism': ['mechanism', 'pathway', 'biology', 'molecular']
}
for subtopic, keywords in medical_subtopics.items():
if any(keyword in text for keyword in keywords):
sub_topics[subtopic] = sub_topics.get(subtopic, 0) + 1
# Identify coverage gaps
total_papers = len(papers)
coverage_gaps = []
for subtopic, count in sub_topics.items():
coverage_ratio = count / total_papers
if coverage_ratio < 0.1: # Less than 10% coverage
coverage_gaps.append({
'subtopic': subtopic,
'coverage': f"{count}/{total_papers} papers",
'coverage_ratio': coverage_ratio,
'priority': 'high' if coverage_ratio < 0.05 else 'medium'
})
return {
'subtopic_distribution': sub_topics,
'coverage_gaps': coverage_gaps,
'total_subtopics_covered': len(sub_topics)
}
def _analyze_methodological_trends(self, papers: List[Dict]) -> Dict[str, Any]:
"""Analyze methodological trends and biases"""
methods = {
'deep_learning': 0,
'machine_learning': 0,
'statistical': 0,
'clinical_trial': 0,
'review': 0,
'other': 0
}
method_keywords = {
'deep_learning': ['deep learning', 'neural network', 'cnn', 'transformer', 'lstm'],
'machine_learning': ['machine learning', 'random forest', 'svm', 'xgboost', 'clustering'],
'statistical': ['statistical', 'regression', 'correlation', 'anova', 'hypothesis'],
'clinical_trial': ['clinical trial', 'randomized', 'controlled study', 'cohort'],
'review': ['review', 'systematic review', 'meta-analysis', 'literature review']
}
for paper in papers:
text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower()
method_found = False
for method_type, keywords in method_keywords.items():
if any(keyword in text for keyword in keywords):
methods[method_type] += 1
method_found = True
if not method_found:
methods['other'] += 1
# Identify methodological biases
total_papers = len(papers)
methodological_biases = []
for method_type, count in methods.items():
if method_type != 'other':
ratio = count / total_papers
if ratio > 0.6: # Over 60% dominance
methodological_biases.append({
'method': method_type,
'dominance': f"{count}/{total_papers} papers",
'ratio': ratio,
'concern': 'high' if ratio > 0.8 else 'medium'
})
return {
'method_distribution': methods,
'methodological_biases': methodological_biases,
'most_common_method': max(methods, key=methods.get)
}
def _create_fallback_gap_analysis(self, papers: List[Dict], domain: str, time_frame: str) -> Dict[str, Any]:
"""Create basic gap analysis when LLM fails"""
print("πŸ”„ Using fallback gap analysis method")
domain_coverage = self._analyze_domain_coverage(papers, domain)
methodological_trends = self._analyze_methodological_trends(papers)
basic_analysis = f"""
Basic Gap Analysis for: {domain}
Time Frame: {time_frame}
Papers Analyzed: {len(papers)}
Domain Coverage:
- Subtopics covered: {domain_coverage['total_subtopics_covered']}
- Coverage gaps: {len(domain_coverage['coverage_gaps'])}
Methodological Trends:
- Most common method: {methodological_trends['most_common_method']}
- Methodological biases: {len(methodological_trends['methodological_biases'])}
Note: Detailed AI analysis unavailable. Consider more specific search terms.
"""
return {
"comprehensive_analysis": basic_analysis,
"structured_gaps": [],
"domain_coverage": domain_coverage,
"methodological_trends": methodological_trends,
"domain": domain,
"time_frame": time_frame,
"papers_analyzed": len(papers),
"analysis_timestamp": datetime.now().isoformat(),
"fallback_used": True
}
def generate_gap_summary(self, gap_analysis: Dict[str, Any]) -> str:
"""Generate a concise summary of research gaps"""
structured_gaps = gap_analysis.get('structured_gaps', [])
domain_coverage = gap_analysis.get('domain_coverage', {})
methodological_trends = gap_analysis.get('methodological_trends', {})
summary = f"**Research Gap Summary - {gap_analysis['domain']}**\n\n"
summary += f"Based on analysis of {gap_analysis['papers_analyzed']} papers:\n\n"
# Key gaps
if structured_gaps:
high_priority_gaps = [gap for gap in structured_gaps if gap['priority'] == 'high']
summary += f"**High Priority Gaps ({len(high_priority_gaps)}):**\n"
for gap in high_priority_gaps[:3]:
summary += f"β€’ {gap['description'][:100]}...\n"
# Coverage gaps
coverage_gaps = domain_coverage.get('coverage_gaps', [])
if coverage_gaps:
summary += f"\n**Domain Coverage Gaps ({len(coverage_gaps)}):**\n"
for gap in coverage_gaps[:3]:
summary += f"β€’ {gap['subtopic']} (only {gap['coverage']})\n"
# Methodological biases
methodological_biases = methodological_trends.get('methodological_biases', [])
if methodological_biases:
summary += f"\n**Methodological Biases ({len(methodological_biases)}):**\n"
for bias in methodological_biases[:2]:
summary += f"β€’ {bias['method']} dominates ({bias['dominance']})\n"
return summary
# Quick test
def test_gap_analyzer():
"""Test the research gap analyzer"""
print("πŸ§ͺ Testing Research Gap Analyzer")
print("=" * 50)
test_papers = [
{
'title': 'Deep Learning for Alzheimer Diagnosis',
'authors': ['Smith J', 'Johnson A'],
'abstract': 'We apply convolutional neural networks to MRI data for Alzheimer disease diagnosis. Our method achieves 95% accuracy on a dataset of 500 patients.',
'source': 'Nature Medicine',
'domain': 'medical_imaging',
'publication_date': '2024-01-15'
},
{
'title': 'Transformer Networks in Medical Imaging',
'authors': ['Lee K', 'Chen R'],
'abstract': 'This study explores transformer architectures for various medical imaging tasks including classification and segmentation.',
'source': 'IEEE TMI',
'domain': 'medical_imaging',
'publication_date': '2024-02-20'
},
{
'title': 'Review of AI in Radiology',
'authors': ['Brown T', 'Wilson S'],
'abstract': 'Systematic review of artificial intelligence applications in radiology, covering 150 studies from 2010-2023.',
'source': 'Radiology',
'domain': 'medical_imaging',
'publication_date': '2023-12-10'
}
]
analyzer = ResearchGapAnalyzer()
try:
gap_analysis = analyzer.analyze_gaps(
test_papers,
"medical_imaging",
"recent"
)
print(f"βœ… Gap analysis generated successfully")
print(f"πŸ“Š Papers analyzed: {gap_analysis['papers_analyzed']}")
print(f"πŸ” Structured gaps identified: {len(gap_analysis['structured_gaps'])}")
print(f"🎯 Coverage gaps: {len(gap_analysis['domain_coverage']['coverage_gaps'])}")
summary = analyzer.generate_gap_summary(gap_analysis)
print(f"\nπŸ“‹ Gap Summary:\n{summary}")
except Exception as e:
print(f"❌ Gap analysis test failed: {e}")
if __name__ == "__main__":
test_gap_analyzer()