Spaces:

paulhemb
/

MedSearchPro

Running

App Files Files Community

MedSearchPro / chat /gap_analyzer.py

paulhemb

Initial Backend Deployment

1367957 about 1 month ago

raw

history blame contribute delete

16.8 kB

	# chat/gap_analyzer.py
	"""
	Advanced research gap identification and opportunity analysis
	Identifies under-explored areas and future research directions
	"""

	from typing import List, Dict, Any, Tuple
	from llm.llm_provider import GrokLLM
	from llm.prompt_templates import MedicalResearchPrompts, ResponseFormatter
	from datetime import datetime
	import re


	class ResearchGapAnalyzer:
	"""
	Advanced analyzer that identifies research gaps and opportunities
	across multiple papers in a domain
	"""

	def __init__(self, llm=None):
	self.llm = llm or GrokLLM(model="model") # Use shared LLM
	self.prompts = MedicalResearchPrompts()
	self.formatter = ResponseFormatter()

	def analyze_gaps(self, papers: List[Dict], domain: str, time_frame: str = "recent") -> Dict[str, Any]:
	"""
	Analyze research papers to identify gaps and opportunities
	"""
	print(f"🔍 Analyzing research gaps in {domain} from {len(papers)} papers")

	# Filter to most relevant papers if too many
	if len(papers) > 20:
	print(f"🔄 Filtering {len(papers)} papers to top 20 most relevant")
	papers = papers[:20]

	try:
	# Generate comprehensive gap analysis
	gap_prompt = self.prompts.research_gap_analysis(papers, domain, time_frame)

	response = self.llm.generate(
	gap_prompt,
	system_message=self.prompts.SYSTEM_MESSAGES["research_analyst"],
	temperature=0.2, # Slightly higher for creative gap identification
	max_tokens=3500
	)

	# Extract structured gaps and opportunities
	structured_gaps = self._extract_structured_gaps(response)
	domain_coverage = self._analyze_domain_coverage(papers, domain)
	methodological_trends = self._analyze_methodological_trends(papers)

	return {
	"comprehensive_analysis": response,
	"structured_gaps": structured_gaps,
	"domain_coverage": domain_coverage,
	"methodological_trends": methodological_trends,
	"domain": domain,
	"time_frame": time_frame,
	"papers_analyzed": len(papers),
	"analysis_timestamp": datetime.now().isoformat()
	}

	except Exception as e:
	print(f"❌ Gap analysis error: {e}")
	return self._create_fallback_gap_analysis(papers, domain, time_frame)

	def _extract_structured_gaps(self, analysis: str) -> List[Dict[str, Any]]:
	"""Extract structured gap information from analysis text"""
	gaps = []

	# Patterns for different types of gaps
	gap_patterns = [
	# Under-explored areas
	r"(?:under.explored\|understudied\|rarely studied\|less explored)[^.!?]*[.!?]",
	# Methodological gaps
	r"(?:methodological\|methodology)[^.!?](?:gap\|limitation\|challenge)[^.!?][.!?]",
	# Data gaps
	r"(?:data\|dataset)[^.!?](?:lack\|missing\|limited\|insufficient)[^.!?][.!?]",
	# Clinical gaps
	r"(?:clinical\|translation)[^.!?](?:gap\|barrier\|challenge)[^.!?][.!?]"
	]

	# Extract gap sentences
	gap_sentences = []
	for pattern in gap_patterns:
	matches = re.findall(pattern, analysis, re.IGNORECASE)
	gap_sentences.extend(matches)

	# Categorize and structure gaps
	for sentence in gap_sentences[:15]: # Limit to top 15
	gap_type = self._categorize_gap(sentence)
	priority = self._assess_gap_priority(sentence)

	gaps.append({
	"description": sentence.strip(),
	"type": gap_type,
	"priority": priority,
	"research_questions": self._generate_research_questions(sentence),
	"potential_impact": self._assess_potential_impact(sentence)
	})

	return gaps

	def _categorize_gap(self, gap_sentence: str) -> str:
	"""Categorize the type of research gap"""
	gap_sentence = gap_sentence.lower()

	if any(term in gap_sentence for term in ['method', 'approach', 'technique', 'algorithm']):
	return "methodological"
	elif any(term in gap_sentence for term in ['data', 'dataset', 'sample', 'population']):
	return "data"
	elif any(term in gap_sentence for term in ['clinical', 'patient', 'treatment', 'diagnosis']):
	return "clinical_translation"
	elif any(term in gap_sentence for term in ['theory', 'concept', 'framework', 'model']):
	return "theoretical"
	else:
	return "general"

	def _assess_gap_priority(self, gap_sentence: str) -> str:
	"""Assess the priority level of a research gap"""
	gap_sentence = gap_sentence.lower()

	if any(term in gap_sentence for term in ['critical', 'urgent', 'essential', 'fundamental']):
	return "high"
	elif any(term in gap_sentence for term in ['important', 'significant', 'major']):
	return "medium"
	else:
	return "low"

	def _generate_research_questions(self, gap_sentence: str) -> List[str]:
	"""Generate potential research questions from a gap description"""
	questions = []
	gap_sentence = gap_sentence.lower()

	# Simple question generation based on gap type
	if 'method' in gap_sentence:
	questions.extend([
	"What novel methods could address this limitation?",
	"How can existing methods be improved for this application?",
	"What comparative evaluation is needed for different approaches?"
	])

	if 'data' in gap_sentence:
	questions.extend([
	"What new datasets need to be collected or created?",
	"How can data limitations be overcome through augmentation or synthesis?",
	"What data sharing initiatives would benefit this area?"
	])

	if 'clinical' in gap_sentence:
	questions.extend([
	"What clinical validation studies are needed?",
	"How can these findings be translated to clinical practice?",
	"What are the barriers to clinical adoption and how can they be addressed?"
	])

	# Add general research questions
	questions.extend([
	"What experimental design would best address this gap?",
	"How can interdisciplinary approaches contribute to solving this?",
	"What metrics should be used to evaluate progress in this area?"
	])

	return questions[:5] # Limit to 5 questions

	def _assess_potential_impact(self, gap_sentence: str) -> str:
	"""Assess the potential impact of addressing the gap"""
	gap_sentence = gap_sentence.lower()

	if any(term in gap_sentence for term in ['transform', 'revolutionize', 'breakthrough', 'paradigm']):
	return "transformative"
	elif any(term in gap_sentence for term in ['significant', 'major', 'substantial', 'important']):
	return "high"
	elif any(term in gap_sentence for term in ['moderate', 'valuable', 'useful']):
	return "medium"
	else:
	return "low"

	def _analyze_domain_coverage(self, papers: List[Dict], domain: str) -> Dict[str, Any]:
	"""Analyze coverage and distribution within the domain"""
	# Extract sub-topics from paper titles and abstracts
	sub_topics = {}

	for paper in papers:
	text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower()

	# Common medical sub-topics (simplified)
	medical_subtopics = {
	'diagnosis': ['diagnosis', 'detection', 'classification', 'screening'],
	'treatment': ['treatment', 'therapy', 'intervention', 'management'],
	'prognosis': ['prognosis', 'prediction', 'outcome', 'survival'],
	'prevention': ['prevention', 'risk', 'screening', 'early detection'],
	'mechanism': ['mechanism', 'pathway', 'biology', 'molecular']
	}

	for subtopic, keywords in medical_subtopics.items():
	if any(keyword in text for keyword in keywords):
	sub_topics[subtopic] = sub_topics.get(subtopic, 0) + 1

	# Identify coverage gaps
	total_papers = len(papers)
	coverage_gaps = []

	for subtopic, count in sub_topics.items():
	coverage_ratio = count / total_papers
	if coverage_ratio < 0.1: # Less than 10% coverage
	coverage_gaps.append({
	'subtopic': subtopic,
	'coverage': f"{count}/{total_papers} papers",
	'coverage_ratio': coverage_ratio,
	'priority': 'high' if coverage_ratio < 0.05 else 'medium'
	})

	return {
	'subtopic_distribution': sub_topics,
	'coverage_gaps': coverage_gaps,
	'total_subtopics_covered': len(sub_topics)
	}

	def _analyze_methodological_trends(self, papers: List[Dict]) -> Dict[str, Any]:
	"""Analyze methodological trends and biases"""
	methods = {
	'deep_learning': 0,
	'machine_learning': 0,
	'statistical': 0,
	'clinical_trial': 0,
	'review': 0,
	'other': 0
	}

	method_keywords = {
	'deep_learning': ['deep learning', 'neural network', 'cnn', 'transformer', 'lstm'],
	'machine_learning': ['machine learning', 'random forest', 'svm', 'xgboost', 'clustering'],
	'statistical': ['statistical', 'regression', 'correlation', 'anova', 'hypothesis'],
	'clinical_trial': ['clinical trial', 'randomized', 'controlled study', 'cohort'],
	'review': ['review', 'systematic review', 'meta-analysis', 'literature review']
	}

	for paper in papers:
	text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower()
	method_found = False

	for method_type, keywords in method_keywords.items():
	if any(keyword in text for keyword in keywords):
	methods[method_type] += 1
	method_found = True

	if not method_found:
	methods['other'] += 1

	# Identify methodological biases
	total_papers = len(papers)
	methodological_biases = []

	for method_type, count in methods.items():
	if method_type != 'other':
	ratio = count / total_papers
	if ratio > 0.6: # Over 60% dominance
	methodological_biases.append({
	'method': method_type,
	'dominance': f"{count}/{total_papers} papers",
	'ratio': ratio,
	'concern': 'high' if ratio > 0.8 else 'medium'
	})

	return {
	'method_distribution': methods,
	'methodological_biases': methodological_biases,
	'most_common_method': max(methods, key=methods.get)
	}

	def _create_fallback_gap_analysis(self, papers: List[Dict], domain: str, time_frame: str) -> Dict[str, Any]:
	"""Create basic gap analysis when LLM fails"""
	print("🔄 Using fallback gap analysis method")

	domain_coverage = self._analyze_domain_coverage(papers, domain)
	methodological_trends = self._analyze_methodological_trends(papers)

	basic_analysis = f"""
	Basic Gap Analysis for: {domain}
	Time Frame: {time_frame}
	Papers Analyzed: {len(papers)}

	Domain Coverage:
	- Subtopics covered: {domain_coverage['total_subtopics_covered']}
	- Coverage gaps: {len(domain_coverage['coverage_gaps'])}

	Methodological Trends:
	- Most common method: {methodological_trends['most_common_method']}
	- Methodological biases: {len(methodological_trends['methodological_biases'])}

	Note: Detailed AI analysis unavailable. Consider more specific search terms.
	"""

	return {
	"comprehensive_analysis": basic_analysis,
	"structured_gaps": [],
	"domain_coverage": domain_coverage,
	"methodological_trends": methodological_trends,
	"domain": domain,
	"time_frame": time_frame,
	"papers_analyzed": len(papers),
	"analysis_timestamp": datetime.now().isoformat(),
	"fallback_used": True
	}

	def generate_gap_summary(self, gap_analysis: Dict[str, Any]) -> str:
	"""Generate a concise summary of research gaps"""
	structured_gaps = gap_analysis.get('structured_gaps', [])
	domain_coverage = gap_analysis.get('domain_coverage', {})
	methodological_trends = gap_analysis.get('methodological_trends', {})

	summary = f"Research Gap Summary - {gap_analysis['domain']}\n\n"
	summary += f"Based on analysis of {gap_analysis['papers_analyzed']} papers:\n\n"

	# Key gaps
	if structured_gaps:
	high_priority_gaps = [gap for gap in structured_gaps if gap['priority'] == 'high']
	summary += f"High Priority Gaps ({len(high_priority_gaps)}):\n"
	for gap in high_priority_gaps[:3]:
	summary += f"• {gap['description'][:100]}...\n"

	# Coverage gaps
	coverage_gaps = domain_coverage.get('coverage_gaps', [])
	if coverage_gaps:
	summary += f"\nDomain Coverage Gaps ({len(coverage_gaps)}):\n"
	for gap in coverage_gaps[:3]:
	summary += f"• {gap['subtopic']} (only {gap['coverage']})\n"

	# Methodological biases
	methodological_biases = methodological_trends.get('methodological_biases', [])
	if methodological_biases:
	summary += f"\nMethodological Biases ({len(methodological_biases)}):\n"
	for bias in methodological_biases[:2]:
	summary += f"• {bias['method']} dominates ({bias['dominance']})\n"

	return summary


	# Quick test
	def test_gap_analyzer():
	"""Test the research gap analyzer"""
	print("🧪 Testing Research Gap Analyzer")
	print("=" * 50)

	test_papers = [
	{
	'title': 'Deep Learning for Alzheimer Diagnosis',
	'authors': ['Smith J', 'Johnson A'],
	'abstract': 'We apply convolutional neural networks to MRI data for Alzheimer disease diagnosis. Our method achieves 95% accuracy on a dataset of 500 patients.',
	'source': 'Nature Medicine',
	'domain': 'medical_imaging',
	'publication_date': '2024-01-15'
	},
	{
	'title': 'Transformer Networks in Medical Imaging',
	'authors': ['Lee K', 'Chen R'],
	'abstract': 'This study explores transformer architectures for various medical imaging tasks including classification and segmentation.',
	'source': 'IEEE TMI',
	'domain': 'medical_imaging',
	'publication_date': '2024-02-20'
	},
	{
	'title': 'Review of AI in Radiology',
	'authors': ['Brown T', 'Wilson S'],
	'abstract': 'Systematic review of artificial intelligence applications in radiology, covering 150 studies from 2010-2023.',
	'source': 'Radiology',
	'domain': 'medical_imaging',
	'publication_date': '2023-12-10'
	}
	]

	analyzer = ResearchGapAnalyzer()

	try:
	gap_analysis = analyzer.analyze_gaps(
	test_papers,
	"medical_imaging",
	"recent"
	)

	print(f"✅ Gap analysis generated successfully")
	print(f"📊 Papers analyzed: {gap_analysis['papers_analyzed']}")
	print(f"🔍 Structured gaps identified: {len(gap_analysis['structured_gaps'])}")
	print(f"🎯 Coverage gaps: {len(gap_analysis['domain_coverage']['coverage_gaps'])}")

	summary = analyzer.generate_gap_summary(gap_analysis)
	print(f"\n📋 Gap Summary:\n{summary}")

	except Exception as e:
	print(f"❌ Gap analysis test failed: {e}")


	if __name__ == "__main__":
	test_gap_analyzer()