# chat/gap_analyzer.py """ Advanced research gap identification and opportunity analysis Identifies under-explored areas and future research directions """ from typing import List, Dict, Any, Tuple from llm.llm_provider import GrokLLM from llm.prompt_templates import MedicalResearchPrompts, ResponseFormatter from datetime import datetime import re class ResearchGapAnalyzer: """ Advanced analyzer that identifies research gaps and opportunities across multiple papers in a domain """ def __init__(self, llm=None): self.llm = llm or GrokLLM(model="model") # Use shared LLM self.prompts = MedicalResearchPrompts() self.formatter = ResponseFormatter() def analyze_gaps(self, papers: List[Dict], domain: str, time_frame: str = "recent") -> Dict[str, Any]: """ Analyze research papers to identify gaps and opportunities """ print(f"๐Ÿ” Analyzing research gaps in {domain} from {len(papers)} papers") # Filter to most relevant papers if too many if len(papers) > 20: print(f"๐Ÿ”„ Filtering {len(papers)} papers to top 20 most relevant") papers = papers[:20] try: # Generate comprehensive gap analysis gap_prompt = self.prompts.research_gap_analysis(papers, domain, time_frame) response = self.llm.generate( gap_prompt, system_message=self.prompts.SYSTEM_MESSAGES["research_analyst"], temperature=0.2, # Slightly higher for creative gap identification max_tokens=3500 ) # Extract structured gaps and opportunities structured_gaps = self._extract_structured_gaps(response) domain_coverage = self._analyze_domain_coverage(papers, domain) methodological_trends = self._analyze_methodological_trends(papers) return { "comprehensive_analysis": response, "structured_gaps": structured_gaps, "domain_coverage": domain_coverage, "methodological_trends": methodological_trends, "domain": domain, "time_frame": time_frame, "papers_analyzed": len(papers), "analysis_timestamp": datetime.now().isoformat() } except Exception as e: print(f"โŒ Gap analysis error: {e}") return self._create_fallback_gap_analysis(papers, domain, time_frame) def _extract_structured_gaps(self, analysis: str) -> List[Dict[str, Any]]: """Extract structured gap information from analysis text""" gaps = [] # Patterns for different types of gaps gap_patterns = [ # Under-explored areas r"(?:under.explored|understudied|rarely studied|less explored)[^.!?]*[.!?]", # Methodological gaps r"(?:methodological|methodology)[^.!?]*(?:gap|limitation|challenge)[^.!?]*[.!?]", # Data gaps r"(?:data|dataset)[^.!?]*(?:lack|missing|limited|insufficient)[^.!?]*[.!?]", # Clinical gaps r"(?:clinical|translation)[^.!?]*(?:gap|barrier|challenge)[^.!?]*[.!?]" ] # Extract gap sentences gap_sentences = [] for pattern in gap_patterns: matches = re.findall(pattern, analysis, re.IGNORECASE) gap_sentences.extend(matches) # Categorize and structure gaps for sentence in gap_sentences[:15]: # Limit to top 15 gap_type = self._categorize_gap(sentence) priority = self._assess_gap_priority(sentence) gaps.append({ "description": sentence.strip(), "type": gap_type, "priority": priority, "research_questions": self._generate_research_questions(sentence), "potential_impact": self._assess_potential_impact(sentence) }) return gaps def _categorize_gap(self, gap_sentence: str) -> str: """Categorize the type of research gap""" gap_sentence = gap_sentence.lower() if any(term in gap_sentence for term in ['method', 'approach', 'technique', 'algorithm']): return "methodological" elif any(term in gap_sentence for term in ['data', 'dataset', 'sample', 'population']): return "data" elif any(term in gap_sentence for term in ['clinical', 'patient', 'treatment', 'diagnosis']): return "clinical_translation" elif any(term in gap_sentence for term in ['theory', 'concept', 'framework', 'model']): return "theoretical" else: return "general" def _assess_gap_priority(self, gap_sentence: str) -> str: """Assess the priority level of a research gap""" gap_sentence = gap_sentence.lower() if any(term in gap_sentence for term in ['critical', 'urgent', 'essential', 'fundamental']): return "high" elif any(term in gap_sentence for term in ['important', 'significant', 'major']): return "medium" else: return "low" def _generate_research_questions(self, gap_sentence: str) -> List[str]: """Generate potential research questions from a gap description""" questions = [] gap_sentence = gap_sentence.lower() # Simple question generation based on gap type if 'method' in gap_sentence: questions.extend([ "What novel methods could address this limitation?", "How can existing methods be improved for this application?", "What comparative evaluation is needed for different approaches?" ]) if 'data' in gap_sentence: questions.extend([ "What new datasets need to be collected or created?", "How can data limitations be overcome through augmentation or synthesis?", "What data sharing initiatives would benefit this area?" ]) if 'clinical' in gap_sentence: questions.extend([ "What clinical validation studies are needed?", "How can these findings be translated to clinical practice?", "What are the barriers to clinical adoption and how can they be addressed?" ]) # Add general research questions questions.extend([ "What experimental design would best address this gap?", "How can interdisciplinary approaches contribute to solving this?", "What metrics should be used to evaluate progress in this area?" ]) return questions[:5] # Limit to 5 questions def _assess_potential_impact(self, gap_sentence: str) -> str: """Assess the potential impact of addressing the gap""" gap_sentence = gap_sentence.lower() if any(term in gap_sentence for term in ['transform', 'revolutionize', 'breakthrough', 'paradigm']): return "transformative" elif any(term in gap_sentence for term in ['significant', 'major', 'substantial', 'important']): return "high" elif any(term in gap_sentence for term in ['moderate', 'valuable', 'useful']): return "medium" else: return "low" def _analyze_domain_coverage(self, papers: List[Dict], domain: str) -> Dict[str, Any]: """Analyze coverage and distribution within the domain""" # Extract sub-topics from paper titles and abstracts sub_topics = {} for paper in papers: text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower() # Common medical sub-topics (simplified) medical_subtopics = { 'diagnosis': ['diagnosis', 'detection', 'classification', 'screening'], 'treatment': ['treatment', 'therapy', 'intervention', 'management'], 'prognosis': ['prognosis', 'prediction', 'outcome', 'survival'], 'prevention': ['prevention', 'risk', 'screening', 'early detection'], 'mechanism': ['mechanism', 'pathway', 'biology', 'molecular'] } for subtopic, keywords in medical_subtopics.items(): if any(keyword in text for keyword in keywords): sub_topics[subtopic] = sub_topics.get(subtopic, 0) + 1 # Identify coverage gaps total_papers = len(papers) coverage_gaps = [] for subtopic, count in sub_topics.items(): coverage_ratio = count / total_papers if coverage_ratio < 0.1: # Less than 10% coverage coverage_gaps.append({ 'subtopic': subtopic, 'coverage': f"{count}/{total_papers} papers", 'coverage_ratio': coverage_ratio, 'priority': 'high' if coverage_ratio < 0.05 else 'medium' }) return { 'subtopic_distribution': sub_topics, 'coverage_gaps': coverage_gaps, 'total_subtopics_covered': len(sub_topics) } def _analyze_methodological_trends(self, papers: List[Dict]) -> Dict[str, Any]: """Analyze methodological trends and biases""" methods = { 'deep_learning': 0, 'machine_learning': 0, 'statistical': 0, 'clinical_trial': 0, 'review': 0, 'other': 0 } method_keywords = { 'deep_learning': ['deep learning', 'neural network', 'cnn', 'transformer', 'lstm'], 'machine_learning': ['machine learning', 'random forest', 'svm', 'xgboost', 'clustering'], 'statistical': ['statistical', 'regression', 'correlation', 'anova', 'hypothesis'], 'clinical_trial': ['clinical trial', 'randomized', 'controlled study', 'cohort'], 'review': ['review', 'systematic review', 'meta-analysis', 'literature review'] } for paper in papers: text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower() method_found = False for method_type, keywords in method_keywords.items(): if any(keyword in text for keyword in keywords): methods[method_type] += 1 method_found = True if not method_found: methods['other'] += 1 # Identify methodological biases total_papers = len(papers) methodological_biases = [] for method_type, count in methods.items(): if method_type != 'other': ratio = count / total_papers if ratio > 0.6: # Over 60% dominance methodological_biases.append({ 'method': method_type, 'dominance': f"{count}/{total_papers} papers", 'ratio': ratio, 'concern': 'high' if ratio > 0.8 else 'medium' }) return { 'method_distribution': methods, 'methodological_biases': methodological_biases, 'most_common_method': max(methods, key=methods.get) } def _create_fallback_gap_analysis(self, papers: List[Dict], domain: str, time_frame: str) -> Dict[str, Any]: """Create basic gap analysis when LLM fails""" print("๐Ÿ”„ Using fallback gap analysis method") domain_coverage = self._analyze_domain_coverage(papers, domain) methodological_trends = self._analyze_methodological_trends(papers) basic_analysis = f""" Basic Gap Analysis for: {domain} Time Frame: {time_frame} Papers Analyzed: {len(papers)} Domain Coverage: - Subtopics covered: {domain_coverage['total_subtopics_covered']} - Coverage gaps: {len(domain_coverage['coverage_gaps'])} Methodological Trends: - Most common method: {methodological_trends['most_common_method']} - Methodological biases: {len(methodological_trends['methodological_biases'])} Note: Detailed AI analysis unavailable. Consider more specific search terms. """ return { "comprehensive_analysis": basic_analysis, "structured_gaps": [], "domain_coverage": domain_coverage, "methodological_trends": methodological_trends, "domain": domain, "time_frame": time_frame, "papers_analyzed": len(papers), "analysis_timestamp": datetime.now().isoformat(), "fallback_used": True } def generate_gap_summary(self, gap_analysis: Dict[str, Any]) -> str: """Generate a concise summary of research gaps""" structured_gaps = gap_analysis.get('structured_gaps', []) domain_coverage = gap_analysis.get('domain_coverage', {}) methodological_trends = gap_analysis.get('methodological_trends', {}) summary = f"**Research Gap Summary - {gap_analysis['domain']}**\n\n" summary += f"Based on analysis of {gap_analysis['papers_analyzed']} papers:\n\n" # Key gaps if structured_gaps: high_priority_gaps = [gap for gap in structured_gaps if gap['priority'] == 'high'] summary += f"**High Priority Gaps ({len(high_priority_gaps)}):**\n" for gap in high_priority_gaps[:3]: summary += f"โ€ข {gap['description'][:100]}...\n" # Coverage gaps coverage_gaps = domain_coverage.get('coverage_gaps', []) if coverage_gaps: summary += f"\n**Domain Coverage Gaps ({len(coverage_gaps)}):**\n" for gap in coverage_gaps[:3]: summary += f"โ€ข {gap['subtopic']} (only {gap['coverage']})\n" # Methodological biases methodological_biases = methodological_trends.get('methodological_biases', []) if methodological_biases: summary += f"\n**Methodological Biases ({len(methodological_biases)}):**\n" for bias in methodological_biases[:2]: summary += f"โ€ข {bias['method']} dominates ({bias['dominance']})\n" return summary # Quick test def test_gap_analyzer(): """Test the research gap analyzer""" print("๐Ÿงช Testing Research Gap Analyzer") print("=" * 50) test_papers = [ { 'title': 'Deep Learning for Alzheimer Diagnosis', 'authors': ['Smith J', 'Johnson A'], 'abstract': 'We apply convolutional neural networks to MRI data for Alzheimer disease diagnosis. Our method achieves 95% accuracy on a dataset of 500 patients.', 'source': 'Nature Medicine', 'domain': 'medical_imaging', 'publication_date': '2024-01-15' }, { 'title': 'Transformer Networks in Medical Imaging', 'authors': ['Lee K', 'Chen R'], 'abstract': 'This study explores transformer architectures for various medical imaging tasks including classification and segmentation.', 'source': 'IEEE TMI', 'domain': 'medical_imaging', 'publication_date': '2024-02-20' }, { 'title': 'Review of AI in Radiology', 'authors': ['Brown T', 'Wilson S'], 'abstract': 'Systematic review of artificial intelligence applications in radiology, covering 150 studies from 2010-2023.', 'source': 'Radiology', 'domain': 'medical_imaging', 'publication_date': '2023-12-10' } ] analyzer = ResearchGapAnalyzer() try: gap_analysis = analyzer.analyze_gaps( test_papers, "medical_imaging", "recent" ) print(f"โœ… Gap analysis generated successfully") print(f"๐Ÿ“Š Papers analyzed: {gap_analysis['papers_analyzed']}") print(f"๐Ÿ” Structured gaps identified: {len(gap_analysis['structured_gaps'])}") print(f"๐ŸŽฏ Coverage gaps: {len(gap_analysis['domain_coverage']['coverage_gaps'])}") summary = analyzer.generate_gap_summary(gap_analysis) print(f"\n๐Ÿ“‹ Gap Summary:\n{summary}") except Exception as e: print(f"โŒ Gap analysis test failed: {e}") if __name__ == "__main__": test_gap_analyzer()