Spaces:
Running
Running
| # chat/gap_analyzer.py | |
| """ | |
| Advanced research gap identification and opportunity analysis | |
| Identifies under-explored areas and future research directions | |
| """ | |
| from typing import List, Dict, Any, Tuple | |
| from llm.llm_provider import GrokLLM | |
| from llm.prompt_templates import MedicalResearchPrompts, ResponseFormatter | |
| from datetime import datetime | |
| import re | |
| class ResearchGapAnalyzer: | |
| """ | |
| Advanced analyzer that identifies research gaps and opportunities | |
| across multiple papers in a domain | |
| """ | |
| def __init__(self, llm=None): | |
| self.llm = llm or GrokLLM(model="model") # Use shared LLM | |
| self.prompts = MedicalResearchPrompts() | |
| self.formatter = ResponseFormatter() | |
| def analyze_gaps(self, papers: List[Dict], domain: str, time_frame: str = "recent") -> Dict[str, Any]: | |
| """ | |
| Analyze research papers to identify gaps and opportunities | |
| """ | |
| print(f"π Analyzing research gaps in {domain} from {len(papers)} papers") | |
| # Filter to most relevant papers if too many | |
| if len(papers) > 20: | |
| print(f"π Filtering {len(papers)} papers to top 20 most relevant") | |
| papers = papers[:20] | |
| try: | |
| # Generate comprehensive gap analysis | |
| gap_prompt = self.prompts.research_gap_analysis(papers, domain, time_frame) | |
| response = self.llm.generate( | |
| gap_prompt, | |
| system_message=self.prompts.SYSTEM_MESSAGES["research_analyst"], | |
| temperature=0.2, # Slightly higher for creative gap identification | |
| max_tokens=3500 | |
| ) | |
| # Extract structured gaps and opportunities | |
| structured_gaps = self._extract_structured_gaps(response) | |
| domain_coverage = self._analyze_domain_coverage(papers, domain) | |
| methodological_trends = self._analyze_methodological_trends(papers) | |
| return { | |
| "comprehensive_analysis": response, | |
| "structured_gaps": structured_gaps, | |
| "domain_coverage": domain_coverage, | |
| "methodological_trends": methodological_trends, | |
| "domain": domain, | |
| "time_frame": time_frame, | |
| "papers_analyzed": len(papers), | |
| "analysis_timestamp": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| print(f"β Gap analysis error: {e}") | |
| return self._create_fallback_gap_analysis(papers, domain, time_frame) | |
| def _extract_structured_gaps(self, analysis: str) -> List[Dict[str, Any]]: | |
| """Extract structured gap information from analysis text""" | |
| gaps = [] | |
| # Patterns for different types of gaps | |
| gap_patterns = [ | |
| # Under-explored areas | |
| r"(?:under.explored|understudied|rarely studied|less explored)[^.!?]*[.!?]", | |
| # Methodological gaps | |
| r"(?:methodological|methodology)[^.!?]*(?:gap|limitation|challenge)[^.!?]*[.!?]", | |
| # Data gaps | |
| r"(?:data|dataset)[^.!?]*(?:lack|missing|limited|insufficient)[^.!?]*[.!?]", | |
| # Clinical gaps | |
| r"(?:clinical|translation)[^.!?]*(?:gap|barrier|challenge)[^.!?]*[.!?]" | |
| ] | |
| # Extract gap sentences | |
| gap_sentences = [] | |
| for pattern in gap_patterns: | |
| matches = re.findall(pattern, analysis, re.IGNORECASE) | |
| gap_sentences.extend(matches) | |
| # Categorize and structure gaps | |
| for sentence in gap_sentences[:15]: # Limit to top 15 | |
| gap_type = self._categorize_gap(sentence) | |
| priority = self._assess_gap_priority(sentence) | |
| gaps.append({ | |
| "description": sentence.strip(), | |
| "type": gap_type, | |
| "priority": priority, | |
| "research_questions": self._generate_research_questions(sentence), | |
| "potential_impact": self._assess_potential_impact(sentence) | |
| }) | |
| return gaps | |
| def _categorize_gap(self, gap_sentence: str) -> str: | |
| """Categorize the type of research gap""" | |
| gap_sentence = gap_sentence.lower() | |
| if any(term in gap_sentence for term in ['method', 'approach', 'technique', 'algorithm']): | |
| return "methodological" | |
| elif any(term in gap_sentence for term in ['data', 'dataset', 'sample', 'population']): | |
| return "data" | |
| elif any(term in gap_sentence for term in ['clinical', 'patient', 'treatment', 'diagnosis']): | |
| return "clinical_translation" | |
| elif any(term in gap_sentence for term in ['theory', 'concept', 'framework', 'model']): | |
| return "theoretical" | |
| else: | |
| return "general" | |
| def _assess_gap_priority(self, gap_sentence: str) -> str: | |
| """Assess the priority level of a research gap""" | |
| gap_sentence = gap_sentence.lower() | |
| if any(term in gap_sentence for term in ['critical', 'urgent', 'essential', 'fundamental']): | |
| return "high" | |
| elif any(term in gap_sentence for term in ['important', 'significant', 'major']): | |
| return "medium" | |
| else: | |
| return "low" | |
| def _generate_research_questions(self, gap_sentence: str) -> List[str]: | |
| """Generate potential research questions from a gap description""" | |
| questions = [] | |
| gap_sentence = gap_sentence.lower() | |
| # Simple question generation based on gap type | |
| if 'method' in gap_sentence: | |
| questions.extend([ | |
| "What novel methods could address this limitation?", | |
| "How can existing methods be improved for this application?", | |
| "What comparative evaluation is needed for different approaches?" | |
| ]) | |
| if 'data' in gap_sentence: | |
| questions.extend([ | |
| "What new datasets need to be collected or created?", | |
| "How can data limitations be overcome through augmentation or synthesis?", | |
| "What data sharing initiatives would benefit this area?" | |
| ]) | |
| if 'clinical' in gap_sentence: | |
| questions.extend([ | |
| "What clinical validation studies are needed?", | |
| "How can these findings be translated to clinical practice?", | |
| "What are the barriers to clinical adoption and how can they be addressed?" | |
| ]) | |
| # Add general research questions | |
| questions.extend([ | |
| "What experimental design would best address this gap?", | |
| "How can interdisciplinary approaches contribute to solving this?", | |
| "What metrics should be used to evaluate progress in this area?" | |
| ]) | |
| return questions[:5] # Limit to 5 questions | |
| def _assess_potential_impact(self, gap_sentence: str) -> str: | |
| """Assess the potential impact of addressing the gap""" | |
| gap_sentence = gap_sentence.lower() | |
| if any(term in gap_sentence for term in ['transform', 'revolutionize', 'breakthrough', 'paradigm']): | |
| return "transformative" | |
| elif any(term in gap_sentence for term in ['significant', 'major', 'substantial', 'important']): | |
| return "high" | |
| elif any(term in gap_sentence for term in ['moderate', 'valuable', 'useful']): | |
| return "medium" | |
| else: | |
| return "low" | |
| def _analyze_domain_coverage(self, papers: List[Dict], domain: str) -> Dict[str, Any]: | |
| """Analyze coverage and distribution within the domain""" | |
| # Extract sub-topics from paper titles and abstracts | |
| sub_topics = {} | |
| for paper in papers: | |
| text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower() | |
| # Common medical sub-topics (simplified) | |
| medical_subtopics = { | |
| 'diagnosis': ['diagnosis', 'detection', 'classification', 'screening'], | |
| 'treatment': ['treatment', 'therapy', 'intervention', 'management'], | |
| 'prognosis': ['prognosis', 'prediction', 'outcome', 'survival'], | |
| 'prevention': ['prevention', 'risk', 'screening', 'early detection'], | |
| 'mechanism': ['mechanism', 'pathway', 'biology', 'molecular'] | |
| } | |
| for subtopic, keywords in medical_subtopics.items(): | |
| if any(keyword in text for keyword in keywords): | |
| sub_topics[subtopic] = sub_topics.get(subtopic, 0) + 1 | |
| # Identify coverage gaps | |
| total_papers = len(papers) | |
| coverage_gaps = [] | |
| for subtopic, count in sub_topics.items(): | |
| coverage_ratio = count / total_papers | |
| if coverage_ratio < 0.1: # Less than 10% coverage | |
| coverage_gaps.append({ | |
| 'subtopic': subtopic, | |
| 'coverage': f"{count}/{total_papers} papers", | |
| 'coverage_ratio': coverage_ratio, | |
| 'priority': 'high' if coverage_ratio < 0.05 else 'medium' | |
| }) | |
| return { | |
| 'subtopic_distribution': sub_topics, | |
| 'coverage_gaps': coverage_gaps, | |
| 'total_subtopics_covered': len(sub_topics) | |
| } | |
| def _analyze_methodological_trends(self, papers: List[Dict]) -> Dict[str, Any]: | |
| """Analyze methodological trends and biases""" | |
| methods = { | |
| 'deep_learning': 0, | |
| 'machine_learning': 0, | |
| 'statistical': 0, | |
| 'clinical_trial': 0, | |
| 'review': 0, | |
| 'other': 0 | |
| } | |
| method_keywords = { | |
| 'deep_learning': ['deep learning', 'neural network', 'cnn', 'transformer', 'lstm'], | |
| 'machine_learning': ['machine learning', 'random forest', 'svm', 'xgboost', 'clustering'], | |
| 'statistical': ['statistical', 'regression', 'correlation', 'anova', 'hypothesis'], | |
| 'clinical_trial': ['clinical trial', 'randomized', 'controlled study', 'cohort'], | |
| 'review': ['review', 'systematic review', 'meta-analysis', 'literature review'] | |
| } | |
| for paper in papers: | |
| text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower() | |
| method_found = False | |
| for method_type, keywords in method_keywords.items(): | |
| if any(keyword in text for keyword in keywords): | |
| methods[method_type] += 1 | |
| method_found = True | |
| if not method_found: | |
| methods['other'] += 1 | |
| # Identify methodological biases | |
| total_papers = len(papers) | |
| methodological_biases = [] | |
| for method_type, count in methods.items(): | |
| if method_type != 'other': | |
| ratio = count / total_papers | |
| if ratio > 0.6: # Over 60% dominance | |
| methodological_biases.append({ | |
| 'method': method_type, | |
| 'dominance': f"{count}/{total_papers} papers", | |
| 'ratio': ratio, | |
| 'concern': 'high' if ratio > 0.8 else 'medium' | |
| }) | |
| return { | |
| 'method_distribution': methods, | |
| 'methodological_biases': methodological_biases, | |
| 'most_common_method': max(methods, key=methods.get) | |
| } | |
| def _create_fallback_gap_analysis(self, papers: List[Dict], domain: str, time_frame: str) -> Dict[str, Any]: | |
| """Create basic gap analysis when LLM fails""" | |
| print("π Using fallback gap analysis method") | |
| domain_coverage = self._analyze_domain_coverage(papers, domain) | |
| methodological_trends = self._analyze_methodological_trends(papers) | |
| basic_analysis = f""" | |
| Basic Gap Analysis for: {domain} | |
| Time Frame: {time_frame} | |
| Papers Analyzed: {len(papers)} | |
| Domain Coverage: | |
| - Subtopics covered: {domain_coverage['total_subtopics_covered']} | |
| - Coverage gaps: {len(domain_coverage['coverage_gaps'])} | |
| Methodological Trends: | |
| - Most common method: {methodological_trends['most_common_method']} | |
| - Methodological biases: {len(methodological_trends['methodological_biases'])} | |
| Note: Detailed AI analysis unavailable. Consider more specific search terms. | |
| """ | |
| return { | |
| "comprehensive_analysis": basic_analysis, | |
| "structured_gaps": [], | |
| "domain_coverage": domain_coverage, | |
| "methodological_trends": methodological_trends, | |
| "domain": domain, | |
| "time_frame": time_frame, | |
| "papers_analyzed": len(papers), | |
| "analysis_timestamp": datetime.now().isoformat(), | |
| "fallback_used": True | |
| } | |
| def generate_gap_summary(self, gap_analysis: Dict[str, Any]) -> str: | |
| """Generate a concise summary of research gaps""" | |
| structured_gaps = gap_analysis.get('structured_gaps', []) | |
| domain_coverage = gap_analysis.get('domain_coverage', {}) | |
| methodological_trends = gap_analysis.get('methodological_trends', {}) | |
| summary = f"**Research Gap Summary - {gap_analysis['domain']}**\n\n" | |
| summary += f"Based on analysis of {gap_analysis['papers_analyzed']} papers:\n\n" | |
| # Key gaps | |
| if structured_gaps: | |
| high_priority_gaps = [gap for gap in structured_gaps if gap['priority'] == 'high'] | |
| summary += f"**High Priority Gaps ({len(high_priority_gaps)}):**\n" | |
| for gap in high_priority_gaps[:3]: | |
| summary += f"β’ {gap['description'][:100]}...\n" | |
| # Coverage gaps | |
| coverage_gaps = domain_coverage.get('coverage_gaps', []) | |
| if coverage_gaps: | |
| summary += f"\n**Domain Coverage Gaps ({len(coverage_gaps)}):**\n" | |
| for gap in coverage_gaps[:3]: | |
| summary += f"β’ {gap['subtopic']} (only {gap['coverage']})\n" | |
| # Methodological biases | |
| methodological_biases = methodological_trends.get('methodological_biases', []) | |
| if methodological_biases: | |
| summary += f"\n**Methodological Biases ({len(methodological_biases)}):**\n" | |
| for bias in methodological_biases[:2]: | |
| summary += f"β’ {bias['method']} dominates ({bias['dominance']})\n" | |
| return summary | |
| # Quick test | |
| def test_gap_analyzer(): | |
| """Test the research gap analyzer""" | |
| print("π§ͺ Testing Research Gap Analyzer") | |
| print("=" * 50) | |
| test_papers = [ | |
| { | |
| 'title': 'Deep Learning for Alzheimer Diagnosis', | |
| 'authors': ['Smith J', 'Johnson A'], | |
| 'abstract': 'We apply convolutional neural networks to MRI data for Alzheimer disease diagnosis. Our method achieves 95% accuracy on a dataset of 500 patients.', | |
| 'source': 'Nature Medicine', | |
| 'domain': 'medical_imaging', | |
| 'publication_date': '2024-01-15' | |
| }, | |
| { | |
| 'title': 'Transformer Networks in Medical Imaging', | |
| 'authors': ['Lee K', 'Chen R'], | |
| 'abstract': 'This study explores transformer architectures for various medical imaging tasks including classification and segmentation.', | |
| 'source': 'IEEE TMI', | |
| 'domain': 'medical_imaging', | |
| 'publication_date': '2024-02-20' | |
| }, | |
| { | |
| 'title': 'Review of AI in Radiology', | |
| 'authors': ['Brown T', 'Wilson S'], | |
| 'abstract': 'Systematic review of artificial intelligence applications in radiology, covering 150 studies from 2010-2023.', | |
| 'source': 'Radiology', | |
| 'domain': 'medical_imaging', | |
| 'publication_date': '2023-12-10' | |
| } | |
| ] | |
| analyzer = ResearchGapAnalyzer() | |
| try: | |
| gap_analysis = analyzer.analyze_gaps( | |
| test_papers, | |
| "medical_imaging", | |
| "recent" | |
| ) | |
| print(f"β Gap analysis generated successfully") | |
| print(f"π Papers analyzed: {gap_analysis['papers_analyzed']}") | |
| print(f"π Structured gaps identified: {len(gap_analysis['structured_gaps'])}") | |
| print(f"π― Coverage gaps: {len(gap_analysis['domain_coverage']['coverage_gaps'])}") | |
| summary = analyzer.generate_gap_summary(gap_analysis) | |
| print(f"\nπ Gap Summary:\n{summary}") | |
| except Exception as e: | |
| print(f"β Gap analysis test failed: {e}") | |
| if __name__ == "__main__": | |
| test_gap_analyzer() |