# chat/gap_analyzer.py
"""
Advanced research gap identification and opportunity analysis
Identifies under-explored areas and future research directions
"""

from typing import List, Dict, Any, Tuple
from llm.llm_provider import GrokLLM
from llm.prompt_templates import MedicalResearchPrompts, ResponseFormatter
from datetime import datetime
import re


class ResearchGapAnalyzer:
    """
    Advanced analyzer that identifies research gaps and opportunities
    across multiple papers in a domain
    """

    def __init__(self, llm=None):
        self.llm = llm or GrokLLM(model="model")  # Use shared LLM
        self.prompts = MedicalResearchPrompts()
        self.formatter = ResponseFormatter()

    def analyze_gaps(self, papers: List[Dict], domain: str, time_frame: str = "recent") -> Dict[str, Any]:
        """
        Analyze research papers to identify gaps and opportunities
        """
        print(f"🔍 Analyzing research gaps in {domain} from {len(papers)} papers")

        # Filter to most relevant papers if too many
        if len(papers) > 20:
            print(f"🔄 Filtering {len(papers)} papers to top 20 most relevant")
            papers = papers[:20]

        try:
            # Generate comprehensive gap analysis
            gap_prompt = self.prompts.research_gap_analysis(papers, domain, time_frame)

            response = self.llm.generate(
                gap_prompt,
                system_message=self.prompts.SYSTEM_MESSAGES["research_analyst"],
                temperature=0.2,  # Slightly higher for creative gap identification
                max_tokens=3500
            )

            # Extract structured gaps and opportunities
            structured_gaps = self._extract_structured_gaps(response)
            domain_coverage = self._analyze_domain_coverage(papers, domain)
            methodological_trends = self._analyze_methodological_trends(papers)

            return {
                "comprehensive_analysis": response,
                "structured_gaps": structured_gaps,
                "domain_coverage": domain_coverage,
                "methodological_trends": methodological_trends,
                "domain": domain,
                "time_frame": time_frame,
                "papers_analyzed": len(papers),
                "analysis_timestamp": datetime.now().isoformat()
            }

        except Exception as e:
            print(f"❌ Gap analysis error: {e}")
            return self._create_fallback_gap_analysis(papers, domain, time_frame)

    def _extract_structured_gaps(self, analysis: str) -> List[Dict[str, Any]]:
        """Extract structured gap information from analysis text"""
        gaps = []

        # Patterns for different types of gaps
        gap_patterns = [
            # Under-explored areas
            r"(?:under.explored|understudied|rarely studied|less explored)[^.!?]*[.!?]",
            # Methodological gaps
            r"(?:methodological|methodology)[^.!?]*(?:gap|limitation|challenge)[^.!?]*[.!?]",
            # Data gaps
            r"(?:data|dataset)[^.!?]*(?:lack|missing|limited|insufficient)[^.!?]*[.!?]",
            # Clinical gaps
            r"(?:clinical|translation)[^.!?]*(?:gap|barrier|challenge)[^.!?]*[.!?]"
        ]

        # Extract gap sentences
        gap_sentences = []
        for pattern in gap_patterns:
            matches = re.findall(pattern, analysis, re.IGNORECASE)
            gap_sentences.extend(matches)

        # Categorize and structure gaps
        for sentence in gap_sentences[:15]:  # Limit to top 15
            gap_type = self._categorize_gap(sentence)
            priority = self._assess_gap_priority(sentence)

            gaps.append({
                "description": sentence.strip(),
                "type": gap_type,
                "priority": priority,
                "research_questions": self._generate_research_questions(sentence),
                "potential_impact": self._assess_potential_impact(sentence)
            })

        return gaps

    def _categorize_gap(self, gap_sentence: str) -> str:
        """Categorize the type of research gap"""
        gap_sentence = gap_sentence.lower()

        if any(term in gap_sentence for term in ['method', 'approach', 'technique', 'algorithm']):
            return "methodological"
        elif any(term in gap_sentence for term in ['data', 'dataset', 'sample', 'population']):
            return "data"
        elif any(term in gap_sentence for term in ['clinical', 'patient', 'treatment', 'diagnosis']):
            return "clinical_translation"
        elif any(term in gap_sentence for term in ['theory', 'concept', 'framework', 'model']):
            return "theoretical"
        else:
            return "general"

    def _assess_gap_priority(self, gap_sentence: str) -> str:
        """Assess the priority level of a research gap"""
        gap_sentence = gap_sentence.lower()

        if any(term in gap_sentence for term in ['critical', 'urgent', 'essential', 'fundamental']):
            return "high"
        elif any(term in gap_sentence for term in ['important', 'significant', 'major']):
            return "medium"
        else:
            return "low"

    def _generate_research_questions(self, gap_sentence: str) -> List[str]:
        """Generate potential research questions from a gap description"""
        questions = []
        gap_sentence = gap_sentence.lower()

        # Simple question generation based on gap type
        if 'method' in gap_sentence:
            questions.extend([
                "What novel methods could address this limitation?",
                "How can existing methods be improved for this application?",
                "What comparative evaluation is needed for different approaches?"
            ])

        if 'data' in gap_sentence:
            questions.extend([
                "What new datasets need to be collected or created?",
                "How can data limitations be overcome through augmentation or synthesis?",
                "What data sharing initiatives would benefit this area?"
            ])

        if 'clinical' in gap_sentence:
            questions.extend([
                "What clinical validation studies are needed?",
                "How can these findings be translated to clinical practice?",
                "What are the barriers to clinical adoption and how can they be addressed?"
            ])

        # Add general research questions
        questions.extend([
            "What experimental design would best address this gap?",
            "How can interdisciplinary approaches contribute to solving this?",
            "What metrics should be used to evaluate progress in this area?"
        ])

        return questions[:5]  # Limit to 5 questions

    def _assess_potential_impact(self, gap_sentence: str) -> str:
        """Assess the potential impact of addressing the gap"""
        gap_sentence = gap_sentence.lower()

        if any(term in gap_sentence for term in ['transform', 'revolutionize', 'breakthrough', 'paradigm']):
            return "transformative"
        elif any(term in gap_sentence for term in ['significant', 'major', 'substantial', 'important']):
            return "high"
        elif any(term in gap_sentence for term in ['moderate', 'valuable', 'useful']):
            return "medium"
        else:
            return "low"

    def _analyze_domain_coverage(self, papers: List[Dict], domain: str) -> Dict[str, Any]:
        """Analyze coverage and distribution within the domain"""
        # Extract sub-topics from paper titles and abstracts
        sub_topics = {}

        for paper in papers:
            text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower()

            # Common medical sub-topics (simplified)
            medical_subtopics = {
                'diagnosis': ['diagnosis', 'detection', 'classification', 'screening'],
                'treatment': ['treatment', 'therapy', 'intervention', 'management'],
                'prognosis': ['prognosis', 'prediction', 'outcome', 'survival'],
                'prevention': ['prevention', 'risk', 'screening', 'early detection'],
                'mechanism': ['mechanism', 'pathway', 'biology', 'molecular']
            }

            for subtopic, keywords in medical_subtopics.items():
                if any(keyword in text for keyword in keywords):
                    sub_topics[subtopic] = sub_topics.get(subtopic, 0) + 1

        # Identify coverage gaps
        total_papers = len(papers)
        coverage_gaps = []

        for subtopic, count in sub_topics.items():
            coverage_ratio = count / total_papers
            if coverage_ratio < 0.1:  # Less than 10% coverage
                coverage_gaps.append({
                    'subtopic': subtopic,
                    'coverage': f"{count}/{total_papers} papers",
                    'coverage_ratio': coverage_ratio,
                    'priority': 'high' if coverage_ratio < 0.05 else 'medium'
                })

        return {
            'subtopic_distribution': sub_topics,
            'coverage_gaps': coverage_gaps,
            'total_subtopics_covered': len(sub_topics)
        }

    def _analyze_methodological_trends(self, papers: List[Dict]) -> Dict[str, Any]:
        """Analyze methodological trends and biases"""
        methods = {
            'deep_learning': 0,
            'machine_learning': 0,
            'statistical': 0,
            'clinical_trial': 0,
            'review': 0,
            'other': 0
        }

        method_keywords = {
            'deep_learning': ['deep learning', 'neural network', 'cnn', 'transformer', 'lstm'],
            'machine_learning': ['machine learning', 'random forest', 'svm', 'xgboost', 'clustering'],
            'statistical': ['statistical', 'regression', 'correlation', 'anova', 'hypothesis'],
            'clinical_trial': ['clinical trial', 'randomized', 'controlled study', 'cohort'],
            'review': ['review', 'systematic review', 'meta-analysis', 'literature review']
        }

        for paper in papers:
            text = f"{paper.get('title', '')} {paper.get('abstract', '')}".lower()
            method_found = False

            for method_type, keywords in method_keywords.items():
                if any(keyword in text for keyword in keywords):
                    methods[method_type] += 1
                    method_found = True

            if not method_found:
                methods['other'] += 1

        # Identify methodological biases
        total_papers = len(papers)
        methodological_biases = []

        for method_type, count in methods.items():
            if method_type != 'other':
                ratio = count / total_papers
                if ratio > 0.6:  # Over 60% dominance
                    methodological_biases.append({
                        'method': method_type,
                        'dominance': f"{count}/{total_papers} papers",
                        'ratio': ratio,
                        'concern': 'high' if ratio > 0.8 else 'medium'
                    })

        return {
            'method_distribution': methods,
            'methodological_biases': methodological_biases,
            'most_common_method': max(methods, key=methods.get)
        }

    def _create_fallback_gap_analysis(self, papers: List[Dict], domain: str, time_frame: str) -> Dict[str, Any]:
        """Create basic gap analysis when LLM fails"""
        print("🔄 Using fallback gap analysis method")

        domain_coverage = self._analyze_domain_coverage(papers, domain)
        methodological_trends = self._analyze_methodological_trends(papers)

        basic_analysis = f"""
        Basic Gap Analysis for: {domain}
        Time Frame: {time_frame}
        Papers Analyzed: {len(papers)}

        Domain Coverage:
        - Subtopics covered: {domain_coverage['total_subtopics_covered']}
        - Coverage gaps: {len(domain_coverage['coverage_gaps'])}

        Methodological Trends:
        - Most common method: {methodological_trends['most_common_method']}
        - Methodological biases: {len(methodological_trends['methodological_biases'])}

        Note: Detailed AI analysis unavailable. Consider more specific search terms.
        """

        return {
            "comprehensive_analysis": basic_analysis,
            "structured_gaps": [],
            "domain_coverage": domain_coverage,
            "methodological_trends": methodological_trends,
            "domain": domain,
            "time_frame": time_frame,
            "papers_analyzed": len(papers),
            "analysis_timestamp": datetime.now().isoformat(),
            "fallback_used": True
        }

    def generate_gap_summary(self, gap_analysis: Dict[str, Any]) -> str:
        """Generate a concise summary of research gaps"""
        structured_gaps = gap_analysis.get('structured_gaps', [])
        domain_coverage = gap_analysis.get('domain_coverage', {})
        methodological_trends = gap_analysis.get('methodological_trends', {})

        summary = f"**Research Gap Summary - {gap_analysis['domain']}**\n\n"
        summary += f"Based on analysis of {gap_analysis['papers_analyzed']} papers:\n\n"

        # Key gaps
        if structured_gaps:
            high_priority_gaps = [gap for gap in structured_gaps if gap['priority'] == 'high']
            summary += f"**High Priority Gaps ({len(high_priority_gaps)}):**\n"
            for gap in high_priority_gaps[:3]:
                summary += f"• {gap['description'][:100]}...\n"

        # Coverage gaps
        coverage_gaps = domain_coverage.get('coverage_gaps', [])
        if coverage_gaps:
            summary += f"\n**Domain Coverage Gaps ({len(coverage_gaps)}):**\n"
            for gap in coverage_gaps[:3]:
                summary += f"• {gap['subtopic']} (only {gap['coverage']})\n"

        # Methodological biases
        methodological_biases = methodological_trends.get('methodological_biases', [])
        if methodological_biases:
            summary += f"\n**Methodological Biases ({len(methodological_biases)}):**\n"
            for bias in methodological_biases[:2]:
                summary += f"• {bias['method']} dominates ({bias['dominance']})\n"

        return summary


# Quick test
def test_gap_analyzer():
    """Test the research gap analyzer"""
    print("🧪 Testing Research Gap Analyzer")
    print("=" * 50)

    test_papers = [
        {
            'title': 'Deep Learning for Alzheimer Diagnosis',
            'authors': ['Smith J', 'Johnson A'],
            'abstract': 'We apply convolutional neural networks to MRI data for Alzheimer disease diagnosis. Our method achieves 95% accuracy on a dataset of 500 patients.',
            'source': 'Nature Medicine',
            'domain': 'medical_imaging',
            'publication_date': '2024-01-15'
        },
        {
            'title': 'Transformer Networks in Medical Imaging',
            'authors': ['Lee K', 'Chen R'],
            'abstract': 'This study explores transformer architectures for various medical imaging tasks including classification and segmentation.',
            'source': 'IEEE TMI',
            'domain': 'medical_imaging',
            'publication_date': '2024-02-20'
        },
        {
            'title': 'Review of AI in Radiology',
            'authors': ['Brown T', 'Wilson S'],
            'abstract': 'Systematic review of artificial intelligence applications in radiology, covering 150 studies from 2010-2023.',
            'source': 'Radiology',
            'domain': 'medical_imaging',
            'publication_date': '2023-12-10'
        }
    ]

    analyzer = ResearchGapAnalyzer()

    try:
        gap_analysis = analyzer.analyze_gaps(
            test_papers,
            "medical_imaging",
            "recent"
        )

        print(f"✅ Gap analysis generated successfully")
        print(f"📊 Papers analyzed: {gap_analysis['papers_analyzed']}")
        print(f"🔍 Structured gaps identified: {len(gap_analysis['structured_gaps'])}")
        print(f"🎯 Coverage gaps: {len(gap_analysis['domain_coverage']['coverage_gaps'])}")

        summary = analyzer.generate_gap_summary(gap_analysis)
        print(f"\n📋 Gap Summary:\n{summary}")

    except Exception as e:
        print(f"❌ Gap analysis test failed: {e}")


if __name__ == "__main__":
    test_gap_analyzer()