from __future__ import annotations

import gradio as gr
import pandas as pd
import numpy as np
import json
import plotly.graph_objects as go
from typing import Dict, List, Optional
import gc
import traceback

# Import evaluation modules
from evaluator_module import AetherScoreEvaluator
from visualizer_module import EvaluationVisualizer
from report_generator import ReportGenerator

# --- Global Components & Storage ---
evaluator = None
visualizer = None
report_gen = None

# In-memory storage for explainability feature
evaluation_storage: Dict[str, Dict] = {}

def get_evaluator():
    """Get or create evaluator instance"""
    global evaluator
    if evaluator is None:
        evaluator = AetherScoreEvaluator()
    return evaluator

def get_visualizer():
    """Get or create visualizer instance"""
    global visualizer
    if visualizer is None:
        visualizer = EvaluationVisualizer()
    return visualizer

def get_report_generator():
    """Get or create report generator instance"""
    global report_gen
    if report_gen is None:
        report_gen = ReportGenerator()
    return report_gen

# CSS for better styling
custom_css = """
.gradio-container {
    font-family: 'Inter', sans-serif;
}
.metric-card {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    padding: 20px;
    border-radius: 10px;
    color: white;
    margin: 10px 0;
}
"""

def process_single_evaluation(
    prompt: str,
    response: str,
    expected_answer: Optional[str] = None,
    agent_name: str = "Agent-1",
    task_type: str = "general"
    ) -> tuple[Dict, go.Figure, go.Figure, str]:
    """Process single evaluation with better error handling"""
    
    try:
        # Input validation
        if not prompt or not response:
            empty_fig = go.Figure()
            empty_fig.update_layout(title="No data to display")
            return {}, empty_fig, empty_fig, "Please provide both prompt and response."

        # Get evaluator instance
        eval_instance = get_evaluator()
        vis_instance = get_visualizer()

        # Evaluate the response
        eval_result = eval_instance.evaluate_single(
            prompt=prompt,
            response=response,
            expected_answer=expected_answer if expected_answer.strip() else None,
            task_type=task_type
        )
        
        scores = eval_result.get("scores", {})
        
        if not scores or scores.get('overall_score', 0) == 0:
            empty_fig = go.Figure()
            empty_fig.update_layout(title="Evaluation failed")
            return {}, empty_fig, empty_fig, "Evaluation failed. Please check your inputs."

        # Generate visualizations
        try:
            spider_chart = vis_instance.create_spider_chart(scores, agent_name)
            score_bars = vis_instance.create_score_bars(scores, agent_name)
        except Exception as viz_error:
            print(f"Visualization error: {viz_error}")
            empty_fig = go.Figure()
            empty_fig.update_layout(title="Visualization failed")
            spider_chart = score_bars = empty_fig

        # Generate explanation
        try:
            explanation = eval_instance.generate_explanation(scores)
        except Exception as exp_error:
            explanation = f"Explanation generation failed: {str(exp_error)}"

        # Format scores for display
        scores_display = {}
        for k, v in scores.items():
            if isinstance(v, float):
                scores_display[k] = f"{v:.3f}"
            else:
                scores_display[k] = str(v)
        
        return scores_display, spider_chart, score_bars, explanation

    except Exception as e:
        error_msg = f"Single evaluation failed: {str(e)}"
        print(f"Error: {error_msg}")
        print(traceback.format_exc())
        empty_fig = go.Figure()
        empty_fig.update_layout(title="Error occurred")
        return {}, empty_fig, empty_fig, error_msg

def process_batch_evaluation(
    file_input,
    evaluation_mode: str = "comprehensive"
) -> tuple[go.Figure, go.Figure, go.Figure, str, pd.DataFrame]:
    """Process batch evaluation with robust error handling"""
    
    empty_fig = go.Figure()
    empty_fig.update_layout(title="No data available")
    empty_df = pd.DataFrame()
    
    try:
        # File validation
        if file_input is None:
            return empty_fig, empty_fig, empty_fig, "Please upload a file.", empty_df

        # Load data with error handling
        try:
            if file_input.name.endswith('.json'):
                with open(file_input.name, 'r', encoding='utf-8') as f:
                    data = json.load(f)
            elif file_input.name.endswith('.jsonl'):
                data = []
                with open(file_input.name, 'r', encoding='utf-8') as f:
                    for line_num, line in enumerate(f, 1):
                        try:
                            if line.strip():
                                data.append(json.loads(line))
                        except json.JSONDecodeError as jde:
                            print(f"JSON error on line {line_num}: {jde}")
                            continue
            else:
                return empty_fig, empty_fig, empty_fig, "Unsupported file format. Please upload JSON or JSONL.", empty_df
                
        except Exception as file_error:
            return empty_fig, empty_fig, empty_fig, f"File reading error: {str(file_error)}", empty_df

        if not data:
            return empty_fig, empty_fig, empty_fig, "No valid data found in file.", empty_df

        # Validate data structure
        required_fields = ['prompt', 'response']
        valid_data = []
        for i, item in enumerate(data):
            if not isinstance(item, dict):
                print(f"Item {i} is not a dictionary, skipping")
                continue
            if all(field in item for field in required_fields):
                valid_data.append(item)
            else:
                print(f"Item {i} missing required fields, skipping")

        if not valid_data:
            return empty_fig, empty_fig, empty_fig, "No valid items found. Each item must have 'prompt' and 'response' fields.", empty_df

        print(f"Processing {len(valid_data)} valid items...")

        # Get instances
        eval_instance = get_evaluator()
        vis_instance = get_visualizer()
        report_instance = get_report_generator()

        # Process batch evaluation
        results = eval_instance.evaluate_batch(valid_data, mode=evaluation_mode)
        
        if not results:
            return empty_fig, empty_fig, empty_fig, "Batch evaluation produced no results.", empty_df

        print(f"Successfully evaluated {len(results)} items")

        # Generate visualizations with error handling
        try:
            heatmap = vis_instance.create_evaluation_heatmap(results)
        except Exception as e:
            print(f"Heatmap creation failed: {e}")
            heatmap = empty_fig

        try:
            distribution = vis_instance.create_score_distribution(results)
        except Exception as e:
            print(f"Distribution creation failed: {e}")
            distribution = empty_fig

        try:
            trends = vis_instance.create_performance_trends(results)
        except Exception as e:
            print(f"Trends creation failed: {e}")
            trends = empty_fig

        try:
            report = report_instance.generate_batch_report(results)
        except Exception as e:
            print(f"Report generation failed: {e}")
            report = f"Report generation failed: {str(e)}"

        try:
            leaderboard = create_leaderboard(results)
        except Exception as e:
            print(f"Leaderboard creation failed: {e}")
            leaderboard = empty_df

        # Cleanup
        gc.collect()
        
        return heatmap, distribution, trends, report, leaderboard
        
    except Exception as e:
        error_msg = f"Batch evaluation failed: {str(e)}"
        print(f"Error: {error_msg}")
        print(traceback.format_exc())
        return empty_fig, empty_fig, empty_fig, error_msg, empty_df

def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
    """Create a leaderboard from evaluation results with robust error handling"""
    try:
        if not results:
            return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
        
        eval_instance = get_evaluator()
        agent_scores = eval_instance.get_agent_scores_from_results(results)
        
        if not agent_scores:
            return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
        
        leaderboard_data = []
        for agent, scores in agent_scores.items():
            if not scores:  # Skip agents with no valid scores
                continue
                
            # Filter out invalid scores
            valid_scores = [s for s in scores if isinstance(s, (int, float)) and not np.isnan(s)]
            
            if not valid_scores:
                continue
                
            leaderboard_data.append({
                'Rank': 0, 
                'Agent': str(agent), 
                'Avg Score': np.mean(valid_scores),
                'Max Score': np.max(valid_scores), 
                'Min Score': np.min(valid_scores),
                'Std Dev': np.std(valid_scores) if len(valid_scores) > 1 else 0.0, 
                'Evaluations': len(valid_scores)
            })
        
        if not leaderboard_data:
            return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
        
        df = pd.DataFrame(leaderboard_data)
        
        # Sort by average score
        df = df.sort_values('Avg Score', ascending=False)
        df['Rank'] = range(1, len(df) + 1)
        
        # Format numeric columns
        for col in ['Avg Score', 'Max Score', 'Min Score', 'Std Dev']:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A")
        
        return df
        
    except Exception as e:
        print(f"Leaderboard creation error: {e}")
        return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])

def compare_agents(
    agent1_file,
    agent2_file,
) -> tuple[go.Figure, go.Figure, go.Figure, str]:
    """Compare two agents' performance with error handling"""
    
    empty_fig = go.Figure()
    empty_fig.update_layout(title="No data available")
    
    try:
        if not agent1_file or not agent2_file:
            return empty_fig, empty_fig, empty_fig, "Please upload files for both agents."

        def load_agent_data(file):
            try:
                if file.name.endswith('.json'):
                    with open(file.name, 'r', encoding='utf-8') as f: 
                        return json.load(f)
                elif file.name.endswith('.jsonl'):
                    data = []
                    with open(file.name, 'r', encoding='utf-8') as f: 
                        for line in f: 
                            if line.strip():
                                data.append(json.loads(line))
                    return data
                else:
                    raise ValueError("Unsupported file format")
            except Exception as e:
                raise ValueError(f"Error loading file {file.name}: {str(e)}")

        eval_instance = get_evaluator()
        vis_instance = get_visualizer()
        report_instance = get_report_generator()

        # Load data for both agents
        agent1_data = load_agent_data(agent1_file)
        agent2_data = load_agent_data(agent2_file)
        
        # Validate data
        if not agent1_data or not agent2_data:
            return empty_fig, empty_fig, empty_fig, "One or both agent files contain no valid data."
        
        # Evaluate both agents
        agent1_results = eval_instance.evaluate_batch(agent1_data, mode="comprehensive")
        agent2_results = eval_instance.evaluate_batch(agent2_data, mode="comprehensive")
        
        if not agent1_results or not agent2_results:
            return empty_fig, empty_fig, empty_fig, "Failed to evaluate one or both agents."
        
        # Generate comparison visualizations
        try:
            comparison_chart = vis_instance.create_agent_comparison(agent1_results, agent2_results)
        except Exception as e:
            print(f"Comparison chart creation failed: {e}")
            comparison_chart = empty_fig
            
        try:
            performance_diff = vis_instance.create_performance_delta(agent1_results, agent2_results)
        except Exception as e:
            print(f"Performance difference chart creation failed: {e}")
            performance_diff = empty_fig
            
        try:
            statistical_analysis = vis_instance.create_radar_comparison(agent1_results, agent2_results)
        except Exception as e:
            print(f"Statistical analysis chart creation failed: {e}")
            statistical_analysis = empty_fig
        
        # Generate comparison report
        try:
            comparison_report = report_instance.generate_comparison_report(agent1_results, agent2_results)
        except Exception as e:
            print(f"Comparison report generation failed: {e}")
            comparison_report = f"Comparison report generation failed: {str(e)}"
        
        return comparison_chart, performance_diff, statistical_analysis, comparison_report
        
    except Exception as e:
        error_msg = f"Agent comparison failed: {str(e)}"
        print(f"Error: {error_msg}")
        print(traceback.format_exc())
        return empty_fig, empty_fig, empty_fig, error_msg

# --- Gradio Interface Setup ---

def create_gradio_interface():
    """Create and return the Gradio interface"""
    
    with gr.Blocks(css=custom_css, title="AetherScore Evaluation Dashboard") as demo:
        
        gr.Markdown("""
        # 🎯 AetherScore Evaluation Dashboard
        
        Advanced AI response evaluation system with comprehensive metrics and visualizations.
        """)
        
        with gr.Tabs():
            # Single Evaluation Tab
            with gr.TabItem("🔍 Single Evaluation"):
                with gr.Row():
                    with gr.Column(scale=1):
                        prompt_input = gr.Textbox(
                            label="Prompt", 
                            placeholder="Enter the prompt/question here...",
                            lines=3
                        )
                        response_input = gr.Textbox(
                            label="AI Response", 
                            placeholder="Enter the AI response to evaluate...",
                            lines=5
                        )
                        expected_input = gr.Textbox(
                            label="Expected Answer (Optional)", 
                            placeholder="Enter expected answer for accuracy comparison...",
                            lines=2
                        )
                        with gr.Row():
                            agent_name_input = gr.Textbox(
                                label="Agent Name", 
                                value="Agent-1",
                                scale=1
                            )
                            task_type_input = gr.Dropdown(
                                label="Task Type",
                                choices=["general", "reasoning", "creative", "factual"],
                                value="general",
                                scale=1
                            )
                        evaluate_btn = gr.Button("🔍 Evaluate", variant="primary")
                    
                    with gr.Column(scale=2):
                        scores_display = gr.JSON(label="📊 Evaluation Scores")
                        explanation_output = gr.Textbox(
                            label="💡 Detailed Explanation", 
                            lines=4,
                            interactive=False
                        )
                
                with gr.Row():
                    spider_chart = gr.Plot(label="🕸️ Performance Spider Chart")
                    score_bars = gr.Plot(label="📊 Score Breakdown")
                
                evaluate_btn.click(
                    fn=process_single_evaluation,
                    inputs=[prompt_input, response_input, expected_input, agent_name_input, task_type_input],
                    outputs=[scores_display, spider_chart, score_bars, explanation_output]
                )
            
            # Batch Evaluation Tab
            with gr.TabItem("📁 Batch Evaluation"):
                with gr.Row():
                    with gr.Column(scale=1):
                        file_input = gr.File(
                            label="Upload Evaluation Data",
                            file_types=[".json", ".jsonl"],
                            type="filepath"
                        )
                        eval_mode = gr.Dropdown(
                            label="Evaluation Mode",
                            choices=["comprehensive", "fast"],
                            value="comprehensive"
                        )
                        batch_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary")
                    
                    with gr.Column(scale=2):
                        batch_report = gr.Textbox(
                            label="📋 Evaluation Report",
                            lines=8,
                            interactive=False
                        )
                
                with gr.Row():
                    heatmap_plot = gr.Plot(label="🔥 Performance Heatmap")
                    distribution_plot = gr.Plot(label="📈 Score Distribution")
                
                with gr.Row():
                    trends_plot = gr.Plot(label="📊 Performance Trends")
                    leaderboard_df = gr.Dataframe(label="🏆 Leaderboard")
                
                batch_btn.click(
                    fn=process_batch_evaluation,
                    inputs=[file_input, eval_mode],
                    outputs=[heatmap_plot, distribution_plot, trends_plot, batch_report, leaderboard_df]
                )
            
            # Agent Comparison Tab
            with gr.TabItem("⚔️ Agent Comparison"):
                with gr.Row():
                    with gr.Column():
                        agent1_file = gr.File(
                            label="Agent 1 Data",
                            file_types=[".json", ".jsonl"],
                            type="filepath"
                        )
                    with gr.Column():
                        agent2_file = gr.File(
                            label="Agent 2 Data", 
                            file_types=[".json", ".jsonl"],
                            type="filepath"
                        )
                
                compare_btn = gr.Button("🔍 Compare Agents", variant="primary")
                
                with gr.Row():
                    comparison_report = gr.Textbox(
                        label="📊 Comparison Report",
                        lines=10,
                        interactive=False
                    )
                
                with gr.Row():
                    comparison_chart = gr.Plot(label="📊 Agent Comparison")
                    performance_diff = gr.Plot(label="📈 Performance Delta")
                
                with gr.Row():
                    radar_comparison = gr.Plot(label="🕸️ Radar Comparison")
                
                compare_btn.click(
                    fn=compare_agents,
                    inputs=[agent1_file, agent2_file],
                    outputs=[comparison_chart, performance_diff, radar_comparison, comparison_report]
                )
            
            # Help & Documentation Tab
            with gr.TabItem("❓ Help & Documentation"):
                gr.Markdown("""
                ## 📖 How to Use AetherScore
                
                ### Single Evaluation
                1. Enter your prompt and AI response
                2. Optionally provide an expected answer for accuracy comparison
                3. Choose agent name and task type
                4. Click "Evaluate" to get comprehensive scores
                
                ### Batch Evaluation
                1. Upload a JSON/JSONL file with evaluation data
                2. Each item should have: `prompt`, `response`, optional `expected_answer`, `agent_name`, `task_id`
                3. Choose evaluation mode and start processing
                4. View results in charts and leaderboard
                
                ### Agent Comparison
                1. Upload evaluation data files for two different agents
                2. Click "Compare Agents" to see detailed performance analysis
                3. Review comparison charts and statistical analysis
                
                ### Evaluation Metrics
                - **Instruction Following**: How well the response follows prompt constraints
                - **Hallucination Score**: Detection of fabricated or unverified information  
                - **Assumption Control**: Management of uncertain or speculative content
                - **Coherence**: Logical flow and consistency of the response
                - **Accuracy**: Similarity to expected answer (when provided)
                - **Overall Score**: Weighted combination of all metrics
                
                ### Data Format Example
                ```json
                {
                    "prompt": "Explain quantum computing",
                    "response": "Quantum computing uses quantum bits...",
                    "expected_answer": "Quantum computing leverages quantum mechanics...",
                    "agent_name": "GPT-4",
                    "task_id": "task_001",
                    "task_type": "factual"
                }
                ```
                """)
    
    return demo

# Create and launch the application
if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )