"""
EDA Report Generation Tools
Generates comprehensive HTML reports using ydata-profiling.
"""

import os
import warnings
from pathlib import Path
from typing import Dict, Any, Optional
import polars as pl

# Suppress multiprocessing warnings from ydata-profiling cleanup
warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing")
warnings.filterwarnings("ignore", message=".*resource_tracker.*")


def generate_ydata_profiling_report(
    file_path: str,
    output_path: str = "./outputs/reports/ydata_profile.html",
    minimal: bool = False,
    title: str = "Data Profiling Report"
) -> Dict[str, Any]:
    """
    Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).
    
    ydata-profiling provides extensive analysis including:
    - Overview: dataset statistics, warnings, reproduction
    - Variables: type inference, statistics, histograms, common values, missing values
    - Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
    - Correlations: detailed correlation matrices and heatmaps
    - Missing values: matrix, heatmap, and dendrogram
    - Sample: first/last rows of the dataset
    - Duplicate rows: analysis and examples
    
    Args:
        file_path: Path to the dataset CSV file
        output_path: Where to save the HTML report
        minimal: If True, generates faster minimal report (useful for large datasets)
        title: Title for the report
        
    Returns:
        Dict with success status, report path, and statistics
    """
    try:
        from ydata_profiling import ProfileReport
        import pandas as pd
        
        # Read dataset (ydata-profiling requires pandas)
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith('.parquet'):
            df = pd.read_parquet(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")
        
        # Auto-optimize for large datasets to prevent memory crashes
        rows, cols = df.shape
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
        
        # Check environment: HuggingFace has 16GB, Render has 512MB
        # Allow larger datasets on high-memory environments
        max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000"))  # Default: 100k (HF), or set to 50000 for low-mem
        max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50"))  # Default: 50MB
        
        # Automatic sampling only when dataset exceeds thresholds
        should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
        if should_sample and not minimal:
            sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
            print(f"📊 Large dataset detected: {rows:,} rows, {file_size_mb:.1f}MB")
            print(f"⚡ Sampling to {sample_size:,} rows for memory efficiency...")
            df = df.sample(n=min(sample_size, rows), random_state=42)
            minimal = True  # Force minimal mode for large files
        
        # Force minimal mode for very large files even after sampling
        if file_size_mb > max_size_threshold * 2:
            minimal = True
            print(f"⚡ Using minimal profiling mode (file size: {file_size_mb:.1f}MB)")
        
        # Create output directory if needed
        os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
        
        # Configure profile based on minimal flag
        if minimal:
            # Minimal mode: faster for large datasets, less memory
            profile = ProfileReport(
                df,
                title=title,
                minimal=True,
                explorative=False,
                samples=None,  # Disable sample display to save memory
                correlations=None,  # Skip correlations in minimal mode
                missing_diagrams=None,  # Skip missing diagrams
                duplicates=None,  # Skip duplicate analysis
                interactions=None  # Skip interactions
            )
        else:
            # Full mode: comprehensive analysis
            profile = ProfileReport(
                df,
                title=title,
                explorative=True,
                correlations={
                    "pearson": {"calculate": True},
                    "spearman": {"calculate": True},
                    "kendall": {"calculate": False},  # Slow for large datasets
                    "phi_k": {"calculate": True},
                    "cramers": {"calculate": True},
                }
            )
        
        # Generate HTML report
        profile.to_file(output_path)
        
        # Extract key statistics
        num_features = len(df.columns)
        num_rows = len(df)
        num_numeric = df.select_dtypes(include=['number']).shape[1]
        num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
        num_boolean = df.select_dtypes(include=['bool']).shape[1]
        missing_cells = df.isnull().sum().sum()
        total_cells = num_rows * num_features
        missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
        duplicate_rows = df.duplicated().sum()
        
        return {
            "success": True,
            "report_path": output_path,
            "message": f"✅ ydata-profiling report generated successfully at: {output_path}",
            "statistics": {
                "dataset_size": {
                    "rows": num_rows,
                    "columns": num_features,
                    "cells": total_cells
                },
                "variable_types": {
                    "numeric": num_numeric,
                    "categorical": num_categorical,
                    "boolean": num_boolean
                },
                "data_quality": {
                    "missing_cells": missing_cells,
                    "missing_percentage": round(missing_pct, 2),
                    "duplicate_rows": int(duplicate_rows)
                },
                "report_config": {
                    "minimal_mode": minimal,
                    "title": title
                }
            }
        }
        
    except ImportError:
        return {
            "success": False,
            "error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
            "error_type": "MissingDependency"
        }
    except Exception as e:
        return {
            "success": False,
            "error": f"Failed to generate ydata-profiling report: {str(e)}",
            "error_type": type(e).__name__
        }