#!/usr/bin/env python3
"""
IPAD VAD Training Interface on HuggingFace Spaces with ZeroGPU
Updated version with integrated training infrastructure
"""
# IMPORTANT: Clear Python cache first to avoid loading stale modules
import shutil
from pathlib import Path
for pycache in Path('.').rglob('__pycache__'):
    shutil.rmtree(pycache, ignore_errors=True)
for pyc in Path('.').rglob('*.pyc'):
    pyc.unlink(missing_ok=True)
print("🧹 Cache cleared - loading fresh modules")

import gradio as gr
import torch
import os
import json
from datetime import datetime
import zipfile
from huggingface_hub import hf_hub_download, HfApi
import subprocess
import sys
from typing import Optional, Dict

# Import training infrastructure
from train_hf import IPADTrainer
from dataset import download_and_extract_dataset, DEVICE_NAMES, SYNTHETIC_DEVICES
import spaces  # ZeroGPU decorator

# Global state
DATASET_PATH = None
CHECKPOINT_DIR = Path("./checkpoints")
CHECKPOINT_DIR.mkdir(exist_ok=True)

def setup_dataset(progress=gr.Progress()) -> str:
    """Download and extract IPAD dataset from HF Hub"""
    global DATASET_PATH

    progress(0, desc="Downloading dataset...")

    if DATASET_PATH and DATASET_PATH.exists():
        return f"✅ Dataset already available at {DATASET_PATH}"

    try:
        DATASET_PATH = download_and_extract_dataset(cache_dir="./cache")
        progress(1.0, desc="Complete!")
        return f"✅ Dataset downloaded and extracted to {DATASET_PATH}\n📊 Ready for training!"

    except Exception as e:
        return f"❌ Error: {str(e)}"

@spaces.GPU(duration=60)  # Request GPU for 1 minute
def quick_gpu_test() -> Dict:
    """Quick test to verify GPU access and model loading"""
    try:
        from IPAD.model.video_swin_transformer import VST

        # Check GPU
        gpu_available = torch.cuda.is_available()
        gpu_name = torch.cuda.get_device_name(0) if gpu_available else "None"

        if not gpu_available:
            return {
                "status": "⚠️ Warning",
                "message": "No GPU available",
                "gpu_available": False,
                "gpu_name": "None"
            }

        # Load model
        model = VST(mem_dim=2000, shrink_thres=0.0025)
        model = model.cuda()

        # Create dummy input
        dummy_input = torch.randn(1, 3, 16, 256, 256).cuda()

        # Forward pass
        with torch.no_grad():
            output = model(dummy_input)

        result = {
            "status": "✅ Success",
            "message": "GPU test passed!",
            "gpu_available": True,
            "gpu_name": gpu_name,
            "output_shape": str(output['output'].shape),
            "attention_shape": str(output['att'].shape),
            "period_shape": str(output['recon_index'].shape),
            "memory_allocated_gb": f"{torch.cuda.memory_allocated() / 1e9:.2f}",
            "memory_reserved_gb": f"{torch.cuda.memory_reserved() / 1e9:.2f}"
        }

        return result

    except Exception as e:
        return {
            "status": "❌ Error",
            "message": str(e),
            "gpu_available": torch.cuda.is_available(),
            "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None"
        }

@spaces.GPU(duration=3600)  # Request GPU for 1 hour
def train_quick_baseline(
    device_name: str = "S01",
    epochs: int = 10,
    batch_size: int = 4,
    lr: float = 1e-4,
    progress=gr.Progress()
) -> str:
    """Quick baseline training (10 epochs for testing)"""
    global DATASET_PATH

    # Auto-download dataset if not available
    if DATASET_PATH is None or not DATASET_PATH.exists():
        progress(0, desc="Dataset not found, downloading...")
        try:
            DATASET_PATH = download_and_extract_dataset(cache_dir="./cache")
            progress(0.05, desc="Dataset ready, starting training...")
        except Exception as e:
            return f"❌ Error downloading dataset: {str(e)}"

    progress(0, desc="Initializing trainer...")

    try:
        # Create trainer
        trainer = IPADTrainer(
            device_name=device_name,
            epochs=epochs,
            batch_size=batch_size,
            lr=lr,
            mem_dim=2000,
            checkpoint_dir=str(CHECKPOINT_DIR),
            wandb_project=None,  # Disable wandb for quick test
            hf_repo=None  # Disable auto-upload for quick test
        )

        progress(0.1, desc="Loading dataset...")

        # Train
        trainer.train(str(DATASET_PATH))

        progress(1.0, desc="Training complete!")

        # Get latest checkpoint
        checkpoints = list(CHECKPOINT_DIR.glob(f"{device_name}_*.pth"))
        latest_checkpoint = max(checkpoints, key=lambda p: p.stat().st_mtime) if checkpoints else None

        result = f"""
✅ Quick baseline training complete!

📊 Configuration:
  - Device: {device_name}
  - Epochs: {epochs}
  - Batch Size: {batch_size}
  - Learning Rate: {lr}

💾 Checkpoint:
  - {latest_checkpoint.name if latest_checkpoint else 'No checkpoint saved'}

🎯 Next Steps:
  1. Review training metrics
  2. Run full 200-epoch training
  3. Evaluate on test set
"""
        return result

    except Exception as e:
        return f"❌ Training failed: {str(e)}\n\nPlease check the logs for details."

@spaces.GPU(duration=7200)  # Request GPU for 2 hours
def train_full_baseline(
    device_name: str = "S01",
    epochs: int = 200,
    batch_size: int = 4,
    lr: float = 1e-4,
    mem_dim: int = 2000,
    enable_wandb: bool = False,
    enable_hf_upload: bool = True,
    progress=gr.Progress()
) -> str:
    """Full baseline training (200 epochs)"""
    global DATASET_PATH

    if DATASET_PATH is None or not DATASET_PATH.exists():
        return "❌ Error: Dataset not downloaded. Please download dataset first."

    progress(0, desc="Initializing full training...")

    try:
        # Create trainer
        trainer = IPADTrainer(
            device_name=device_name,
            epochs=epochs,
            batch_size=batch_size,
            lr=lr,
            mem_dim=mem_dim,
            checkpoint_dir=str(CHECKPOINT_DIR),
            wandb_project="ipad-vad" if enable_wandb else None,
            hf_repo="MSherbinii/ipad-vad-checkpoints" if enable_hf_upload else None
        )

        progress(0.05, desc="Loading dataset...")

        # Train
        trainer.train(str(DATASET_PATH))

        progress(1.0, desc="Training complete!")

        # Get final checkpoint
        checkpoints = list(CHECKPOINT_DIR.glob(f"{device_name}_*.pth"))
        latest_checkpoint = max(checkpoints, key=lambda p: p.stat().st_mtime) if checkpoints else None

        result = f"""
✅ Full baseline training complete!

📊 Configuration:
  - Device: {device_name}
  - Epochs: {epochs}
  - Batch Size: {batch_size}
  - Learning Rate: {lr}
  - Memory Dimension: {mem_dim}

💾 Checkpoints:
  - Total saved: {len(checkpoints)}
  - Latest: {latest_checkpoint.name if latest_checkpoint else 'None'}

☁️ HuggingFace Hub:
  - {'✅ Uploaded to MSherbinii/ipad-vad-checkpoints' if enable_hf_upload else '❌ Upload disabled'}

📈 WandB Logging:
  - {'✅ Logged to ipad-vad project' if enable_wandb else '❌ Logging disabled'}

🎯 Expected Performance:
  - Target AUC for {device_name}: Check baseline results table
  - Paper baseline avg: 68.6%
"""
        return result

    except Exception as e:
        return f"❌ Training failed: {str(e)}\n\nPlease check the logs for details."

def list_checkpoints() -> str:
    """List all saved checkpoints"""
    checkpoints = sorted(CHECKPOINT_DIR.glob("*.pth"))

    if not checkpoints:
        return "📁 No checkpoints found"

    result = "💾 **Available Checkpoints:**\n\n"
    for ckpt in checkpoints:
        size_mb = ckpt.stat().st_size / (1024 * 1024)
        modified = datetime.fromtimestamp(ckpt.stat().st_mtime).strftime("%Y-%m-%d %H:%M")
        result += f"- `{ckpt.name}` ({size_mb:.1f} MB, modified {modified})\n"

    return result

# Gradio Interface
with gr.Blocks(title="IPAD VAD Training on ZeroGPU", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🏭 IPAD: Industrial Process Anomaly Detection Training")
    gr.Markdown("Train video anomaly detection models on ZeroGPU with the IPAD dataset")

    with gr.Tab("📥 Setup"):
        gr.Markdown("## 1️⃣ Download Dataset from HF Hub")
        gr.Markdown("Downloads the 8.3GB IPAD dataset. **This only needs to be done once** - the dataset is cached.")

        download_btn = gr.Button("📥 Download Dataset", variant="primary", size="lg")
        download_output = gr.Textbox(label="Download Status", lines=4)
        download_btn.click(setup_dataset, outputs=download_output)

        gr.Markdown("---")
        gr.Markdown("## 2️⃣ Test GPU Access")
        gr.Markdown("Verify that ZeroGPU is working and the model loads correctly. **No dataset required.**")

        test_btn = gr.Button("🧪 Run GPU Test", variant="secondary")
        test_output = gr.JSON(label="GPU Test Results")
        test_btn.click(quick_gpu_test, outputs=test_output)

    with gr.Tab("⚡ Quick Test (10 epochs)"):
        gr.Markdown("## Quick Baseline Test")
        gr.Markdown("Train for 10 epochs to verify everything works. Takes ~10-15 minutes.")

        with gr.Row():
            quick_device = gr.Dropdown(
                choices=SYNTHETIC_DEVICES,
                value="S01",
                label="Device"
            )
            quick_epochs = gr.Slider(5, 50, value=10, step=5, label="Epochs")

        with gr.Row():
            quick_batch = gr.Slider(1, 8, value=4, step=1, label="Batch Size")
            quick_lr = gr.Number(value=1e-4, label="Learning Rate", precision=6)

        quick_train_btn = gr.Button("🚀 Start Quick Training", variant="primary", size="lg")
        quick_output = gr.Textbox(label="Training Results", lines=15)

        quick_train_btn.click(
            train_quick_baseline,
            inputs=[quick_device, quick_epochs, quick_batch, quick_lr],
            outputs=quick_output
        )

    with gr.Tab("🎯 Full Training (200 epochs)"):
        gr.Markdown("## Full Baseline Training")
        gr.Markdown("Complete 200-epoch training to match paper results. Takes ~2-3 hours.")

        with gr.Row():
            full_device = gr.Dropdown(
                choices=SYNTHETIC_DEVICES,
                value="S01",
                label="Training Device"
            )
            full_epochs = gr.Slider(50, 300, value=200, step=10, label="Epochs")

        with gr.Row():
            full_batch = gr.Slider(1, 8, value=4, step=1, label="Batch Size")
            full_lr = gr.Number(value=1e-4, label="Learning Rate", precision=6)

        with gr.Row():
            full_mem_dim = gr.Slider(500, 2000, value=2000, step=100, label="Memory Dimension")
            full_wandb = gr.Checkbox(value=False, label="Enable WandB Logging")
            full_hf_upload = gr.Checkbox(value=True, label="Upload to HF Hub")

        full_train_btn = gr.Button("🚀 Start Full Training", variant="primary", size="lg")
        full_output = gr.Textbox(label="Training Results", lines=20)

        full_train_btn.click(
            train_full_baseline,
            inputs=[full_device, full_epochs, full_batch, full_lr, full_mem_dim, full_wandb, full_hf_upload],
            outputs=full_output
        )

    with gr.Tab("💾 Checkpoints"):
        gr.Markdown("## Checkpoint Management")

        refresh_btn = gr.Button("🔄 Refresh Checkpoint List")
        checkpoint_list = gr.Markdown(value=list_checkpoints())
        refresh_btn.click(list_checkpoints, outputs=checkpoint_list)

        gr.Markdown("### Checkpoint Info")
        gr.Markdown("""
        - Checkpoints are saved every 10 epochs
        - Best model (lowest val loss) is automatically selected
        - Files are in PyTorch `.pth` format
        - Can be loaded with `torch.load(checkpoint_path)`
        """)

    with gr.Tab("📊 Documentation"):
        gr.Markdown("""
        ## IPAD VAD Training Guide

        ### Quick Start
        1. **Download Dataset**: Go to "Setup" tab and download the IPAD dataset (once)
        2. **GPU Test**: Verify GPU access in "Setup" tab
        3. **Quick Test**: Train for 10 epochs in "Quick Test" tab to verify setup
        4. **Full Training**: Launch 200-epoch training in "Full Training" tab

        ### Hardware
        - **GPU**: NVIDIA H200 (via ZeroGPU)
        - **VRAM**: 80GB HBM3
        - **Duration**: 1-2 hours per full training session

        ### Model Architecture
        - **Encoder**: Video Swin Transformer (768-dim features)
        - **Memory**: 2000-dimensional learnable memory bank
        - **Period Module**: 200-class temporal position classifier
        - **Decoder**: I3D-based 3D decoder

        ### Expected Baseline Results (200 epochs)

        | Device | AUC (%) | Device | AUC (%) |
        |--------|---------|--------|---------|
        | S01 | 69.5 | S07 | 60.6 |
        | S02 | 63.9 | S08 | 85.6 |
        | S03 | 70.6 | S09 | 71.2 |
        | S04 | 58.3 | S10 | 62.2 |
        | S05 | 86.2 | S11 | 60.9 |
        | S06 | 61.2 | S12 | 67.1 |
        | **Avg** | **68.6** | | |

        ### Training Configuration
        - **Batch Size**: 4 (default, can increase with more VRAM)
        - **Learning Rate**: 1e-4 (Adam optimizer)
        - **Clip Length**: 16 frames
        - **Frame Size**: 256×256 pixels
        - **Mixed Precision**: FP16 (automatic)

        ### Loss Function
        ```
        Total Loss = Reconstruction Loss
                   + 0.0002 × Entropy Loss
                   + 0.02 × Period Loss
        ```

        ### Resources
        - [Paper](https://arxiv.org/abs/2404.15033)
        - [Dataset](https://huggingface.co/datasets/MSherbinii/ipad-industrial-anomaly)
        - [Original Code](https://github.com/LJF1113/IPAD)
        - [Checkpoints](https://huggingface.co/MSherbinii/ipad-vad-checkpoints)

        ### Next Steps (SOTA Improvements)
        After baseline reproduction:
        1. **Modern Transformer**: Replace Video Swin → MViTv2 (+2-4% AUC)
        2. **Diffusion Decoder**: Add diffusion-based reconstruction (+3-5% AUC)
        3. **Enhanced Memory**: GWN regularization (+1-3% AUC)

        **Target**: 75-80% average AUC (vs 68.6% baseline)
        """)

if __name__ == "__main__":
    # Auto-start training if flag file exists
    autostart_flag = Path("./AUTOSTART_TRAINING")
    if autostart_flag.exists():
        print("🚀 AUTO-START: Training flag detected, starting training...")
        try:
            # Read configuration from flag file
            config = json.loads(autostart_flag.read_text())
            device = config.get("device", "S01")
            epochs = config.get("epochs", 10)

            print(f"📊 Configuration: Device={device}, Epochs={epochs}")

            # Remove flag to prevent re-running on every restart
            autostart_flag.unlink()

            # Download dataset first
            print("📥 Downloading dataset...")
            DATASET_PATH = download_and_extract_dataset(cache_dir="./cache")
            print(f"✅ Dataset ready at {DATASET_PATH}")

            # Start training in background thread
            import threading
            def run_training():
                print(f"🏋️ Starting training on {device} for {epochs} epochs...")
                result = train_quick_baseline(device, epochs, 4, 1e-4)
                print(f"📊 Training result:\n{result}")

            training_thread = threading.Thread(target=run_training, daemon=True)
            training_thread.start()
            print("✅ Training started in background!")

        except Exception as e:
            print(f"❌ Auto-start failed: {e}")

    demo.launch(server_name="0.0.0.0", server_port=7860)