"""
RoboMind VLA — Task 12: tests.py

End-to-end tests and verification for the entire pipeline.
Runs on Modal CPU for data checks, GPU for model checks.

Usage:
    modal run tests.py
"""

from __future__ import annotations

import json

import modal

image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install(
        "numpy<2",
        "torch==2.4.0",
        "torchvision==0.19.0",
        "transformers==4.40.0",
        "peft==0.11.1",
        "accelerate==0.30.1",
        "datasets",
        "pillow",
        "huggingface_hub",
    )
    .run_commands(
        "python -c \""
        "import os, sys; "
        "d = os.path.join(sys.prefix, 'lib/python3.11/site-packages/flash_attn'); "
        "os.makedirs(d, exist_ok=True); "
        "open(os.path.join(d, '__init__.py'), 'w').write(''); "
        "open(os.path.join(d, 'flash_attn_interface.py'), 'w').write("
        "'def flash_attn_func(*a, **kw): raise NotImplementedError\\n"
        "def flash_attn_varlen_func(*a, **kw): raise NotImplementedError\\n'); "
        "print('flash_attn stub created')\""
    )
)

app = modal.App("robomind-tests")
volume = modal.Volume.from_name("robomind-data", create_if_missing=True)


@app.function(image=image, secrets=[modal.Secret.from_name("huggingface-secret")], timeout=300)
def test_data_repos():
    """Test 1: Verify HF data repos exist and have correct structure."""
    from huggingface_hub import HfApi, hf_hub_download
    import os
    from huggingface_hub import login

    hf_token = os.environ.get("HF_TOKEN")
    if hf_token:
        login(token=hf_token)

    api = HfApi()
    results = {}

    # Check robomind-rollouts
    try:
        files = api.list_repo_files("mitvho09/robomind-rollouts")
        mp4_count = len([f for f in files if f.endswith(".mp4")])
        has_metadata = "metadata.jsonl" in files
        results["rollouts"] = {
            "exists": True,
            "mp4_files": mp4_count,
            "has_metadata": has_metadata,
            "total_files": len(files),
        }
        print(f"[test] rollouts: {mp4_count} mp4s, metadata={has_metadata}")
    except Exception as e:
        results["rollouts"] = {"exists": False, "error": str(e)}
        print(f"[test] rollouts FAILED: {e}")

    # Check robomind-loco-judge-dataset
    try:
        files = api.list_repo_files("mitvho09/robomind-loco-judge-dataset")
        results["dataset"] = {
            "exists": True,
            "total_files": len(files),
        }
        print(f"[test] dataset: {len(files)} files")
    except Exception as e:
        results["dataset"] = {"exists": False, "error": str(e)}
        print(f"[test] dataset FAILED: {e}")

    # Check adapter
    try:
        files = api.list_repo_files("mitvho09/robomind-minicpm-loco-lora")
        has_adapter = "adapter_model.safetensors" in files
        has_config = "adapter_config.json" in files
        results["adapter"] = {
            "exists": True,
            "has_adapter_weights": has_adapter,
            "has_config": has_config,
            "total_files": len(files),
        }
        print(f"[test] adapter: weights={has_adapter}, config={has_config}")
    except Exception as e:
        results["adapter"] = {"exists": False, "error": str(e)}
        print(f"[test] adapter FAILED: {e}")

    return results


@app.function(image=image, timeout=300)
def test_dataset_structure():
    """Test 2: Verify dataset has correct schema and content."""
    from datasets import load_dataset

    ds = load_dataset("mitvho09/robomind-loco-judge-dataset", split="train")
    results = {"n_samples": len(ds)}

    required_cols = ["env", "tier", "episode_id", "images", "target_json"]
    for col in required_cols:
        results[f"has_{col}"] = col in ds.column_names

    envs = set(ds["env"])
    tiers = set(ds["tier"])
    results["envs"] = sorted(envs)
    results["tiers"] = sorted(tiers)

    n_images = [len(row["images"]) for row in ds]
    results["min_images"] = min(n_images)
    results["max_images"] = max(n_images)
    results["avg_images"] = sum(n_images) / len(n_images)

    target_jsons = [json.loads(row["target_json"]) for row in ds]
    all_keys = set()
    for tj in target_jsons:
        all_keys.update(tj.keys())
    results["target_json_keys"] = sorted(all_keys)

    print(f"[test] dataset: {results['n_samples']} samples, {len(envs)} envs, {len(tiers)} tiers")
    print(f"[test] images per sample: min={results['min_images']}, max={results['max_images']}")
    print(f"[test] envs: {results['envs']}")
    print(f"[test] target keys: {results['target_json_keys']}")

    return results


@app.function(
    image=image,
    gpu="A100-40GB",
    volumes={"/data": volume},
    secrets=[modal.Secret.from_name("huggingface-secret")],
    timeout=600,
)
def test_model_load():
    """Test 3: Verify model loads correctly with LoRA adapter."""
    import torch
    from transformers import AutoModel, AutoTokenizer
    from peft import PeftModel
    from huggingface_hub import login
    import os

    hf_token = os.environ.get("HF_TOKEN")
    if hf_token:
        login(token=hf_token)

    results = {}

    try:
        tokenizer = AutoTokenizer.from_pretrained(
            "openbmb/MiniCPM-V-2_6", trust_remote_code=True
        )
        results["tokenizer_loaded"] = True
        print("[test] tokenizer loaded")
    except Exception as e:
        results["tokenizer_loaded"] = False
        results["tokenizer_error"] = str(e)
        print(f"[test] tokenizer FAILED: {e}")
        return results

    try:
        base_model = AutoModel.from_pretrained(
            "openbmb/MiniCPM-V-2_6",
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        results["base_model_loaded"] = True
        print("[test] base model loaded")
    except Exception as e:
        results["base_model_loaded"] = False
        results["base_model_error"] = str(e)
        print(f"[test] base model FAILED: {e}")
        return results

    try:
        model = PeftModel.from_pretrained(base_model, "mitvho09/robomind-minicpm-loco-lora")
        results["adapter_loaded"] = True
        print("[test] LoRA adapter loaded")
    except Exception as e:
        results["adapter_loaded"] = False
        results["adapter_error"] = str(e)
        print(f"[test] adapter FAILED: {e}")
        return results

    model.eval()
    results["model_ready"] = True
    print("[test] model ready for inference")
    return results


@app.function(
    image=image,
    gpu="A100-40GB",
    volumes={"/data": volume},
    secrets=[modal.Secret.from_name("huggingface-secret")],
    timeout=600,
)
def test_inference():
    """Test 4: Run a dummy inference to verify the model generates output."""
    import os
    import torch
    import json
    from transformers import AutoModel, AutoTokenizer
    from peft import PeftModel
    from huggingface_hub import login
    from PIL import Image
    import numpy as np

    hf_token = os.environ.get("HF_TOKEN")
    if hf_token:
        login(token=hf_token)

    results = {}

    tokenizer = AutoTokenizer.from_pretrained(
        "openbmb/MiniCPM-V-2_6", trust_remote_code=True
    )
    base_model = AutoModel.from_pretrained(
        "openbmb/MiniCPM-V-2_6",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(base_model, "mitvho09/robomind-minicpm-loco-lora")
    model.eval()

    dummy_images = [
        Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
        for _ in range(3)
    ]

    instruction = (
        "You are RoboMind VLA, a vision-language reward model for humanoid "
        "locomotion. You are shown keyframes from a robot locomotion rollout. "
        "The robot was commanded to \"walk forward\". Analyze the rollout and "
        "respond with ONLY a JSON object with these exact keys: timestep_range, "
        "phase, command, command_followed, stability, fall_risk, gait_quality, "
        "predicted_reward, anomaly, explanation."
    )

    image_tokens = "\n".join(f"<image_{k:02d}>" for k in range(3))
    user_content = f"{image_tokens}\n{instruction}"

    with torch.no_grad():
        output = model.chat(
            image=dummy_images,
            msgs=[{"role": "user", "content": user_content}],
            tokenizer=tokenizer,
            max_new_tokens=256,
        )

    response = output if isinstance(output, str) else str(output)
    results["response_length"] = len(response)
    results["response_preview"] = response[:300]

    has_json = "{" in response and "}" in response
    results["contains_json"] = has_json

    print(f"[test] inference OK: {len(response)} chars")
    print(f"[test] response: {response[:300]}")
    return results


@app.function(image=image, timeout=300)
def test_volume_data():
    """Test 5: Verify Modal volume has expected data."""
    import os

    results = {}

    ft_dir = "/data/ft"
    if os.path.exists(ft_dir):
        images_dir = os.path.join(ft_dir, "images")
        if os.path.exists(images_dir):
            n_images = len([f for f in os.listdir(images_dir) if f.endswith(".jpg")])
            results["ft_images"] = n_images
            print(f"[test] volume images: {n_images}")

        train_json = os.path.join(ft_dir, "train.json")
        if os.path.exists(train_json):
            with open(train_json) as f:
                data = json.load(f)
            results["train_samples"] = len(data)
            print(f"[test] volume train.json: {len(data)} samples")

        lora_dir = os.path.join(ft_dir, "lora_output")
        if os.path.exists(lora_dir):
            results["lora_output_exists"] = True
            has_adapter = any("adapter_model" in f for f in os.listdir(lora_dir))
            results["has_adapter_in_output"] = has_adapter
            print(f"[test] LoRA output exists, adapter={has_adapter}")

    mcpm_dir = "/data/MiniCPM-V"
    results["minicpm_cloned"] = os.path.exists(mcpm_dir)

    print(f"[test] volume data: {results}")
    return results


@app.local_entrypoint()
def main():
    """Run all tests."""
    print("=" * 60)
    print("RoboMind VLA — End-to-End Tests")
    print("=" * 60)

    all_results = {}

    print("\n--- Test 1: HF Data Repos ---")
    all_results["data_repos"] = test_data_repos.remote()

    print("\n--- Test 2: Dataset Structure ---")
    all_results["dataset_structure"] = test_dataset_structure.remote()

    print("\n--- Test 3: Model Load ---")
    all_results["model_load"] = test_model_load.remote()

    print("\n--- Test 4: Inference ---")
    all_results["inference"] = test_inference.remote()

    print("\n--- Test 5: Volume Data ---")
    all_results["volume_data"] = test_volume_data.remote()

    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)

    passed = 0
    failed = 0
    for test_name, result in all_results.items():
        has_error = any("error" in str(v).lower() for v in result.values() if isinstance(v, str))
        has_false = any(v is False for v in result.values() if isinstance(v, bool))
        if has_error or has_false:
            status = "FAIL"
            failed += 1
        else:
            status = "PASS"
            passed += 1
        print(f"  {test_name}: {status}")

    print(f"\nTotal: {passed} passed, {failed} failed, {passed + failed} total")
    return all_results