import modal

app = modal.App("eval-finetuned-census")

# Volumes
vol_checkpoints = modal.Volume.from_name("model-checkpoints")

# Image: Same as training to ensure compatibility
image = modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") \
    .apt_install("git") \
    .run_commands(
        "pip install --upgrade pip",
        "pip install packaging ninja psutil",
        "pip install unsloth_zoo",  # This will install compatible torch/torchvision
        "pip install torchvision",  # Ensure torchvision is installed
        # Skip flash-attn - it causes OOM during build and is optional
        "pip install xformers trl peft accelerate bitsandbytes wandb scipy huggingface_hub protobuf sentencepiece einops",
        "pip install --no-deps 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'"
    ) \
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})

@app.function(
    image=image,
    volumes={"/data/checkpoints": vol_checkpoints},
    gpu="A10G", # Inference can run on smaller GPU
    timeout=600
)
def evaluate_model(questions: list):
    from unsloth import FastLanguageModel
    import torch
    
    print("🚀 Loading Fine-tuned Model...")
    
    max_seq_length = 2048
    dtype = None
    load_in_4bit = True
    
    # Load model + adapter
    # Note: We saved to /data/checkpoints/phi3-census-lora
    model_path = "/data/checkpoints/phi3-census-lora"
    
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_path, # Load from local path (volume)
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model)
    
    print("✅ Model Loaded. Running Inference...")
    
    results = []
    
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
"""
    
    for q in questions:
        instruction = q["question"]
        input_context = q.get("context", "")
        
        inputs = tokenizer(
            [alpaca_prompt.format(instruction, input_context)], 
            return_tensors="pt"
        ).to("cuda")
        
        outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
        response = tokenizer.batch_decode(outputs)
        
        # Extract response part
        full_text = response[0]
        # Simple parsing
        if "### Response:\n" in full_text:
            answer = full_text.split("### Response:\n")[1].split("<|endoftext|>")[0]
        else:
            answer = full_text
            
        results.append({
            "question": instruction,
            "answer": answer.strip()
        })
        
    return results

@app.local_entrypoint()
def main():
    questions = [
        {
            "question": "What is the population of Tokyo?",
            "context": "Context: Japan Census data."
        },
        {
            "question": "What is the average income in Osaka?",
            "context": "Context: Japan Economy & Labor data."
        },
        {
             "question": "How many households are in Hokkaido?",
             "context": "Context: Japan Census data."
        }
    ]
    
    answers = evaluate_model.remote(questions)
    
    for item in answers:
        print(f"\nQ: {item['question']}")
        print(f"A: {item['answer']}")