import modal
import os

app = modal.App("finetune-census-phi3")

# Volumes
vol_dataset = modal.Volume.from_name("finetune-dataset")
vol_checkpoints = modal.Volume.from_name("model-checkpoints", create_if_missing=True)

# Image: Build from CUDA base to ensure compatibility
image = modal.Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") \
    .apt_install("git") \
    .run_commands(
        "pip install --upgrade pip",
        "pip install packaging ninja psutil",
        "pip install unsloth_zoo",  # This will install compatible torch/torchvision
        "pip install torchvision",  # Ensure torchvision is installed
        # Skip flash-attn - it causes OOM during build and is optional
        "pip install xformers trl peft accelerate bitsandbytes wandb scipy huggingface_hub protobuf sentencepiece einops",
        "pip install --no-deps 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git'"
    ) \
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})

@app.function(
    image=image,
    volumes={
        "/data/dataset": vol_dataset,
        "/data/checkpoints": vol_checkpoints
    },
    gpu="H200",  # Fastest GPU - 3-4x faster than A100
    timeout=86400, # 24 hours
)
def finetune():
    from unsloth import FastLanguageModel
    from trl import SFTTrainer
    from transformers import TrainingArguments
    from datasets import load_dataset
    import torch
    
    print("🚀 Starting Fine-tuning Job...")
    
    # 1. Configuration
    max_seq_length = 2048 # Can go up to 4096 for Phi-3
    dtype = None # Auto detection
    load_in_4bit = True # Use 4bit quantization to reduce memory usage
    
    model_name = "unsloth/Phi-3-mini-4k-instruct"
    
    # 2. Load Model and Tokenizer
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    
    # 3. Add LoRA Adapters
    model = FastLanguageModel.get_peft_model(
        model,
        r=16, # Rank
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha=16,
        lora_dropout=0, # Supports any, but = 0 is optimized
        bias="none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
        random_state=3407,
        use_rslora=False,  # Rank stabilized LoRA
        loftq_config=None, # LoftQ
    )
    
    # 4. Load Dataset
    # We generated JSONL files.
    # Format: {"instruction": ..., "input": ..., "output": ...}
    dataset = load_dataset("json", data_files={"train": "/data/dataset/train.jsonl", "test": "/data/dataset/val.jsonl"})
    
    # 5. Formatting Function
    # Alpaca format
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

    EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
    def formatting_prompts_func(examples):
        instructions = examples["instruction"]
        inputs       = examples["input"]
        outputs      = examples["output"]
        texts = []
        for instruction, input, output in zip(instructions, inputs, outputs):
            # Must add EOS_TOKEN, otherwise your generation will go on forever!
            text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
            texts.append(text)
        return { "text" : texts, }

    dataset = dataset.map(formatting_prompts_func, batched=True)
    
    # 6. Training Arguments (Optimized for H200)
    training_args = TrainingArguments(
        per_device_train_batch_size=4,  # Increased for H200's 141GB memory
        gradient_accumulation_steps=2,  # Effective batch size = 8
        warmup_steps=100,  # Increased for larger dataset
        max_steps=10000,  # ~4% of full epoch, completes in ~90 minutes
        # num_train_epochs=1,  # Full epoch takes ~30 hours with 1.9M samples
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=100,  # Log less frequently
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Disable wandb logging
        save_strategy="steps",
        save_steps=10000,  # Save checkpoints every 10k steps
        save_total_limit=2,  # Keep only 2 checkpoints
    )
    
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False, # Can make training 5x faster for short sequences
        args=training_args,
    )
    
    # 7. Train
    print("Training...")
    trainer_stats = trainer.train()
    
    # 8. Save Model
    print("Saving model to /data/checkpoints/phi3-census-lora...")
    model.save_pretrained("/data/checkpoints/phi3-census-lora")
    tokenizer.save_pretrained("/data/checkpoints/phi3-census-lora")
    
    # Also save to GGUF if possible? Unsloth supports it.
    # model.save_pretrained_gguf("/data/checkpoints/phi3-census-gguf", tokenizer, quantization_method = "q4_k_m")
    
    # Commit volume
    vol_checkpoints.commit()
    print("✅ Fine-tuning Complete!")

@app.local_entrypoint()
def main():
    finetune.remote()