#!/usr/bin/env python3
"""Phase 2: SFT training on Qwen3-4B"""

import os
import time
import torch
from pathlib import Path
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

# Config
BASE_MODEL = "Qwen/Qwen3-4B"
DATA_DIR = Path("./qwen3_pipeline/data")
CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
CKPT_DIR.mkdir(parents=True, exist_ok=True)

EPOCHS = 1
BATCH_SIZE = 2
GRAD_ACCUM = 8
LR = 2e-4
MAX_SEQ_LEN = 4096
LORA_RANK = 32
LORA_ALPHA = 64
LORA_TARGETS = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

print("="*70)
print("PHASE 2: SFT TRAINING")
print("="*70)

# [1/4] Load model
print(f"\n[1/4] Loading {BASE_MODEL}...")

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="eager"
)

print(f"  Model loaded")
print(f"  GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB")

# [2/4] Apply LoRA
print(f"\n[2/4] Applying LoRA...")

lora_config = LoraConfig(
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGETS,
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
    init_lora_weights="gaussian",
    use_rslora=True,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Enable input gradients for LoRA
model.enable_input_require_grads()

# [3/4] Load and tokenize data
print(f"\n[3/4] Loading and tokenizing data...")

dataset = load_from_disk(str(DATA_DIR / "sft"))
print(f"  Dataset: {len(dataset)} samples")

def tokenize_function(examples):
    # Format messages using chat template
    texts = []
    for msg in examples["messages"]:
        text = tokenizer.apply_chat_template(
            msg,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text + tokenizer.eos_token)
    
    # Tokenize with padding and truncation
    result = tokenizer(
        texts,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding="max_length",
        return_tensors=None
    )
    
    # Labels = input_ids (simple list, not nested)
    result["labels"] = result["input_ids"].copy()
    
    return result

print("  Tokenizing...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Tokenizing",
    num_proc=4
)

print(f"  Tokenized: {len(tokenized_dataset)} samples")

# [4/4] Train
print(f"\n[4/4] Training...")

steps_per_epoch = len(tokenized_dataset) // (BATCH_SIZE * GRAD_ACCUM)
total_steps = steps_per_epoch * EPOCHS

print(f"  Batch size: {BATCH_SIZE}")
print(f"  Grad accum: {GRAD_ACCUM}")
print(f"  Effective batch: {BATCH_SIZE * GRAD_ACCUM}")
print(f"  Steps per epoch: {steps_per_epoch}")
print(f"  Total steps: {total_steps}")
print(f"  Learning rate: {LR}")
print(f"  Estimated time: ~30-40 min")

training_args = TrainingArguments(
    output_dir=str(CKPT_DIR),
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.01,
    bf16=True,
    logging_steps=10,
    save_strategy="no",
    optim="adamw_torch",
    gradient_checkpointing=True,
    seed=42,
    report_to="none",
    dataloader_num_workers=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

print(f"\n{'='*70}")
print("TRAINING STARTED")
print(f"{'='*70}\n")

start = time.time()
trainer.train()
elapsed = (time.time() - start) / 60

print(f"\n{'='*70}")
print(f"✓ TRAINING COMPLETE: {elapsed:.1f} minutes")
print(f"{'='*70}")

# Save
print(f"\nSaving model...")

adapter_path = CKPT_DIR / "adapter"
model.save_pretrained(str(adapter_path))
tokenizer.save_pretrained(str(adapter_path))
print(f"  ✓ Adapter: {adapter_path}")

# Merge
print(f"\nMerging LoRA weights...")
model = model.merge_and_unload()

merged_path = CKPT_DIR / "merged"
model.save_pretrained(str(merged_path))
tokenizer.save_pretrained(str(merged_path))
print(f"  ✓ Merged: {merged_path}")

del model, trainer
torch.cuda.empty_cache()

print(f"\n{'='*70}")
print(f"✓ PHASE 2 COMPLETE")
print(f"{'='*70}")
print(f"\nTime: {elapsed:.1f} minutes")
print(f"Cost: ~${elapsed/60 * 1.15:.2f}")
print(f"\n➡️  Next: python phase3_eval.py")