#!/usr/bin/env python3
"""
LoRAマージ + AWQ量子化スクリプト

学習完了後に実行:
1. LoRAアダプターをベースモデルにマージ
2. AWQ量子化（4bit）
3. HuggingFaceにアップロード
"""

import os
import sys
import shutil
from datetime import datetime

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from awq import AutoAWQForCausalLM

# ============================================================
# 設定
# ============================================================
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
LORA_MODEL = "hajimemat/qwen2.5-7b-glaive-fc-lora"  # 学習済みLoRA

# 出力先
MERGED_MODEL_DIR = "./merged_model"
QUANTIZED_MODEL_DIR = "./quantized_model"
OUTPUT_MODEL_ID = "hajimemat/qwen2.5-7b-glaive-fc-awq"

# AWQ量子化設定
AWQ_CONFIG = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}


def step1_merge_lora():
    """Step 1: LoRAをベースモデルにマージ"""
    print("\n" + "=" * 60)
    print("Step 1: Merging LoRA adapter to base model")
    print("=" * 60)

    print(f"Base model: {BASE_MODEL}")
    print(f"LoRA model: {LORA_MODEL}")

    # ベースモデル読み込み
    print("\nLoading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )

    # トークナイザー読み込み
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

    # LoRAアダプター適用
    print("Loading LoRA adapter...")
    model = PeftModel.from_pretrained(base_model, LORA_MODEL)

    # マージ
    print("Merging LoRA weights...")
    model = model.merge_and_unload()

    # 保存
    print(f"Saving merged model to {MERGED_MODEL_DIR}...")
    model.save_pretrained(MERGED_MODEL_DIR, safe_serialization=True)
    tokenizer.save_pretrained(MERGED_MODEL_DIR)

    # メモリ解放
    del model
    del base_model
    torch.cuda.empty_cache()

    print("✅ Step 1 complete: LoRA merged")
    return MERGED_MODEL_DIR


def step2_quantize_awq(merged_model_path):
    """Step 2: AWQ量子化"""
    print("\n" + "=" * 60)
    print("Step 2: AWQ Quantization (4-bit)")
    print("=" * 60)

    print(f"Input model: {merged_model_path}")
    print(f"AWQ config: {AWQ_CONFIG}")

    # モデル読み込み
    print("\nLoading merged model for quantization...")
    model = AutoAWQForCausalLM.from_pretrained(
        merged_model_path,
        trust_remote_code=True,
        safetensors=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(merged_model_path)

    # 量子化用キャリブレーションデータ
    # シンプルなサンプルで十分
    calib_data = [
        "Hello, how are you today?",
        "What is the weather like?",
        "Can you help me with coding?",
        "Search for files containing docker",
        "List all repositories in the project",
        "Find the definition of the function main",
        "こんにちは、今日の天気はどうですか？",
        "プロジェクトのファイル構成を教えてください",
    ]

    # 量子化実行
    print("\nQuantizing model (this may take a while)...")
    model.quantize(
        tokenizer,
        quant_config=AWQ_CONFIG,
        calib_data=calib_data,
    )

    # 保存
    print(f"\nSaving quantized model to {QUANTIZED_MODEL_DIR}...")
    model.save_quantized(QUANTIZED_MODEL_DIR, safetensors=True)
    tokenizer.save_pretrained(QUANTIZED_MODEL_DIR)

    print("✅ Step 2 complete: AWQ quantization done")
    return QUANTIZED_MODEL_DIR


def step3_upload_to_hub(quantized_model_path):
    """Step 3: HuggingFaceにアップロード"""
    print("\n" + "=" * 60)
    print("Step 3: Upload to HuggingFace Hub")
    print("=" * 60)

    print(f"Uploading to: {OUTPUT_MODEL_ID}")

    from huggingface_hub import HfApi, upload_folder

    api = HfApi()

    # リポジトリ作成（存在しなければ）
    try:
        api.create_repo(OUTPUT_MODEL_ID, private=True, exist_ok=True)
    except Exception as e:
        print(f"Note: {e}")

    # アップロード
    print("Uploading files...")
    upload_folder(
        folder_path=quantized_model_path,
        repo_id=OUTPUT_MODEL_ID,
        repo_type="model",
    )

    print(f"✅ Step 3 complete: Uploaded to https://huggingface.co/{OUTPUT_MODEL_ID}")


def main():
    print("\n" + "=" * 70)
    print("  LoRA Merge + AWQ Quantization Pipeline")
    print("=" * 70)
    print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Base: {BASE_MODEL}")
    print(f"LoRA: {LORA_MODEL}")
    print(f"Output: {OUTPUT_MODEL_ID}")
    print("=" * 70)

    # GPU確認
    if torch.cuda.is_available():
        print(f"\nGPU: {torch.cuda.get_device_name(0)}")
        print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    else:
        print("WARNING: No GPU available, this will be slow!")

    # Step 1: マージ
    merged_path = step1_merge_lora()

    # Step 2: 量子化
    quantized_path = step2_quantize_awq(merged_path)

    # Step 3: アップロード
    step3_upload_to_hub(quantized_path)

    # クリーンアップ（オプション）
    print("\n" + "=" * 60)
    print("Cleanup")
    print("=" * 60)
    cleanup = input("Delete intermediate files? (merged_model/) [y/N]: ").strip().lower()
    if cleanup == 'y':
        shutil.rmtree(MERGED_MODEL_DIR, ignore_errors=True)
        print("Cleaned up merged_model/")

    print("\n" + "=" * 70)
    print("🎉 Pipeline complete!")
    print(f"Model available at: https://huggingface.co/{OUTPUT_MODEL_ID}")
    print("=" * 70)


if __name__ == "__main__":
    main()