import time

from unsloth import FastLanguageModel

# =================================================================================
# --- 配置区 (请根据你的实际情况修改这里的三个路径) ---
# =================================================================================

# 1. 你的基础模型本地路径 (就是你训练时用的那个)
base_model_path = (
    "/home/aifeifei/AI_Data/develop/mini_tang/modules/Qwen3-4B-Thinking-2507"
)

# 2. 你刚刚炼成的“负重 LoRA”路径
lora_path = "QiMing-Polaris-Qwen3-4B-Thinking-2507_burden_trained_lora"

# 3. 序列最大长度 (和训练时保持一致)
max_seq_length = 4096

# =================================================================================


# --- Alpaca Prompt 模板 (无需修改) ---
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = None  # 会被自动设置


def run_inference(model, tokenizer, instruction, input_text, title=""):
    """
    一个通用的推理函数，负责格式化、生成和计时。
    """
    global EOS_TOKEN
    if EOS_TOKEN is None:
        EOS_TOKEN = tokenizer.eos_token

    print(f"\n{'=' * 20} {title} {'=' * 20}")

    # 格式化输入
    prompt = alpaca_prompt.format(instruction, input_text, "")
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # 开始计时
    start_time = time.time()

    # 生成文本
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        use_cache=True,  # 必须开启以获得最快速度
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    # 结束计时
    end_time = time.time()
    duration = end_time - start_time

    # 解码并打印结果
    response_full = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    response_only = response_full.split("### Response:")[1].strip()

    print("💬 生成的回答:")
    print(response_only)
    print(f"\n🕒 生成耗时: {duration:.4f} 秒")
    return duration


# --- 主程序开始 ---

# 1. 加载基础模型和分词器
print("✅ 步骤 1/3: 正在加载基础模型 (不含 LoRA)...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_path,
    max_seq_length=max_seq_length,
    dtype=None,  # 自动检测
    load_in_4bit=True,
)
print("🎉 基础模型加载完成！")

# 2. 定义一个测试问题
instruction = "You are a helpful assistant. Provide a concise and accurate answer."
input_text = "What is the 'Burden-based Training' method for AI models, and why is it considered innovative?"

# 3. 运行第一次测试 (不带 LoRA)
duration_without_lora = run_inference(
    model, tokenizer, instruction, input_text, title="⚔️ 测试1: 纯基础模型"
)


# 4. 加载并融合“负重 LoRA”
print(f"\n✅ 步骤 2/3: 正在加载并融合你的“负重 LoRA”从 '{lora_path}'...")
# Unsloth/PEFT 的一个很棒的功能是，可以直接在已加载的模型上添加适配器
model.load_adapter(lora_path)
print("🎉 LoRA 融合完成！")


# 5. 运行第二次测试 (带 LoRA)
duration_with_lora = run_inference(
    model, tokenizer, instruction, input_text, title="🚀 测试2: 基础模型 + 负重 LoRA"
)


# 6. 最终对比
print(f"\n{'=' * 20} 最终对决 {'=' * 20}")
print(f"基础模型耗时:   {duration_without_lora:.4f} 秒")
print(f"负重LoRA后耗时: {duration_with_lora:.4f} 秒")

if duration_with_lora < duration_without_lora:
    improvement = (
        (duration_without_lora - duration_with_lora) / duration_without_lora
    ) * 100
    print(f"\n🏆 恭喜！“负重 LoRA” 带来了 {improvement:.2f}% 的速度提升！奇迹发生了！")
else:
    print("\n🤔 速度没有明显提升，但请关注回答质量的变化！这可能是一种“质”的飞跃！")

print("✅ 步骤 3/3: 对比测试完成！")