import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# **基模型 ID（LoRA 适配器是基于该基础 LLM 训练的）**
BASE_MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
ADAPTER_MODEL_ID = "Snow2222/autotrain-fst"

print("🚀 正在加载 Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

print("🚀 正在加载 Base Model（基础模型）...")
device = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

# **解决 `size mismatch` 问题：让基模型的词表大小与 LoRA 的词表保持一致**
print("🔧 调整 vocab_size 以匹配 LoRA...")
new_vocab_size = 151665
base_model.resize_token_embeddings(new_vocab_size)

print("🚀 正在加载 LoRA 适配器...")
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_ID).to(device)

# ======== 正确地“放大” LoRA 权重影响力 ========
# 1) 获取 adapter 名称（默认叫 "default" 或者是你的自定义名称）
adapter_name = model.active_adapter or "default"  # 若未自定义，一般是 "default"

# 2) 修改 peft_config.lora_alpha
peft_config = model.peft_config[adapter_name]
print(f"【原始】lora_alpha: {peft_config.lora_alpha}")
peft_config.lora_alpha = 128  # 你想尝试的较大值
print(f"【更新后】lora_alpha: {peft_config.lora_alpha}")

# 3) 遍历模型的 LoRA 层，将 scaling[adapter_name] 更新为相同的 alpha
for module_name, module in model.named_modules():
    # LoRA 层通常有 module.scaling，是个 dict
    if hasattr(module, "scaling") and isinstance(module.scaling, dict):
        # 如果当前 adapter 在该 dict 中，就更新
        if adapter_name in module.scaling:
            module.scaling[adapter_name] = peft_config.lora_alpha
# ======== LoRA 放大操作结束 ========

def respond(message, history, system_message, max_tokens, temperature, top_p):
    print("==== 🚀 处理用户输入 ====")
    print(f"用户输入: {message}")

    # 构造简单的 Prompt
    prompt = f"{system_message}\n用户: {message}\n助手:"
    print(f"📡 处理 Prompt: {prompt}")

    # 只处理当前输入
    inputs = tokenizer(message, return_tensors="pt", truncation=True).to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p
        )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"✅ 生成结果: {response}")

    return response

# **Gradio UI**
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
    ],
)

if __name__ == "__main__":
    print("🌍 启动 Gradio 界面...")
    demo.launch()