import time from unsloth import FastLanguageModel # ================================================================================= # --- 配置区 (请根据你的实际情况修改这里的三个路径) --- # ================================================================================= # 1. 你的基础模型本地路径 (就是你训练时用的那个) base_model_path = ( "/home/aifeifei/AI_Data/develop/mini_tang/modules/Qwen3-4B-Thinking-2507" ) # 2. 你刚刚炼成的“负重 LoRA”路径 lora_path = "QiMing-Polaris-Qwen3-4B-Thinking-2507_burden_trained_lora" # 3. 序列最大长度 (和训练时保持一致) max_seq_length = 4096 # ================================================================================= # --- Alpaca Prompt 模板 (无需修改) --- alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" EOS_TOKEN = None # 会被自动设置 def run_inference(model, tokenizer, instruction, input_text, title=""): """ 一个通用的推理函数,负责格式化、生成和计时。 """ global EOS_TOKEN if EOS_TOKEN is None: EOS_TOKEN = tokenizer.eos_token print(f"\n{'=' * 20} {title} {'=' * 20}") # 格式化输入 prompt = alpaca_prompt.format(instruction, input_text, "") inputs = tokenizer([prompt], return_tensors="pt").to("cuda") # 开始计时 start_time = time.time() # 生成文本 outputs = model.generate( **inputs, max_new_tokens=256, use_cache=True, # 必须开启以获得最快速度 eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, ) # 结束计时 end_time = time.time() duration = end_time - start_time # 解码并打印结果 response_full = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] response_only = response_full.split("### Response:")[1].strip() print("💬 生成的回答:") print(response_only) print(f"\n🕒 生成耗时: {duration:.4f} 秒") return duration # --- 主程序开始 --- # 1. 加载基础模型和分词器 print("✅ 步骤 1/3: 正在加载基础模型 (不含 LoRA)...") model, tokenizer = FastLanguageModel.from_pretrained( model_name=base_model_path, max_seq_length=max_seq_length, dtype=None, # 自动检测 load_in_4bit=True, ) print("🎉 基础模型加载完成!") # 2. 定义一个测试问题 instruction = "You are a helpful assistant. Provide a concise and accurate answer." input_text = "What is the 'Burden-based Training' method for AI models, and why is it considered innovative?" # 3. 运行第一次测试 (不带 LoRA) duration_without_lora = run_inference( model, tokenizer, instruction, input_text, title="⚔️ 测试1: 纯基础模型" ) # 4. 加载并融合“负重 LoRA” print(f"\n✅ 步骤 2/3: 正在加载并融合你的“负重 LoRA”从 '{lora_path}'...") # Unsloth/PEFT 的一个很棒的功能是,可以直接在已加载的模型上添加适配器 model.load_adapter(lora_path) print("🎉 LoRA 融合完成!") # 5. 运行第二次测试 (带 LoRA) duration_with_lora = run_inference( model, tokenizer, instruction, input_text, title="🚀 测试2: 基础模型 + 负重 LoRA" ) # 6. 最终对比 print(f"\n{'=' * 20} 最终对决 {'=' * 20}") print(f"基础模型耗时: {duration_without_lora:.4f} 秒") print(f"负重LoRA后耗时: {duration_with_lora:.4f} 秒") if duration_with_lora < duration_without_lora: improvement = ( (duration_without_lora - duration_with_lora) / duration_without_lora ) * 100 print(f"\n🏆 恭喜!“负重 LoRA” 带来了 {improvement:.2f}% 的速度提升!奇迹发生了!") else: print("\n🤔 速度没有明显提升,但请关注回答质量的变化!这可能是一种“质”的飞跃!") print("✅ 步骤 3/3: 对比测试完成!")