Fragmented-Training / 2.inference-comparison.py
aifeifei798's picture
Upload 2 files
2509a70 verified
raw
history blame
4.07 kB
import time
from unsloth import FastLanguageModel
# =================================================================================
# --- 配置区 (请根据你的实际情况修改这里的三个路径) ---
# =================================================================================
# 1. 你的基础模型本地路径 (就是你训练时用的那个)
base_model_path = (
"/home/aifeifei/AI_Data/develop/mini_tang/modules/Qwen3-4B-Thinking-2507"
)
# 2. 你刚刚炼成的“负重 LoRA”路径
lora_path = "QiMing-Polaris-Qwen3-4B-Thinking-2507_burden_trained_lora"
# 3. 序列最大长度 (和训练时保持一致)
max_seq_length = 4096
# =================================================================================
# --- Alpaca Prompt 模板 (无需修改) ---
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
EOS_TOKEN = None # 会被自动设置
def run_inference(model, tokenizer, instruction, input_text, title=""):
"""
一个通用的推理函数,负责格式化、生成和计时。
"""
global EOS_TOKEN
if EOS_TOKEN is None:
EOS_TOKEN = tokenizer.eos_token
print(f"\n{'=' * 20} {title} {'=' * 20}")
# 格式化输入
prompt = alpaca_prompt.format(instruction, input_text, "")
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
# 开始计时
start_time = time.time()
# 生成文本
outputs = model.generate(
**inputs,
max_new_tokens=256,
use_cache=True, # 必须开启以获得最快速度
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
)
# 结束计时
end_time = time.time()
duration = end_time - start_time
# 解码并打印结果
response_full = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
response_only = response_full.split("### Response:")[1].strip()
print("💬 生成的回答:")
print(response_only)
print(f"\n🕒 生成耗时: {duration:.4f} 秒")
return duration
# --- 主程序开始 ---
# 1. 加载基础模型和分词器
print("✅ 步骤 1/3: 正在加载基础模型 (不含 LoRA)...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=base_model_path,
max_seq_length=max_seq_length,
dtype=None, # 自动检测
load_in_4bit=True,
)
print("🎉 基础模型加载完成!")
# 2. 定义一个测试问题
instruction = "You are a helpful assistant. Provide a concise and accurate answer."
input_text = "What is the 'Burden-based Training' method for AI models, and why is it considered innovative?"
# 3. 运行第一次测试 (不带 LoRA)
duration_without_lora = run_inference(
model, tokenizer, instruction, input_text, title="⚔️ 测试1: 纯基础模型"
)
# 4. 加载并融合“负重 LoRA”
print(f"\n✅ 步骤 2/3: 正在加载并融合你的“负重 LoRA”从 '{lora_path}'...")
# Unsloth/PEFT 的一个很棒的功能是,可以直接在已加载的模型上添加适配器
model.load_adapter(lora_path)
print("🎉 LoRA 融合完成!")
# 5. 运行第二次测试 (带 LoRA)
duration_with_lora = run_inference(
model, tokenizer, instruction, input_text, title="🚀 测试2: 基础模型 + 负重 LoRA"
)
# 6. 最终对比
print(f"\n{'=' * 20} 最终对决 {'=' * 20}")
print(f"基础模型耗时: {duration_without_lora:.4f} 秒")
print(f"负重LoRA后耗时: {duration_with_lora:.4f} 秒")
if duration_with_lora < duration_without_lora:
improvement = (
(duration_without_lora - duration_with_lora) / duration_without_lora
) * 100
print(f"\n🏆 恭喜!“负重 LoRA” 带来了 {improvement:.2f}% 的速度提升!奇迹发生了!")
else:
print("\n🤔 速度没有明显提升,但请关注回答质量的变化!这可能是一种“质”的飞跃!")
print("✅ 步骤 3/3: 对比测试完成!")