|
|
import time |
|
|
|
|
|
from unsloth import FastLanguageModel |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
base_model_path = ( |
|
|
"/home/aifeifei/AI_Data/develop/mini_tang/modules/Qwen3-4B-Thinking-2507" |
|
|
) |
|
|
|
|
|
|
|
|
lora_path = "QiMing-Polaris-Qwen3-4B-Thinking-2507_burden_trained_lora" |
|
|
|
|
|
|
|
|
max_seq_length = 4096 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
|
|
### Instruction: |
|
|
{} |
|
|
|
|
|
### Input: |
|
|
{} |
|
|
|
|
|
### Response: |
|
|
{}""" |
|
|
|
|
|
EOS_TOKEN = None |
|
|
|
|
|
|
|
|
def run_inference(model, tokenizer, instruction, input_text, title=""): |
|
|
""" |
|
|
一个通用的推理函数,负责格式化、生成和计时。 |
|
|
""" |
|
|
global EOS_TOKEN |
|
|
if EOS_TOKEN is None: |
|
|
EOS_TOKEN = tokenizer.eos_token |
|
|
|
|
|
print(f"\n{'=' * 20} {title} {'=' * 20}") |
|
|
|
|
|
|
|
|
prompt = alpaca_prompt.format(instruction, input_text, "") |
|
|
inputs = tokenizer([prompt], return_tensors="pt").to("cuda") |
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=256, |
|
|
use_cache=True, |
|
|
eos_token_id=tokenizer.eos_token_id, |
|
|
pad_token_id=tokenizer.pad_token_id, |
|
|
) |
|
|
|
|
|
|
|
|
end_time = time.time() |
|
|
duration = end_time - start_time |
|
|
|
|
|
|
|
|
response_full = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] |
|
|
response_only = response_full.split("### Response:")[1].strip() |
|
|
|
|
|
print("💬 生成的回答:") |
|
|
print(response_only) |
|
|
print(f"\n🕒 生成耗时: {duration:.4f} 秒") |
|
|
return duration |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("✅ 步骤 1/3: 正在加载基础模型 (不含 LoRA)...") |
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
|
model_name=base_model_path, |
|
|
max_seq_length=max_seq_length, |
|
|
dtype=None, |
|
|
load_in_4bit=True, |
|
|
) |
|
|
print("🎉 基础模型加载完成!") |
|
|
|
|
|
|
|
|
instruction = "You are a helpful assistant. Provide a concise and accurate answer." |
|
|
input_text = "What is the 'Burden-based Training' method for AI models, and why is it considered innovative?" |
|
|
|
|
|
|
|
|
duration_without_lora = run_inference( |
|
|
model, tokenizer, instruction, input_text, title="⚔️ 测试1: 纯基础模型" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
print(f"\n✅ 步骤 2/3: 正在加载并融合你的“负重 LoRA”从 '{lora_path}'...") |
|
|
|
|
|
model.load_adapter(lora_path) |
|
|
print("🎉 LoRA 融合完成!") |
|
|
|
|
|
|
|
|
|
|
|
duration_with_lora = run_inference( |
|
|
model, tokenizer, instruction, input_text, title="🚀 测试2: 基础模型 + 负重 LoRA" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
print(f"\n{'=' * 20} 最终对决 {'=' * 20}") |
|
|
print(f"基础模型耗时: {duration_without_lora:.4f} 秒") |
|
|
print(f"负重LoRA后耗时: {duration_with_lora:.4f} 秒") |
|
|
|
|
|
if duration_with_lora < duration_without_lora: |
|
|
improvement = ( |
|
|
(duration_without_lora - duration_with_lora) / duration_without_lora |
|
|
) * 100 |
|
|
print(f"\n🏆 恭喜!“负重 LoRA” 带来了 {improvement:.2f}% 的速度提升!奇迹发生了!") |
|
|
else: |
|
|
print("\n🤔 速度没有明显提升,但请关注回答质量的变化!这可能是一种“质”的飞跃!") |
|
|
|
|
|
print("✅ 步骤 3/3: 对比测试完成!") |
|
|
|