File size: 4,068 Bytes
2509a70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import time

from unsloth import FastLanguageModel

# =================================================================================
# --- 配置区 (请根据你的实际情况修改这里的三个路径) ---
# =================================================================================

# 1. 你的基础模型本地路径 (就是你训练时用的那个)
base_model_path = (
    "/home/aifeifei/AI_Data/develop/mini_tang/modules/Qwen3-4B-Thinking-2507"
)

# 2. 你刚刚炼成的“负重 LoRA”路径
lora_path = "QiMing-Polaris-Qwen3-4B-Thinking-2507_burden_trained_lora"

# 3. 序列最大长度 (和训练时保持一致)
max_seq_length = 4096

# =================================================================================


# --- Alpaca Prompt 模板 (无需修改) ---
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = None  # 会被自动设置


def run_inference(model, tokenizer, instruction, input_text, title=""):
    """
    一个通用的推理函数,负责格式化、生成和计时。
    """
    global EOS_TOKEN
    if EOS_TOKEN is None:
        EOS_TOKEN = tokenizer.eos_token

    print(f"\n{'=' * 20} {title} {'=' * 20}")

    # 格式化输入
    prompt = alpaca_prompt.format(instruction, input_text, "")
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # 开始计时
    start_time = time.time()

    # 生成文本
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        use_cache=True,  # 必须开启以获得最快速度
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    # 结束计时
    end_time = time.time()
    duration = end_time - start_time

    # 解码并打印结果
    response_full = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    response_only = response_full.split("### Response:")[1].strip()

    print("💬 生成的回答:")
    print(response_only)
    print(f"\n🕒 生成耗时: {duration:.4f} 秒")
    return duration


# --- 主程序开始 ---

# 1. 加载基础模型和分词器
print("✅ 步骤 1/3: 正在加载基础模型 (不含 LoRA)...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_path,
    max_seq_length=max_seq_length,
    dtype=None,  # 自动检测
    load_in_4bit=True,
)
print("🎉 基础模型加载完成!")

# 2. 定义一个测试问题
instruction = "You are a helpful assistant. Provide a concise and accurate answer."
input_text = "What is the 'Burden-based Training' method for AI models, and why is it considered innovative?"

# 3. 运行第一次测试 (不带 LoRA)
duration_without_lora = run_inference(
    model, tokenizer, instruction, input_text, title="⚔️ 测试1: 纯基础模型"
)


# 4. 加载并融合“负重 LoRA”
print(f"\n✅ 步骤 2/3: 正在加载并融合你的“负重 LoRA”从 '{lora_path}'...")
# Unsloth/PEFT 的一个很棒的功能是,可以直接在已加载的模型上添加适配器
model.load_adapter(lora_path)
print("🎉 LoRA 融合完成!")


# 5. 运行第二次测试 (带 LoRA)
duration_with_lora = run_inference(
    model, tokenizer, instruction, input_text, title="🚀 测试2: 基础模型 + 负重 LoRA"
)


# 6. 最终对比
print(f"\n{'=' * 20} 最终对决 {'=' * 20}")
print(f"基础模型耗时:   {duration_without_lora:.4f} 秒")
print(f"负重LoRA后耗时: {duration_with_lora:.4f} 秒")

if duration_with_lora < duration_without_lora:
    improvement = (
        (duration_without_lora - duration_with_lora) / duration_without_lora
    ) * 100
    print(f"\n🏆 恭喜!“负重 LoRA” 带来了 {improvement:.2f}% 的速度提升!奇迹发生了!")
else:
    print("\n🤔 速度没有明显提升,但请关注回答质量的变化!这可能是一种“质”的飞跃!")

print("✅ 步骤 3/3: 对比测试完成!")