Fragmented-Training / 2.inference-comparison.py

Upload 2 files

2509a70 verified 20 days ago

4.07 kB

	import time

	from unsloth import FastLanguageModel

	# =================================================================================
	# --- 配置区 (请根据你的实际情况修改这里的三个路径) ---
	# =================================================================================

	# 1. 你的基础模型本地路径 (就是你训练时用的那个)
	base_model_path = (
	"/home/aifeifei/AI_Data/develop/mini_tang/modules/Qwen3-4B-Thinking-2507"
	)

	# 2. 你刚刚炼成的“负重 LoRA”路径
	lora_path = "QiMing-Polaris-Qwen3-4B-Thinking-2507_burden_trained_lora"

	# 3. 序列最大长度 (和训练时保持一致)
	max_seq_length = 4096

	# =================================================================================


	# --- Alpaca Prompt 模板 (无需修改) ---
	alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	{}

	### Input:
	{}

	### Response:
	{}"""

	EOS_TOKEN = None # 会被自动设置


	def run_inference(model, tokenizer, instruction, input_text, title=""):
	"""
	一个通用的推理函数，负责格式化、生成和计时。
	"""
	global EOS_TOKEN
	if EOS_TOKEN is None:
	EOS_TOKEN = tokenizer.eos_token

	print(f"\n{'=' * 20} {title} {'=' * 20}")

	# 格式化输入
	prompt = alpaca_prompt.format(instruction, input_text, "")
	inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

	# 开始计时
	start_time = time.time()

	# 生成文本
	outputs = model.generate(
	**inputs,
	max_new_tokens=256,
	use_cache=True, # 必须开启以获得最快速度
	eos_token_id=tokenizer.eos_token_id,
	pad_token_id=tokenizer.pad_token_id,
	)

	# 结束计时
	end_time = time.time()
	duration = end_time - start_time

	# 解码并打印结果
	response_full = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
	response_only = response_full.split("### Response:")[1].strip()

	print("💬 生成的回答:")
	print(response_only)
	print(f"\n🕒 生成耗时: {duration:.4f} 秒")
	return duration


	# --- 主程序开始 ---

	# 1. 加载基础模型和分词器
	print("✅ 步骤 1/3: 正在加载基础模型 (不含 LoRA)...")
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=base_model_path,
	max_seq_length=max_seq_length,
	dtype=None, # 自动检测
	load_in_4bit=True,
	)
	print("🎉 基础模型加载完成！")

	# 2. 定义一个测试问题
	instruction = "You are a helpful assistant. Provide a concise and accurate answer."
	input_text = "What is the 'Burden-based Training' method for AI models, and why is it considered innovative?"

	# 3. 运行第一次测试 (不带 LoRA)
	duration_without_lora = run_inference(
	model, tokenizer, instruction, input_text, title="⚔️ 测试1: 纯基础模型"
	)


	# 4. 加载并融合“负重 LoRA”
	print(f"\n✅ 步骤 2/3: 正在加载并融合你的“负重 LoRA”从 '{lora_path}'...")
	# Unsloth/PEFT 的一个很棒的功能是，可以直接在已加载的模型上添加适配器
	model.load_adapter(lora_path)
	print("🎉 LoRA 融合完成！")


	# 5. 运行第二次测试 (带 LoRA)
	duration_with_lora = run_inference(
	model, tokenizer, instruction, input_text, title="🚀 测试2: 基础模型 + 负重 LoRA"
	)


	# 6. 最终对比
	print(f"\n{'=' * 20} 最终对决 {'=' * 20}")
	print(f"基础模型耗时: {duration_without_lora:.4f} 秒")
	print(f"负重LoRA后耗时: {duration_with_lora:.4f} 秒")

	if duration_with_lora < duration_without_lora:
	improvement = (
	(duration_without_lora - duration_with_lora) / duration_without_lora
	) * 100
	print(f"\n🏆 恭喜！“负重 LoRA” 带来了 {improvement:.2f}% 的速度提升！奇迹发生了！")
	else:
	print("\n🤔 速度没有明显提升，但请关注回答质量的变化！这可能是一种“质”的飞跃！")

	print("✅ 步骤 3/3: 对比测试完成！")