dream-quant / quantize_dream_q8_0.py

init q8 gguf

7b8c2e7 4 months ago

10.8 kB

	#!/usr/bin/env python3
	"""
	Dream-Coder v0-Instruct-7B GGUF Q8_0 量化脚本

	本脚本专门为 Dream-Coder 模型架构设计，处理其特殊的 diffusion 架构。
	基于 llama.cpp 的转换工具，增加了对 Dream-Coder 特有配置的支持。

	使用方法:
	python quantize_dream_q8_0.py --model_path /path/to/Dream-Coder-v0-Instruct-7B --output_dir ./gguf_output

	依赖:
	- llama.cpp (需要先 git clone 并编译)
	- transformers>=4.46.2
	- torch
	- safetensors
	"""

	import os
	import sys
	import json
	import argparse
	import subprocess
	import tempfile
	from pathlib import Path
	from typing import Dict, Any

	def check_llama_cpp_installation(llama_cpp_path: str) -> bool:
	"""检查 llama.cpp 安装和编译状态"""
	required_files = [
	"convert_hf_to_gguf.py", # 转换脚本
	"build/bin/llama-quantize" # 编译后的量化工具
	]

	for file in required_files:
	file_path = Path(llama_cpp_path) / file
	if not file_path.exists():
	print(f"缺少文件: {file_path}")
	return False

	return True

	def prepare_dream_config(model_path: str) -> Dict[str, Any]:
	"""
	准备 Dream-Coder 特定的配置信息
	处理架构差异和特殊 token
	"""
	config_path = Path(model_path) / "config.json"

	with open(config_path, 'r', encoding='utf-8') as f:
	config = json.load(f)

	# Dream-Coder 特定配置映射
	dream_config = {
	# 基本架构信息
	"model_type": "llama", # 映射到 llama.cpp 支持的类型
	"architectures": ["LlamaForCausalLM"], # 兼容映射

	# 模型参数
	"vocab_size": config.get("vocab_size", 152064),
	"hidden_size": config.get("hidden_size", 3584),
	"intermediate_size": config.get("intermediate_size", 18944),
	"num_hidden_layers": config.get("num_hidden_layers", 28),
	"num_attention_heads": config.get("num_attention_heads", 28),
	"num_key_value_heads": config.get("num_key_value_heads", 4),
	"max_position_embeddings": config.get("max_position_embeddings", 32768),

	# 特殊配置
	"hidden_act": config.get("hidden_act", "silu"),
	"rms_norm_eps": config.get("rms_norm_eps", 1e-06),
	"rope_theta": config.get("rope_theta", 1000000.0),
	"rope_scaling": config.get("rope_scaling"),

	# 特殊 token ID
	"bos_token_id": config.get("bos_token_id", 151665),
	"eos_token_id": config.get("eos_token_id", 151643),
	"pad_token_id": config.get("pad_token_id", 151643),

	# Dream-Coder 特有: mask token (关键!)
	"mask_token_id": config.get("mask_token_id", 151666),

	# 其他参数
	"tie_word_embeddings": config.get("tie_word_embeddings", False),
	"torch_dtype": config.get("torch_dtype", "bfloat16"),
	"use_cache": config.get("use_cache", True),
	"attention_dropout": config.get("attention_dropout", 0.0),
	"initializer_range": config.get("initializer_range", 0.02),

	# Dream-Coder diffusion 相关
	"max_window_layers": config.get("max_window_layers", 28),
	"sliding_window": config.get("sliding_window"),
	"use_sliding_window": config.get("use_sliding_window", False),
	}

	return dream_config

	def create_compatible_config(model_path: str, temp_dir: str) -> str:
	"""
	创建与 llama.cpp 兼容的配置文件
	"""
	dream_config = prepare_dream_config(model_path)

	# 创建临时配置文件
	temp_config_path = Path(temp_dir) / "config.json"

	with open(temp_config_path, 'w', encoding='utf-8') as f:
	json.dump(dream_config, f, indent=2, ensure_ascii=False)

	return str(temp_config_path)

	def convert_to_gguf_f16(model_path: str, llama_cpp_path: str, output_path: str) -> bool:
	"""
	第一步: 转换 PyTorch 模型到 GGUF F16 格式
	"""
	print("步骤 1: 转换 PyTorch 模型到 GGUF F16...")

	convert_script = Path(llama_cpp_path) / "convert_hf_to_gguf.py"

	cmd = [
	sys.executable,
	str(convert_script),
	model_path,
	"--outfile", output_path,
	"--outtype", "f16",
	"--verbose", # 显示详细信息
	]

	try:
	result = subprocess.run(
	cmd,
	check=True,
	capture_output=True,
	text=True,
	cwd=llama_cpp_path
	)
	print("✓ F16 转换成功")
	print(f"输出: {result.stdout}")
	return True
	except subprocess.CalledProcessError as e:
	print(f"✗ F16 转换失败: {e}")
	print(f"错误输出: {e.stderr}")
	return False

	def quantize_to_q8_0(f16_path: str, llama_cpp_path: str, q8_0_path: str) -> bool:
	"""
	第二步: 量化 F16 模型到 Q8_0
	"""
	print("步骤 2: 量化到 Q8_0...")

	quantize_tool = Path(llama_cpp_path) / "build/bin/llama-quantize"
	if os.name == 'nt': # Windows
	quantize_tool = quantize_tool.with_suffix('.exe')

	cmd = [
	str(quantize_tool),
	f16_path,
	q8_0_path,
	"Q8_0"
	]

	try:
	result = subprocess.run(
	cmd,
	check=True,
	capture_output=True,
	text=True,
	cwd=llama_cpp_path
	)
	print("✓ Q8_0 量化成功")
	print(f"输出: {result.stdout}")
	return True
	except subprocess.CalledProcessError as e:
	print(f"✗ Q8_0 量化失败: {e}")
	print(f"错误输出: {e.stderr}")
	return False

	def verify_gguf_model(gguf_path: str, llama_cpp_path: str) -> bool:
	"""
	验证生成的 GGUF 模型
	"""
	print("步骤 3: 验证 GGUF 模型...")

	# 检查文件是否存在
	if not Path(gguf_path).exists():
	print(f"✗ GGUF 文件不存在: {gguf_path}")
	return False

	# 获取文件大小
	file_size = Path(gguf_path).stat().st_size / (1024**3) # GB
	print(f"✓ GGUF 文件大小: {file_size:.2f} GB")

	# 使用 llama.cpp 的 main 程序简单测试
	main_tool = Path(llama_cpp_path) / "build/bin/llama-cli"
	if os.name == 'nt':
	main_tool = main_tool.with_suffix('.exe')

	if main_tool.exists():
	cmd = [
	str(main_tool),
	"-m", gguf_path,
	"-p", "def quicksort(arr):",
	"-n", "10",
	"--temp", "0.1"
	]

	try:
	result = subprocess.run(
	cmd,
	check=True,
	capture_output=True,
	text=True,
	timeout=30,
	cwd=llama_cpp_path
	)
	print("✓ 模型验证成功")
	print("示例输出:")
	print(result.stdout[-200:]) # 显示最后 200 字符
	return True
	except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
	print(f"⚠ 模型验证失败，但文件可能仍然有效: {e}")
	return True # 验证失败不一定意味着量化失败
	else:
	print("⚠ 未找到 main 工具，跳过验证")
	return True

	def main():
	parser = argparse.ArgumentParser(
	description="Dream-Coder v0-Instruct-7B GGUF Q8_0 量化工具"
	)
	parser.add_argument(
	"--model_path",
	type=str,
	default=".",
	help="Dream-Coder 模型路径 (默认: 当前目录)"
	)
	parser.add_argument(
	"--llama_cpp_path",
	type=str,
	required=True,
	help="llama.cpp 项目路径 (必需)"
	)
	parser.add_argument(
	"--output_dir",
	type=str,
	default="./gguf_output",
	help="输出目录 (默认: ./gguf_output)"
	)
	parser.add_argument(
	"--keep_f16",
	action="store_true",
	help="保留 F16 中间文件"
	)

	args = parser.parse_args()

	# 路径处理
	model_path = Path(args.model_path).resolve()
	llama_cpp_path = Path(args.llama_cpp_path).resolve()
	output_dir = Path(args.output_dir).resolve()

	print("=" * 60)
	print("Dream-Coder v0-Instruct-7B GGUF Q8_0 量化工具")
	print("=" * 60)
	print(f"模型路径: {model_path}")
	print(f"llama.cpp 路径: {llama_cpp_path}")
	print(f"输出目录: {output_dir}")
	print()

	# 检查输入
	if not model_path.exists():
	print(f"✗ 模型路径不存在: {model_path}")
	return 1

	if not (model_path / "config.json").exists():
	print(f"✗ 未找到模型配置文件: {model_path}/config.json")
	return 1

	if not check_llama_cpp_installation(llama_cpp_path):
	print(f"✗ llama.cpp 安装不完整或未编译: {llama_cpp_path}")
	print("请先运行:")
	print(f" cd {llama_cpp_path}")
	print(" make -j$(nproc)")
	return 1

	# 创建输出目录
	output_dir.mkdir(parents=True, exist_ok=True)

	# 输出文件路径
	f16_path = output_dir / "dream-coder-7b-f16.gguf"
	q8_0_path = output_dir / "dream-coder-7b-q8_0.gguf"

	# 执行转换流程
	success = True

	# 步骤 1: 转换到 F16
	if not convert_to_gguf_f16(str(model_path), str(llama_cpp_path), str(f16_path)):
	success = False

	# 步骤 2: 量化到 Q8_0
	if success and not quantize_to_q8_0(str(f16_path), str(llama_cpp_path), str(q8_0_path)):
	success = False

	# 步骤 3: 验证模型
	if success and not verify_gguf_model(str(q8_0_path), str(llama_cpp_path)):
	success = False

	# 清理中间文件
	if success and not args.keep_f16 and f16_path.exists():
	f16_path.unlink()
	print("✓ 已删除 F16 中间文件")

	# 结果报告
	print()
	print("=" * 60)
	if success:
	print("✓ 量化完成!")
	print(f"输出文件: {q8_0_path}")

	# 文件信息
	if q8_0_path.exists():
	size_gb = q8_0_path.stat().st_size / (1024**3)
	print(f"文件大小: {size_gb:.2f} GB")
	print(f"预期内存占用: ~{size_gb:.1f} GB")

	print()
	print("使用方法:")
	print(f" # 使用 llama.cpp")
	print(f" {llama_cpp_path}/main -m {q8_0_path} -p 'def quicksort(arr):' -n 512")
	print()
	print(f" # 使用 llama-cpp-python")
	print(f" from llama_cpp import Llama")
	print(f" llm = Llama(model_path='{q8_0_path}', n_ctx=2048)")
	print(f" output = llm('def quicksort(arr):', max_tokens=512)")

	else:
	print("✗ 量化失败")
	return 1

	print("=" * 60)
	return 0

	if __name__ == "__main__":
	sys.exit(main())