#!/usr/bin/env python3 """ Dream-Coder v0-Instruct-7B GGUF Q8_0 量化脚本 本脚本专门为 Dream-Coder 模型架构设计,处理其特殊的 diffusion 架构。 基于 llama.cpp 的转换工具,增加了对 Dream-Coder 特有配置的支持。 使用方法: python quantize_dream_q8_0.py --model_path /path/to/Dream-Coder-v0-Instruct-7B --output_dir ./gguf_output 依赖: - llama.cpp (需要先 git clone 并编译) - transformers>=4.46.2 - torch - safetensors """ import os import sys import json import argparse import subprocess import tempfile from pathlib import Path from typing import Dict, Any def check_llama_cpp_installation(llama_cpp_path: str) -> bool: """检查 llama.cpp 安装和编译状态""" required_files = [ "convert_hf_to_gguf.py", # 转换脚本 "build/bin/llama-quantize" # 编译后的量化工具 ] for file in required_files: file_path = Path(llama_cpp_path) / file if not file_path.exists(): print(f"缺少文件: {file_path}") return False return True def prepare_dream_config(model_path: str) -> Dict[str, Any]: """ 准备 Dream-Coder 特定的配置信息 处理架构差异和特殊 token """ config_path = Path(model_path) / "config.json" with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) # Dream-Coder 特定配置映射 dream_config = { # 基本架构信息 "model_type": "llama", # 映射到 llama.cpp 支持的类型 "architectures": ["LlamaForCausalLM"], # 兼容映射 # 模型参数 "vocab_size": config.get("vocab_size", 152064), "hidden_size": config.get("hidden_size", 3584), "intermediate_size": config.get("intermediate_size", 18944), "num_hidden_layers": config.get("num_hidden_layers", 28), "num_attention_heads": config.get("num_attention_heads", 28), "num_key_value_heads": config.get("num_key_value_heads", 4), "max_position_embeddings": config.get("max_position_embeddings", 32768), # 特殊配置 "hidden_act": config.get("hidden_act", "silu"), "rms_norm_eps": config.get("rms_norm_eps", 1e-06), "rope_theta": config.get("rope_theta", 1000000.0), "rope_scaling": config.get("rope_scaling"), # 特殊 token ID "bos_token_id": config.get("bos_token_id", 151665), "eos_token_id": config.get("eos_token_id", 151643), "pad_token_id": config.get("pad_token_id", 151643), # Dream-Coder 特有: mask token (关键!) "mask_token_id": config.get("mask_token_id", 151666), # 其他参数 "tie_word_embeddings": config.get("tie_word_embeddings", False), "torch_dtype": config.get("torch_dtype", "bfloat16"), "use_cache": config.get("use_cache", True), "attention_dropout": config.get("attention_dropout", 0.0), "initializer_range": config.get("initializer_range", 0.02), # Dream-Coder diffusion 相关 "max_window_layers": config.get("max_window_layers", 28), "sliding_window": config.get("sliding_window"), "use_sliding_window": config.get("use_sliding_window", False), } return dream_config def create_compatible_config(model_path: str, temp_dir: str) -> str: """ 创建与 llama.cpp 兼容的配置文件 """ dream_config = prepare_dream_config(model_path) # 创建临时配置文件 temp_config_path = Path(temp_dir) / "config.json" with open(temp_config_path, 'w', encoding='utf-8') as f: json.dump(dream_config, f, indent=2, ensure_ascii=False) return str(temp_config_path) def convert_to_gguf_f16(model_path: str, llama_cpp_path: str, output_path: str) -> bool: """ 第一步: 转换 PyTorch 模型到 GGUF F16 格式 """ print("步骤 1: 转换 PyTorch 模型到 GGUF F16...") convert_script = Path(llama_cpp_path) / "convert_hf_to_gguf.py" cmd = [ sys.executable, str(convert_script), model_path, "--outfile", output_path, "--outtype", "f16", "--verbose", # 显示详细信息 ] try: result = subprocess.run( cmd, check=True, capture_output=True, text=True, cwd=llama_cpp_path ) print("✓ F16 转换成功") print(f"输出: {result.stdout}") return True except subprocess.CalledProcessError as e: print(f"✗ F16 转换失败: {e}") print(f"错误输出: {e.stderr}") return False def quantize_to_q8_0(f16_path: str, llama_cpp_path: str, q8_0_path: str) -> bool: """ 第二步: 量化 F16 模型到 Q8_0 """ print("步骤 2: 量化到 Q8_0...") quantize_tool = Path(llama_cpp_path) / "build/bin/llama-quantize" if os.name == 'nt': # Windows quantize_tool = quantize_tool.with_suffix('.exe') cmd = [ str(quantize_tool), f16_path, q8_0_path, "Q8_0" ] try: result = subprocess.run( cmd, check=True, capture_output=True, text=True, cwd=llama_cpp_path ) print("✓ Q8_0 量化成功") print(f"输出: {result.stdout}") return True except subprocess.CalledProcessError as e: print(f"✗ Q8_0 量化失败: {e}") print(f"错误输出: {e.stderr}") return False def verify_gguf_model(gguf_path: str, llama_cpp_path: str) -> bool: """ 验证生成的 GGUF 模型 """ print("步骤 3: 验证 GGUF 模型...") # 检查文件是否存在 if not Path(gguf_path).exists(): print(f"✗ GGUF 文件不存在: {gguf_path}") return False # 获取文件大小 file_size = Path(gguf_path).stat().st_size / (1024**3) # GB print(f"✓ GGUF 文件大小: {file_size:.2f} GB") # 使用 llama.cpp 的 main 程序简单测试 main_tool = Path(llama_cpp_path) / "build/bin/llama-cli" if os.name == 'nt': main_tool = main_tool.with_suffix('.exe') if main_tool.exists(): cmd = [ str(main_tool), "-m", gguf_path, "-p", "def quicksort(arr):", "-n", "10", "--temp", "0.1" ] try: result = subprocess.run( cmd, check=True, capture_output=True, text=True, timeout=30, cwd=llama_cpp_path ) print("✓ 模型验证成功") print("示例输出:") print(result.stdout[-200:]) # 显示最后 200 字符 return True except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: print(f"⚠ 模型验证失败,但文件可能仍然有效: {e}") return True # 验证失败不一定意味着量化失败 else: print("⚠ 未找到 main 工具,跳过验证") return True def main(): parser = argparse.ArgumentParser( description="Dream-Coder v0-Instruct-7B GGUF Q8_0 量化工具" ) parser.add_argument( "--model_path", type=str, default=".", help="Dream-Coder 模型路径 (默认: 当前目录)" ) parser.add_argument( "--llama_cpp_path", type=str, required=True, help="llama.cpp 项目路径 (必需)" ) parser.add_argument( "--output_dir", type=str, default="./gguf_output", help="输出目录 (默认: ./gguf_output)" ) parser.add_argument( "--keep_f16", action="store_true", help="保留 F16 中间文件" ) args = parser.parse_args() # 路径处理 model_path = Path(args.model_path).resolve() llama_cpp_path = Path(args.llama_cpp_path).resolve() output_dir = Path(args.output_dir).resolve() print("=" * 60) print("Dream-Coder v0-Instruct-7B GGUF Q8_0 量化工具") print("=" * 60) print(f"模型路径: {model_path}") print(f"llama.cpp 路径: {llama_cpp_path}") print(f"输出目录: {output_dir}") print() # 检查输入 if not model_path.exists(): print(f"✗ 模型路径不存在: {model_path}") return 1 if not (model_path / "config.json").exists(): print(f"✗ 未找到模型配置文件: {model_path}/config.json") return 1 if not check_llama_cpp_installation(llama_cpp_path): print(f"✗ llama.cpp 安装不完整或未编译: {llama_cpp_path}") print("请先运行:") print(f" cd {llama_cpp_path}") print(" make -j$(nproc)") return 1 # 创建输出目录 output_dir.mkdir(parents=True, exist_ok=True) # 输出文件路径 f16_path = output_dir / "dream-coder-7b-f16.gguf" q8_0_path = output_dir / "dream-coder-7b-q8_0.gguf" # 执行转换流程 success = True # 步骤 1: 转换到 F16 if not convert_to_gguf_f16(str(model_path), str(llama_cpp_path), str(f16_path)): success = False # 步骤 2: 量化到 Q8_0 if success and not quantize_to_q8_0(str(f16_path), str(llama_cpp_path), str(q8_0_path)): success = False # 步骤 3: 验证模型 if success and not verify_gguf_model(str(q8_0_path), str(llama_cpp_path)): success = False # 清理中间文件 if success and not args.keep_f16 and f16_path.exists(): f16_path.unlink() print("✓ 已删除 F16 中间文件") # 结果报告 print() print("=" * 60) if success: print("✓ 量化完成!") print(f"输出文件: {q8_0_path}") # 文件信息 if q8_0_path.exists(): size_gb = q8_0_path.stat().st_size / (1024**3) print(f"文件大小: {size_gb:.2f} GB") print(f"预期内存占用: ~{size_gb:.1f} GB") print() print("使用方法:") print(f" # 使用 llama.cpp") print(f" {llama_cpp_path}/main -m {q8_0_path} -p 'def quicksort(arr):' -n 512") print() print(f" # 使用 llama-cpp-python") print(f" from llama_cpp import Llama") print(f" llm = Llama(model_path='{q8_0_path}', n_ctx=2048)") print(f" output = llm('def quicksort(arr):', max_tokens=512)") else: print("✗ 量化失败") return 1 print("=" * 60) return 0 if __name__ == "__main__": sys.exit(main())