|
|
|
|
|
""" |
|
|
Dream-Coder v0-Instruct-7B GGUF Q8_0 量化脚本 |
|
|
|
|
|
本脚本专门为 Dream-Coder 模型架构设计,处理其特殊的 diffusion 架构。 |
|
|
基于 llama.cpp 的转换工具,增加了对 Dream-Coder 特有配置的支持。 |
|
|
|
|
|
使用方法: |
|
|
python quantize_dream_q8_0.py --model_path /path/to/Dream-Coder-v0-Instruct-7B --output_dir ./gguf_output |
|
|
|
|
|
依赖: |
|
|
- llama.cpp (需要先 git clone 并编译) |
|
|
- transformers>=4.46.2 |
|
|
- torch |
|
|
- safetensors |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import json |
|
|
import argparse |
|
|
import subprocess |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
from typing import Dict, Any |
|
|
|
|
|
def check_llama_cpp_installation(llama_cpp_path: str) -> bool: |
|
|
"""检查 llama.cpp 安装和编译状态""" |
|
|
required_files = [ |
|
|
"convert_hf_to_gguf.py", |
|
|
"build/bin/llama-quantize" |
|
|
] |
|
|
|
|
|
for file in required_files: |
|
|
file_path = Path(llama_cpp_path) / file |
|
|
if not file_path.exists(): |
|
|
print(f"缺少文件: {file_path}") |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def prepare_dream_config(model_path: str) -> Dict[str, Any]: |
|
|
""" |
|
|
准备 Dream-Coder 特定的配置信息 |
|
|
处理架构差异和特殊 token |
|
|
""" |
|
|
config_path = Path(model_path) / "config.json" |
|
|
|
|
|
with open(config_path, 'r', encoding='utf-8') as f: |
|
|
config = json.load(f) |
|
|
|
|
|
|
|
|
dream_config = { |
|
|
|
|
|
"model_type": "llama", |
|
|
"architectures": ["LlamaForCausalLM"], |
|
|
|
|
|
|
|
|
"vocab_size": config.get("vocab_size", 152064), |
|
|
"hidden_size": config.get("hidden_size", 3584), |
|
|
"intermediate_size": config.get("intermediate_size", 18944), |
|
|
"num_hidden_layers": config.get("num_hidden_layers", 28), |
|
|
"num_attention_heads": config.get("num_attention_heads", 28), |
|
|
"num_key_value_heads": config.get("num_key_value_heads", 4), |
|
|
"max_position_embeddings": config.get("max_position_embeddings", 32768), |
|
|
|
|
|
|
|
|
"hidden_act": config.get("hidden_act", "silu"), |
|
|
"rms_norm_eps": config.get("rms_norm_eps", 1e-06), |
|
|
"rope_theta": config.get("rope_theta", 1000000.0), |
|
|
"rope_scaling": config.get("rope_scaling"), |
|
|
|
|
|
|
|
|
"bos_token_id": config.get("bos_token_id", 151665), |
|
|
"eos_token_id": config.get("eos_token_id", 151643), |
|
|
"pad_token_id": config.get("pad_token_id", 151643), |
|
|
|
|
|
|
|
|
"mask_token_id": config.get("mask_token_id", 151666), |
|
|
|
|
|
|
|
|
"tie_word_embeddings": config.get("tie_word_embeddings", False), |
|
|
"torch_dtype": config.get("torch_dtype", "bfloat16"), |
|
|
"use_cache": config.get("use_cache", True), |
|
|
"attention_dropout": config.get("attention_dropout", 0.0), |
|
|
"initializer_range": config.get("initializer_range", 0.02), |
|
|
|
|
|
|
|
|
"max_window_layers": config.get("max_window_layers", 28), |
|
|
"sliding_window": config.get("sliding_window"), |
|
|
"use_sliding_window": config.get("use_sliding_window", False), |
|
|
} |
|
|
|
|
|
return dream_config |
|
|
|
|
|
def create_compatible_config(model_path: str, temp_dir: str) -> str: |
|
|
""" |
|
|
创建与 llama.cpp 兼容的配置文件 |
|
|
""" |
|
|
dream_config = prepare_dream_config(model_path) |
|
|
|
|
|
|
|
|
temp_config_path = Path(temp_dir) / "config.json" |
|
|
|
|
|
with open(temp_config_path, 'w', encoding='utf-8') as f: |
|
|
json.dump(dream_config, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
return str(temp_config_path) |
|
|
|
|
|
def convert_to_gguf_f16(model_path: str, llama_cpp_path: str, output_path: str) -> bool: |
|
|
""" |
|
|
第一步: 转换 PyTorch 模型到 GGUF F16 格式 |
|
|
""" |
|
|
print("步骤 1: 转换 PyTorch 模型到 GGUF F16...") |
|
|
|
|
|
convert_script = Path(llama_cpp_path) / "convert_hf_to_gguf.py" |
|
|
|
|
|
cmd = [ |
|
|
sys.executable, |
|
|
str(convert_script), |
|
|
model_path, |
|
|
"--outfile", output_path, |
|
|
"--outtype", "f16", |
|
|
"--verbose", |
|
|
] |
|
|
|
|
|
try: |
|
|
result = subprocess.run( |
|
|
cmd, |
|
|
check=True, |
|
|
capture_output=True, |
|
|
text=True, |
|
|
cwd=llama_cpp_path |
|
|
) |
|
|
print("✓ F16 转换成功") |
|
|
print(f"输出: {result.stdout}") |
|
|
return True |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"✗ F16 转换失败: {e}") |
|
|
print(f"错误输出: {e.stderr}") |
|
|
return False |
|
|
|
|
|
def quantize_to_q8_0(f16_path: str, llama_cpp_path: str, q8_0_path: str) -> bool: |
|
|
""" |
|
|
第二步: 量化 F16 模型到 Q8_0 |
|
|
""" |
|
|
print("步骤 2: 量化到 Q8_0...") |
|
|
|
|
|
quantize_tool = Path(llama_cpp_path) / "build/bin/llama-quantize" |
|
|
if os.name == 'nt': |
|
|
quantize_tool = quantize_tool.with_suffix('.exe') |
|
|
|
|
|
cmd = [ |
|
|
str(quantize_tool), |
|
|
f16_path, |
|
|
q8_0_path, |
|
|
"Q8_0" |
|
|
] |
|
|
|
|
|
try: |
|
|
result = subprocess.run( |
|
|
cmd, |
|
|
check=True, |
|
|
capture_output=True, |
|
|
text=True, |
|
|
cwd=llama_cpp_path |
|
|
) |
|
|
print("✓ Q8_0 量化成功") |
|
|
print(f"输出: {result.stdout}") |
|
|
return True |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"✗ Q8_0 量化失败: {e}") |
|
|
print(f"错误输出: {e.stderr}") |
|
|
return False |
|
|
|
|
|
def verify_gguf_model(gguf_path: str, llama_cpp_path: str) -> bool: |
|
|
""" |
|
|
验证生成的 GGUF 模型 |
|
|
""" |
|
|
print("步骤 3: 验证 GGUF 模型...") |
|
|
|
|
|
|
|
|
if not Path(gguf_path).exists(): |
|
|
print(f"✗ GGUF 文件不存在: {gguf_path}") |
|
|
return False |
|
|
|
|
|
|
|
|
file_size = Path(gguf_path).stat().st_size / (1024**3) |
|
|
print(f"✓ GGUF 文件大小: {file_size:.2f} GB") |
|
|
|
|
|
|
|
|
main_tool = Path(llama_cpp_path) / "build/bin/llama-cli" |
|
|
if os.name == 'nt': |
|
|
main_tool = main_tool.with_suffix('.exe') |
|
|
|
|
|
if main_tool.exists(): |
|
|
cmd = [ |
|
|
str(main_tool), |
|
|
"-m", gguf_path, |
|
|
"-p", "def quicksort(arr):", |
|
|
"-n", "10", |
|
|
"--temp", "0.1" |
|
|
] |
|
|
|
|
|
try: |
|
|
result = subprocess.run( |
|
|
cmd, |
|
|
check=True, |
|
|
capture_output=True, |
|
|
text=True, |
|
|
timeout=30, |
|
|
cwd=llama_cpp_path |
|
|
) |
|
|
print("✓ 模型验证成功") |
|
|
print("示例输出:") |
|
|
print(result.stdout[-200:]) |
|
|
return True |
|
|
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: |
|
|
print(f"⚠ 模型验证失败,但文件可能仍然有效: {e}") |
|
|
return True |
|
|
else: |
|
|
print("⚠ 未找到 main 工具,跳过验证") |
|
|
return True |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Dream-Coder v0-Instruct-7B GGUF Q8_0 量化工具" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--model_path", |
|
|
type=str, |
|
|
default=".", |
|
|
help="Dream-Coder 模型路径 (默认: 当前目录)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--llama_cpp_path", |
|
|
type=str, |
|
|
required=True, |
|
|
help="llama.cpp 项目路径 (必需)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output_dir", |
|
|
type=str, |
|
|
default="./gguf_output", |
|
|
help="输出目录 (默认: ./gguf_output)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--keep_f16", |
|
|
action="store_true", |
|
|
help="保留 F16 中间文件" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
model_path = Path(args.model_path).resolve() |
|
|
llama_cpp_path = Path(args.llama_cpp_path).resolve() |
|
|
output_dir = Path(args.output_dir).resolve() |
|
|
|
|
|
print("=" * 60) |
|
|
print("Dream-Coder v0-Instruct-7B GGUF Q8_0 量化工具") |
|
|
print("=" * 60) |
|
|
print(f"模型路径: {model_path}") |
|
|
print(f"llama.cpp 路径: {llama_cpp_path}") |
|
|
print(f"输出目录: {output_dir}") |
|
|
print() |
|
|
|
|
|
|
|
|
if not model_path.exists(): |
|
|
print(f"✗ 模型路径不存在: {model_path}") |
|
|
return 1 |
|
|
|
|
|
if not (model_path / "config.json").exists(): |
|
|
print(f"✗ 未找到模型配置文件: {model_path}/config.json") |
|
|
return 1 |
|
|
|
|
|
if not check_llama_cpp_installation(llama_cpp_path): |
|
|
print(f"✗ llama.cpp 安装不完整或未编译: {llama_cpp_path}") |
|
|
print("请先运行:") |
|
|
print(f" cd {llama_cpp_path}") |
|
|
print(" make -j$(nproc)") |
|
|
return 1 |
|
|
|
|
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
f16_path = output_dir / "dream-coder-7b-f16.gguf" |
|
|
q8_0_path = output_dir / "dream-coder-7b-q8_0.gguf" |
|
|
|
|
|
|
|
|
success = True |
|
|
|
|
|
|
|
|
if not convert_to_gguf_f16(str(model_path), str(llama_cpp_path), str(f16_path)): |
|
|
success = False |
|
|
|
|
|
|
|
|
if success and not quantize_to_q8_0(str(f16_path), str(llama_cpp_path), str(q8_0_path)): |
|
|
success = False |
|
|
|
|
|
|
|
|
if success and not verify_gguf_model(str(q8_0_path), str(llama_cpp_path)): |
|
|
success = False |
|
|
|
|
|
|
|
|
if success and not args.keep_f16 and f16_path.exists(): |
|
|
f16_path.unlink() |
|
|
print("✓ 已删除 F16 中间文件") |
|
|
|
|
|
|
|
|
print() |
|
|
print("=" * 60) |
|
|
if success: |
|
|
print("✓ 量化完成!") |
|
|
print(f"输出文件: {q8_0_path}") |
|
|
|
|
|
|
|
|
if q8_0_path.exists(): |
|
|
size_gb = q8_0_path.stat().st_size / (1024**3) |
|
|
print(f"文件大小: {size_gb:.2f} GB") |
|
|
print(f"预期内存占用: ~{size_gb:.1f} GB") |
|
|
|
|
|
print() |
|
|
print("使用方法:") |
|
|
print(f" # 使用 llama.cpp") |
|
|
print(f" {llama_cpp_path}/main -m {q8_0_path} -p 'def quicksort(arr):' -n 512") |
|
|
print() |
|
|
print(f" # 使用 llama-cpp-python") |
|
|
print(f" from llama_cpp import Llama") |
|
|
print(f" llm = Llama(model_path='{q8_0_path}', n_ctx=2048)") |
|
|
print(f" output = llm('def quicksort(arr):', max_tokens=512)") |
|
|
|
|
|
else: |
|
|
print("✗ 量化失败") |
|
|
return 1 |
|
|
|
|
|
print("=" * 60) |
|
|
return 0 |
|
|
|
|
|
if __name__ == "__main__": |
|
|
sys.exit(main()) |