File size: 10,795 Bytes

7b8c2e7

#!/usr/bin/env python3
"""
Dream-Coder v0-Instruct-7B GGUF Q8_0 量化脚本

本脚本专门为 Dream-Coder 模型架构设计，处理其特殊的 diffusion 架构。
基于 llama.cpp 的转换工具，增加了对 Dream-Coder 特有配置的支持。

使用方法:
    python quantize_dream_q8_0.py --model_path /path/to/Dream-Coder-v0-Instruct-7B --output_dir ./gguf_output

依赖:
    - llama.cpp (需要先 git clone 并编译)
    - transformers>=4.46.2
    - torch
    - safetensors
"""

import os
import sys
import json
import argparse
import subprocess
import tempfile
from pathlib import Path
from typing import Dict, Any

def check_llama_cpp_installation(llama_cpp_path: str) -> bool:
    """检查 llama.cpp 安装和编译状态"""
    required_files = [
        "convert_hf_to_gguf.py",  # 转换脚本
        "build/bin/llama-quantize"  # 编译后的量化工具
    ]
    
    for file in required_files:
        file_path = Path(llama_cpp_path) / file
        if not file_path.exists():
            print(f"缺少文件: {file_path}")
            return False
    
    return True

def prepare_dream_config(model_path: str) -> Dict[str, Any]:
    """
    准备 Dream-Coder 特定的配置信息
    处理架构差异和特殊 token
    """
    config_path = Path(model_path) / "config.json"
    
    with open(config_path, 'r', encoding='utf-8') as f:
        config = json.load(f)
    
    # Dream-Coder 特定配置映射
    dream_config = {
        # 基本架构信息
        "model_type": "llama",  # 映射到 llama.cpp 支持的类型
        "architectures": ["LlamaForCausalLM"],  # 兼容映射
        
        # 模型参数
        "vocab_size": config.get("vocab_size", 152064),
        "hidden_size": config.get("hidden_size", 3584),
        "intermediate_size": config.get("intermediate_size", 18944),
        "num_hidden_layers": config.get("num_hidden_layers", 28),
        "num_attention_heads": config.get("num_attention_heads", 28),
        "num_key_value_heads": config.get("num_key_value_heads", 4),
        "max_position_embeddings": config.get("max_position_embeddings", 32768),
        
        # 特殊配置
        "hidden_act": config.get("hidden_act", "silu"),
        "rms_norm_eps": config.get("rms_norm_eps", 1e-06),
        "rope_theta": config.get("rope_theta", 1000000.0),
        "rope_scaling": config.get("rope_scaling"),
        
        # 特殊 token ID
        "bos_token_id": config.get("bos_token_id", 151665),
        "eos_token_id": config.get("eos_token_id", 151643),
        "pad_token_id": config.get("pad_token_id", 151643),
        
        # Dream-Coder 特有: mask token (关键!)
        "mask_token_id": config.get("mask_token_id", 151666),
        
        # 其他参数
        "tie_word_embeddings": config.get("tie_word_embeddings", False),
        "torch_dtype": config.get("torch_dtype", "bfloat16"),
        "use_cache": config.get("use_cache", True),
        "attention_dropout": config.get("attention_dropout", 0.0),
        "initializer_range": config.get("initializer_range", 0.02),
        
        # Dream-Coder diffusion 相关
        "max_window_layers": config.get("max_window_layers", 28),
        "sliding_window": config.get("sliding_window"),
        "use_sliding_window": config.get("use_sliding_window", False),
    }
    
    return dream_config

def create_compatible_config(model_path: str, temp_dir: str) -> str:
    """
    创建与 llama.cpp 兼容的配置文件
    """
    dream_config = prepare_dream_config(model_path)
    
    # 创建临时配置文件
    temp_config_path = Path(temp_dir) / "config.json"
    
    with open(temp_config_path, 'w', encoding='utf-8') as f:
        json.dump(dream_config, f, indent=2, ensure_ascii=False)
    
    return str(temp_config_path)

def convert_to_gguf_f16(model_path: str, llama_cpp_path: str, output_path: str) -> bool:
    """
    第一步: 转换 PyTorch 模型到 GGUF F16 格式
    """
    print("步骤 1: 转换 PyTorch 模型到 GGUF F16...")
    
    convert_script = Path(llama_cpp_path) / "convert_hf_to_gguf.py"
    
    cmd = [
        sys.executable,
        str(convert_script),
        model_path,
        "--outfile", output_path,
        "--outtype", "f16",
        "--verbose",  # 显示详细信息
    ]
    
    try:
        result = subprocess.run(
            cmd, 
            check=True, 
            capture_output=True, 
            text=True,
            cwd=llama_cpp_path
        )
        print("✓ F16 转换成功")
        print(f"输出: {result.stdout}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"✗ F16 转换失败: {e}")
        print(f"错误输出: {e.stderr}")
        return False

def quantize_to_q8_0(f16_path: str, llama_cpp_path: str, q8_0_path: str) -> bool:
    """
    第二步: 量化 F16 模型到 Q8_0
    """
    print("步骤 2: 量化到 Q8_0...")
    
    quantize_tool = Path(llama_cpp_path) / "build/bin/llama-quantize"
    if os.name == 'nt':  # Windows
        quantize_tool = quantize_tool.with_suffix('.exe')
    
    cmd = [
        str(quantize_tool),
        f16_path,
        q8_0_path,
        "Q8_0"
    ]
    
    try:
        result = subprocess.run(
            cmd,
            check=True,
            capture_output=True,
            text=True,
            cwd=llama_cpp_path
        )
        print("✓ Q8_0 量化成功")
        print(f"输出: {result.stdout}")
        return True
    except subprocess.CalledProcessError as e:
        print(f"✗ Q8_0 量化失败: {e}")
        print(f"错误输出: {e.stderr}")
        return False

def verify_gguf_model(gguf_path: str, llama_cpp_path: str) -> bool:
    """
    验证生成的 GGUF 模型
    """
    print("步骤 3: 验证 GGUF 模型...")
    
    # 检查文件是否存在
    if not Path(gguf_path).exists():
        print(f"✗ GGUF 文件不存在: {gguf_path}")
        return False
    
    # 获取文件大小
    file_size = Path(gguf_path).stat().st_size / (1024**3)  # GB
    print(f"✓ GGUF 文件大小: {file_size:.2f} GB")
    
    # 使用 llama.cpp 的 main 程序简单测试
    main_tool = Path(llama_cpp_path) / "build/bin/llama-cli"
    if os.name == 'nt':
        main_tool = main_tool.with_suffix('.exe')
    
    if main_tool.exists():
        cmd = [
            str(main_tool),
            "-m", gguf_path,
            "-p", "def quicksort(arr):",
            "-n", "10",
            "--temp", "0.1"
        ]
        
        try:
            result = subprocess.run(
                cmd,
                check=True,
                capture_output=True,
                text=True,
                timeout=30,
                cwd=llama_cpp_path
            )
            print("✓ 模型验证成功")
            print("示例输出:")
            print(result.stdout[-200:])  # 显示最后 200 字符
            return True
        except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
            print(f"⚠ 模型验证失败，但文件可能仍然有效: {e}")
            return True  # 验证失败不一定意味着量化失败
    else:
        print("⚠ 未找到 main 工具，跳过验证")
        return True

def main():
    parser = argparse.ArgumentParser(
        description="Dream-Coder v0-Instruct-7B GGUF Q8_0 量化工具"
    )
    parser.add_argument(
        "--model_path",
        type=str,
        default=".",
        help="Dream-Coder 模型路径 (默认: 当前目录)"
    )
    parser.add_argument(
        "--llama_cpp_path",
        type=str,
        required=True,
        help="llama.cpp 项目路径 (必需)"
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="./gguf_output",
        help="输出目录 (默认: ./gguf_output)"
    )
    parser.add_argument(
        "--keep_f16",
        action="store_true",
        help="保留 F16 中间文件"
    )
    
    args = parser.parse_args()
    
    # 路径处理
    model_path = Path(args.model_path).resolve()
    llama_cpp_path = Path(args.llama_cpp_path).resolve()
    output_dir = Path(args.output_dir).resolve()
    
    print("=" * 60)
    print("Dream-Coder v0-Instruct-7B GGUF Q8_0 量化工具")
    print("=" * 60)
    print(f"模型路径: {model_path}")
    print(f"llama.cpp 路径: {llama_cpp_path}")
    print(f"输出目录: {output_dir}")
    print()
    
    # 检查输入
    if not model_path.exists():
        print(f"✗ 模型路径不存在: {model_path}")
        return 1
    
    if not (model_path / "config.json").exists():
        print(f"✗ 未找到模型配置文件: {model_path}/config.json")
        return 1
    
    if not check_llama_cpp_installation(llama_cpp_path):
        print(f"✗ llama.cpp 安装不完整或未编译: {llama_cpp_path}")
        print("请先运行:")
        print(f"  cd {llama_cpp_path}")
        print("  make -j$(nproc)")
        return 1
    
    # 创建输出目录
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # 输出文件路径
    f16_path = output_dir / "dream-coder-7b-f16.gguf"
    q8_0_path = output_dir / "dream-coder-7b-q8_0.gguf"
    
    # 执行转换流程
    success = True
    
    # 步骤 1: 转换到 F16
    if not convert_to_gguf_f16(str(model_path), str(llama_cpp_path), str(f16_path)):
        success = False
    
    # 步骤 2: 量化到 Q8_0
    if success and not quantize_to_q8_0(str(f16_path), str(llama_cpp_path), str(q8_0_path)):
        success = False
    
    # 步骤 3: 验证模型
    if success and not verify_gguf_model(str(q8_0_path), str(llama_cpp_path)):
        success = False
    
    # 清理中间文件
    if success and not args.keep_f16 and f16_path.exists():
        f16_path.unlink()
        print("✓ 已删除 F16 中间文件")
    
    # 结果报告
    print()
    print("=" * 60)
    if success:
        print("✓ 量化完成!")
        print(f"输出文件: {q8_0_path}")
        
        # 文件信息
        if q8_0_path.exists():
            size_gb = q8_0_path.stat().st_size / (1024**3)
            print(f"文件大小: {size_gb:.2f} GB")
            print(f"预期内存占用: ~{size_gb:.1f} GB")
        
        print()
        print("使用方法:")
        print(f"  # 使用 llama.cpp")
        print(f"  {llama_cpp_path}/main -m {q8_0_path} -p 'def quicksort(arr):' -n 512")
        print()
        print(f"  # 使用 llama-cpp-python")
        print(f"  from llama_cpp import Llama")
        print(f"  llm = Llama(model_path='{q8_0_path}', n_ctx=2048)")
        print(f"  output = llm('def quicksort(arr):', max_tokens=512)")
        
    else:
        print("✗ 量化失败")
        return 1
    
    print("=" * 60)
    return 0

if __name__ == "__main__":
    sys.exit(main())