File size: 10,271 Bytes

3afa0cd

#!/usr/bin/env python3
"""
Extract MOSS-TTS-Delay weights into three groups for the llama.cpp backend:

  1. Qwen3 backbone  → standalone Qwen3ForCausalLM (safetensors + config.json)
  2. Embedding tables → numpy .npy files
  3. LM head weights  → numpy .npy files

The Qwen3 backbone safetensors can then be converted to GGUF with
``llama.cpp/convert_hf_to_gguf.py``.

Usage::

    python scripts/extract_weights_llama_cpp.py \\
        --model OpenMOSS-Team/MOSS-TTS \\
        --output weights/extracted
"""

import argparse
import json
import logging
import shutil
from collections import defaultdict
from pathlib import Path

import numpy as np
from huggingface_hub import snapshot_download
from safetensors import safe_open
from safetensors.torch import save_file
import torch

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)


def remap_backbone_name(name: str) -> str | None:
    """Map a MossTTSDelay tensor name to Qwen3ForCausalLM convention."""
    if name.startswith("language_model."):
        return "model." + name[len("language_model."):]
    if name == "lm_heads.0.weight":
        return "lm_head.weight"
    return None


def load_source_index(model_dir: Path) -> dict:
    index_path = model_dir / "model.safetensors.index.json"
    if index_path.exists():
        with open(index_path) as f:
            return json.load(f)
    single = model_dir / "model.safetensors"
    if single.exists():
        with safe_open(str(single), framework="pt") as f:
            return {
                "metadata": {},
                "weight_map": {k: "model.safetensors" for k in f.keys()},
            }
    raise FileNotFoundError(f"No safetensors files found in {model_dir}")


def load_source_config(model_dir: Path) -> dict:
    with open(model_dir / "config.json") as f:
        return json.load(f)


def build_qwen3_config(moss_config: dict) -> dict:
    lang = dict(moss_config["language_config"])
    lang["architectures"] = ["Qwen3ForCausalLM"]
    lang["model_type"] = "qwen3"
    lang.pop("_name_or_path", None)
    lang.setdefault("torch_dtype", "bfloat16")
    lang.setdefault("transformers_version", moss_config.get("transformers_version", "4.57.1"))
    return lang


MAX_SHARD_SIZE = 5 * 1024**3


def extract(model_dir: Path, output_dir: Path) -> None:
    output_dir.mkdir(parents=True, exist_ok=True)
    backbone_dir = output_dir / "qwen3_backbone"
    backbone_dir.mkdir(exist_ok=True)
    embed_dir = output_dir / "embeddings"
    embed_dir.mkdir(exist_ok=True)
    head_dir = output_dir / "lm_heads"
    head_dir.mkdir(exist_ok=True)

    moss_config = load_source_config(model_dir)
    index = load_source_index(model_dir)
    weight_map = index["weight_map"]

    lang_config = moss_config["language_config"]
    n_vq = moss_config.get("n_vq", 32)
    hidden_size = lang_config["hidden_size"]
    vocab_size = lang_config["vocab_size"]
    audio_vocab_size = moss_config.get("audio_vocab_size", 1024)

    log.info(
        "Model: hidden_size=%d, vocab_size=%d, n_vq=%d, audio_vocab_size=%d",
        hidden_size, vocab_size, n_vq, audio_vocab_size,
    )

    shard_to_tensors: dict[str, list[str]] = defaultdict(list)
    for tensor_name, shard_file in weight_map.items():
        shard_to_tensors[shard_file].append(tensor_name)

    backbone_tensors: dict[str, torch.Tensor] = {}
    backbone_size = 0
    shard_idx = 0
    saved_shards: list[str] = []
    backbone_weight_map: dict[str, str] = {}

    def flush_backbone_shard():
        nonlocal backbone_tensors, backbone_size, shard_idx
        if not backbone_tensors:
            return
        shard_idx += 1
        shard_name = f"model-{shard_idx:05d}-of-PLACEHOLDER.safetensors"
        shard_path = backbone_dir / shard_name
        log.info("  Writing backbone shard %s (%d tensors, %.2f GB)",
                 shard_name, len(backbone_tensors), backbone_size / 1e9)
        save_file(backbone_tensors, str(shard_path))
        for tname in backbone_tensors:
            backbone_weight_map[tname] = shard_name
        saved_shards.append(shard_name)
        backbone_tensors = {}
        backbone_size = 0

    sorted_shards = sorted(shard_to_tensors.keys())
    for shard_file in sorted_shards:
        tensor_names = shard_to_tensors[shard_file]
        shard_path = model_dir / shard_file
        log.info("Processing shard: %s (%d tensors)", shard_file, len(tensor_names))

        with safe_open(str(shard_path), framework="pt") as sf:
            for tname in sorted(tensor_names):
                tensor = sf.get_tensor(tname)

                if tname == "language_model.embed_tokens.weight":
                    npy_path = embed_dir / "embed_tokens.npy"
                    np.save(str(npy_path), tensor.to(torch.float16).numpy())
                    log.info("  Saved %s → %s  shape=%s", tname, npy_path.name, list(tensor.shape))

                if tname.startswith("emb_ext.") and tname.endswith(".weight"):
                    idx = int(tname.split(".")[1])
                    npy_path = embed_dir / f"emb_ext_{idx:02d}.npy"
                    np.save(str(npy_path), tensor.to(torch.float16).numpy())
                    log.info("  Saved %s → %s  shape=%s", tname, npy_path.name, list(tensor.shape))

                if tname.startswith("lm_heads.") and tname.endswith(".weight"):
                    head_idx = int(tname.split(".")[1])
                    if head_idx == 0:
                        npy_path = head_dir / "lm_head_text.npy"
                    else:
                        npy_path = head_dir / f"lm_head_audio_{head_idx - 1:02d}.npy"
                    np.save(str(npy_path), tensor.to(torch.float16).numpy())
                    log.info("  Saved %s → %s  shape=%s", tname, npy_path.name, list(tensor.shape))

                qwen_name = remap_backbone_name(tname)
                if qwen_name is not None:
                    tensor_bytes = tensor.nelement() * tensor.element_size()
                    if backbone_size + tensor_bytes > MAX_SHARD_SIZE and backbone_tensors:
                        flush_backbone_shard()
                    backbone_tensors[qwen_name] = tensor
                    backbone_size += tensor_bytes

    flush_backbone_shard()

    total_shards = len(saved_shards)
    renamed_shards = []
    for i, old_name in enumerate(saved_shards, 1):
        new_name = f"model-{i:05d}-of-{total_shards:05d}.safetensors"
        if old_name != new_name:
            (backbone_dir / old_name).rename(backbone_dir / new_name)
        renamed_shards.append(new_name)
        for tname in list(backbone_weight_map.keys()):
            if backbone_weight_map[tname] == old_name:
                backbone_weight_map[tname] = new_name

    total_size = 0
    for shard_name in renamed_shards:
        total_size += (backbone_dir / shard_name).stat().st_size

    backbone_index = {
        "metadata": {"total_size": total_size},
        "weight_map": backbone_weight_map,
    }
    if total_shards > 1:
        with open(backbone_dir / "model.safetensors.index.json", "w") as f:
            json.dump(backbone_index, f, indent=2, sort_keys=True)
        log.info("Wrote backbone index: %d shards, %.2f GB total", total_shards, total_size / 1e9)
    elif total_shards == 1:
        single = backbone_dir / renamed_shards[0]
        target = backbone_dir / "model.safetensors"
        if single != target:
            single.rename(target)
        log.info("Wrote single backbone shard: %.2f GB", total_size / 1e9)

    qwen3_config = build_qwen3_config(moss_config)
    with open(backbone_dir / "config.json", "w") as f:
        json.dump(qwen3_config, f, indent=2)
    log.info("Wrote backbone config.json")

    tokenizer_files = [
        "tokenizer.json", "tokenizer_config.json",
        "special_tokens_map.json", "added_tokens.json",
        "merges.txt", "vocab.json",
    ]
    copied = 0
    for tf in tokenizer_files:
        src = model_dir / tf
        if src.exists():
            shutil.copy2(str(src), str(backbone_dir / tf))
            copied += 1
    log.info("Copied %d tokenizer files to backbone dir", copied)

    meta = {
        "source_model": str(model_dir),
        "n_vq": n_vq,
        "hidden_size": hidden_size,
        "vocab_size": vocab_size,
        "audio_vocab_size": audio_vocab_size,
        "backbone_dir": str(backbone_dir),
        "embedding_dir": str(embed_dir),
        "lm_head_dir": str(head_dir),
        "moss_config": moss_config,
    }
    with open(output_dir / "extraction_meta.json", "w") as f:
        json.dump(meta, f, indent=2)
    log.info("Wrote extraction_meta.json")

    embed_files = sorted(embed_dir.glob("*.npy"))
    head_files = sorted(head_dir.glob("*.npy"))
    log.info("=" * 60)
    log.info("Extraction complete!")
    log.info("  Backbone:    %s (%d shards)", backbone_dir, total_shards)
    log.info("  Embeddings:  %s (%d files)", embed_dir, len(embed_files))
    log.info("  LM heads:    %s (%d files)", head_dir, len(head_files))
    log.info("=" * 60)


def main():
    parser = argparse.ArgumentParser(
        description="Extract MOSS-TTS-Delay weights for llama.cpp backend"
    )
    parser.add_argument(
        "--model", type=str, default="OpenMOSS-Team/MOSS-TTS",
        help="HuggingFace model ID or local path",
    )
    parser.add_argument(
        "--output", type=str, default="weights/extracted",
        help="Output directory for extracted weights",
    )
    parser.add_argument(
        "--cache-dir", type=str, default=None,
        help="HuggingFace cache directory for model download",
    )
    args = parser.parse_args()

    model_path = Path(args.model)
    if model_path.is_dir() and (model_path / "config.json").exists():
        model_dir = model_path
        log.info("Using local model directory: %s", model_dir)
    else:
        log.info("Downloading model from HuggingFace: %s", args.model)
        model_dir = Path(snapshot_download(
            args.model,
            cache_dir=args.cache_dir,
            ignore_patterns=["*.md", "*.py", "*.jinja", "__pycache__"],
        ))
        log.info("Model downloaded to: %s", model_dir)

    extract(model_dir, Path(args.output))


if __name__ == "__main__":
    main()