|
|
| """
|
| Extract MOSS-TTS-Delay weights into three groups for the llama.cpp backend:
|
|
|
| 1. Qwen3 backbone → standalone Qwen3ForCausalLM (safetensors + config.json)
|
| 2. Embedding tables → numpy .npy files
|
| 3. LM head weights → numpy .npy files
|
|
|
| The Qwen3 backbone safetensors can then be converted to GGUF with
|
| ``llama.cpp/convert_hf_to_gguf.py``.
|
|
|
| Usage::
|
|
|
| python scripts/extract_weights_llama_cpp.py \\
|
| --model OpenMOSS-Team/MOSS-TTS \\
|
| --output weights/extracted
|
| """
|
|
|
| import argparse
|
| import json
|
| import logging
|
| import shutil
|
| from collections import defaultdict
|
| from pathlib import Path
|
|
|
| import numpy as np
|
| from huggingface_hub import snapshot_download
|
| from safetensors import safe_open
|
| from safetensors.torch import save_file
|
| import torch
|
|
|
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
| log = logging.getLogger(__name__)
|
|
|
|
|
| def remap_backbone_name(name: str) -> str | None:
|
| """Map a MossTTSDelay tensor name to Qwen3ForCausalLM convention."""
|
| if name.startswith("language_model."):
|
| return "model." + name[len("language_model."):]
|
| if name == "lm_heads.0.weight":
|
| return "lm_head.weight"
|
| return None
|
|
|
|
|
| def load_source_index(model_dir: Path) -> dict:
|
| index_path = model_dir / "model.safetensors.index.json"
|
| if index_path.exists():
|
| with open(index_path) as f:
|
| return json.load(f)
|
| single = model_dir / "model.safetensors"
|
| if single.exists():
|
| with safe_open(str(single), framework="pt") as f:
|
| return {
|
| "metadata": {},
|
| "weight_map": {k: "model.safetensors" for k in f.keys()},
|
| }
|
| raise FileNotFoundError(f"No safetensors files found in {model_dir}")
|
|
|
|
|
| def load_source_config(model_dir: Path) -> dict:
|
| with open(model_dir / "config.json") as f:
|
| return json.load(f)
|
|
|
|
|
| def build_qwen3_config(moss_config: dict) -> dict:
|
| lang = dict(moss_config["language_config"])
|
| lang["architectures"] = ["Qwen3ForCausalLM"]
|
| lang["model_type"] = "qwen3"
|
| lang.pop("_name_or_path", None)
|
| lang.setdefault("torch_dtype", "bfloat16")
|
| lang.setdefault("transformers_version", moss_config.get("transformers_version", "4.57.1"))
|
| return lang
|
|
|
|
|
| MAX_SHARD_SIZE = 5 * 1024**3
|
|
|
|
|
| def extract(model_dir: Path, output_dir: Path) -> None:
|
| output_dir.mkdir(parents=True, exist_ok=True)
|
| backbone_dir = output_dir / "qwen3_backbone"
|
| backbone_dir.mkdir(exist_ok=True)
|
| embed_dir = output_dir / "embeddings"
|
| embed_dir.mkdir(exist_ok=True)
|
| head_dir = output_dir / "lm_heads"
|
| head_dir.mkdir(exist_ok=True)
|
|
|
| moss_config = load_source_config(model_dir)
|
| index = load_source_index(model_dir)
|
| weight_map = index["weight_map"]
|
|
|
| lang_config = moss_config["language_config"]
|
| n_vq = moss_config.get("n_vq", 32)
|
| hidden_size = lang_config["hidden_size"]
|
| vocab_size = lang_config["vocab_size"]
|
| audio_vocab_size = moss_config.get("audio_vocab_size", 1024)
|
|
|
| log.info(
|
| "Model: hidden_size=%d, vocab_size=%d, n_vq=%d, audio_vocab_size=%d",
|
| hidden_size, vocab_size, n_vq, audio_vocab_size,
|
| )
|
|
|
| shard_to_tensors: dict[str, list[str]] = defaultdict(list)
|
| for tensor_name, shard_file in weight_map.items():
|
| shard_to_tensors[shard_file].append(tensor_name)
|
|
|
| backbone_tensors: dict[str, torch.Tensor] = {}
|
| backbone_size = 0
|
| shard_idx = 0
|
| saved_shards: list[str] = []
|
| backbone_weight_map: dict[str, str] = {}
|
|
|
| def flush_backbone_shard():
|
| nonlocal backbone_tensors, backbone_size, shard_idx
|
| if not backbone_tensors:
|
| return
|
| shard_idx += 1
|
| shard_name = f"model-{shard_idx:05d}-of-PLACEHOLDER.safetensors"
|
| shard_path = backbone_dir / shard_name
|
| log.info(" Writing backbone shard %s (%d tensors, %.2f GB)",
|
| shard_name, len(backbone_tensors), backbone_size / 1e9)
|
| save_file(backbone_tensors, str(shard_path))
|
| for tname in backbone_tensors:
|
| backbone_weight_map[tname] = shard_name
|
| saved_shards.append(shard_name)
|
| backbone_tensors = {}
|
| backbone_size = 0
|
|
|
| sorted_shards = sorted(shard_to_tensors.keys())
|
| for shard_file in sorted_shards:
|
| tensor_names = shard_to_tensors[shard_file]
|
| shard_path = model_dir / shard_file
|
| log.info("Processing shard: %s (%d tensors)", shard_file, len(tensor_names))
|
|
|
| with safe_open(str(shard_path), framework="pt") as sf:
|
| for tname in sorted(tensor_names):
|
| tensor = sf.get_tensor(tname)
|
|
|
| if tname == "language_model.embed_tokens.weight":
|
| npy_path = embed_dir / "embed_tokens.npy"
|
| np.save(str(npy_path), tensor.to(torch.float16).numpy())
|
| log.info(" Saved %s → %s shape=%s", tname, npy_path.name, list(tensor.shape))
|
|
|
| if tname.startswith("emb_ext.") and tname.endswith(".weight"):
|
| idx = int(tname.split(".")[1])
|
| npy_path = embed_dir / f"emb_ext_{idx:02d}.npy"
|
| np.save(str(npy_path), tensor.to(torch.float16).numpy())
|
| log.info(" Saved %s → %s shape=%s", tname, npy_path.name, list(tensor.shape))
|
|
|
| if tname.startswith("lm_heads.") and tname.endswith(".weight"):
|
| head_idx = int(tname.split(".")[1])
|
| if head_idx == 0:
|
| npy_path = head_dir / "lm_head_text.npy"
|
| else:
|
| npy_path = head_dir / f"lm_head_audio_{head_idx - 1:02d}.npy"
|
| np.save(str(npy_path), tensor.to(torch.float16).numpy())
|
| log.info(" Saved %s → %s shape=%s", tname, npy_path.name, list(tensor.shape))
|
|
|
| qwen_name = remap_backbone_name(tname)
|
| if qwen_name is not None:
|
| tensor_bytes = tensor.nelement() * tensor.element_size()
|
| if backbone_size + tensor_bytes > MAX_SHARD_SIZE and backbone_tensors:
|
| flush_backbone_shard()
|
| backbone_tensors[qwen_name] = tensor
|
| backbone_size += tensor_bytes
|
|
|
| flush_backbone_shard()
|
|
|
| total_shards = len(saved_shards)
|
| renamed_shards = []
|
| for i, old_name in enumerate(saved_shards, 1):
|
| new_name = f"model-{i:05d}-of-{total_shards:05d}.safetensors"
|
| if old_name != new_name:
|
| (backbone_dir / old_name).rename(backbone_dir / new_name)
|
| renamed_shards.append(new_name)
|
| for tname in list(backbone_weight_map.keys()):
|
| if backbone_weight_map[tname] == old_name:
|
| backbone_weight_map[tname] = new_name
|
|
|
| total_size = 0
|
| for shard_name in renamed_shards:
|
| total_size += (backbone_dir / shard_name).stat().st_size
|
|
|
| backbone_index = {
|
| "metadata": {"total_size": total_size},
|
| "weight_map": backbone_weight_map,
|
| }
|
| if total_shards > 1:
|
| with open(backbone_dir / "model.safetensors.index.json", "w") as f:
|
| json.dump(backbone_index, f, indent=2, sort_keys=True)
|
| log.info("Wrote backbone index: %d shards, %.2f GB total", total_shards, total_size / 1e9)
|
| elif total_shards == 1:
|
| single = backbone_dir / renamed_shards[0]
|
| target = backbone_dir / "model.safetensors"
|
| if single != target:
|
| single.rename(target)
|
| log.info("Wrote single backbone shard: %.2f GB", total_size / 1e9)
|
|
|
| qwen3_config = build_qwen3_config(moss_config)
|
| with open(backbone_dir / "config.json", "w") as f:
|
| json.dump(qwen3_config, f, indent=2)
|
| log.info("Wrote backbone config.json")
|
|
|
| tokenizer_files = [
|
| "tokenizer.json", "tokenizer_config.json",
|
| "special_tokens_map.json", "added_tokens.json",
|
| "merges.txt", "vocab.json",
|
| ]
|
| copied = 0
|
| for tf in tokenizer_files:
|
| src = model_dir / tf
|
| if src.exists():
|
| shutil.copy2(str(src), str(backbone_dir / tf))
|
| copied += 1
|
| log.info("Copied %d tokenizer files to backbone dir", copied)
|
|
|
| meta = {
|
| "source_model": str(model_dir),
|
| "n_vq": n_vq,
|
| "hidden_size": hidden_size,
|
| "vocab_size": vocab_size,
|
| "audio_vocab_size": audio_vocab_size,
|
| "backbone_dir": str(backbone_dir),
|
| "embedding_dir": str(embed_dir),
|
| "lm_head_dir": str(head_dir),
|
| "moss_config": moss_config,
|
| }
|
| with open(output_dir / "extraction_meta.json", "w") as f:
|
| json.dump(meta, f, indent=2)
|
| log.info("Wrote extraction_meta.json")
|
|
|
| embed_files = sorted(embed_dir.glob("*.npy"))
|
| head_files = sorted(head_dir.glob("*.npy"))
|
| log.info("=" * 60)
|
| log.info("Extraction complete!")
|
| log.info(" Backbone: %s (%d shards)", backbone_dir, total_shards)
|
| log.info(" Embeddings: %s (%d files)", embed_dir, len(embed_files))
|
| log.info(" LM heads: %s (%d files)", head_dir, len(head_files))
|
| log.info("=" * 60)
|
|
|
|
|
| def main():
|
| parser = argparse.ArgumentParser(
|
| description="Extract MOSS-TTS-Delay weights for llama.cpp backend"
|
| )
|
| parser.add_argument(
|
| "--model", type=str, default="OpenMOSS-Team/MOSS-TTS",
|
| help="HuggingFace model ID or local path",
|
| )
|
| parser.add_argument(
|
| "--output", type=str, default="weights/extracted",
|
| help="Output directory for extracted weights",
|
| )
|
| parser.add_argument(
|
| "--cache-dir", type=str, default=None,
|
| help="HuggingFace cache directory for model download",
|
| )
|
| args = parser.parse_args()
|
|
|
| model_path = Path(args.model)
|
| if model_path.is_dir() and (model_path / "config.json").exists():
|
| model_dir = model_path
|
| log.info("Using local model directory: %s", model_dir)
|
| else:
|
| log.info("Downloading model from HuggingFace: %s", args.model)
|
| model_dir = Path(snapshot_download(
|
| args.model,
|
| cache_dir=args.cache_dir,
|
| ignore_patterns=["*.md", "*.py", "*.jinja", "__pycache__"],
|
| ))
|
| log.info("Model downloaded to: %s", model_dir)
|
|
|
| extract(model_dir, Path(args.output))
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|