minpeter
/

HyperCLOVAX-SEED-Text-Think-32B-hf

+#!/usr/bin/env python3
+"""
+Extract text-only LLM from HyperCLOVAX-SEED-Think-32B VLM.
+Converts to LLaMA-compatible format for standard inference engines.
+Usage:
+    python extract_llm.py --input ./HyperCLOVAX-SEED-Think-32B --output ./HyperCLOVAX-SEED-Text-Think-32B
+Requirements:
+    pip install safetensors torch tqdm
+"""
+import argparse
+import json
+import os
+import shutil
+from pathlib import Path
+from collections import defaultdict
+from safetensors import safe_open
+from safetensors.torch import save_file
+import torch
+from tqdm import tqdm
+def load_weight_index(model_path: Path) -> dict:
+    """Load the safetensors weight index file."""
+    index_path = model_path / "model.safetensors.index.json"
+    with open(index_path, "r") as f:
+        return json.load(f)
+def extract_llm_weights(model_path: Path, output_path: Path):
+    """
+    Extract LLM weights from VLM.
+    Key mapping:
+    - model.language_model.model.* → model.*
+    - model.language_model.lm_head.* → lm_head.*
+    All vision encoder and MM projector weights are excluded.
+    """
+    output_path.mkdir(parents=True, exist_ok=True)
+    weight_index = load_weight_index(model_path)
+    weight_map = weight_index["weight_map"]
+    # Filter and remap LLM weights
+    llm_weights = {}
+    for key, shard_file in weight_map.items():
+        if key.startswith("model.language_model."):
+            if key.startswith("model.language_model.model."):
+                new_key = key.replace("model.language_model.model.", "model.")
+            elif key.startswith("model.language_model.lm_head."):
+                new_key = key.replace("model.language_model.", "")
+            else:
+                new_key = key.replace("model.language_model.", "")
+            llm_weights[new_key] = (key, shard_file)
+    print(f"Found {len(llm_weights)} LLM weight tensors")
+    print(f"Excluded {len(weight_map) - len(llm_weights)} vision/projector tensors")
+    # Group by source shard for efficient loading
+    shard_to_weights = defaultdict(list)
+    for new_key, (old_key, shard_file) in llm_weights.items():
+        shard_to_weights[shard_file].append((old_key, new_key))
+    # Load all LLM tensors
+    all_tensors = {}
+    shard_files = sorted(set(shard_to_weights.keys()))
+    print(f"\nLoading weights from {len(shard_files)} shards...")
+    for shard_file in tqdm(shard_files, desc="Loading shards"):
+        shard_path = model_path / shard_file
+        with safe_open(shard_path, framework="pt", device="cpu") as f:
+            for old_key, new_key in shard_to_weights[shard_file]:
+                tensor = f.get_tensor(old_key)
+                all_tensors[new_key] = tensor
+    print(f"\nTotal tensors extracted: {len(all_tensors)}")
+    total_size = sum(t.numel() * t.element_size() for t in all_tensors.values())
+    print(f"Total size: {total_size / 1e9:.2f} GB")
+    # Save as sharded safetensors (~5GB per shard)
+    max_shard_size = 5 * 1024 * 1024 * 1024
+    print("\nSaving extracted weights...")
+    save_sharded_safetensors(all_tensors, output_path, max_shard_size)
+    return list(all_tensors.keys())
+def save_sharded_safetensors(tensors: dict, output_path: Path, max_shard_size: int):
+    """Save tensors as sharded safetensors files with index."""
+    sorted_keys = sorted(tensors.keys())
+    shards = []
+    current_shard = {}
+    current_size = 0
+    shard_idx = 1
+    weight_map = {}
+    for key in sorted_keys:
+        tensor = tensors[key]
+        tensor_size = tensor.numel() * tensor.element_size()
+        if current_size + tensor_size > max_shard_size and current_shard:
+            shards.append((shard_idx, current_shard))
+            shard_idx += 1
+            current_shard = {}
+            current_size = 0
+        current_shard[key] = tensor
+        current_size += tensor_size
+    if current_shard:
+        shards.append((shard_idx, current_shard))
+    total_shards = len(shards)
+    total_size = sum(t.numel() * t.element_size() for t in tensors.values())
+    for shard_idx, shard_tensors in tqdm(shards, desc="Saving shards"):
+        shard_name = f"model-{shard_idx:05d}-of-{total_shards:05d}.safetensors"
+        shard_path = output_path / shard_name
+        save_file(shard_tensors, shard_path)
+        for key in shard_tensors.keys():
+            weight_map[key] = shard_name
+    # Create index file
+    index = {
+        "metadata": {"total_size": total_size},
+        "weight_map": weight_map
+    }
+    index_path = output_path / "model.safetensors.index.json"
+    with open(index_path, "w") as f:
+        json.dump(index, f, indent=2)
+    print(f"Saved {total_shards} shards to {output_path}")
+def create_llama_config(original_config_path: Path, output_path: Path):
+    """
+    Create LLaMA-compatible config from VLM config.
+    Note: HyperCLOVAX uses attention_multiplier ≈ 1/sqrt(head_dim)
+    which matches standard LLaMA scaled dot-product attention.
+    """
+    with open(original_config_path, "r") as f:
+        vlm_config = json.load(f)
+    text_config = vlm_config["text_config"]
+    llama_config = {
+        "architectures": ["LlamaForCausalLM"],
+        "attention_bias": text_config.get("attention_bias", False),
+        "attention_dropout": text_config.get("attention_dropout", 0.0),
+        "bos_token_id": text_config.get("bos_token_id", 128000),
+        "eos_token_id": text_config.get("eos_token_id", 128001),
+        "head_dim": text_config.get("head_dim", 128),
+        "hidden_act": text_config.get("hidden_act", "silu"),
+        "hidden_size": text_config.get("hidden_size", 5120),
+        "initializer_range": text_config.get("initializer_range", 0.006),
+        "intermediate_size": text_config.get("intermediate_size", 24192),
+        "max_position_embeddings": text_config.get("max_position_embeddings", 131072),
+        "mlp_bias": text_config.get("mlp_bias", False),
+        "model_type": "llama",
+        "num_attention_heads": text_config.get("num_attention_heads", 40),
+        "num_hidden_layers": text_config.get("num_hidden_layers", 72),
+        "num_key_value_heads": text_config.get("num_key_value_heads", 8),
+        "pad_token_id": text_config.get("pad_token_id", 0),
+        "pretraining_tp": 1,
+        "rms_norm_eps": text_config.get("rms_norm_eps", 1e-05),
+        "rope_scaling": text_config.get("rope_scaling", None),
+        "rope_theta": text_config.get("rope_theta", 50000000),
+        "tie_word_embeddings": text_config.get("tie_word_embeddings", False),
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.52.4",
+        "use_cache": True,
+        "vocab_size": text_config.get("vocab_size", 128256),
+    }
+    config_path = output_path / "config.json"
+    with open(config_path, "w") as f:
+        json.dump(llama_config, f, indent=2)
+    print(f"Saved LLaMA config to {config_path}")
+    # Generation config
+    gen_config = {
+        "bos_token_id": llama_config["bos_token_id"],
+        "eos_token_id": llama_config["eos_token_id"],
+        "pad_token_id": llama_config["pad_token_id"],
+        "do_sample": True,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "max_length": 4096
+    }
+    gen_config_path = output_path / "generation_config.json"
+    with open(gen_config_path, "w") as f:
+        json.dump(gen_config, f, indent=2)
+    return llama_config
+def copy_tokenizer_files(original_path: Path, output_path: Path):
+    """Copy tokenizer files from original model."""
+    tokenizer_files = [
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "added_tokens.json",
+        "vocab.json",
+        "merges.txt",
+        "chat_template.jinja"
+    ]
+    copied = []
+    for fname in tokenizer_files:
+        src = original_path / fname
+        if src.exists():
+            dst = output_path / fname
+            shutil.copy2(src, dst)
+            copied.append(fname)
+    print(f"Copied tokenizer files: {copied}")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract text-only LLM from HyperCLOVAX-SEED-Think-32B VLM",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example:
+    # Download original VLM
+    huggingface-cli download naver-hyperclovax/HyperCLOVAX-SEED-Think-32B \\
+        --local-dir ./HyperCLOVAX-SEED-Think-32B
+    # Extract text-only LLM
+    python extract_llm.py \\
+        --input ./HyperCLOVAX-SEED-Think-32B \\
+        --output ./HyperCLOVAX-SEED-Text-Think-32B
+        """
+    )
+    parser.add_argument(
+        "--input", "-i",
+        type=Path,
+        required=True,
+        help="Path to original HyperCLOVAX-SEED-Think-32B VLM"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        required=True,
+        help="Output path for extracted text-only LLM"
+    )
+    args = parser.parse_args()
+    if not args.input.exists():
+        print(f"Error: Input path does not exist: {args.input}")
+        return 1
+    if not (args.input / "model.safetensors.index.json").exists():
+        print(f"Error: model.safetensors.index.json not found in {args.input}")
+        return 1
+    print("=" * 60)
+    print("HyperCLOVAX VLM → Text-only LLM Extraction")
+    print("=" * 60)
+    print(f"Input:  {args.input}")
+    print(f"Output: {args.output}")
+    print("\n[Step 1] Extracting LLM weights...")
+    extracted_keys = extract_llm_weights(args.input, args.output)
+    print("\n[Step 2] Creating LLaMA-compatible config...")
+    config = create_llama_config(args.input / "config.json", args.output)
+    print("\n[Step 3] Copying tokenizer files...")
+    copy_tokenizer_files(args.input, args.output)
+    print("\n" + "=" * 60)
+    print("Extraction complete!")
+    print(f"Output: {args.output}")
+    print("=" * 60)
+    print(f"\nModel summary:")
+    print(f"  - Architecture: LlamaForCausalLM")
+    print(f"  - Hidden size: {config['hidden_size']}")
+    print(f"  - Layers: {config['num_hidden_layers']}")
+    print(f"  - Attention heads: {config['num_attention_heads']}")
+    print(f"  - KV heads: {config['num_key_value_heads']}")
+    print(f"  - Vocab size: {config['vocab_size']}")
+    print(f"  - Max context: {config['max_position_embeddings']}")
+    print(f"\nYou can now use the model with vLLM, transformers, or other LLaMA-compatible frameworks.")
+    return 0
+if __name__ == "__main__":
+    exit(main())