File size: 7,632 Bytes

7f7a72e

#!/usr/bin/env python3

"""Export a Manthan-T1 folder that can be uploaded to Hugging Face.

What this does:
- Copies `hf_export_stub/*` into an output directory
- Builds a tokenizer from `tokenizer_name_or_path` (defaults to Qwen3)
- Ensures `<image>` is a real special token in the tokenizer
- Writes `tokenizer_config.json`, `special_tokens_map.json`, `added_tokens.json`, and `chat_template.jinja`
- Updates `config.json` with a correct `image_token_id` (kept equal to -200 placeholder)

Note:
- This does NOT include model weights. It's intended for placeholder-weight repo layout
  (like your MicroLLaVA example). For training, you'll later save actual weights.
"""

from __future__ import annotations

import argparse
import json
import os
import shutil
import sys
from pathlib import Path

from transformers import AutoTokenizer


# Allow running this script without installing the package.
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))


def _copytree(src: Path, dst: Path) -> None:
    dst.mkdir(parents=True, exist_ok=True)
    for item in src.iterdir():
        s = item
        d = dst / item.name
        if item.is_dir():
            shutil.copytree(s, d, dirs_exist_ok=True)
        else:
            shutil.copy2(s, d)


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", required=True, help="Output folder")
    ap.add_argument(
        "--stub",
        default=str(Path(__file__).resolve().parents[1] / "hf_export_stub"),
        help="Path to hf_export_stub folder",
    )
    ap.add_argument(
        "--tokenizer",
        default=None,
        help="Tokenizer name/path. Defaults to config.json tokenizer_name_or_path.",
    )
    ap.add_argument(
        "--tokenizer_local_dir",
        default=None,
        help="Local tokenizer directory to copy (e.g. MicroLlava-* folder). If set, no network fetch is performed.",
    )
    ap.add_argument(
        "--write_stub_weights",
        action="store_true",
        help="Write randomly-initialized weights (model.safetensors) into the export dir so from_pretrained() succeeds.",
    )
    args = ap.parse_args()

    out_dir = Path(args.out).expanduser().resolve()
    stub_dir = Path(args.stub).expanduser().resolve()

    if not stub_dir.exists():
        raise SystemExit(f"Stub dir not found: {stub_dir}")

    out_dir.mkdir(parents=True, exist_ok=True)
    _copytree(stub_dir, out_dir)

    # Ensure we don't keep stale remote-code python files from a previous export.
    for stale in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
        p = out_dir / stale
        if p.exists():
            p.unlink()

    # Copy remote-code python files to export root (HF dynamic module loader expects them)
    repo_root = Path(__file__).resolve().parents[1]
    pkg_dir = repo_root / "manthan_t1"
    for fname in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
        src = pkg_dir / fname
        if not src.exists():
            raise SystemExit(f"Missing required source file for export: {src}")
        shutil.copy2(src, out_dir / fname)

    cfg_path = out_dir / "config.json"
    if not cfg_path.exists():
        raise SystemExit(f"config.json not found in: {out_dir}")

    cfg = json.loads(cfg_path.read_text(encoding="utf-8"))
    tokenizer_name = (
        args.tokenizer
        or cfg.get("tokenizer_name_or_path")
        or cfg.get("llm_model_name_or_path")
        or cfg.get("text_model_id")
        or cfg.get("vision_model_id")
    )
    if not tokenizer_name:
        raise SystemExit("Could not infer tokenizer_name_or_path")

    # Prefer an on-disk tokenizer (e.g. the attached MicroLLaVA folder) to avoid any
    # network dependency during export.
    repo_root = Path(__file__).resolve().parents[1]
    local_tokenizer_candidates = [
        repo_root / "MicroLlava-Qwen3-0.6B-base-siglip2-so400m",
    ]
    for cand in local_tokenizer_candidates:
        if cand.exists() and (cand / "tokenizer_config.json").exists():
            tokenizer_name = str(cand)
            break

    tok = AutoTokenizer.from_pretrained(
        tokenizer_name,
        trust_remote_code=True,
        use_fast=bool(cfg.get("tokenizer_use_fast", False)),
        local_files_only=True,
    )

    # Ensure special tokens exist
    added = tok.add_special_tokens({"additional_special_tokens": ["<image>"]})
    # Some tokenizers need a pad token for batching.
    if tok.pad_token_id is None and cfg.get("pad_token"):
        tok.add_special_tokens({"pad_token": cfg["pad_token"]})

    # Save tokenizer files into export dir
    tok.save_pretrained(out_dir)

    # Copy chat template if present in stub
    tmpl_src = out_dir / "chat_template.jinja"
    if tmpl_src.exists():
        # Ensure tokenizer_config.json references it (HF uses string field)
        tok_cfg_path = out_dir / "tokenizer_config.json"
        if tok_cfg_path.exists():
            tok_cfg = json.loads(tok_cfg_path.read_text(encoding="utf-8"))
        else:
            tok_cfg = {}
        tok_cfg["chat_template"] = tmpl_src.read_text(encoding="utf-8")
        tok_cfg_path.write_text(json.dumps(tok_cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

    # Align config fields with MicroLLaVA convention
    cfg.setdefault("image_token_index", -200)
    cfg["image_token_index"] = -200
    cfg["image_token_id"] = -200

    # For user convenience record actual tokenizer vocab id of '<image>'
    img_vocab_id = tok.convert_tokens_to_ids("<image>")
    cfg["tokenizer_image_token_id"] = int(img_vocab_id) if img_vocab_id is not None else None
    cfg["tokenizer_added_tokens"] = int(added)

    cfg_path.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

    # Minimal README hint
    readme = out_dir / "README_EXPORT.md"
    readme.write_text(
        "Manthan-T1 export folder (stub).\n\n"
        "- `config.json` uses `image_token_index=-200` placeholder like TinyLLaVA.\n"
        "- Tokenizer contains a real `<image>` special token.\n"
        "- This folder does not include model weights; training should save weights here later.\n",
        encoding="utf-8",
    )

    print(f"Exported to: {out_dir}")

    if args.write_stub_weights:
        # Import only when requested to avoid heavier imports for plain export.
        from manthan_t1.configuration_manthan import ManthanConfig
        from manthan_t1.modeling_manthan import ManthanForCausalLM

        # Tiny randomly-initialized model that is loadable.
        # This does not download any base weights.
        stub_cfg = ManthanConfig(
            text_model_id=None,
            vision_model_id=None,
            image_token_index=-200,
            num_image_tokens=32,
        )
        model = ManthanForCausalLM(stub_cfg)
        model.save_pretrained(out_dir, safe_serialization=True)

        # Ensure auto_map is present so AutoConfig/AutoModel can resolve our
        # custom classes via trust_remote_code.
        saved_cfg = json.loads((out_dir / "config.json").read_text(encoding="utf-8"))
        saved_cfg["auto_map"] = cfg.get(
            "auto_map",
            {
                "AutoConfig": "configuration_manthan.ManthanConfig",
                "AutoModelForCausalLM": "modeling_manthan.ManthanForCausalLM",
            },
        )
        (out_dir / "config.json").write_text(
            json.dumps(saved_cfg, indent=2, ensure_ascii=False) + "\n",
            encoding="utf-8",
        )

        print("Wrote stub weights: model.safetensors")


if __name__ == "__main__":
    main()