Manthan-T1 / scripts /export_hf.py
Atah Alam
Manthan-T1 clean code-only
7f7a72e
#!/usr/bin/env python3
"""Export a Manthan-T1 folder that can be uploaded to Hugging Face.
What this does:
- Copies `hf_export_stub/*` into an output directory
- Builds a tokenizer from `tokenizer_name_or_path` (defaults to Qwen3)
- Ensures `<image>` is a real special token in the tokenizer
- Writes `tokenizer_config.json`, `special_tokens_map.json`, `added_tokens.json`, and `chat_template.jinja`
- Updates `config.json` with a correct `image_token_id` (kept equal to -200 placeholder)
Note:
- This does NOT include model weights. It's intended for placeholder-weight repo layout
(like your MicroLLaVA example). For training, you'll later save actual weights.
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import sys
from pathlib import Path
from transformers import AutoTokenizer
# Allow running this script without installing the package.
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
def _copytree(src: Path, dst: Path) -> None:
dst.mkdir(parents=True, exist_ok=True)
for item in src.iterdir():
s = item
d = dst / item.name
if item.is_dir():
shutil.copytree(s, d, dirs_exist_ok=True)
else:
shutil.copy2(s, d)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--out", required=True, help="Output folder")
ap.add_argument(
"--stub",
default=str(Path(__file__).resolve().parents[1] / "hf_export_stub"),
help="Path to hf_export_stub folder",
)
ap.add_argument(
"--tokenizer",
default=None,
help="Tokenizer name/path. Defaults to config.json tokenizer_name_or_path.",
)
ap.add_argument(
"--tokenizer_local_dir",
default=None,
help="Local tokenizer directory to copy (e.g. MicroLlava-* folder). If set, no network fetch is performed.",
)
ap.add_argument(
"--write_stub_weights",
action="store_true",
help="Write randomly-initialized weights (model.safetensors) into the export dir so from_pretrained() succeeds.",
)
args = ap.parse_args()
out_dir = Path(args.out).expanduser().resolve()
stub_dir = Path(args.stub).expanduser().resolve()
if not stub_dir.exists():
raise SystemExit(f"Stub dir not found: {stub_dir}")
out_dir.mkdir(parents=True, exist_ok=True)
_copytree(stub_dir, out_dir)
# Ensure we don't keep stale remote-code python files from a previous export.
for stale in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
p = out_dir / stale
if p.exists():
p.unlink()
# Copy remote-code python files to export root (HF dynamic module loader expects them)
repo_root = Path(__file__).resolve().parents[1]
pkg_dir = repo_root / "manthan_t1"
for fname in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
src = pkg_dir / fname
if not src.exists():
raise SystemExit(f"Missing required source file for export: {src}")
shutil.copy2(src, out_dir / fname)
cfg_path = out_dir / "config.json"
if not cfg_path.exists():
raise SystemExit(f"config.json not found in: {out_dir}")
cfg = json.loads(cfg_path.read_text(encoding="utf-8"))
tokenizer_name = (
args.tokenizer
or cfg.get("tokenizer_name_or_path")
or cfg.get("llm_model_name_or_path")
or cfg.get("text_model_id")
or cfg.get("vision_model_id")
)
if not tokenizer_name:
raise SystemExit("Could not infer tokenizer_name_or_path")
# Prefer an on-disk tokenizer (e.g. the attached MicroLLaVA folder) to avoid any
# network dependency during export.
repo_root = Path(__file__).resolve().parents[1]
local_tokenizer_candidates = [
repo_root / "MicroLlava-Qwen3-0.6B-base-siglip2-so400m",
]
for cand in local_tokenizer_candidates:
if cand.exists() and (cand / "tokenizer_config.json").exists():
tokenizer_name = str(cand)
break
tok = AutoTokenizer.from_pretrained(
tokenizer_name,
trust_remote_code=True,
use_fast=bool(cfg.get("tokenizer_use_fast", False)),
local_files_only=True,
)
# Ensure special tokens exist
added = tok.add_special_tokens({"additional_special_tokens": ["<image>"]})
# Some tokenizers need a pad token for batching.
if tok.pad_token_id is None and cfg.get("pad_token"):
tok.add_special_tokens({"pad_token": cfg["pad_token"]})
# Save tokenizer files into export dir
tok.save_pretrained(out_dir)
# Copy chat template if present in stub
tmpl_src = out_dir / "chat_template.jinja"
if tmpl_src.exists():
# Ensure tokenizer_config.json references it (HF uses string field)
tok_cfg_path = out_dir / "tokenizer_config.json"
if tok_cfg_path.exists():
tok_cfg = json.loads(tok_cfg_path.read_text(encoding="utf-8"))
else:
tok_cfg = {}
tok_cfg["chat_template"] = tmpl_src.read_text(encoding="utf-8")
tok_cfg_path.write_text(json.dumps(tok_cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
# Align config fields with MicroLLaVA convention
cfg.setdefault("image_token_index", -200)
cfg["image_token_index"] = -200
cfg["image_token_id"] = -200
# For user convenience record actual tokenizer vocab id of '<image>'
img_vocab_id = tok.convert_tokens_to_ids("<image>")
cfg["tokenizer_image_token_id"] = int(img_vocab_id) if img_vocab_id is not None else None
cfg["tokenizer_added_tokens"] = int(added)
cfg_path.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
# Minimal README hint
readme = out_dir / "README_EXPORT.md"
readme.write_text(
"Manthan-T1 export folder (stub).\n\n"
"- `config.json` uses `image_token_index=-200` placeholder like TinyLLaVA.\n"
"- Tokenizer contains a real `<image>` special token.\n"
"- This folder does not include model weights; training should save weights here later.\n",
encoding="utf-8",
)
print(f"Exported to: {out_dir}")
if args.write_stub_weights:
# Import only when requested to avoid heavier imports for plain export.
from manthan_t1.configuration_manthan import ManthanConfig
from manthan_t1.modeling_manthan import ManthanForCausalLM
# Tiny randomly-initialized model that is loadable.
# This does not download any base weights.
stub_cfg = ManthanConfig(
text_model_id=None,
vision_model_id=None,
image_token_index=-200,
num_image_tokens=32,
)
model = ManthanForCausalLM(stub_cfg)
model.save_pretrained(out_dir, safe_serialization=True)
# Ensure auto_map is present so AutoConfig/AutoModel can resolve our
# custom classes via trust_remote_code.
saved_cfg = json.loads((out_dir / "config.json").read_text(encoding="utf-8"))
saved_cfg["auto_map"] = cfg.get(
"auto_map",
{
"AutoConfig": "configuration_manthan.ManthanConfig",
"AutoModelForCausalLM": "modeling_manthan.ManthanForCausalLM",
},
)
(out_dir / "config.json").write_text(
json.dumps(saved_cfg, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
print("Wrote stub weights: model.safetensors")
if __name__ == "__main__":
main()