#!/usr/bin/env python3 """Export a Manthan-T1 folder that can be uploaded to Hugging Face. What this does: - Copies `hf_export_stub/*` into an output directory - Builds a tokenizer from `tokenizer_name_or_path` (defaults to Qwen3) - Ensures `` is a real special token in the tokenizer - Writes `tokenizer_config.json`, `special_tokens_map.json`, `added_tokens.json`, and `chat_template.jinja` - Updates `config.json` with a correct `image_token_id` (kept equal to -200 placeholder) Note: - This does NOT include model weights. It's intended for placeholder-weight repo layout (like your MicroLLaVA example). For training, you'll later save actual weights. """ from __future__ import annotations import argparse import json import os import shutil import sys from pathlib import Path from transformers import AutoTokenizer # Allow running this script without installing the package. REPO_ROOT = Path(__file__).resolve().parents[1] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) def _copytree(src: Path, dst: Path) -> None: dst.mkdir(parents=True, exist_ok=True) for item in src.iterdir(): s = item d = dst / item.name if item.is_dir(): shutil.copytree(s, d, dirs_exist_ok=True) else: shutil.copy2(s, d) def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--out", required=True, help="Output folder") ap.add_argument( "--stub", default=str(Path(__file__).resolve().parents[1] / "hf_export_stub"), help="Path to hf_export_stub folder", ) ap.add_argument( "--tokenizer", default=None, help="Tokenizer name/path. Defaults to config.json tokenizer_name_or_path.", ) ap.add_argument( "--tokenizer_local_dir", default=None, help="Local tokenizer directory to copy (e.g. MicroLlava-* folder). If set, no network fetch is performed.", ) ap.add_argument( "--write_stub_weights", action="store_true", help="Write randomly-initialized weights (model.safetensors) into the export dir so from_pretrained() succeeds.", ) args = ap.parse_args() out_dir = Path(args.out).expanduser().resolve() stub_dir = Path(args.stub).expanduser().resolve() if not stub_dir.exists(): raise SystemExit(f"Stub dir not found: {stub_dir}") out_dir.mkdir(parents=True, exist_ok=True) _copytree(stub_dir, out_dir) # Ensure we don't keep stale remote-code python files from a previous export. for stale in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]: p = out_dir / stale if p.exists(): p.unlink() # Copy remote-code python files to export root (HF dynamic module loader expects them) repo_root = Path(__file__).resolve().parents[1] pkg_dir = repo_root / "manthan_t1" for fname in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]: src = pkg_dir / fname if not src.exists(): raise SystemExit(f"Missing required source file for export: {src}") shutil.copy2(src, out_dir / fname) cfg_path = out_dir / "config.json" if not cfg_path.exists(): raise SystemExit(f"config.json not found in: {out_dir}") cfg = json.loads(cfg_path.read_text(encoding="utf-8")) tokenizer_name = ( args.tokenizer or cfg.get("tokenizer_name_or_path") or cfg.get("llm_model_name_or_path") or cfg.get("text_model_id") or cfg.get("vision_model_id") ) if not tokenizer_name: raise SystemExit("Could not infer tokenizer_name_or_path") # Prefer an on-disk tokenizer (e.g. the attached MicroLLaVA folder) to avoid any # network dependency during export. repo_root = Path(__file__).resolve().parents[1] local_tokenizer_candidates = [ repo_root / "MicroLlava-Qwen3-0.6B-base-siglip2-so400m", ] for cand in local_tokenizer_candidates: if cand.exists() and (cand / "tokenizer_config.json").exists(): tokenizer_name = str(cand) break tok = AutoTokenizer.from_pretrained( tokenizer_name, trust_remote_code=True, use_fast=bool(cfg.get("tokenizer_use_fast", False)), local_files_only=True, ) # Ensure special tokens exist added = tok.add_special_tokens({"additional_special_tokens": [""]}) # Some tokenizers need a pad token for batching. if tok.pad_token_id is None and cfg.get("pad_token"): tok.add_special_tokens({"pad_token": cfg["pad_token"]}) # Save tokenizer files into export dir tok.save_pretrained(out_dir) # Copy chat template if present in stub tmpl_src = out_dir / "chat_template.jinja" if tmpl_src.exists(): # Ensure tokenizer_config.json references it (HF uses string field) tok_cfg_path = out_dir / "tokenizer_config.json" if tok_cfg_path.exists(): tok_cfg = json.loads(tok_cfg_path.read_text(encoding="utf-8")) else: tok_cfg = {} tok_cfg["chat_template"] = tmpl_src.read_text(encoding="utf-8") tok_cfg_path.write_text(json.dumps(tok_cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") # Align config fields with MicroLLaVA convention cfg.setdefault("image_token_index", -200) cfg["image_token_index"] = -200 cfg["image_token_id"] = -200 # For user convenience record actual tokenizer vocab id of '' img_vocab_id = tok.convert_tokens_to_ids("") cfg["tokenizer_image_token_id"] = int(img_vocab_id) if img_vocab_id is not None else None cfg["tokenizer_added_tokens"] = int(added) cfg_path.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") # Minimal README hint readme = out_dir / "README_EXPORT.md" readme.write_text( "Manthan-T1 export folder (stub).\n\n" "- `config.json` uses `image_token_index=-200` placeholder like TinyLLaVA.\n" "- Tokenizer contains a real `` special token.\n" "- This folder does not include model weights; training should save weights here later.\n", encoding="utf-8", ) print(f"Exported to: {out_dir}") if args.write_stub_weights: # Import only when requested to avoid heavier imports for plain export. from manthan_t1.configuration_manthan import ManthanConfig from manthan_t1.modeling_manthan import ManthanForCausalLM # Tiny randomly-initialized model that is loadable. # This does not download any base weights. stub_cfg = ManthanConfig( text_model_id=None, vision_model_id=None, image_token_index=-200, num_image_tokens=32, ) model = ManthanForCausalLM(stub_cfg) model.save_pretrained(out_dir, safe_serialization=True) # Ensure auto_map is present so AutoConfig/AutoModel can resolve our # custom classes via trust_remote_code. saved_cfg = json.loads((out_dir / "config.json").read_text(encoding="utf-8")) saved_cfg["auto_map"] = cfg.get( "auto_map", { "AutoConfig": "configuration_manthan.ManthanConfig", "AutoModelForCausalLM": "modeling_manthan.ManthanForCausalLM", }, ) (out_dir / "config.json").write_text( json.dumps(saved_cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) print("Wrote stub weights: model.safetensors") if __name__ == "__main__": main()