Manthan-T1 / scripts /export_hf.py

Atah Alam

Manthan-T1 clean code-only

7f7a72e about 1 month ago

7.63 kB

	#!/usr/bin/env python3

	"""Export a Manthan-T1 folder that can be uploaded to Hugging Face.

	What this does:
	- Copies `hf_export_stub/*` into an output directory
	- Builds a tokenizer from `tokenizer_name_or_path` (defaults to Qwen3)
	- Ensures `<image>` is a real special token in the tokenizer
	- Writes `tokenizer_config.json`, `special_tokens_map.json`, `added_tokens.json`, and `chat_template.jinja`
	- Updates `config.json` with a correct `image_token_id` (kept equal to -200 placeholder)

	Note:
	- This does NOT include model weights. It's intended for placeholder-weight repo layout
	(like your MicroLLaVA example). For training, you'll later save actual weights.
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import shutil
	import sys
	from pathlib import Path

	from transformers import AutoTokenizer


	# Allow running this script without installing the package.
	REPO_ROOT = Path(__file__).resolve().parents[1]
	if str(REPO_ROOT) not in sys.path:
	sys.path.insert(0, str(REPO_ROOT))


	def _copytree(src: Path, dst: Path) -> None:
	dst.mkdir(parents=True, exist_ok=True)
	for item in src.iterdir():
	s = item
	d = dst / item.name
	if item.is_dir():
	shutil.copytree(s, d, dirs_exist_ok=True)
	else:
	shutil.copy2(s, d)


	def main() -> None:
	ap = argparse.ArgumentParser()
	ap.add_argument("--out", required=True, help="Output folder")
	ap.add_argument(
	"--stub",
	default=str(Path(__file__).resolve().parents[1] / "hf_export_stub"),
	help="Path to hf_export_stub folder",
	)
	ap.add_argument(
	"--tokenizer",
	default=None,
	help="Tokenizer name/path. Defaults to config.json tokenizer_name_or_path.",
	)
	ap.add_argument(
	"--tokenizer_local_dir",
	default=None,
	help="Local tokenizer directory to copy (e.g. MicroLlava-* folder). If set, no network fetch is performed.",
	)
	ap.add_argument(
	"--write_stub_weights",
	action="store_true",
	help="Write randomly-initialized weights (model.safetensors) into the export dir so from_pretrained() succeeds.",
	)
	args = ap.parse_args()

	out_dir = Path(args.out).expanduser().resolve()
	stub_dir = Path(args.stub).expanduser().resolve()

	if not stub_dir.exists():
	raise SystemExit(f"Stub dir not found: {stub_dir}")

	out_dir.mkdir(parents=True, exist_ok=True)
	_copytree(stub_dir, out_dir)

	# Ensure we don't keep stale remote-code python files from a previous export.
	for stale in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
	p = out_dir / stale
	if p.exists():
	p.unlink()

	# Copy remote-code python files to export root (HF dynamic module loader expects them)
	repo_root = Path(__file__).resolve().parents[1]
	pkg_dir = repo_root / "manthan_t1"
	for fname in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
	src = pkg_dir / fname
	if not src.exists():
	raise SystemExit(f"Missing required source file for export: {src}")
	shutil.copy2(src, out_dir / fname)

	cfg_path = out_dir / "config.json"
	if not cfg_path.exists():
	raise SystemExit(f"config.json not found in: {out_dir}")

	cfg = json.loads(cfg_path.read_text(encoding="utf-8"))
	tokenizer_name = (
	args.tokenizer
	or cfg.get("tokenizer_name_or_path")
	or cfg.get("llm_model_name_or_path")
	or cfg.get("text_model_id")
	or cfg.get("vision_model_id")
	)
	if not tokenizer_name:
	raise SystemExit("Could not infer tokenizer_name_or_path")

	# Prefer an on-disk tokenizer (e.g. the attached MicroLLaVA folder) to avoid any
	# network dependency during export.
	repo_root = Path(__file__).resolve().parents[1]
	local_tokenizer_candidates = [
	repo_root / "MicroLlava-Qwen3-0.6B-base-siglip2-so400m",
	]
	for cand in local_tokenizer_candidates:
	if cand.exists() and (cand / "tokenizer_config.json").exists():
	tokenizer_name = str(cand)
	break

	tok = AutoTokenizer.from_pretrained(
	tokenizer_name,
	trust_remote_code=True,
	use_fast=bool(cfg.get("tokenizer_use_fast", False)),
	local_files_only=True,
	)

	# Ensure special tokens exist
	added = tok.add_special_tokens({"additional_special_tokens": ["<image>"]})
	# Some tokenizers need a pad token for batching.
	if tok.pad_token_id is None and cfg.get("pad_token"):
	tok.add_special_tokens({"pad_token": cfg["pad_token"]})

	# Save tokenizer files into export dir
	tok.save_pretrained(out_dir)

	# Copy chat template if present in stub
	tmpl_src = out_dir / "chat_template.jinja"
	if tmpl_src.exists():
	# Ensure tokenizer_config.json references it (HF uses string field)
	tok_cfg_path = out_dir / "tokenizer_config.json"
	if tok_cfg_path.exists():
	tok_cfg = json.loads(tok_cfg_path.read_text(encoding="utf-8"))
	else:
	tok_cfg = {}
	tok_cfg["chat_template"] = tmpl_src.read_text(encoding="utf-8")
	tok_cfg_path.write_text(json.dumps(tok_cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

	# Align config fields with MicroLLaVA convention
	cfg.setdefault("image_token_index", -200)
	cfg["image_token_index"] = -200
	cfg["image_token_id"] = -200

	# For user convenience record actual tokenizer vocab id of '<image>'
	img_vocab_id = tok.convert_tokens_to_ids("<image>")
	cfg["tokenizer_image_token_id"] = int(img_vocab_id) if img_vocab_id is not None else None
	cfg["tokenizer_added_tokens"] = int(added)

	cfg_path.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

	# Minimal README hint
	readme = out_dir / "README_EXPORT.md"
	readme.write_text(
	"Manthan-T1 export folder (stub).\n\n"
	"- `config.json` uses `image_token_index=-200` placeholder like TinyLLaVA.\n"
	"- Tokenizer contains a real `<image>` special token.\n"
	"- This folder does not include model weights; training should save weights here later.\n",
	encoding="utf-8",
	)

	print(f"Exported to: {out_dir}")

	if args.write_stub_weights:
	# Import only when requested to avoid heavier imports for plain export.
	from manthan_t1.configuration_manthan import ManthanConfig
	from manthan_t1.modeling_manthan import ManthanForCausalLM

	# Tiny randomly-initialized model that is loadable.
	# This does not download any base weights.
	stub_cfg = ManthanConfig(
	text_model_id=None,
	vision_model_id=None,
	image_token_index=-200,
	num_image_tokens=32,
	)
	model = ManthanForCausalLM(stub_cfg)
	model.save_pretrained(out_dir, safe_serialization=True)

	# Ensure auto_map is present so AutoConfig/AutoModel can resolve our
	# custom classes via trust_remote_code.
	saved_cfg = json.loads((out_dir / "config.json").read_text(encoding="utf-8"))
	saved_cfg["auto_map"] = cfg.get(
	"auto_map",
	{
	"AutoConfig": "configuration_manthan.ManthanConfig",
	"AutoModelForCausalLM": "modeling_manthan.ManthanForCausalLM",
	},
	)
	(out_dir / "config.json").write_text(
	json.dumps(saved_cfg, indent=2, ensure_ascii=False) + "\n",
	encoding="utf-8",
	)

	print("Wrote stub weights: model.safetensors")


	if __name__ == "__main__":
	main()