File size: 7,632 Bytes
7f7a72e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
#!/usr/bin/env python3
"""Export a Manthan-T1 folder that can be uploaded to Hugging Face.
What this does:
- Copies `hf_export_stub/*` into an output directory
- Builds a tokenizer from `tokenizer_name_or_path` (defaults to Qwen3)
- Ensures `<image>` is a real special token in the tokenizer
- Writes `tokenizer_config.json`, `special_tokens_map.json`, `added_tokens.json`, and `chat_template.jinja`
- Updates `config.json` with a correct `image_token_id` (kept equal to -200 placeholder)
Note:
- This does NOT include model weights. It's intended for placeholder-weight repo layout
(like your MicroLLaVA example). For training, you'll later save actual weights.
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import sys
from pathlib import Path
from transformers import AutoTokenizer
# Allow running this script without installing the package.
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
def _copytree(src: Path, dst: Path) -> None:
dst.mkdir(parents=True, exist_ok=True)
for item in src.iterdir():
s = item
d = dst / item.name
if item.is_dir():
shutil.copytree(s, d, dirs_exist_ok=True)
else:
shutil.copy2(s, d)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--out", required=True, help="Output folder")
ap.add_argument(
"--stub",
default=str(Path(__file__).resolve().parents[1] / "hf_export_stub"),
help="Path to hf_export_stub folder",
)
ap.add_argument(
"--tokenizer",
default=None,
help="Tokenizer name/path. Defaults to config.json tokenizer_name_or_path.",
)
ap.add_argument(
"--tokenizer_local_dir",
default=None,
help="Local tokenizer directory to copy (e.g. MicroLlava-* folder). If set, no network fetch is performed.",
)
ap.add_argument(
"--write_stub_weights",
action="store_true",
help="Write randomly-initialized weights (model.safetensors) into the export dir so from_pretrained() succeeds.",
)
args = ap.parse_args()
out_dir = Path(args.out).expanduser().resolve()
stub_dir = Path(args.stub).expanduser().resolve()
if not stub_dir.exists():
raise SystemExit(f"Stub dir not found: {stub_dir}")
out_dir.mkdir(parents=True, exist_ok=True)
_copytree(stub_dir, out_dir)
# Ensure we don't keep stale remote-code python files from a previous export.
for stale in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
p = out_dir / stale
if p.exists():
p.unlink()
# Copy remote-code python files to export root (HF dynamic module loader expects them)
repo_root = Path(__file__).resolve().parents[1]
pkg_dir = repo_root / "manthan_t1"
for fname in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
src = pkg_dir / fname
if not src.exists():
raise SystemExit(f"Missing required source file for export: {src}")
shutil.copy2(src, out_dir / fname)
cfg_path = out_dir / "config.json"
if not cfg_path.exists():
raise SystemExit(f"config.json not found in: {out_dir}")
cfg = json.loads(cfg_path.read_text(encoding="utf-8"))
tokenizer_name = (
args.tokenizer
or cfg.get("tokenizer_name_or_path")
or cfg.get("llm_model_name_or_path")
or cfg.get("text_model_id")
or cfg.get("vision_model_id")
)
if not tokenizer_name:
raise SystemExit("Could not infer tokenizer_name_or_path")
# Prefer an on-disk tokenizer (e.g. the attached MicroLLaVA folder) to avoid any
# network dependency during export.
repo_root = Path(__file__).resolve().parents[1]
local_tokenizer_candidates = [
repo_root / "MicroLlava-Qwen3-0.6B-base-siglip2-so400m",
]
for cand in local_tokenizer_candidates:
if cand.exists() and (cand / "tokenizer_config.json").exists():
tokenizer_name = str(cand)
break
tok = AutoTokenizer.from_pretrained(
tokenizer_name,
trust_remote_code=True,
use_fast=bool(cfg.get("tokenizer_use_fast", False)),
local_files_only=True,
)
# Ensure special tokens exist
added = tok.add_special_tokens({"additional_special_tokens": ["<image>"]})
# Some tokenizers need a pad token for batching.
if tok.pad_token_id is None and cfg.get("pad_token"):
tok.add_special_tokens({"pad_token": cfg["pad_token"]})
# Save tokenizer files into export dir
tok.save_pretrained(out_dir)
# Copy chat template if present in stub
tmpl_src = out_dir / "chat_template.jinja"
if tmpl_src.exists():
# Ensure tokenizer_config.json references it (HF uses string field)
tok_cfg_path = out_dir / "tokenizer_config.json"
if tok_cfg_path.exists():
tok_cfg = json.loads(tok_cfg_path.read_text(encoding="utf-8"))
else:
tok_cfg = {}
tok_cfg["chat_template"] = tmpl_src.read_text(encoding="utf-8")
tok_cfg_path.write_text(json.dumps(tok_cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
# Align config fields with MicroLLaVA convention
cfg.setdefault("image_token_index", -200)
cfg["image_token_index"] = -200
cfg["image_token_id"] = -200
# For user convenience record actual tokenizer vocab id of '<image>'
img_vocab_id = tok.convert_tokens_to_ids("<image>")
cfg["tokenizer_image_token_id"] = int(img_vocab_id) if img_vocab_id is not None else None
cfg["tokenizer_added_tokens"] = int(added)
cfg_path.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
# Minimal README hint
readme = out_dir / "README_EXPORT.md"
readme.write_text(
"Manthan-T1 export folder (stub).\n\n"
"- `config.json` uses `image_token_index=-200` placeholder like TinyLLaVA.\n"
"- Tokenizer contains a real `<image>` special token.\n"
"- This folder does not include model weights; training should save weights here later.\n",
encoding="utf-8",
)
print(f"Exported to: {out_dir}")
if args.write_stub_weights:
# Import only when requested to avoid heavier imports for plain export.
from manthan_t1.configuration_manthan import ManthanConfig
from manthan_t1.modeling_manthan import ManthanForCausalLM
# Tiny randomly-initialized model that is loadable.
# This does not download any base weights.
stub_cfg = ManthanConfig(
text_model_id=None,
vision_model_id=None,
image_token_index=-200,
num_image_tokens=32,
)
model = ManthanForCausalLM(stub_cfg)
model.save_pretrained(out_dir, safe_serialization=True)
# Ensure auto_map is present so AutoConfig/AutoModel can resolve our
# custom classes via trust_remote_code.
saved_cfg = json.loads((out_dir / "config.json").read_text(encoding="utf-8"))
saved_cfg["auto_map"] = cfg.get(
"auto_map",
{
"AutoConfig": "configuration_manthan.ManthanConfig",
"AutoModelForCausalLM": "modeling_manthan.ManthanForCausalLM",
},
)
(out_dir / "config.json").write_text(
json.dumps(saved_cfg, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
print("Wrote stub weights: model.safetensors")
if __name__ == "__main__":
main()
|