|
|
|
|
|
|
|
|
"""Export a Manthan-T1 folder that can be uploaded to Hugging Face. |
|
|
|
|
|
What this does: |
|
|
- Copies `hf_export_stub/*` into an output directory |
|
|
- Builds a tokenizer from `tokenizer_name_or_path` (defaults to Qwen3) |
|
|
- Ensures `<image>` is a real special token in the tokenizer |
|
|
- Writes `tokenizer_config.json`, `special_tokens_map.json`, `added_tokens.json`, and `chat_template.jinja` |
|
|
- Updates `config.json` with a correct `image_token_id` (kept equal to -200 placeholder) |
|
|
|
|
|
Note: |
|
|
- This does NOT include model weights. It's intended for placeholder-weight repo layout |
|
|
(like your MicroLLaVA example). For training, you'll later save actual weights. |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import os |
|
|
import shutil |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[1] |
|
|
if str(REPO_ROOT) not in sys.path: |
|
|
sys.path.insert(0, str(REPO_ROOT)) |
|
|
|
|
|
|
|
|
def _copytree(src: Path, dst: Path) -> None: |
|
|
dst.mkdir(parents=True, exist_ok=True) |
|
|
for item in src.iterdir(): |
|
|
s = item |
|
|
d = dst / item.name |
|
|
if item.is_dir(): |
|
|
shutil.copytree(s, d, dirs_exist_ok=True) |
|
|
else: |
|
|
shutil.copy2(s, d) |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
ap = argparse.ArgumentParser() |
|
|
ap.add_argument("--out", required=True, help="Output folder") |
|
|
ap.add_argument( |
|
|
"--stub", |
|
|
default=str(Path(__file__).resolve().parents[1] / "hf_export_stub"), |
|
|
help="Path to hf_export_stub folder", |
|
|
) |
|
|
ap.add_argument( |
|
|
"--tokenizer", |
|
|
default=None, |
|
|
help="Tokenizer name/path. Defaults to config.json tokenizer_name_or_path.", |
|
|
) |
|
|
ap.add_argument( |
|
|
"--tokenizer_local_dir", |
|
|
default=None, |
|
|
help="Local tokenizer directory to copy (e.g. MicroLlava-* folder). If set, no network fetch is performed.", |
|
|
) |
|
|
ap.add_argument( |
|
|
"--write_stub_weights", |
|
|
action="store_true", |
|
|
help="Write randomly-initialized weights (model.safetensors) into the export dir so from_pretrained() succeeds.", |
|
|
) |
|
|
args = ap.parse_args() |
|
|
|
|
|
out_dir = Path(args.out).expanduser().resolve() |
|
|
stub_dir = Path(args.stub).expanduser().resolve() |
|
|
|
|
|
if not stub_dir.exists(): |
|
|
raise SystemExit(f"Stub dir not found: {stub_dir}") |
|
|
|
|
|
out_dir.mkdir(parents=True, exist_ok=True) |
|
|
_copytree(stub_dir, out_dir) |
|
|
|
|
|
|
|
|
for stale in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]: |
|
|
p = out_dir / stale |
|
|
if p.exists(): |
|
|
p.unlink() |
|
|
|
|
|
|
|
|
repo_root = Path(__file__).resolve().parents[1] |
|
|
pkg_dir = repo_root / "manthan_t1" |
|
|
for fname in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]: |
|
|
src = pkg_dir / fname |
|
|
if not src.exists(): |
|
|
raise SystemExit(f"Missing required source file for export: {src}") |
|
|
shutil.copy2(src, out_dir / fname) |
|
|
|
|
|
cfg_path = out_dir / "config.json" |
|
|
if not cfg_path.exists(): |
|
|
raise SystemExit(f"config.json not found in: {out_dir}") |
|
|
|
|
|
cfg = json.loads(cfg_path.read_text(encoding="utf-8")) |
|
|
tokenizer_name = ( |
|
|
args.tokenizer |
|
|
or cfg.get("tokenizer_name_or_path") |
|
|
or cfg.get("llm_model_name_or_path") |
|
|
or cfg.get("text_model_id") |
|
|
or cfg.get("vision_model_id") |
|
|
) |
|
|
if not tokenizer_name: |
|
|
raise SystemExit("Could not infer tokenizer_name_or_path") |
|
|
|
|
|
|
|
|
|
|
|
repo_root = Path(__file__).resolve().parents[1] |
|
|
local_tokenizer_candidates = [ |
|
|
repo_root / "MicroLlava-Qwen3-0.6B-base-siglip2-so400m", |
|
|
] |
|
|
for cand in local_tokenizer_candidates: |
|
|
if cand.exists() and (cand / "tokenizer_config.json").exists(): |
|
|
tokenizer_name = str(cand) |
|
|
break |
|
|
|
|
|
tok = AutoTokenizer.from_pretrained( |
|
|
tokenizer_name, |
|
|
trust_remote_code=True, |
|
|
use_fast=bool(cfg.get("tokenizer_use_fast", False)), |
|
|
local_files_only=True, |
|
|
) |
|
|
|
|
|
|
|
|
added = tok.add_special_tokens({"additional_special_tokens": ["<image>"]}) |
|
|
|
|
|
if tok.pad_token_id is None and cfg.get("pad_token"): |
|
|
tok.add_special_tokens({"pad_token": cfg["pad_token"]}) |
|
|
|
|
|
|
|
|
tok.save_pretrained(out_dir) |
|
|
|
|
|
|
|
|
tmpl_src = out_dir / "chat_template.jinja" |
|
|
if tmpl_src.exists(): |
|
|
|
|
|
tok_cfg_path = out_dir / "tokenizer_config.json" |
|
|
if tok_cfg_path.exists(): |
|
|
tok_cfg = json.loads(tok_cfg_path.read_text(encoding="utf-8")) |
|
|
else: |
|
|
tok_cfg = {} |
|
|
tok_cfg["chat_template"] = tmpl_src.read_text(encoding="utf-8") |
|
|
tok_cfg_path.write_text(json.dumps(tok_cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") |
|
|
|
|
|
|
|
|
cfg.setdefault("image_token_index", -200) |
|
|
cfg["image_token_index"] = -200 |
|
|
cfg["image_token_id"] = -200 |
|
|
|
|
|
|
|
|
img_vocab_id = tok.convert_tokens_to_ids("<image>") |
|
|
cfg["tokenizer_image_token_id"] = int(img_vocab_id) if img_vocab_id is not None else None |
|
|
cfg["tokenizer_added_tokens"] = int(added) |
|
|
|
|
|
cfg_path.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") |
|
|
|
|
|
|
|
|
readme = out_dir / "README_EXPORT.md" |
|
|
readme.write_text( |
|
|
"Manthan-T1 export folder (stub).\n\n" |
|
|
"- `config.json` uses `image_token_index=-200` placeholder like TinyLLaVA.\n" |
|
|
"- Tokenizer contains a real `<image>` special token.\n" |
|
|
"- This folder does not include model weights; training should save weights here later.\n", |
|
|
encoding="utf-8", |
|
|
) |
|
|
|
|
|
print(f"Exported to: {out_dir}") |
|
|
|
|
|
if args.write_stub_weights: |
|
|
|
|
|
from manthan_t1.configuration_manthan import ManthanConfig |
|
|
from manthan_t1.modeling_manthan import ManthanForCausalLM |
|
|
|
|
|
|
|
|
|
|
|
stub_cfg = ManthanConfig( |
|
|
text_model_id=None, |
|
|
vision_model_id=None, |
|
|
image_token_index=-200, |
|
|
num_image_tokens=32, |
|
|
) |
|
|
model = ManthanForCausalLM(stub_cfg) |
|
|
model.save_pretrained(out_dir, safe_serialization=True) |
|
|
|
|
|
|
|
|
|
|
|
saved_cfg = json.loads((out_dir / "config.json").read_text(encoding="utf-8")) |
|
|
saved_cfg["auto_map"] = cfg.get( |
|
|
"auto_map", |
|
|
{ |
|
|
"AutoConfig": "configuration_manthan.ManthanConfig", |
|
|
"AutoModelForCausalLM": "modeling_manthan.ManthanForCausalLM", |
|
|
}, |
|
|
) |
|
|
(out_dir / "config.json").write_text( |
|
|
json.dumps(saved_cfg, indent=2, ensure_ascii=False) + "\n", |
|
|
encoding="utf-8", |
|
|
) |
|
|
|
|
|
print("Wrote stub weights: model.safetensors") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|