File size: 7,632 Bytes
7f7a72e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python3

"""Export a Manthan-T1 folder that can be uploaded to Hugging Face.

What this does:
- Copies `hf_export_stub/*` into an output directory
- Builds a tokenizer from `tokenizer_name_or_path` (defaults to Qwen3)
- Ensures `<image>` is a real special token in the tokenizer
- Writes `tokenizer_config.json`, `special_tokens_map.json`, `added_tokens.json`, and `chat_template.jinja`
- Updates `config.json` with a correct `image_token_id` (kept equal to -200 placeholder)

Note:
- This does NOT include model weights. It's intended for placeholder-weight repo layout
  (like your MicroLLaVA example). For training, you'll later save actual weights.
"""

from __future__ import annotations

import argparse
import json
import os
import shutil
import sys
from pathlib import Path

from transformers import AutoTokenizer


# Allow running this script without installing the package.
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))


def _copytree(src: Path, dst: Path) -> None:
    dst.mkdir(parents=True, exist_ok=True)
    for item in src.iterdir():
        s = item
        d = dst / item.name
        if item.is_dir():
            shutil.copytree(s, d, dirs_exist_ok=True)
        else:
            shutil.copy2(s, d)


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", required=True, help="Output folder")
    ap.add_argument(
        "--stub",
        default=str(Path(__file__).resolve().parents[1] / "hf_export_stub"),
        help="Path to hf_export_stub folder",
    )
    ap.add_argument(
        "--tokenizer",
        default=None,
        help="Tokenizer name/path. Defaults to config.json tokenizer_name_or_path.",
    )
    ap.add_argument(
        "--tokenizer_local_dir",
        default=None,
        help="Local tokenizer directory to copy (e.g. MicroLlava-* folder). If set, no network fetch is performed.",
    )
    ap.add_argument(
        "--write_stub_weights",
        action="store_true",
        help="Write randomly-initialized weights (model.safetensors) into the export dir so from_pretrained() succeeds.",
    )
    args = ap.parse_args()

    out_dir = Path(args.out).expanduser().resolve()
    stub_dir = Path(args.stub).expanduser().resolve()

    if not stub_dir.exists():
        raise SystemExit(f"Stub dir not found: {stub_dir}")

    out_dir.mkdir(parents=True, exist_ok=True)
    _copytree(stub_dir, out_dir)

    # Ensure we don't keep stale remote-code python files from a previous export.
    for stale in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
        p = out_dir / stale
        if p.exists():
            p.unlink()

    # Copy remote-code python files to export root (HF dynamic module loader expects them)
    repo_root = Path(__file__).resolve().parents[1]
    pkg_dir = repo_root / "manthan_t1"
    for fname in ["configuration_manthan.py", "modeling_manthan.py", "__init__.py"]:
        src = pkg_dir / fname
        if not src.exists():
            raise SystemExit(f"Missing required source file for export: {src}")
        shutil.copy2(src, out_dir / fname)

    cfg_path = out_dir / "config.json"
    if not cfg_path.exists():
        raise SystemExit(f"config.json not found in: {out_dir}")

    cfg = json.loads(cfg_path.read_text(encoding="utf-8"))
    tokenizer_name = (
        args.tokenizer
        or cfg.get("tokenizer_name_or_path")
        or cfg.get("llm_model_name_or_path")
        or cfg.get("text_model_id")
        or cfg.get("vision_model_id")
    )
    if not tokenizer_name:
        raise SystemExit("Could not infer tokenizer_name_or_path")

    # Prefer an on-disk tokenizer (e.g. the attached MicroLLaVA folder) to avoid any
    # network dependency during export.
    repo_root = Path(__file__).resolve().parents[1]
    local_tokenizer_candidates = [
        repo_root / "MicroLlava-Qwen3-0.6B-base-siglip2-so400m",
    ]
    for cand in local_tokenizer_candidates:
        if cand.exists() and (cand / "tokenizer_config.json").exists():
            tokenizer_name = str(cand)
            break

    tok = AutoTokenizer.from_pretrained(
        tokenizer_name,
        trust_remote_code=True,
        use_fast=bool(cfg.get("tokenizer_use_fast", False)),
        local_files_only=True,
    )

    # Ensure special tokens exist
    added = tok.add_special_tokens({"additional_special_tokens": ["<image>"]})
    # Some tokenizers need a pad token for batching.
    if tok.pad_token_id is None and cfg.get("pad_token"):
        tok.add_special_tokens({"pad_token": cfg["pad_token"]})

    # Save tokenizer files into export dir
    tok.save_pretrained(out_dir)

    # Copy chat template if present in stub
    tmpl_src = out_dir / "chat_template.jinja"
    if tmpl_src.exists():
        # Ensure tokenizer_config.json references it (HF uses string field)
        tok_cfg_path = out_dir / "tokenizer_config.json"
        if tok_cfg_path.exists():
            tok_cfg = json.loads(tok_cfg_path.read_text(encoding="utf-8"))
        else:
            tok_cfg = {}
        tok_cfg["chat_template"] = tmpl_src.read_text(encoding="utf-8")
        tok_cfg_path.write_text(json.dumps(tok_cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

    # Align config fields with MicroLLaVA convention
    cfg.setdefault("image_token_index", -200)
    cfg["image_token_index"] = -200
    cfg["image_token_id"] = -200

    # For user convenience record actual tokenizer vocab id of '<image>'
    img_vocab_id = tok.convert_tokens_to_ids("<image>")
    cfg["tokenizer_image_token_id"] = int(img_vocab_id) if img_vocab_id is not None else None
    cfg["tokenizer_added_tokens"] = int(added)

    cfg_path.write_text(json.dumps(cfg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

    # Minimal README hint
    readme = out_dir / "README_EXPORT.md"
    readme.write_text(
        "Manthan-T1 export folder (stub).\n\n"
        "- `config.json` uses `image_token_index=-200` placeholder like TinyLLaVA.\n"
        "- Tokenizer contains a real `<image>` special token.\n"
        "- This folder does not include model weights; training should save weights here later.\n",
        encoding="utf-8",
    )

    print(f"Exported to: {out_dir}")

    if args.write_stub_weights:
        # Import only when requested to avoid heavier imports for plain export.
        from manthan_t1.configuration_manthan import ManthanConfig
        from manthan_t1.modeling_manthan import ManthanForCausalLM

        # Tiny randomly-initialized model that is loadable.
        # This does not download any base weights.
        stub_cfg = ManthanConfig(
            text_model_id=None,
            vision_model_id=None,
            image_token_index=-200,
            num_image_tokens=32,
        )
        model = ManthanForCausalLM(stub_cfg)
        model.save_pretrained(out_dir, safe_serialization=True)

        # Ensure auto_map is present so AutoConfig/AutoModel can resolve our
        # custom classes via trust_remote_code.
        saved_cfg = json.loads((out_dir / "config.json").read_text(encoding="utf-8"))
        saved_cfg["auto_map"] = cfg.get(
            "auto_map",
            {
                "AutoConfig": "configuration_manthan.ManthanConfig",
                "AutoModelForCausalLM": "modeling_manthan.ManthanForCausalLM",
            },
        )
        (out_dir / "config.json").write_text(
            json.dumps(saved_cfg, indent=2, ensure_ascii=False) + "\n",
            encoding="utf-8",
        )

        print("Wrote stub weights: model.safetensors")


if __name__ == "__main__":
    main()