sahilmob commited on
Commit
8020510
·
verified ·
1 Parent(s): f1c78fe

docs: add config and job snapshots

Browse files
artifacts/build_two_phase_job_payload.ts ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env node
2
+
3
+ import fs from "node:fs";
4
+ import path from "node:path";
5
+
6
+ type PhaseConfig = {
7
+ dataset_source: "hub" | "json";
8
+ dataset_id?: string;
9
+ train_split?: string;
10
+ eval_split?: string;
11
+ train_file?: string;
12
+ eval_file?: string;
13
+ num_train_epochs?: number;
14
+ per_device_train_batch_size?: number;
15
+ gradient_accumulation_steps?: number;
16
+ learning_rate?: number;
17
+ };
18
+
19
+ type TwoPhaseConfig = {
20
+ base_model: string;
21
+ phase1_hub_model_id?: string;
22
+ final_hub_model_id: string;
23
+ output_root?: string;
24
+ phase1: PhaseConfig;
25
+ phase2: PhaseConfig;
26
+ lora?: {
27
+ r?: number;
28
+ lora_alpha?: number;
29
+ lora_dropout?: number;
30
+ bias?: string;
31
+ task_type?: string;
32
+ target_modules?: string[];
33
+ };
34
+ runtime?: {
35
+ trackio_project?: string;
36
+ trackio_run_name?: string;
37
+ seed?: number;
38
+ logging_steps?: number;
39
+ save_steps?: number;
40
+ save_total_limit?: number;
41
+ eval_steps?: number;
42
+ warmup_ratio?: number;
43
+ lr_scheduler_type?: string;
44
+ max_length?: number;
45
+ gradient_checkpointing?: boolean;
46
+ strict_chat_template?: boolean;
47
+ skip_trainer_model_move?: boolean;
48
+ force_single_device_model?: boolean;
49
+ };
50
+ };
51
+
52
+ type CliOptions = {
53
+ configPath: string;
54
+ outPath: string;
55
+ emitScriptPath: string | null;
56
+ flavor: string;
57
+ timeout: string;
58
+ dryRun: boolean;
59
+ };
60
+
61
+ const usageText = `
62
+ Usage:
63
+ npx tsx ./training/hf-jobs/build_two_phase_job_payload.ts [options]
64
+
65
+ Options:
66
+ --config <path> Two-phase config JSON path
67
+ (default: ./training/hf-jobs/two-phase-sft.hf.config.json)
68
+ --out <path> Output JSON payload path
69
+ (default: ./training/hf-jobs/two-phase-job.payload.json)
70
+ --emit-script <path> Also write generated Python training script to this path
71
+ --flavor <name> HF Jobs hardware flavor (default: a10g-large)
72
+ --timeout <dur> HF Jobs timeout (default: 4h)
73
+ --dry-run Validate and print summary without writing files
74
+ --help Show this help
75
+ `.trim();
76
+
77
+ const parseArgs = (argv: string[]): CliOptions => {
78
+ const opts: CliOptions = {
79
+ configPath: path.resolve(
80
+ process.cwd(),
81
+ "training",
82
+ "hf-jobs",
83
+ "two-phase-sft.hf.config.json"
84
+ ),
85
+ outPath: path.resolve(
86
+ process.cwd(),
87
+ "training",
88
+ "hf-jobs",
89
+ "two-phase-job.payload.json"
90
+ ),
91
+ emitScriptPath: null,
92
+ flavor: "a10g-large",
93
+ timeout: "4h",
94
+ dryRun: false,
95
+ };
96
+
97
+ for (let i = 0; i < argv.length; i += 1) {
98
+ const arg = argv[i];
99
+ if (arg === "--config") {
100
+ opts.configPath = path.resolve(argv[i + 1]);
101
+ i += 1;
102
+ } else if (arg === "--out") {
103
+ opts.outPath = path.resolve(argv[i + 1]);
104
+ i += 1;
105
+ } else if (arg === "--emit-script") {
106
+ opts.emitScriptPath = path.resolve(argv[i + 1]);
107
+ i += 1;
108
+ } else if (arg === "--flavor") {
109
+ opts.flavor = String(argv[i + 1] || "").trim();
110
+ i += 1;
111
+ } else if (arg === "--timeout") {
112
+ opts.timeout = String(argv[i + 1] || "").trim();
113
+ i += 1;
114
+ } else if (arg === "--dry-run") {
115
+ opts.dryRun = true;
116
+ } else if (arg === "--help" || arg === "-h") {
117
+ console.log(usageText);
118
+ process.exit(0);
119
+ } else if (arg.startsWith("-")) {
120
+ throw new Error(`Unknown option: ${arg}\n\n${usageText}`);
121
+ }
122
+ }
123
+
124
+ return opts;
125
+ };
126
+
127
+ const ensureDir = (dirPath: string) => {
128
+ fs.mkdirSync(dirPath, { recursive: true });
129
+ };
130
+
131
+ const readJson = <T>(filePath: string): T => {
132
+ return JSON.parse(fs.readFileSync(filePath, "utf8"));
133
+ };
134
+
135
+ const writeJson = (filePath: string, value: unknown) => {
136
+ ensureDir(path.dirname(filePath));
137
+ fs.writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, "utf8");
138
+ };
139
+
140
+ const writeText = (filePath: string, value: string) => {
141
+ ensureDir(path.dirname(filePath));
142
+ fs.writeFileSync(filePath, value, "utf8");
143
+ };
144
+
145
+ const validateConfig = (config: TwoPhaseConfig) => {
146
+ const required = ["base_model", "final_hub_model_id", "phase1", "phase2"] as const;
147
+ for (const key of required) {
148
+ if (!(key in config)) {
149
+ throw new Error(`Missing required config key: ${key}`);
150
+ }
151
+ }
152
+
153
+ const phaseNames: Array<"phase1" | "phase2"> = ["phase1", "phase2"];
154
+ for (const phaseName of phaseNames) {
155
+ const phase = config[phaseName];
156
+ if (!phase || typeof phase !== "object") {
157
+ throw new Error(`Invalid ${phaseName} config`);
158
+ }
159
+ if (phase.dataset_source !== "hub" && phase.dataset_source !== "json") {
160
+ throw new Error(`${phaseName}.dataset_source must be "hub" or "json"`);
161
+ }
162
+ if (phase.dataset_source === "hub" && !phase.dataset_id) {
163
+ throw new Error(`${phaseName}.dataset_id is required when dataset_source=hub`);
164
+ }
165
+ if (phase.dataset_source === "json" && !phase.train_file) {
166
+ throw new Error(`${phaseName}.train_file is required when dataset_source=json`);
167
+ }
168
+ }
169
+ };
170
+
171
+ const generatePythonScript = (config: TwoPhaseConfig) => {
172
+ const cfgB64 = Buffer.from(JSON.stringify(config), "utf8").toString("base64");
173
+
174
+ return `#!/usr/bin/env python3
175
+ # /// script
176
+ # requires-python = ">=3.10"
177
+ # dependencies = [
178
+ # "datasets>=2.19.0",
179
+ # "trl>=0.12.0",
180
+ # "peft>=0.12.0",
181
+ # "transformers>=4.45.0",
182
+ # "accelerate>=0.34.0",
183
+ # "jinja2>=3.1.0",
184
+ # "trackio",
185
+ # ]
186
+ # ///
187
+
188
+ import base64
189
+ import json
190
+ import os
191
+
192
+ import trackio
193
+ import torch
194
+ from datasets import load_dataset
195
+ from peft import LoraConfig
196
+ from trl import SFTConfig, SFTTrainer
197
+ from transformers import AutoModelForCausalLM, AutoTokenizer
198
+
199
+
200
+ CONFIG = json.loads(base64.b64decode("${cfgB64}").decode("utf-8"))
201
+
202
+
203
+ def load_phase_datasets(phase_cfg):
204
+ source = str(phase_cfg.get("dataset_source", "hub")).strip().lower()
205
+ if source == "hub":
206
+ dataset_id = phase_cfg["dataset_id"]
207
+ train_split = phase_cfg.get("train_split", "train")
208
+ eval_split = phase_cfg.get("eval_split", "validation")
209
+ train_ds = load_dataset(dataset_id, split=train_split)
210
+ eval_ds = load_dataset(dataset_id, split=eval_split) if eval_split else None
211
+ return train_ds, eval_ds
212
+ if source == "json":
213
+ train_file = phase_cfg["train_file"]
214
+ eval_file = phase_cfg.get("eval_file")
215
+ data_files = {"train": train_file}
216
+ if eval_file:
217
+ data_files["validation"] = eval_file
218
+ ds = load_dataset("json", data_files=data_files)
219
+ train_ds = ds["train"]
220
+ eval_ds = ds["validation"] if "validation" in ds else None
221
+ return train_ds, eval_ds
222
+ raise ValueError(f"Unsupported dataset_source: {source}")
223
+
224
+
225
+ def _safe_json(value):
226
+ try:
227
+ return json.dumps(value, ensure_ascii=False)
228
+ except Exception:
229
+ return str(value)
230
+
231
+
232
+ def _message_to_text(message):
233
+ if not isinstance(message, dict):
234
+ return _safe_json(message)
235
+ role = str(message.get("role", "unknown"))
236
+ if isinstance(message.get("tool_calls"), list):
237
+ return f"{role}: <tool_calls> " + _safe_json(message.get("tool_calls"))
238
+ content = message.get("content")
239
+ if isinstance(content, str):
240
+ return f"{role}: {content}"
241
+ if content is None:
242
+ return f"{role}:"
243
+ return f"{role}: " + _safe_json(content)
244
+
245
+
246
+ def _fallback_chat_render(messages):
247
+ lines = [_message_to_text(message) for message in messages]
248
+ return "\\n".join(lines).strip()
249
+
250
+
251
+ def load_base_model_for_training(model_id, runtime_cfg):
252
+ on_cuda = torch.cuda.is_available()
253
+ force_single_device_model = bool(runtime_cfg.get("force_single_device_model", True))
254
+ device_map = None
255
+ if on_cuda:
256
+ device_map = {"": 0} if force_single_device_model else "auto"
257
+ model = AutoModelForCausalLM.from_pretrained(
258
+ model_id,
259
+ trust_remote_code=True,
260
+ torch_dtype=torch.bfloat16 if on_cuda else torch.float32,
261
+ low_cpu_mem_usage=True,
262
+ device_map=device_map,
263
+ )
264
+ if hasattr(model, "config") and hasattr(model.config, "use_cache"):
265
+ model.config.use_cache = False
266
+ if on_cuda and hasattr(model, "gradient_checkpointing_enable"):
267
+ model.gradient_checkpointing_enable()
268
+ return model
269
+
270
+
271
+ def assert_no_meta_parameters(model):
272
+ meta_params = [name for name, param in model.named_parameters() if param.device.type == "meta"]
273
+ if meta_params:
274
+ sample = ", ".join(meta_params[:5])
275
+ raise RuntimeError(f"Model has {len(meta_params)} meta parameters after load: {sample}")
276
+
277
+
278
+ class StaticDeviceSFTTrainer(SFTTrainer):
279
+ def __init__(self, *args, skip_model_move=False, **kwargs):
280
+ self._skip_model_move = bool(skip_model_move)
281
+ super().__init__(*args, **kwargs)
282
+
283
+ def _move_model_to_device(self, model, device):
284
+ if self._skip_model_move:
285
+ return model
286
+ return super()._move_model_to_device(model, device)
287
+
288
+
289
+ def normalize_dataset_for_sft(dataset, tokenizer, split_name, strict_chat_template):
290
+ fallback_used = 0
291
+ render_errors = 0
292
+
293
+ def row_to_text(example):
294
+ nonlocal fallback_used
295
+ nonlocal render_errors
296
+ messages = example.get("messages")
297
+ if isinstance(messages, list):
298
+ try:
299
+ rendered = tokenizer.apply_chat_template(
300
+ messages,
301
+ tokenize=False,
302
+ add_generation_prompt=False,
303
+ )
304
+ if isinstance(rendered, str) and rendered.strip():
305
+ return {"text": rendered}
306
+ if strict_chat_template:
307
+ raise RuntimeError("chat template returned empty text")
308
+ except Exception as exc:
309
+ render_errors += 1
310
+ if strict_chat_template:
311
+ raise RuntimeError(
312
+ f"chat template failed in split={split_name}: {type(exc).__name__}: {exc}"
313
+ ) from exc
314
+ fallback_used += 1
315
+ return {"text": _fallback_chat_render(messages)}
316
+
317
+ prompt = example.get("prompt")
318
+ response = example.get("response")
319
+ completion = example.get("completion")
320
+ chosen = example.get("chosen")
321
+ text = example.get("text")
322
+
323
+ if isinstance(prompt, str) and isinstance(response, str):
324
+ return {"text": f"{prompt}\\n\\n{response}"}
325
+ if isinstance(prompt, str) and isinstance(completion, str):
326
+ return {"text": f"{prompt}\\n\\n{completion}"}
327
+ if isinstance(prompt, str) and isinstance(chosen, str):
328
+ return {"text": f"{prompt}\\n\\n{chosen}"}
329
+ if isinstance(text, str):
330
+ return {"text": text}
331
+ return {"text": _safe_json(example)}
332
+
333
+ mapped = dataset.map(
334
+ row_to_text,
335
+ desc=f"Normalize {split_name} dataset to text",
336
+ )
337
+ drop_columns = [column for column in mapped.column_names if column != "text"]
338
+ if drop_columns:
339
+ mapped = mapped.remove_columns(drop_columns)
340
+ print(
341
+ f"[normalize] split={split_name} rows={len(mapped)} "
342
+ f"fallback_used={fallback_used} render_errors={render_errors} "
343
+ f"strict_chat_template={strict_chat_template}"
344
+ )
345
+ return mapped
346
+
347
+
348
+ def build_sft_config(phase_name, phase_cfg, runtime_cfg, output_root, push_to_hub, hub_model_id, has_eval):
349
+ run_name_root = runtime_cfg.get("trackio_run_name", "two-phase-sft")
350
+ cfg_kwargs = {
351
+ "output_dir": os.path.join(output_root, phase_name),
352
+ "push_to_hub": push_to_hub,
353
+ "hub_model_id": hub_model_id if push_to_hub and hub_model_id else None,
354
+ "num_train_epochs": float(phase_cfg.get("num_train_epochs", 1)),
355
+ "per_device_train_batch_size": int(phase_cfg.get("per_device_train_batch_size", 4)),
356
+ "gradient_accumulation_steps": int(phase_cfg.get("gradient_accumulation_steps", 4)),
357
+ "learning_rate": float(phase_cfg.get("learning_rate", 2e-5)),
358
+ "logging_steps": int(runtime_cfg.get("logging_steps", 10)),
359
+ "save_strategy": "steps",
360
+ "save_steps": int(runtime_cfg.get("save_steps", 100)),
361
+ "save_total_limit": int(runtime_cfg.get("save_total_limit", 2)),
362
+ "warmup_ratio": float(runtime_cfg.get("warmup_ratio", 0.1)),
363
+ "lr_scheduler_type": str(runtime_cfg.get("lr_scheduler_type", "cosine")),
364
+ "dataset_text_field": "text",
365
+ "seed": int(runtime_cfg.get("seed", 42)),
366
+ "bf16": bool(torch.cuda.is_available()),
367
+ "gradient_checkpointing": bool(runtime_cfg.get("gradient_checkpointing", True)),
368
+ "report_to": "trackio",
369
+ "project": str(runtime_cfg.get("trackio_project", "wish-engine-jssg")),
370
+ "run_name": f"{run_name_root}-{phase_name}",
371
+ }
372
+ max_length = runtime_cfg.get("max_length")
373
+ if max_length is not None:
374
+ cfg_kwargs["max_length"] = int(max_length)
375
+ if has_eval:
376
+ cfg_kwargs["eval_strategy"] = "steps"
377
+ cfg_kwargs["eval_steps"] = int(runtime_cfg.get("eval_steps", 100))
378
+ cfg_kwargs = {k: v for k, v in cfg_kwargs.items() if v is not None}
379
+ return SFTConfig(**cfg_kwargs)
380
+
381
+
382
+ def build_lora_config(lora_cfg):
383
+ return LoraConfig(
384
+ r=int(lora_cfg.get("r", 16)),
385
+ lora_alpha=int(lora_cfg.get("lora_alpha", 32)),
386
+ lora_dropout=float(lora_cfg.get("lora_dropout", 0.05)),
387
+ bias=str(lora_cfg.get("bias", "none")),
388
+ task_type=str(lora_cfg.get("task_type", "CAUSAL_LM")),
389
+ target_modules=list(lora_cfg.get("target_modules", ["q_proj", "v_proj"])),
390
+ )
391
+
392
+
393
+ def train_phase(phase_name, model_ref_or_obj, tokenizer, phase_cfg, runtime_cfg, output_root, peft_config, push_to_hub, hub_model_id):
394
+ strict_chat_template = bool(runtime_cfg.get("strict_chat_template", True))
395
+ skip_model_move = bool(runtime_cfg.get("skip_trainer_model_move", True))
396
+ train_ds, eval_ds = load_phase_datasets(phase_cfg)
397
+ train_ds = normalize_dataset_for_sft(
398
+ train_ds,
399
+ tokenizer,
400
+ f"{phase_name}-train",
401
+ strict_chat_template=strict_chat_template,
402
+ )
403
+ if eval_ds is not None:
404
+ eval_ds = normalize_dataset_for_sft(
405
+ eval_ds,
406
+ tokenizer,
407
+ f"{phase_name}-eval",
408
+ strict_chat_template=strict_chat_template,
409
+ )
410
+ print(f"[{phase_name}] train={len(train_ds)} eval={len(eval_ds) if eval_ds is not None else 0}")
411
+ if len(train_ds) > 0:
412
+ print(f"[{phase_name}] sample_text_chars={len(train_ds[0]['text'])}")
413
+ sft_cfg = build_sft_config(
414
+ phase_name=phase_name,
415
+ phase_cfg=phase_cfg,
416
+ runtime_cfg=runtime_cfg,
417
+ output_root=output_root,
418
+ push_to_hub=push_to_hub,
419
+ hub_model_id=hub_model_id,
420
+ has_eval=eval_ds is not None,
421
+ )
422
+ trainer = StaticDeviceSFTTrainer(
423
+ model=model_ref_or_obj,
424
+ processing_class=tokenizer,
425
+ train_dataset=train_ds,
426
+ eval_dataset=eval_ds,
427
+ args=sft_cfg,
428
+ peft_config=peft_config,
429
+ skip_model_move=skip_model_move,
430
+ )
431
+ trainer.train()
432
+ trainer.save_model()
433
+ if push_to_hub:
434
+ trainer.push_to_hub()
435
+ return trainer
436
+
437
+
438
+ def main():
439
+ base_model = CONFIG["base_model"]
440
+ final_hub_model_id = CONFIG["final_hub_model_id"]
441
+ output_root = CONFIG.get("output_root", "wish-engine-two-phase-sft")
442
+ runtime_cfg = CONFIG.get("runtime", {})
443
+ phase1_cfg = CONFIG["phase1"]
444
+ phase2_cfg = CONFIG["phase2"]
445
+ lora_cfg = CONFIG.get("lora", {})
446
+ phase1_hub_model_id = CONFIG.get("phase1_hub_model_id")
447
+
448
+ print(f"[setup] base_model={base_model}")
449
+ print(f"[setup] final_hub_model_id={final_hub_model_id}")
450
+ print(f"[setup] skip_trainer_model_move={bool(runtime_cfg.get('skip_trainer_model_move', True))}")
451
+ print(f"[setup] force_single_device_model={bool(runtime_cfg.get('force_single_device_model', True))}")
452
+ tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, use_fast=True)
453
+ if tokenizer.pad_token is None and tokenizer.eos_token is not None:
454
+ tokenizer.pad_token = tokenizer.eos_token
455
+ tokenizer.padding_side = "right"
456
+
457
+ base_model_obj = load_base_model_for_training(base_model, runtime_cfg)
458
+ assert_no_meta_parameters(base_model_obj)
459
+
460
+ trainer1 = train_phase(
461
+ phase_name="phase1-focus",
462
+ model_ref_or_obj=base_model_obj,
463
+ tokenizer=tokenizer,
464
+ phase_cfg=phase1_cfg,
465
+ runtime_cfg=runtime_cfg,
466
+ output_root=output_root,
467
+ peft_config=build_lora_config(lora_cfg),
468
+ push_to_hub=bool(phase1_hub_model_id),
469
+ hub_model_id=phase1_hub_model_id,
470
+ )
471
+
472
+ assert_no_meta_parameters(trainer1.model)
473
+ if torch.cuda.is_available():
474
+ torch.cuda.empty_cache()
475
+
476
+ train_phase(
477
+ phase_name="phase2-curriculum",
478
+ model_ref_or_obj=trainer1.model,
479
+ tokenizer=tokenizer,
480
+ phase_cfg=phase2_cfg,
481
+ runtime_cfg=runtime_cfg,
482
+ output_root=output_root,
483
+ peft_config=None,
484
+ push_to_hub=True,
485
+ hub_model_id=final_hub_model_id,
486
+ )
487
+
488
+ trackio.finish()
489
+ print("[done] two-phase training complete")
490
+
491
+
492
+ if __name__ == "__main__":
493
+ main()
494
+ `;
495
+ };
496
+
497
+ const buildPayload = (
498
+ pythonScript: string,
499
+ options: Pick<CliOptions, "flavor" | "timeout">
500
+ ) => {
501
+ const jobParameters = {
502
+ script: pythonScript,
503
+ flavor: options.flavor,
504
+ timeout: options.timeout,
505
+ secrets: {
506
+ HF_TOKEN: "$HF_TOKEN",
507
+ },
508
+ };
509
+
510
+ return {
511
+ createdAt: new Date().toISOString(),
512
+ tool: "hf_jobs",
513
+ method: "uv",
514
+ parameters: jobParameters,
515
+ callSnippet: `hf_jobs("uv", ${JSON.stringify(jobParameters, null, 2)})`,
516
+ };
517
+ };
518
+
519
+ const main = () => {
520
+ const options = parseArgs(process.argv.slice(2));
521
+ const config = readJson<TwoPhaseConfig>(options.configPath);
522
+ validateConfig(config);
523
+
524
+ const pythonScript = generatePythonScript(config);
525
+ const payload = buildPayload(pythonScript, options);
526
+
527
+ if (options.dryRun) {
528
+ console.log(
529
+ JSON.stringify(
530
+ {
531
+ message: "Dry run OK",
532
+ configPath: options.configPath,
533
+ flavor: options.flavor,
534
+ timeout: options.timeout,
535
+ hasPhase1HubPush: Boolean(config.phase1_hub_model_id),
536
+ finalHubModelId: config.final_hub_model_id,
537
+ phase1Source: config.phase1.dataset_source,
538
+ phase2Source: config.phase2.dataset_source,
539
+ outputPath: options.outPath,
540
+ },
541
+ null,
542
+ 2
543
+ )
544
+ );
545
+ return;
546
+ }
547
+
548
+ writeJson(options.outPath, payload);
549
+ if (options.emitScriptPath) {
550
+ writeText(options.emitScriptPath, pythonScript);
551
+ }
552
+
553
+ console.log(
554
+ JSON.stringify(
555
+ {
556
+ message: "Two-phase HF Jobs payload generated",
557
+ configPath: options.configPath,
558
+ outPath: options.outPath,
559
+ emitScriptPath: options.emitScriptPath,
560
+ flavor: options.flavor,
561
+ timeout: options.timeout,
562
+ },
563
+ null,
564
+ 2
565
+ )
566
+ );
567
+ };
568
+
569
+ main();
artifacts/job-history.snapshot.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "699b2e2952d1c53b7df7d2d4",
4
+ "created_at": "2026-02-22 16:26:17.529000+00:00",
5
+ "status": "ERROR",
6
+ "message": "Job failed with exit code: 1. Reason: Error",
7
+ "url": "https://huggingface.co/jobs/sahilmob/699b2e2952d1c53b7df7d2d4",
8
+ "flavor": "a10g-large"
9
+ },
10
+ {
11
+ "id": "699b2eda52d1c53b7df7d2d6",
12
+ "created_at": "2026-02-22 16:29:14.495000+00:00",
13
+ "status": "ERROR",
14
+ "message": "Job failed with exit code: 1. Reason: Error",
15
+ "url": "https://huggingface.co/jobs/sahilmob/699b2eda52d1c53b7df7d2d6",
16
+ "flavor": "a10g-large"
17
+ },
18
+ {
19
+ "id": "699b30551aad19adb8aacbdb",
20
+ "created_at": "2026-02-22 16:35:33.569000+00:00",
21
+ "status": "ERROR",
22
+ "message": "Job failed with exit code: 1. Reason: Error",
23
+ "url": "https://huggingface.co/jobs/sahilmob/699b30551aad19adb8aacbdb",
24
+ "flavor": "a10g-large"
25
+ },
26
+ {
27
+ "id": "699b310652d1c53b7df7d2d8",
28
+ "created_at": "2026-02-22 16:38:30.446000+00:00",
29
+ "status": "RUNNING",
30
+ "message": null,
31
+ "url": "https://huggingface.co/jobs/sahilmob/699b310652d1c53b7df7d2d8",
32
+ "flavor": "a100-large"
33
+ }
34
+ ]
artifacts/two-phase-toolcall.hf.config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model": "openai/gpt-oss-20b",
3
+ "phase1_hub_model_id": "sahilmob/gpt-oss-20b-toolcall-phase1-v2-strict-lora",
4
+ "final_hub_model_id": "sahilmob/gpt-oss-20b-toolcall-two-phase-v2-lora",
5
+ "output_root": "gpt-oss-20b-toolcall-two-phase-v2-sft",
6
+ "phase1": {
7
+ "dataset_source": "hub",
8
+ "dataset_id": "sahilmob/wish-engine-toolcall-next-v2-strict",
9
+ "train_split": "train",
10
+ "eval_split": "validation",
11
+ "num_train_epochs": 2,
12
+ "per_device_train_batch_size": 1,
13
+ "gradient_accumulation_steps": 16,
14
+ "learning_rate": 1.2e-5
15
+ },
16
+ "phase2": {
17
+ "dataset_source": "hub",
18
+ "dataset_id": "sahilmob/wish-engine-toolcall-next-v2",
19
+ "train_split": "train",
20
+ "eval_split": "validation",
21
+ "num_train_epochs": 1,
22
+ "per_device_train_batch_size": 1,
23
+ "gradient_accumulation_steps": 16,
24
+ "learning_rate": 8e-6
25
+ },
26
+ "lora": {
27
+ "r": 16,
28
+ "lora_alpha": 32,
29
+ "lora_dropout": 0.05,
30
+ "bias": "none",
31
+ "task_type": "CAUSAL_LM",
32
+ "target_modules": [
33
+ "q_proj",
34
+ "k_proj",
35
+ "v_proj",
36
+ "o_proj"
37
+ ]
38
+ },
39
+ "runtime": {
40
+ "trackio_project": "wish-engine-toolcalls-gpt-oss-20b",
41
+ "trackio_run_name": "two-phase-toolcall-v2-sft-gpt-oss-20b",
42
+ "seed": 42,
43
+ "logging_steps": 10,
44
+ "save_steps": 100,
45
+ "save_total_limit": 2,
46
+ "eval_steps": 50,
47
+ "warmup_ratio": 0.1,
48
+ "lr_scheduler_type": "cosine",
49
+ "max_length": 1024,
50
+ "strict_chat_template": true,
51
+ "skip_trainer_model_move": true,
52
+ "force_single_device_model": true
53
+ }
54
+ }