visproj
/

proofkit-train-scripts

Model card Files Files and versions

xet

Community

visproj commited on Jun 7

Commit

e90bc49

verified ·

1 Parent(s): beace15

Upload train_gpt_oss.py with huggingface_hub

Browse files

Files changed (1) hide show

train_gpt_oss.py +130 -0

train_gpt_oss.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# /// script
+# dependencies = [
+#   "trl>=0.20,<0.24",
+#   "peft>=0.17,<0.18",
+#   "transformers>=4.55,<4.60",
+#   "accelerate>=1.7,<2",
+#   "datasets>=2.20,<4",
+#   "trackio",
+#   "kernels>=0.9,<0.10",
+# ]
+# ///
+# Deps are pinned on purpose: gpt-oss is a Mixture-of-Experts model whose
+# `kernels` lib must match `transformers`, and "latest of everything" makes them
+# clash at import. These caps are the validated pair, and are harmless for dense
+# models like Llama. See docs/FINETUNE_MODAL.md for the full story.
+"""ProofKit — fine-tune a small model (LoRA SFT) on Hugging Face Jobs.
+This script runs ON HUGGING FACE JOBS, not locally. It loads the ProofKit SFT
+dataset from the Hub, trains an attention-only LoRA adapter, and pushes it back
+to the Hub. It works for any base model; the intended HF Jobs target is a small
+dense model like meta-llama/Llama-3.2-3B-Instruct — fast and cheap on a T4, and
+the model that feeds ProofKit's GGUF / llama.cpp backend (the Llama Champion +
+Off the Grid badges). gpt-oss-20b is trained on Modal instead, where its MoE
+experts can be adapted on a bigger GPU — see scripts/modal_train_gpt_oss.py and
+docs/FINETUNE_MODAL.md.
+⚠️  The Jobs container is ephemeral — everything is deleted when the job ends.
+    `push_to_hub=True` (+ the HF_TOKEN secret) is what makes the result survive.
+Submit it from your terminal (after uploading this file to a Hub repo):
+    hf jobs uv run \\
+      --flavor a100-large \\
+      --timeout 3h \\
+      --secrets HF_TOKEN \\
+      "https://huggingface.co/visproj/proofkit-train-scripts/resolve/main/train_gpt_oss.py"
+Configuration is via environment variables (pass with `--env KEY=VALUE`):
+    BASE_MODEL    base model to tune        (default: openai/gpt-oss-20b)
+    DATASET_REPO  Hub dataset to train on   (default: visproj/proofkit-sft)
+    MODEL_REPO    Hub repo to push to       (default: visproj/proofkit-gpt-oss-20b-lora)
+    EPOCHS        training epochs           (default: 3)
+    LR            learning rate             (default: 2e-4)
+    MAX_LEN       max sequence length       (default: 1024)
+See docs/FINETUNE_HF_JOBS.md for the full runbook.
+"""
+import os
+from datasets import load_dataset
+from peft import LoraConfig, TaskType
+from trl import SFTConfig, SFTTrainer
+BASE_MODEL = os.environ.get("BASE_MODEL", "openai/gpt-oss-20b")
+DATASET_REPO = os.environ.get("DATASET_REPO", "visproj/proofkit-sft")
+MODEL_REPO = os.environ.get("MODEL_REPO", "visproj/proofkit-gpt-oss-20b-lora")
+EPOCHS = float(os.environ.get("EPOCHS", "3"))
+LR = float(os.environ.get("LR", "2e-4"))
+MAX_LEN = int(os.environ.get("MAX_LEN", "1024"))
+is_gpt_oss = "gpt-oss" in BASE_MODEL.lower()
+print(f"Base model : {BASE_MODEL}", flush=True)
+print(f"Dataset    : {DATASET_REPO}", flush=True)
+print(f"Push to    : {MODEL_REPO}", flush=True)
+dataset = load_dataset(DATASET_REPO, split="train")
+print(f"Examples   : {len(dataset)}", flush=True)
+model_init_kwargs = {
+    "attn_implementation": "eager",
+    "torch_dtype": "auto",
+    "use_cache": False,
+}
+# Only gpt-oss ships MXFP4-quantized MoE weights that need dequantizing to train.
+# Dense models (Llama, Qwen, …) must NOT get a quantization_config — applying one
+# to a non-quantized model is meaningless and can error.
+if is_gpt_oss:
+    try:
+        from transformers import Mxfp4Config
+        model_init_kwargs["quantization_config"] = Mxfp4Config(dequantize=True)
+        print("MXFP4 dequantize: on", flush=True)
+    except Exception:
+        print("MXFP4 dequantize: unavailable (training in native dtype)", flush=True)
+# Attention-only LoRA over all linear layers — the standard, reliable recipe that
+# works for any architecture (attention projections + MLP/router linears). We do
+# NOT adapt gpt-oss's fused MoE experts here: `target_parameters` would fail to
+# match on a dense model like Llama, and on gpt-oss it needs a 141 GB GPU. HF Jobs
+# is ProofKit's small-model path, so attention-only is exactly the right recipe.
+# (Expert adaptation lives in scripts/modal_train_gpt_oss.py with TUNE_EXPERTS=1.)
+lora = LoraConfig(
+    r=8,
+    lora_alpha=16,
+    lora_dropout=0.05,
+    bias="none",
+    task_type=TaskType.CAUSAL_LM,
+    target_modules="all-linear",
+)
+args = SFTConfig(
+    output_dir="proofkit-gpt-oss-20b",
+    num_train_epochs=EPOCHS,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=8,        # effective batch size = 8
+    learning_rate=LR,
+    max_length=MAX_LEN,
+    bf16=True,
+    gradient_checkpointing=True,
+    logging_steps=10,
+    save_strategy="no",                   # small run — push the final model once at the end
+    push_to_hub=True,                     # ← results survive the ephemeral container
+    hub_model_id=MODEL_REPO,
+    report_to="trackio",                  # live metrics at https://huggingface.co/<you>/trackio
+    run_name="gpt-oss-20b-lora-sft",
+    model_init_kwargs=model_init_kwargs,
+)
+trainer = SFTTrainer(
+    model=BASE_MODEL,
+    train_dataset=dataset,
+    peft_config=lora,
+    args=args,
+)
+print("Training...", flush=True)
+trainer.train()
+trainer.push_to_hub()
+print(f"Done. Adapter pushed to https://huggingface.co/{MODEL_REPO}", flush=True)