Spaces:

Ev3Dev
/

hackathon

Running

App Files Files Community

Ev3Dev commited on Mar 8

Commit

2bf5069

verified ·

1 Parent(s): ad39f2a

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

_dashboard_state.json +94 -123
colab_train_llama32_remote.py +340 -0
colab_train_unsloth.ipynb +347 -128
pyproject.toml +0 -2
tests/test_colab_train_llama32_remote.py +17 -0
unsloth_2048.ipynb +0 -0
unsloth_compiled_cache/.locks/.lock.UnslothDPOTrainer.py +0 -0
unsloth_compiled_cache/.locks/.lock.UnslothGRPOTrainer.py +0 -0
unsloth_compiled_cache/.locks/.lock.UnslothNashMDTrainer.py +0 -0
unsloth_compiled_cache/.locks/.lock.UnslothOnlineDPOTrainer.py +0 -0
unsloth_compiled_cache/.locks/.lock.UnslothRLOOTrainer.py +0 -0
unsloth_compiled_cache/.locks/.lock.UnslothXPOTrainer.py +0 -0
unsloth_compiled_cache/UnslothCPOTrainer.py +1 -1
unsloth_compiled_cache/UnslothDPOTrainer.py +0 -0
unsloth_compiled_cache/UnslothGRPOTrainer.py +0 -0
unsloth_compiled_cache/UnslothKTOTrainer.py +1 -1
unsloth_compiled_cache/UnslothNashMDTrainer.py +1340 -0
unsloth_compiled_cache/UnslothORPOTrainer.py +1 -1
unsloth_compiled_cache/UnslothOnlineDPOTrainer.py +0 -0
unsloth_compiled_cache/UnslothRLOOTrainer.py +0 -0
unsloth_compiled_cache/UnslothXPOTrainer.py +1385 -0

_dashboard_state.json CHANGED Viewed

@@ -1,29 +1,29 @@
 {
-  "timestamp": 1772986999.4539518,
   "step": 20,
   "episode_done": false,
-  "cumulative_reward": -1.2923024829860532,
-  "gen_time_s": 17.14,
-  "model_response_raw": "{\n\"action_type\": \"design_followup_experiment\",\n\"method\": \"\",\n\"parameters\": {\n\"candidate_target_genes\": [],\n\"suggested_wet_lab_targets\": []\n},\n\"Justification\": \"The previous attempt failed due to incomplete genomic profiling required for robust marker identification via CRISPR/Cas9 editing followed by functional readout. Without identifying specific high-confidence target genes within the identified clusters through comprehensive transcriptomic characterization first, designing a focused knockdown screen lacks the necessary molecular basis.\",\n\"Confidence\": 0.85\n}",
   "model_thinking": "",
   "thinking_enabled": false,
   "task": {
-    "problem_statement": "Validate candidate biomarker ERBB2 for breast cancer in breast tissue using single-cell RNA sequencing.",
     "organism": "human",
-    "tissue": "breast",
     "modality": "scRNA-seq",
     "conditions": [
       "healthy",
-      "breast_cancer"
     ],
-    "budget_limit": 89988.7088221059,
-    "time_limit_days": 160.54122628462622
   },
   "resources": {
     "budget_used": 28900.0,
-    "budget_remaining": 61088.71,
     "time_used_days": 18.0,
-    "time_remaining_days": 142.5,
     "samples_consumed": 0,
     "compute_hours_used": 12.0
   },
@@ -32,9 +32,9 @@
       "step_index": 1,
       "action_type": "collect_sample",
       "method": null,
-      "output_summary": "Collected 6 samples (quality=0.42)",
       "success": true,
-      "quality_score": 0.422,
       "resource_cost": 5000.0,
       "time_cost_days": 7.0
     },
@@ -62,9 +62,9 @@
       "step_index": 4,
       "action_type": "prepare_library",
       "method": null,
-      "output_summary": "Library prepared (complexity=0.59)",
       "success": true,
-      "quality_score": 0.595,
       "resource_cost": 8000.0,
       "time_cost_days": 3.0
     },
@@ -72,42 +72,42 @@
       "step_index": 5,
       "action_type": "sequence_cells",
       "method": null,
-      "output_summary": "Sequenced 8578 cells, 9244 genes detected, median UMI=3090",
       "success": true,
-      "quality_score": 0.359,
       "resource_cost": 15000.0,
       "time_cost_days": 5.0
     },
     {
       "step_index": 6,
       "action_type": "run_qc",
       "method": null,
       "output_summary": "QC metrics computed",
       "success": true,
-      "quality_score": 0.821,
       "resource_cost": 100.0,
       "time_cost_days": 0.5
     },
     {
-      "step_index": 7,
       "action_type": "filter_data",
       "method": null,
-      "output_summary": "Filtered 8578 \u2192 7050 cells (82% retained)",
       "success": true,
-      "quality_score": 0.822,
       "resource_cost": 50.0,
       "time_cost_days": 0.2
     },
-    {
-      "step_index": 8,
-      "action_type": "cluster_cells",
-      "method": null,
-      "output_summary": "Action blocked: Cannot cluster before normalisation",
-      "success": false,
-      "quality_score": 1.0,
-      "resource_cost": 100.0,
-      "time_cost_days": 0.5
-    },
     {
       "step_index": 9,
       "action_type": "normalize_data",
@@ -120,11 +120,11 @@
     },
     {
       "step_index": 10,
-      "action_type": "marker_selection",
       "method": null,
-      "output_summary": "Action blocked: Cannot select markers without DE results",
-      "success": false,
-      "quality_score": 1.0,
       "resource_cost": 100.0,
       "time_cost_days": 0.5
     },
@@ -132,20 +132,20 @@
       "step_index": 11,
       "action_type": "design_followup_experiment",
       "method": null,
-      "output_summary": "Action blocked: Follow-up design without prior analysis is blocked; complete wet-lab and computational steps first",
-      "success": false,
-      "quality_score": 1.0,
       "resource_cost": 100.0,
       "time_cost_days": 0.5
     },
     {
       "step_index": 12,
-      "action_type": "cluster_cells",
       "method": null,
-      "output_summary": "Found 7 clusters",
-      "success": true,
-      "quality_score": 0.741,
-      "resource_cost": 100.0,
       "time_cost_days": 0.5
     },
     {
@@ -160,26 +160,6 @@
     },
     {
       "step_index": 14,
-      "action_type": "design_followup_experiment",
-      "method": null,
-      "output_summary": "Follow-up experiment design proposed (evidence_signals=1)",
-      "success": true,
-      "quality_score": 0.28,
-      "resource_cost": 100.0,
-      "time_cost_days": 0.5
-    },
-    {
-      "step_index": 15,
-      "action_type": "synthesize_conclusion",
-      "method": null,
-      "output_summary": "Action blocked: Cannot synthesise conclusion before discovering or validating markers; Cannot synthesise conclusion befo",
-      "success": false,
-      "quality_score": 1.0,
-      "resource_cost": 0.0,
-      "time_cost_days": 0.5
-    },
-    {
-      "step_index": 16,
       "action_type": "synthesize_conclusion",
       "method": null,
       "output_summary": "Action blocked: Cannot synthesise conclusion before discovering or validating markers; Cannot synthesise conclusion befo",
@@ -222,8 +202,8 @@
     "Cannot synthesise conclusion before inferring pathways or mechanisms"
   ],
   "uncertainty_summary": {
-    "avg_uncertainty": 0.177,
-    "avg_quality": 0.804
   },
   "reward_breakdown": {
     "validity": -1.0,
@@ -249,92 +229,83 @@
   "latent": {
     "cell_populations": [
       {
-        "name": "luminal_epithelial",
-        "proportion": 0.433,
         "marker_genes": [
-          "KRT8",
-          "KRT18",
-          "EPCAM"
         ],
-        "state": "normal"
       },
       {
-        "name": "basal_epithelial",
-        "proportion": 0.157,
         "marker_genes": [
-          "KRT14",
-          "KRT5",
-          "TP63"
         ],
         "state": "normal"
       },
       {
-        "name": "fibroblast",
-        "proportion": 0.119,
         "marker_genes": [
-          "COL1A1",
-          "COL3A1",
-          "FAP"
         ],
         "state": "quiescent"
       },
       {
-        "name": "T_cell",
-        "proportion": 0.105,
-        "marker_genes": [
-          "CD3D",
-          "CD3E",
-          "CD8A"
-        ],
-        "state": "activated"
-      },
-      {
-        "name": "macrophage",
-        "proportion": 0.096,
         "marker_genes": [
-          "CD68",
-          "CD163",
-          "CSF1R"
         ],
-        "state": "inflammatory"
       },
       {
-        "name": "endothelial",
-        "proportion": 0.09,
         "marker_genes": [
-          "PECAM1",
-          "VWF",
-          "CDH5"
         ],
-        "state": "quiescent"
       }
     ],
     "true_markers": [
-      "ERBB2",
-      "MKI67",
-      "CD274",
-      "VIM"
     ],
     "causal_mechanisms": [
-      "ERBB2-driven proliferative signalling",
-      "immune evasion via PD-L1 upregulation"
     ],
     "true_pathways": {
-      "cell_cycle": 0.889,
-      "PI3K_AKT_signalling": 0.803,
-      "EMT": 0.757,
-      "immune_checkpoint": 0.579,
-      "estrogen_signalling": 0.644
     },
-    "true_de_genes_count": 9,
-    "true_regulatory_network_size": 9,
     "confounders": {},
-    "n_true_cells": 14493,
     "technical": {
-      "ambient_rna_fraction": 0.05700028722692205,
-      "doublet_rate": 0.0716382392677839,
-      "dropout_rate": 0.1822398381996976,
-      "sample_quality": 0.7058144963381642,
       "library_complexity": 0.8,
       "capture_efficiency": 0.6
     },
@@ -359,8 +330,8 @@
       "followup_designed": true,
       "subagent_review_requested": false,
       "conclusion_reached": false,
-      "n_cells_sequenced": 8578,
-      "n_cells_after_filter": 7050,
       "n_clusters_found": "7",
       "n_de_genes_found": null,
       "n_markers_found": null

 {
+  "timestamp": 1772994420.314485,
   "step": 20,
   "episode_done": false,
+  "cumulative_reward": 1.8024931955050638,
+  "gen_time_s": 23.52,
+  "model_response_raw": "{\n\"action_type\": \"design_followup_experiment\",\n\"method\": \"\",\n\"parameters\": {\n\"disease_comparison_groups\": true,\n\"treatment_condition_focus\": false,\n\"criterion_for_target_genes\": \"Differentially Expressed Between Healthy vs AD Brain Cells\"\n},\n\"Justification\": \"The primary goal remains identifying DGEs across conditions rather than targeting specific known pathways first. While 'filter_data' was previously performed but failed due to lack of validated markers from previous attempts, re-running filtering based solely on technical/quality criteria would still leave ambiguous candidates if noise pe",
   "model_thinking": "",
   "thinking_enabled": false,
   "task": {
+    "problem_statement": "Identify differentially expressed genes between Alzheimer's disease and healthy brain tissue using single-cell RNA sequencing.",
     "organism": "human",
+    "tissue": "brain",
     "modality": "scRNA-seq",
     "conditions": [
       "healthy",
+      "Alzheimer"
     ],
+    "budget_limit": 84032.3891133686,
+    "time_limit_days": 116.86792941835967
   },
   "resources": {
     "budget_used": 28900.0,
+    "budget_remaining": 55132.39,
     "time_used_days": 18.0,
+    "time_remaining_days": 98.9,
     "samples_consumed": 0,
     "compute_hours_used": 12.0
   },
       "step_index": 1,
       "action_type": "collect_sample",
       "method": null,
+      "output_summary": "Collected 6 samples (quality=0.56)",
       "success": true,
+      "quality_score": 0.562,
       "resource_cost": 5000.0,
       "time_cost_days": 7.0
     },
       "step_index": 4,
       "action_type": "prepare_library",
       "method": null,
+      "output_summary": "Library prepared (complexity=0.75)",
       "success": true,
+      "quality_score": 0.751,
       "resource_cost": 8000.0,
       "time_cost_days": 3.0
     },
       "step_index": 5,
       "action_type": "sequence_cells",
       "method": null,
+      "output_summary": "Sequenced 4574 cells, 9256 genes detected, median UMI=3083",
       "success": true,
+      "quality_score": 0.47,
       "resource_cost": 15000.0,
       "time_cost_days": 5.0
     },
     {
       "step_index": 6,
+      "action_type": "filter_data",
+      "method": null,
+      "output_summary": "Action blocked: Cannot filter data before QC",
+      "success": false,
+      "quality_score": 1.0,
+      "resource_cost": 50.0,
+      "time_cost_days": 0.2
+    },
+    {
+      "step_index": 7,
       "action_type": "run_qc",
       "method": null,
       "output_summary": "QC metrics computed",
       "success": true,
+      "quality_score": 0.829,
       "resource_cost": 100.0,
       "time_cost_days": 0.5
     },
     {
+      "step_index": 8,
       "action_type": "filter_data",
       "method": null,
+      "output_summary": "Filtered 4574 \u2192 3996 cells (87% retained)",
       "success": true,
+      "quality_score": 0.874,
       "resource_cost": 50.0,
       "time_cost_days": 0.2
     },
     {
       "step_index": 9,
       "action_type": "normalize_data",
     },
     {
       "step_index": 10,
+      "action_type": "cluster_cells",
       "method": null,
+      "output_summary": "Found 7 clusters",
+      "success": true,
+      "quality_score": 0.79,
       "resource_cost": 100.0,
       "time_cost_days": 0.5
     },
       "step_index": 11,
       "action_type": "design_followup_experiment",
       "method": null,
+      "output_summary": "Follow-up experiment design proposed (evidence_signals=1)",
+      "success": true,
+      "quality_score": 0.28,
       "resource_cost": 100.0,
       "time_cost_days": 0.5
     },
     {
       "step_index": 12,
+      "action_type": "synthesize_conclusion",
       "method": null,
+      "output_summary": "Action blocked: Cannot synthesise conclusion before discovering or validating markers; Cannot synthesise conclusion befo",
+      "success": false,
+      "quality_score": 1.0,
+      "resource_cost": 0.0,
       "time_cost_days": 0.5
     },
     {
     },
     {
       "step_index": 14,
       "action_type": "synthesize_conclusion",
       "method": null,
       "output_summary": "Action blocked: Cannot synthesise conclusion before discovering or validating markers; Cannot synthesise conclusion befo",
     "Cannot synthesise conclusion before inferring pathways or mechanisms"
   ],
   "uncertainty_summary": {
+    "avg_uncertainty": 0.224,
+    "avg_quality": 0.814
   },
   "reward_breakdown": {
     "validity": -1.0,
   "latent": {
     "cell_populations": [
       {
+        "name": "excitatory_neuron",
+        "proportion": 0.349,
         "marker_genes": [
+          "SLC17A7",
+          "CAMK2A",
+          "NRGN"
         ],
+        "state": "stressed"
       },
       {
+        "name": "inhibitory_neuron",
+        "proportion": 0.209,
         "marker_genes": [
+          "GAD1",
+          "GAD2",
+          "SLC32A1"
         ],
         "state": "normal"
       },
       {
+        "name": "astrocyte",
+        "proportion": 0.211,
         "marker_genes": [
+          "GFAP",
+          "AQP4",
+          "SLC1A3"
         ],
         "state": "quiescent"
       },
       {
+        "name": "oligodendrocyte",
+        "proportion": 0.153,
         "marker_genes": [
+          "MBP",
+          "PLP1",
+          "MOG"
         ],
+        "state": "myelinating"
       },
       {
+        "name": "OPC",
+        "proportion": 0.078,
         "marker_genes": [
+          "PDGFRA",
+          "CSPG4",
+          "OLIG2"
         ],
+        "state": "progenitor"
       }
     ],
     "true_markers": [
+      "TREM2",
+      "APOE",
+      "GFAP",
+      "C1QA"
     ],
     "causal_mechanisms": [
+      "TREM2-mediated microglial activation in amyloid clearance",
+      "complement-driven synaptic pruning",
+      "reactive astrogliosis amplifying neuroinflammation"
     ],
     "true_pathways": {
+      "complement_cascade": 0.839,
+      "neuroinflammation": 0.805,
+      "amyloid_processing": 0.666,
+      "synaptic_signalling": 0.394,
+      "lipid_metabolism": 0.674
     },
+    "true_de_genes_count": 10,
+    "true_regulatory_network_size": 0,
     "confounders": {},
+    "n_true_cells": 7619,
     "technical": {
+      "ambient_rna_fraction": 0.04108598341080635,
+      "doublet_rate": 0.045763110874719674,
+      "dropout_rate": 0.07138299827651534,
+      "sample_quality": 0.9242864922806572,
       "library_complexity": 0.8,
       "capture_efficiency": 0.6
     },
       "followup_designed": true,
       "subagent_review_requested": false,
       "conclusion_reached": false,
+      "n_cells_sequenced": 4574,
+      "n_cells_after_filter": 3996,
       "n_clusters_found": "7",
       "n_de_genes_found": null,
       "n_markers_found": null

colab_train_llama32_remote.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""Minimal Colab entrypoint for Unsloth GRPO against a remote OpenEnv Space.
+This keeps the repo's prompt formatting and action parsing logic, but builds
+prompt states by interacting with a deployed OpenEnv Hugging Face Space instead
+of the local in-process environment. That makes the Colab workflow match the
+remote environment users actually want to train against.
+"""
+from __future__ import annotations
+import argparse
+import json
+import random
+from typing import Any, Dict, List, Optional, Sequence
+from client import BioExperimentEnv
+import training_script as base
+DEFAULT_MODEL_ID = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
+DEFAULT_OUTPUT_DIR = "artifacts/grpo-unsloth-llama32-3b-space"
+DEFAULT_SPACE_REPO_ID = "Ev3Dev/hackathon"
+def hf_space_repo_to_base_url(repo_id: str) -> str:
+    """Convert `owner/space-name` to the standard `hf.space` URL."""
+    owner, space_name = repo_id.split("/", 1)
+    normalized_owner = owner.strip().lower().replace("_", "-")
+    normalized_space = space_name.strip().lower().replace("_", "-")
+    return f"https://{normalized_owner}-{normalized_space}.hf.space"
+def require_unsloth_base():
+    # Unsloth must be imported before trl / transformers / peft.
+    import unsloth  # noqa: F401
+    import training_unsloth as unsloth_base
+    return unsloth_base
+def build_argument_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Train Unsloth Llama 3.2 3B on a remote OpenEnv Hugging Face Space."
+    )
+    parser.add_argument("--model-id", default=DEFAULT_MODEL_ID)
+    parser.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR)
+    parser.add_argument("--dataset-episodes", type=int, default=8)
+    parser.add_argument("--rollout-steps", type=int, default=6)
+    parser.add_argument(
+        "--collection-policy",
+        choices=["random", "heuristic"],
+        default="heuristic",
+    )
+    parser.add_argument("--base-url", default="")
+    parser.add_argument(
+        "--space-repo-id",
+        default=DEFAULT_SPACE_REPO_ID,
+        help="Hugging Face Space repo id, for example `Ev3Dev/hackathon`.",
+    )
+    parser.add_argument("--num-generations", type=int, default=2)
+    parser.add_argument("--max-completion-length", type=int, default=160)
+    parser.add_argument("--max-prompt-length", type=int, default=1280)
+    parser.add_argument("--max-seq-length", type=int, default=2048)
+    parser.add_argument("--per-device-train-batch-size", type=int, default=1)
+    parser.add_argument("--gradient-accumulation-steps", type=int, default=4)
+    parser.add_argument("--learning-rate", type=float, default=5e-6)
+    parser.add_argument("--num-train-epochs", type=float, default=1.0)
+    parser.add_argument("--logging-steps", type=int, default=1)
+    parser.add_argument("--save-steps", type=int, default=25)
+    parser.add_argument("--plot-metric-key", default=None)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--load-model-only", action="store_true")
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--disable-4bit", action="store_true")
+    parser.add_argument("--lora-r", type=int, default=unsloth_defaults()["lora_r"])
+    parser.add_argument(
+        "--lora-alpha", type=int, default=unsloth_defaults()["lora_alpha"]
+    )
+    parser.add_argument(
+        "--lora-dropout", type=float, default=unsloth_defaults()["lora_dropout"]
+    )
+    return parser
+def unsloth_defaults() -> Dict[str, float]:
+    return {
+        "lora_r": 16,
+        "lora_alpha": 16,
+        "lora_dropout": 0.0,
+    }
+def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
+    args = build_argument_parser().parse_args(argv)
+    if not args.base_url:
+        args.base_url = hf_space_repo_to_base_url(args.space_repo_id)
+    return args
+def make_training_args(**overrides: Any) -> argparse.Namespace:
+    parser = build_argument_parser()
+    defaults = vars(parser.parse_args([]))
+    unknown = sorted(set(overrides) - set(defaults))
+    if unknown:
+        raise ValueError(f"Unknown training args: {', '.join(unknown)}")
+    defaults.update(overrides)
+    args = argparse.Namespace(**defaults)
+    if not getattr(args, "base_url", ""):
+        args.base_url = hf_space_repo_to_base_url(args.space_repo_id)
+    return args
+def build_remote_prompt_examples(args: argparse.Namespace) -> List[Dict[str, str]]:
+    """Collect prompt states directly from the remote OpenEnv server."""
+    rng = random.Random(args.seed)
+    examples: List[Dict[str, str]] = []
+    for _episode_idx in range(args.dataset_episodes):
+        with BioExperimentEnv(base_url=args.base_url) as env:
+            result = env.reset()
+            obs = result.observation
+            history_actions: List[base.ExperimentAction] = []
+            for step_idx in range(args.rollout_steps):
+                if obs.done:
+                    break
+                next_action = base.build_experiment_action(
+                    action_type=base.pick_action(
+                        args.collection_policy,
+                        step_idx,
+                        [action.action_type for action in history_actions],
+                    ),
+                    discovered_markers=obs.discovered_markers,
+                    candidate_mechanisms=obs.candidate_mechanisms,
+                    conditions=obs.task.conditions,
+                )
+                examples.append(
+                    {
+                        "prompt": base.build_training_prompt(obs),
+                        "history_actions": json.dumps(
+                            [action.model_dump() for action in history_actions]
+                        ),
+                        "reference_action": base.action_completion_json(next_action),
+                        "problem_statement": obs.task.problem_statement,
+                        "episode_tag": f"remote-{rng.randrange(10**9):09d}",
+                    }
+                )
+                history_actions.append(next_action)
+                result = env.step(next_action)
+                obs = result.observation
+                if result.done:
+                    break
+    return examples
+class RemoteSpaceReward:
+    """Reward function that replays each candidate against the remote Space."""
+    def __init__(
+        self,
+        *,
+        base_url: str,
+        invalid_action_penalty: float = base.INVALID_ACTION_PENALTY,
+        environment_error_penalty: float = base.ENVIRONMENT_ERROR_PENALTY,
+    ) -> None:
+        self.__name__ = "remote_space_reward"
+        self.base_url = base_url
+        self.invalid_action_penalty = invalid_action_penalty
+        self.environment_error_penalty = environment_error_penalty
+    def __call__(
+        self,
+        completions: List[Any],
+        history_actions: Optional[List[str]] = None,
+        **_: Any,
+    ) -> List[float]:
+        history_columns = base.normalise_column(history_actions, len(completions))
+        rewards: List[float] = []
+        for completion, current_history in zip(completions, history_columns):
+            action = base.parse_action_completion(base.completion_to_text(completion))
+            if action is None:
+                rewards.append(self.invalid_action_penalty)
+                continue
+            try:
+                rewards.append(self._score_remote(action, current_history))
+            except Exception:
+                rewards.append(self.environment_error_penalty)
+        return rewards
+    def _score_remote(
+        self,
+        action: base.ExperimentAction,
+        history_actions: Optional[str],
+    ) -> float:
+        with BioExperimentEnv(base_url=self.base_url) as env:
+            result = env.reset()
+            obs = result.observation
+            for previous_action in base.decode_history_actions(history_actions):
+                result = env.step(previous_action)
+                obs = result.observation
+                if result.done:
+                    return float(result.reward or obs.reward or 0.0)
+            action = base.ensure_conclusion_claims(obs, action)
+            result = env.step(action)
+            if result.reward is not None:
+                return float(result.reward)
+            return float(result.observation.reward)
+def run_dry_run_preview(
+    examples: Sequence[Dict[str, str]],
+    reward_fn: RemoteSpaceReward,
+    output_dir: str,
+    base_url: str,
+) -> None:
+    if not examples:
+        raise ValueError("No training prompts were generated for the dry run.")
+    sample = examples[0]
+    sample_reward = reward_fn(
+        completions=[[{"role": "assistant", "content": sample["reference_action"]}]],
+        history_actions=[sample["history_actions"]],
+    )[0]
+    print(f"Built {len(examples)} remote prompt states.")
+    print(f"Remote OpenEnv Space: {base_url}")
+    print(f"Output directory: {output_dir}")
+    print(f"Sample reward for reference action: {sample_reward:+.3f}")
+    print("\nSample prompt:\n")
+    print(sample["prompt"])
+def run_training(args: argparse.Namespace) -> Dict[str, Any]:
+    random.seed(args.seed)
+    runtime = base.resolve_torch_runtime()
+    unsloth_base = require_unsloth_base()
+    if args.load_model_only:
+        tokenizer, model = unsloth_base.load_model_artifacts(
+            args.model_id,
+            trust_remote_code=args.trust_remote_code,
+            max_seq_length=args.max_seq_length,
+            load_in_4bit=not args.disable_4bit,
+            fast_inference=False,
+            prepare_for_inference=True,
+        )
+        return {
+            "args": args,
+            "runtime": runtime,
+            "tokenizer": tokenizer,
+            "model": model,
+        }
+    examples = build_remote_prompt_examples(args)
+    reward_fn = RemoteSpaceReward(base_url=args.base_url)
+    if args.dry_run:
+        run_dry_run_preview(examples, reward_fn, args.output_dir, args.base_url)
+        return {
+            "args": args,
+            "runtime": runtime,
+            "examples": examples,
+            "reward_fn": reward_fn,
+        }
+    from datasets import Dataset
+    FastLanguageModel = unsloth_base.patch_unsloth_grpo()
+    train_dataset = Dataset.from_list(examples)
+    tokenizer, model = unsloth_base.load_model_artifacts(
+        args.model_id,
+        trust_remote_code=args.trust_remote_code,
+        max_seq_length=args.max_seq_length,
+        load_in_4bit=not args.disable_4bit,
+        fast_inference=False,
+    )
+    model = unsloth_base.apply_lora_adapters(FastLanguageModel, model, args)
+    print(
+        f"Training runtime: device={runtime['device']} "
+        f"name={runtime['device_name']} "
+        f"dtype={runtime['dtype']} "
+        f"load_in_4bit={not args.disable_4bit}"
+    )
+    print(f"Remote OpenEnv Space: {args.base_url}")
+    print(f"Collected remote prompt states: {len(examples)}")
+    trainer = unsloth_base.build_unsloth_grpo_trainer(
+        model=model,
+        tokenizer=tokenizer,
+        reward_func=reward_fn,
+        train_dataset=train_dataset,
+        args=args,
+        runtime=runtime,
+    )
+    for attr in ("image_token_id", "vision_start_token_id", "vision_end_token_id"):
+        if not hasattr(trainer, attr):
+            setattr(trainer, attr, None)
+    trainer.train()
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    plot_paths = base.save_training_plots(
+        trainer.state.log_history,
+        args.output_dir,
+        metric_key=args.plot_metric_key,
+    )
+    print("Saved training plots:")
+    for plot_name, plot_path in plot_paths.items():
+        print(f"  - {plot_name}: {plot_path}")
+    return {
+        "args": args,
+        "runtime": runtime,
+        "examples": examples,
+        "reward_fn": reward_fn,
+        "train_dataset": train_dataset,
+        "tokenizer": tokenizer,
+        "model": model,
+        "trainer": trainer,
+        "plot_paths": plot_paths,
+    }
+def main() -> None:
+    run_training(parse_args())
+if __name__ == "__main__":
+    main()

colab_train_unsloth.ipynb CHANGED Viewed

@@ -1,128 +1,347 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Unsloth training on Colab\n",
-    "\n",
-    "Minimal setup: clone repo → install deps → run GRPO training with Unsloth (Qwen3-4B, 4-bit + LoRA).\n",
-    "\n",
-    "**Runtime**: Enable a GPU (e.g. T4) in Colab: Runtime → Change runtime type → GPU."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 1. Clone repo (set branch/tag if needed)\n",
-    "REPO_URL = \"https://github.com/mhtruong1031/OpenENV-Hackathon.git\"  # or your fork\n",
-    "REPO_DIR = \"OpenENV-Hackathon\"\n",
-    "\n",
-    "!git clone --depth 1 {REPO_URL} {REPO_DIR}\n",
-    "%cd {REPO_DIR}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 2. Install requirements: project + train extras + Unsloth (no-deps to keep trl>=0.29)\n",
-    "!pip install -q -e \".[train]\"\n",
-    "!pip install -q unsloth unsloth_zoo --no-deps\n",
-    "\n",
-    "# Optional: reward backends\n",
-    "!pip install -q sentence-transformers gseapy 2>/dev/null || true"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 3. Unsloth must be imported before trl/transformers/peft\n",
-    "import unsloth  # noqa: F401\n",
-    "import torch\n",
-    "from pathlib import Path\n",
-    "\n",
-    "from training_unsloth import make_training_args, run_training\n",
-    "\n",
-    "print(\"CUDA:\", torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"\")\n",
-    "Path(\"artifacts\").mkdir(exist_ok=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 4. Training config (small run for Colab T4)\n",
-    "args = make_training_args(\n",
-    "    model_id=\"Qwen/Qwen3-4B-Base\",\n",
-    "    output_dir=\"artifacts/grpo-unsloth-qwen3-4b\",\n",
-    "    dataset_episodes=16,\n",
-    "    rollout_steps=10,\n",
-    "    collection_policy=\"heuristic\",\n",
-    "    reward_backend=\"local\",\n",
-    "    domain_randomise=True,\n",
-    "    num_generations=4,\n",
-    "    max_completion_length=160,\n",
-    "    max_prompt_length=1280,\n",
-    "    max_seq_length=2048,\n",
-    "    per_device_train_batch_size=2,\n",
-    "    gradient_accumulation_steps=4,\n",
-    "    learning_rate=5e-6,\n",
-    "    num_train_epochs=1.0,\n",
-    "    logging_steps=1,\n",
-    "    save_steps=25,\n",
-    "    trust_remote_code=True,\n",
-    "    dry_run=False,\n",
-    "    seed=42,\n",
-    ")\n",
-    "args"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 5. Run training\n",
-    "result = run_training(args)\n",
-    "print(\"Plots:\", result[\"plot_paths\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 6. (Optional) Show loss curves\n",
-    "from IPython.display import Image, display\n",
-    "for name, path in result[\"plot_paths\"].items():\n",
-    "    display(Image(filename=path))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Minimal Unsloth GRPO on Colab with a remote OpenEnv Space\n",
+        "\n",
+        "This notebook is intentionally similar to the 2048 notebook pattern:\n",
+        "- training runs locally inside Colab\n",
+        "- the environment is accessed remotely through a Hugging Face Space\n",
+        "- the reward function is defined in notebook code by replaying actions against that remote env\n",
+        "- prompt / action / conclusion formatting mirrors the repo logic without importing the repo training script\n",
+        "\n",
+        "Default remote env: `Ev3Dev/hackathon`\n",
+        "\n",
+        "**Runtime**: Enable a GPU in Colab: Runtime -> Change runtime type -> GPU."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 1. Clone the repo for lightweight client / model definitions only\n",
+        "REPO_URL = \"https://github.com/mhtruong1031/OpenENV-Hackathon.git\"  # or your fork\n",
+        "REPO_DIR = \"OpenENV-Hackathon\"\n",
+        "\n",
+        "!git clone --depth 1 {REPO_URL} {REPO_DIR}\n",
+        "%cd {REPO_DIR}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 2. Install only the runtime pieces needed for notebook-side training\n",
+        "!pip install -q unsloth unsloth_zoo --no-deps\n",
+        "!pip install -q \"openenv-core[core]>=0.2.0\" \"pydantic>=2\" \"numpy>=1.24.0\" \"scipy>=1.10.0\" \"datasets>=4.6.1\" \"accelerate>=1.13.0\" \"peft>=0.15.0\" \"bitsandbytes>=0.45.0\" \"matplotlib>=3.8.0\"\n",
+        "!pip install -q \"transformers>=4.57.0\" \"trl>=0.29.0\" \"torchvision>=0.20.0\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 3. Import repo reward helpers, but keep the environment remote\n",
+        "import inspect\n",
+        "import json\n",
+        "import random\n",
+        "import sys\n",
+        "from pathlib import Path\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "# Unsloth must be imported before trl / transformers / peft.\n",
+        "import unsloth  # noqa: F401\n",
+        "import torch\n",
+        "from unsloth import FastLanguageModel, PatchFastRL\n",
+        "\n",
+        "sys.path.insert(0, str(Path.cwd()))\n",
+        "\n",
+        "from client import BioExperimentEnv\n",
+        "from models import ActionType, ExperimentAction\n",
+        "from training_script import (\n",
+        "    INVALID_ACTION_PENALTY,\n",
+        "    ENVIRONMENT_ERROR_PENALTY,\n",
+        "    OpenEnvReward,\n",
+        "    build_training_prompt,\n",
+        "    build_experiment_action,\n",
+        "    decode_history_actions,\n",
+        "    pick_action,\n",
+        "    save_training_plots,\n",
+        ")\n",
+        "\n",
+        "MAX_COMPLETION_TOKENS = 160\n",
+        "LORA_TARGET_MODULES = [\n",
+        "    \"q_proj\",\n",
+        "    \"k_proj\",\n",
+        "    \"v_proj\",\n",
+        "    \"o_proj\",\n",
+        "    \"gate_proj\",\n",
+        "    \"up_proj\",\n",
+        "    \"down_proj\",\n",
+        "]\n",
+        "\n",
+        "\n",
+        "def hf_space_repo_to_base_url(repo_id: str) -> str:\n",
+        "    owner, space_name = repo_id.split(\"/\", 1)\n",
+        "    return f\"https://{owner.lower().replace('_', '-')}-{space_name.lower().replace('_', '-')}.hf.space\"\n",
+        "\n",
+        "\n",
+        "def build_remote_prompt_examples(\n",
+        "    base_url: str,\n",
+        "    dataset_episodes: int,\n",
+        "    rollout_steps: int,\n",
+        "    seed: int,\n",
+        ") -> List[Dict[str, str]]:\n",
+        "    rng = random.Random(seed)\n",
+        "    examples: List[Dict[str, str]] = []\n",
+        "\n",
+        "    for _ in range(dataset_episodes):\n",
+        "        with BioExperimentEnv(base_url=base_url) as env:\n",
+        "            result = env.reset()\n",
+        "            obs = result.observation\n",
+        "            history_actions: List[ExperimentAction] = []\n",
+        "\n",
+        "            for step_idx in range(rollout_steps):\n",
+        "                if obs.done:\n",
+        "                    break\n",
+        "\n",
+        "                next_action = build_experiment_action(\n",
+        "                    action_type=pick_action(\n",
+        "                        \"heuristic\",\n",
+        "                        step_idx,\n",
+        "                        [action.action_type for action in history_actions],\n",
+        "                    ),\n",
+        "                    discovered_markers=obs.discovered_markers,\n",
+        "                    candidate_mechanisms=obs.candidate_mechanisms,\n",
+        "                    conditions=obs.task.conditions,\n",
+        "                )\n",
+        "                examples.append(\n",
+        "                    {\n",
+        "                        \"prompt\": build_training_prompt(obs),\n",
+        "                        \"history_actions\": json.dumps(\n",
+        "                            [action.model_dump() for action in history_actions]\n",
+        "                        ),\n",
+        "                        \"reference_action\": json.dumps(next_action.model_dump()),\n",
+        "                        \"problem_statement\": obs.task.problem_statement,\n",
+        "                        \"episode_tag\": f\"remote-{rng.randrange(10**9):09d}\",\n",
+        "                    }\n",
+        "                )\n",
+        "\n",
+        "                history_actions.append(next_action)\n",
+        "                result = env.step(next_action)\n",
+        "                obs = result.observation\n",
+        "                if result.done:\n",
+        "                    break\n",
+        "\n",
+        "    return examples\n",
+        "\n",
+        "\n",
+        "def build_grpo_config(**overrides: Any):\n",
+        "    from trl import GRPOConfig\n",
+        "\n",
+        "    supported = set(inspect.signature(GRPOConfig.__init__).parameters)\n",
+        "    config_kwargs = {\n",
+        "        \"output_dir\": overrides[\"output_dir\"],\n",
+        "        \"learning_rate\": overrides[\"learning_rate\"],\n",
+        "        \"per_device_train_batch_size\": overrides[\"per_device_train_batch_size\"],\n",
+        "        \"gradient_accumulation_steps\": overrides[\"gradient_accumulation_steps\"],\n",
+        "        \"num_generations\": overrides[\"num_generations\"],\n",
+        "        \"max_completion_length\": overrides[\"max_completion_length\"],\n",
+        "        \"num_train_epochs\": overrides[\"num_train_epochs\"],\n",
+        "        \"logging_steps\": overrides[\"logging_steps\"],\n",
+        "        \"save_steps\": overrides[\"save_steps\"],\n",
+        "        \"bf16\": overrides[\"bf16\"],\n",
+        "        \"fp16\": overrides[\"fp16\"],\n",
+        "        \"report_to\": \"none\",\n",
+        "        \"remove_unused_columns\": False,\n",
+        "    }\n",
+        "    # Keep prompt truncation enabled. Leaving this as None can trigger\n",
+        "    # an Unsloth rotary-cache shape mismatch on long GRPO prompts.\n",
+        "    if \"max_prompt_length\" in supported:\n",
+        "        config_kwargs[\"max_prompt_length\"] = overrides[\"max_prompt_length\"]\n",
+        "    if (\n",
+        "        \"max_length\" in supported\n",
+        "        and \"max_prompt_length\" not in supported\n",
+        "        and \"max_completion_length\" not in supported\n",
+        "    ):\n",
+        "        config_kwargs[\"max_length\"] = (\n",
+        "            overrides[\"max_prompt_length\"] + overrides[\"max_completion_length\"]\n",
+        "        )\n",
+        "    return GRPOConfig(**{k: v for k, v in config_kwargs.items() if k in supported})\n",
+        "\n",
+        "\n",
+        "print(\"CUDA:\", torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"\")\n",
+        "Path(\"artifacts\").mkdir(exist_ok=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 4. Config + collect prompt states from the remote Space\n",
+        "SPACE_REPO_ID = \"Ev3Dev/hackathon\"\n",
+        "SPACE_BASE_URL = hf_space_repo_to_base_url(SPACE_REPO_ID)\n",
+        "# If your Space has a custom domain, replace SPACE_BASE_URL manually.\n",
+        "\n",
+        "MODEL_ID = \"unsloth/Llama-3.2-3B-Instruct-bnb-4bit\"\n",
+        "OUTPUT_DIR = \"artifacts/grpo-unsloth-llama32-3b-remote-space\"\n",
+        "\n",
+        "DATASET_EPISODES = 8\n",
+        "ROLLOUT_STEPS = 6\n",
+        "NUM_GENERATIONS = 2\n",
+        "# Keep this modest for Unsloth GRPO stability on Colab.\n",
+        "MAX_PROMPT_LENGTH = 768\n",
+        "MAX_SEQ_LENGTH = 2048\n",
+        "PER_DEVICE_TRAIN_BATCH_SIZE = 1\n",
+        "GRADIENT_ACCUMULATION_STEPS = 4\n",
+        "LEARNING_RATE = 5e-6\n",
+        "NUM_TRAIN_EPOCHS = 1.0\n",
+        "LOGGING_STEPS = 1\n",
+        "SAVE_STEPS = 25\n",
+        "SEED = 42\n",
+        "LORA_R = 16\n",
+        "LORA_ALPHA = 16\n",
+        "LORA_DROPOUT = 0.0\n",
+        "\n",
+        "examples = build_remote_prompt_examples(\n",
+        "    base_url=SPACE_BASE_URL,\n",
+        "    dataset_episodes=DATASET_EPISODES,\n",
+        "    rollout_steps=ROLLOUT_STEPS,\n",
+        "    seed=SEED,\n",
+        ")\n",
+        "\n",
+        "reward_fn = OpenEnvReward(\n",
+        "    reward_backend=\"remote\",\n",
+        "    base_url=SPACE_BASE_URL,\n",
+        "    invalid_action_penalty=INVALID_ACTION_PENALTY,\n",
+        "    environment_error_penalty=ENVIRONMENT_ERROR_PENALTY,\n",
+        ")\n",
+        "\n",
+        "print(\"Remote env:\", SPACE_BASE_URL)\n",
+        "print(\"Prompt states:\", len(examples))\n",
+        "print(\"Sample prompt preview:\\n\")\n",
+        "print(examples[0][\"prompt\"][:2000])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 5. Local GRPO training in Colab, remote env for rewards\n",
+        "from datasets import Dataset\n",
+        "from trl import GRPOTrainer\n",
+        "\n",
+        "PatchFastRL(\"GRPO\", FastLanguageModel)\n",
+        "train_dataset = Dataset.from_list(examples)\n",
+        "\n",
+        "bf16 = bool(getattr(torch.cuda, \"is_bf16_supported\", lambda: False)()) if torch.cuda.is_available() else False\n",
+        "runtime_dtype = torch.bfloat16 if bf16 else (torch.float16 if torch.cuda.is_available() else torch.float32)\n",
+        "\n",
+        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+        "    model_name=MODEL_ID,\n",
+        "    max_seq_length=MAX_SEQ_LENGTH,\n",
+        "    dtype=runtime_dtype,\n",
+        "    load_in_4bit=True,\n",
+        ")\n",
+        "if tokenizer.pad_token is None and tokenizer.eos_token is not None:\n",
+        "    tokenizer.pad_token = tokenizer.eos_token\n",
+        "\n",
+        "model = FastLanguageModel.get_peft_model(\n",
+        "    model,\n",
+        "    r=LORA_R,\n",
+        "    target_modules=LORA_TARGET_MODULES,\n",
+        "    lora_alpha=LORA_ALPHA,\n",
+        "    lora_dropout=LORA_DROPOUT,\n",
+        "    bias=\"none\",\n",
+        "    use_gradient_checkpointing=True,\n",
+        "    random_state=SEED,\n",
+        ")\n",
+        "\n",
+        "training_args = build_grpo_config(\n",
+        "    output_dir=OUTPUT_DIR,\n",
+        "    learning_rate=LEARNING_RATE,\n",
+        "    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,\n",
+        "    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n",
+        "    num_generations=NUM_GENERATIONS,\n",
+        "    max_completion_length=MAX_COMPLETION_TOKENS,\n",
+        "    max_prompt_length=MAX_PROMPT_LENGTH,\n",
+        "    num_train_epochs=NUM_TRAIN_EPOCHS,\n",
+        "    logging_steps=LOGGING_STEPS,\n",
+        "    save_steps=SAVE_STEPS,\n",
+        "    bf16=bf16,\n",
+        "    fp16=torch.cuda.is_available() and not bf16,\n",
+        ")\n",
+        "\n",
+        "trainer = GRPOTrainer(\n",
+        "    model=model,\n",
+        "    reward_funcs=[reward_fn],\n",
+        "    args=training_args,\n",
+        "    train_dataset=train_dataset,\n",
+        "    processing_class=tokenizer,\n",
+        ")\n",
+        "\n",
+        "for attr in (\"image_token_id\", \"vision_start_token_id\", \"vision_end_token_id\"):\n",
+        "    if not hasattr(trainer, attr):\n",
+        "        setattr(trainer, attr, None)\n",
+        "\n",
+        "trainer.train()\n",
+        "trainer.save_model(OUTPUT_DIR)\n",
+        "tokenizer.save_pretrained(OUTPUT_DIR)\n",
+        "plot_paths = save_training_plots(trainer.state.log_history, OUTPUT_DIR)\n",
+        "\n",
+        "result = {\n",
+        "    \"trainer\": trainer,\n",
+        "    \"plot_paths\": plot_paths,\n",
+        "    \"output_dir\": OUTPUT_DIR,\n",
+        "}\n",
+        "print(\"Saved to:\", OUTPUT_DIR)\n",
+        "print(\"Plots:\", plot_paths)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# 6. (Optional) Show curves and sanity-check the repo reward wrapper\n",
+        "from IPython.display import Image, display\n",
+        "\n",
+        "sample_reward = reward_fn(\n",
+        "    completions=[[{\"role\": \"assistant\", \"content\": examples[0][\"reference_action\"]}]],\n",
+        "    history_actions=[examples[0][\"history_actions\"]],\n",
+        ")[0]\n",
+        "print(\"Sample reward for reference action:\", sample_reward)\n",
+        "\n",
+        "for name, path in (result.get(\"plot_paths\") or {}).items():\n",
+        "    print(name, path)\n",
+        "    display(Image(filename=path))"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.10.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

pyproject.toml CHANGED Viewed

@@ -40,8 +40,6 @@ train = [
     "torch>=2.10.0",
     "torchvision>=0.20.0",  # required by transformers for Qwen3.5 (image_utils)
     "transformers>=5.3.0",  # 5.3+ required for Qwen3.5 (qwen3_5 model type)
-    "llm-blender>=0.0.2",  # required by trl GRPOTrainer judges
-    "mergekit>=0.1.0",  # required by trl GRPOTrainer/callbacks
     "trl>=0.29.0",  # GRPOTrainer; 0.29+ compatible with transformers 5.3
 ]

     "torch>=2.10.0",
     "torchvision>=0.20.0",  # required by transformers for Qwen3.5 (image_utils)
     "transformers>=5.3.0",  # 5.3+ required for Qwen3.5 (qwen3_5 model type)
     "trl>=0.29.0",  # GRPOTrainer; 0.29+ compatible with transformers 5.3
 ]

tests/test_colab_train_llama32_remote.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from colab_train_llama32_remote import (
+    hf_space_repo_to_base_url,
+    make_training_args,
+)
+def test_hf_space_repo_to_base_url_formats_standard_hf_space_domain():
+    assert (
+        hf_space_repo_to_base_url("Ev3Dev/hackathon")
+        == "https://ev3dev-hackathon.hf.space"
+    )
+def test_make_training_args_derives_base_url_when_missing():
+    args = make_training_args(space_repo_id="Ev3Dev/hackathon", base_url="")
+    assert args.base_url == "https://ev3dev-hackathon.hf.space"
+    assert args.model_id == "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"

unsloth_2048.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

unsloth_compiled_cache/.locks/.lock.UnslothDPOTrainer.py ADDED Viewed

File without changes

unsloth_compiled_cache/.locks/.lock.UnslothGRPOTrainer.py ADDED Viewed

File without changes

unsloth_compiled_cache/.locks/.lock.UnslothNashMDTrainer.py ADDED Viewed

File without changes

unsloth_compiled_cache/.locks/.lock.UnslothOnlineDPOTrainer.py ADDED Viewed

File without changes

unsloth_compiled_cache/.locks/.lock.UnslothRLOOTrainer.py ADDED Viewed

File without changes

unsloth_compiled_cache/.locks/.lock.UnslothXPOTrainer.py ADDED Viewed

File without changes

unsloth_compiled_cache/UnslothCPOTrainer.py CHANGED Viewed

@@ -28,7 +28,7 @@ import torch.nn as nn
 from torch.nn import functional as F
 from unsloth_zoo.temporary_patches.common import torch_compile
 from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
-from trl.trainer.cpo_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, BaseTrainer, CPOConfig, CPOTrainer, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, Union, add_bos_token_if_needed, add_eos_token_if_needed, autocast, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_peft_available, is_torch_fx_proxy, is_wandb_available, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, warnings, AutoModelForCausalLM, BaseImageProcessor, CPOConfig, CPOTrainer, Callable, DPODataCollatorWithPadding, DataCollator, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Optional, PartialState, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, Union, autocast, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_peft_available, is_wandb_available, logger, maybe_apply_chat_template, maybe_extract_prompt, nn, np, os, peft_module_casting_to_bf16, prepare_model_for_kbit_training, torch, warnings, F, PeftModel, PreTrainedModel, is_peft_available, logger, os, torch)
 import os

 from torch.nn import functional as F
 from unsloth_zoo.temporary_patches.common import torch_compile
 from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.cpo_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, BaseTrainer, CPOConfig, CPOTrainer, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, Union, add_bos_token_if_needed, add_eos_token_if_needed, autocast, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_peft_available, is_torch_fx_proxy, is_wandb_available, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, wandb, warnings, AutoModelForCausalLM, BaseImageProcessor, CPOConfig, CPOTrainer, Callable, DPODataCollatorWithPadding, DataCollator, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Optional, PartialState, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, Union, autocast, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_peft_available, is_wandb_available, logger, maybe_apply_chat_template, maybe_extract_prompt, nn, np, os, peft_module_casting_to_bf16, prepare_model_for_kbit_training, torch, wandb, warnings, F, PeftModel, PreTrainedModel, is_peft_available, logger, os, torch)
 import os

unsloth_compiled_cache/UnslothDPOTrainer.py ADDED Viewed

The diff for this file is too large to render. See raw diff

unsloth_compiled_cache/UnslothGRPOTrainer.py ADDED Viewed

The diff for this file is too large to render. See raw diff

unsloth_compiled_cache/UnslothKTOTrainer.py CHANGED Viewed

@@ -28,7 +28,7 @@ import torch.nn as nn
 from torch.nn import functional as F
 from unsloth_zoo.temporary_patches.common import torch_compile
 from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
-from trl.trainer.kto_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, BaseTrainer, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, KTOConfig, KTOTrainer, Literal, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SequentialSampler, TrainerCallback, TrainingArguments, Union, _get_kl_dataset, _process_tokens, _tokenize, autocast, concatenate_datasets, contextmanager, create_reference_model, defaultdict, disable_dropout_in_model, has_length, inspect, is_comet_available, is_liger_kernel_available, is_peft_available, is_wandb_available, itemgetter, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_deepspeed, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, tqdm, warnings, AutoModelForCausalLM, BaseImageProcessor, Callable, DPODataCollatorWithPadding, DataCollator, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, KTOConfig, KTOTrainer, Optional, PartialState, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, TrainingArguments, Union, autocast, concatenate_datasets, create_reference_model, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_liger_kernel_available, is_peft_available, is_wandb_available, logger, maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset, nn, np, os, peft_module_casting_to_bf16, prepare_deepspeed, prepare_model_for_kbit_training, torch, warnings, F, PeftModel, PreTrainedModel, is_peft_available, logger, os, torch, F, nn, np, os, selective_log_softmax, torch)
 import os

 from torch.nn import functional as F
 from unsloth_zoo.temporary_patches.common import torch_compile
 from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.kto_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, BaseTrainer, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, KTOConfig, KTOTrainer, Literal, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SequentialSampler, TrainerCallback, TrainingArguments, Union, _get_kl_dataset, _process_tokens, _tokenize, autocast, concatenate_datasets, contextmanager, create_reference_model, defaultdict, disable_dropout_in_model, has_length, inspect, is_comet_available, is_liger_kernel_available, is_peft_available, is_wandb_available, itemgetter, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_deepspeed, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, tqdm, wandb, warnings, AutoModelForCausalLM, BaseImageProcessor, Callable, DPODataCollatorWithPadding, DataCollator, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, KTOConfig, KTOTrainer, Optional, PartialState, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, TrainingArguments, Union, autocast, concatenate_datasets, create_reference_model, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_liger_kernel_available, is_peft_available, is_wandb_available, logger, maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset, nn, np, os, peft_module_casting_to_bf16, prepare_deepspeed, prepare_model_for_kbit_training, torch, wandb, warnings, F, PeftModel, PreTrainedModel, is_peft_available, logger, os, torch, F, nn, np, os, selective_log_softmax, torch)
 import os

unsloth_compiled_cache/UnslothNashMDTrainer.py ADDED Viewed

	@@ -0,0 +1,1340 @@

+"""
+2026.3.2
+2026.3.4
+5.3.0
+0.24.0
+__UNSLOTH_VERSIONING__
+"""
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from unsloth_zoo.temporary_patches.common import torch_compile
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.nash_md_trainer import (Any, BaseImageProcessor, BasePairwiseJudge, Callable, Dataset, EvalPrediction, F, FeatureExtractionMixin, GeometricMixtureWrapper, IterableDataset, NashMDConfig, NashMDTrainer, OnlineDPOTrainer, OptimizerNames, Optional, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SIMPLE_CHAT_TEMPLATE, TrainerCallback, Union, empty_cache, get_reward, is_conversational, is_peft_available, jinja2, maybe_apply_chat_template, nn, selective_log_softmax, textwrap, torch, truncate_right, unwrap_model_for_generation)
+import os
+import math
+import logging
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+import inspect
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+from unsloth_zoo.device_type import DEVICE_TYPE, device_synchronize
+# Wrap trainer with padding to right and enable training mode
+# Also patches W&B since multiple runs must use wandb.finish()
+import functools
+from types import MethodType
+try:
+    from unsloth_zoo.gradient_checkpointing import reset_unsloth_gradient_checkpointing_buffers
+except:
+    def reset_unsloth_gradient_checkpointing_buffers(): pass
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        _was_training = None
+        # Get gradient checkpointing setting from training arguments
+        use_gc = getattr(self.args, 'gradient_checkpointing', True)
+        if hasattr(self, 'model') and hasattr(self.model, "training"):
+            _was_training = self.model.training
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training(use_gradient_checkpointing=use_gc)
+        output = f(self, *args, **kwargs)
+        # Restore previous mode when possible
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            if _was_training is False:
+                self.model.for_inference()
+            elif _was_training is True and hasattr(self.model, "for_training"):
+                self.model.for_training(use_gradient_checkpointing=use_gc)
+        # Reset gradient checkpointing buffers to free memory while staying ready for next run
+        try:
+            reset_unsloth_gradient_checkpointing_buffers()
+        except:
+            pass
+        # Patch W&B to enable logging on future runs, otherwise it'll overwrite the first run
+        try:
+            import wandb
+            wandb.finish()
+        except:
+            pass
+        return output
+    return wrapper
+pass
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_hidden_states_selective_log_softmax(
+    hidden_states: torch.Tensor,
+    lm_head: torch.Tensor,
+    index: torch.Tensor,
+    chunks: int = 4,
+    logit_scale_multiply: float = 0.0,
+    logit_scale_divide: float = 0.0,
+    logit_softcapping: float = 0.0,
+    temperature: float = 1.0,
+) -> torch.Tensor:
+    # All Unsloth Zoo code licensed under AGPL3
+    flat_hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1])
+    flat_index = index.reshape(-1)
+    chunked_hidden_states = torch.chunk(flat_hidden_states, chunks=chunks, dim=0)
+    chunked_index = torch.chunk(flat_index, chunks=chunks, dim=0)
+    all_per_token_logps = []
+    for chunk_hidden_states, chunk_index in zip(chunked_hidden_states, chunked_index):
+        chunk_logits = chunk_hidden_states.to(lm_head.dtype) @ lm_head.t()
+        if logit_scale_multiply != 0.0:
+            chunk_logits = chunk_logits * logit_scale_multiply
+        if logit_scale_divide != 0.0:
+            chunk_logits = chunk_logits / logit_scale_divide
+        if logit_softcapping != 0.0:
+            chunk_logits = chunk_logits * torch.tanh(chunk_logits / logit_softcapping)
+        chunk_logits = chunk_logits.to(torch.float32)
+        if temperature != 1.0:
+            chunk_logits = chunk_logits / temperature
+        selected_logits = torch.gather(chunk_logits, dim=-1, index=chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim=-1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((hidden_states.shape[0], hidden_states.shape[1]))
+    return all_per_token_logps
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+    prompt_section = input_ids[:, :-logits_to_keep]
+    padding_mask = (prompt_section == pad_token_id)
+    pad_token_counts = padding_mask.sum(dim=1)
+    return pad_token_counts
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+    non_padding_mask = (completion_input_ids != pad_token_id)
+    final_mask = shift_mask & non_padding_mask
+    return final_mask
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+    return padded_logprobs
+def autotune_batch_and_chunks(
+    total_input_rows,
+    seq_len,
+    hidden_size,
+    vocab_size,
+    dtype_bytes=16,
+    multiplier=None
+):
+    if multiplier is None:
+        final_m = max(4, seq_len // 4096)
+    else:
+        final_m = multiplier
+    if torch.cuda.is_available():
+        free_bytes, _ = torch.cuda.mem_get_info()
+        limit_gb = (free_bytes / (1024**3))*.80
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        # For XPU: estimate free memory from total - reserved
+        total_mem = torch.xpu.get_device_properties(0).total_memory
+        reserved_mem = torch.xpu.memory_reserved()
+        free_bytes = total_mem - reserved_mem
+        limit_gb = (free_bytes / (1024**3)) * 0.80
+    else:
+        # Fallback: assume 8GB available
+        limit_gb = 8.0
+    bytes_to_gb = 1024**3
+    b_vals = torch.arange(total_input_rows, 0, -1, device='cpu', dtype=torch.float32)
+    hidden_gb = (b_vals * seq_len * hidden_size * dtype_bytes) / bytes_to_gb
+    base_logits = ((b_vals/total_input_rows) * b_vals * seq_len * vocab_size * dtype_bytes) / bytes_to_gb
+    logits_gb = base_logits / final_m
+    total_mem_gb = hidden_gb + logits_gb
+    valid_mask = total_mem_gb <= limit_gb
+    valid_indices = torch.nonzero(valid_mask, as_tuple=False)
+    if valid_indices.shape[0] == 0:
+        #This means your GPU will OOM
+        return 4, final_m
+    best_idx = valid_indices[0].item()
+    final_b = int(b_vals[best_idx].item())
+    return final_b, final_m
+def sanitize_logprob(logprob):
+    """Local port of trl.scripts.vllm_serve.sanitize_logprob.
+    Filters NaN logprobs from vLLM outputs."""
+    value = logprob.logprob
+    if math.isnan(value):
+        logging.getLogger(__name__).warning(
+            f"Generated NaN logprob, token logprob '{logprob}' will be ignored"
+        )
+        return None
+    return value
+@dataclass
+class UnslothNashMDConfig(NashMDConfig):
+    """
+    Configuration class for the [`NashMDTrainer`].
+    Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
+    Parameters:
+        mixture_coef (`float` or `list[float]`, *optional*, defaults to `0.5`):
+            Logit mixture coefficient for the model and reference model. If a list of floats is provided then the
+            mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the
+            epochs.
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    unsloth_logit_chunk_multiplier : Optional[int] = field(
+            default = None,
+            metadata = {'help': 'Multiplier for chunked logit computations.'},
+        )
+    unsloth_grpo_mini_batch : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Mini batch size for GRPO hidden state accumulation. Default is None unless user defines it.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        per_device_train_batch_size = 4,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        learning_rate = 5e-05,
+        lr_scheduler_type = 'linear',
+        lr_scheduler_kwargs = None,
+        warmup_steps = 0.1,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        optim_target_modules = None,
+        gradient_accumulation_steps = 2,
+        average_tokens_across_devices = True,
+        max_grad_norm = 1.0,
+        label_smoothing_factor = 0.0,
+        bf16 = False,
+        fp16 = False,
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        use_cache = False,
+        neftune_noise_alpha = None,
+        torch_empty_cache_steps = 250,
+        auto_find_batch_size = False,
+        logging_strategy = 'steps',
+        logging_steps = 1,
+        logging_first_step = False,
+        log_on_each_node = True,
+        logging_nan_inf_filter = False,
+        include_num_input_tokens_seen = False,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        disable_tqdm = None,
+        report_to = 'none',
+        run_name = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        eval_strategy = 'no',
+        eval_steps = None,
+        eval_delay = 0,
+        per_device_eval_batch_size = 4,
+        prediction_loss_only = False,
+        eval_on_start = False,
+        eval_do_concat_batches = True,
+        eval_use_gather_object = False,
+        eval_accumulation_steps = 2,
+        batch_eval_metrics = False,
+        save_only_model = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_on_each_node = False,
+        save_total_limit = None,
+        enable_jit_checkpoint = False,
+        push_to_hub = False,
+        hub_token = None,
+        hub_private_repo = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_always_push = False,
+        hub_revision = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        restore_callback_states_from_checkpoint = False,
+        full_determinism = False,
+        seed = 3407,
+        data_seed = 3407,
+        use_cpu = False,
+        accelerator_config = None,
+        parallelism_config = None,
+        dataloader_drop_last = False,
+        dataloader_num_workers = 0,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        dataloader_prefetch_factor = None,
+        remove_unused_columns = True,
+        label_names = None,
+        train_sampling_strategy = 'random',
+        length_column_name = 'length',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        ddp_backend = None,
+        ddp_timeout = 1800,
+        fsdp = None,
+        fsdp_config = None,
+        deepspeed = None,
+        debug = '',
+        skip_memory_metrics = True,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        resume_from_checkpoint = None,
+        warmup_ratio = None,
+        logging_dir = None,
+        local_rank = -1,
+        reward_model_path = None,
+        judge = None,
+        max_new_tokens = 64,
+        max_length = 512,
+        temperature = 0.9,
+        top_p = 1.0,
+        top_k = None,
+        min_p = None,
+        repetition_penalty = 1.0,
+        generation_kwargs = {},
+        use_transformers_paged = False,
+        cache_implementation = None,
+        missing_eos_penalty = None,
+        loss_type = 'sigmoid',
+        disable_dropout = True,
+        use_vllm = False,
+        vllm_model_impl = 'vllm',
+        vllm_guided_decoding_regex = None,
+        vllm_gpu_memory_utilization = 0.55,
+        vllm_mode = 'colocate',
+        vllm_server_base_url = None,
+        vllm_server_host = '0.0.0.0',
+        vllm_server_port = 8000,
+        vllm_server_timeout = 240.0,
+        vllm_tensor_parallel_size = 1,
+        ds3_gather_for_generation = True,
+        model_init_kwargs = None,
+        reward_weights = None,
+        dataset_num_proc = None,
+        gpu_memory_utilization = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        unsloth_logit_chunk_multiplier = None,
+        unsloth_grpo_mini_batch = None,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if num_train_epochs is None:
+            num_train_epochs = 3.0  # Default to 3 epochs if None, max_steps will override
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        import multiprocessing as _mp
+        if _mp.get_start_method() != 'fork':
+            dataset_num_proc = None
+        elif dataset_num_proc is None:
+            import psutil
+            dataset_num_proc = min(max((psutil.cpu_count() or 1)+4, 2), 64)
+            memory_gb_left = psutil.virtual_memory().available / (1024**3)
+            if memory_gb_left <= 2: dataset_num_proc = 1
+            else: dataset_num_proc = min(dataset_num_proc, int(memory_gb_left))
+        if temperature <= 0:
+            raise ValueError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise ValueError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        super().__init__(
+            output_dir = output_dir,
+            per_device_train_batch_size = per_device_train_batch_size,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            learning_rate = learning_rate,
+            lr_scheduler_type = lr_scheduler_type,
+            lr_scheduler_kwargs = lr_scheduler_kwargs,
+            warmup_steps = warmup_steps,
+            optim = optim,
+            optim_args = optim_args,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            optim_target_modules = optim_target_modules,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            average_tokens_across_devices = average_tokens_across_devices,
+            max_grad_norm = max_grad_norm,
+            label_smoothing_factor = label_smoothing_factor,
+            bf16 = bf16,
+            fp16 = fp16,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            use_cache = use_cache,
+            neftune_noise_alpha = neftune_noise_alpha,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            auto_find_batch_size = auto_find_batch_size,
+            logging_strategy = logging_strategy,
+            logging_steps = logging_steps,
+            logging_first_step = logging_first_step,
+            log_on_each_node = log_on_each_node,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            disable_tqdm = disable_tqdm,
+            report_to = report_to,
+            run_name = run_name,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            eval_strategy = eval_strategy,
+            eval_steps = eval_steps,
+            eval_delay = eval_delay,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            prediction_loss_only = prediction_loss_only,
+            eval_on_start = eval_on_start,
+            eval_do_concat_batches = eval_do_concat_batches,
+            eval_use_gather_object = eval_use_gather_object,
+            eval_accumulation_steps = eval_accumulation_steps,
+            batch_eval_metrics = batch_eval_metrics,
+            save_only_model = save_only_model,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_on_each_node = save_on_each_node,
+            save_total_limit = save_total_limit,
+            enable_jit_checkpoint = enable_jit_checkpoint,
+            push_to_hub = push_to_hub,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            full_determinism = full_determinism,
+            seed = seed,
+            data_seed = data_seed,
+            use_cpu = use_cpu,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            dataloader_drop_last = dataloader_drop_last,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            train_sampling_strategy = train_sampling_strategy,
+            length_column_name = length_column_name,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            ddp_backend = ddp_backend,
+            ddp_timeout = ddp_timeout,
+            fsdp = fsdp,
+            fsdp_config = fsdp_config,
+            deepspeed = deepspeed,
+            debug = debug,
+            skip_memory_metrics = skip_memory_metrics,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            resume_from_checkpoint = resume_from_checkpoint,
+            warmup_ratio = warmup_ratio,
+            logging_dir = logging_dir,
+            local_rank = local_rank,
+            reward_model_path = reward_model_path,
+            judge = judge,
+            max_new_tokens = max_new_tokens,
+            max_length = max_length,
+            temperature = temperature,
+            top_p = top_p,
+            top_k = top_k,
+            min_p = min_p,
+            repetition_penalty = repetition_penalty,
+            generation_kwargs = generation_kwargs,
+            use_transformers_paged = use_transformers_paged,
+            cache_implementation = cache_implementation,
+            missing_eos_penalty = missing_eos_penalty,
+            loss_type = loss_type,
+            disable_dropout = disable_dropout,
+            use_vllm = use_vllm,
+            vllm_model_impl = vllm_model_impl,
+            vllm_guided_decoding_regex = vllm_guided_decoding_regex,
+            vllm_gpu_memory_utilization = vllm_gpu_memory_utilization,
+            vllm_mode = vllm_mode,
+            vllm_server_base_url = vllm_server_base_url,
+            vllm_server_host = vllm_server_host,
+            vllm_server_port = vllm_server_port,
+            vllm_server_timeout = vllm_server_timeout,
+            vllm_tensor_parallel_size = vllm_tensor_parallel_size,
+            ds3_gather_for_generation = ds3_gather_for_generation,
+            model_init_kwargs = model_init_kwargs,
+            reward_weights = reward_weights,
+            dataset_num_proc = dataset_num_proc,
+            gpu_memory_utilization = gpu_memory_utilization,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        if unsloth_grpo_mini_batch is not None:
+            if self.generation_batch_size >= unsloth_grpo_mini_batch:
+                self.unsloth_grpo_mini_batch = unsloth_grpo_mini_batch
+            else:
+                raise ValueError(
+                    f"Unsloth GRPO mini batch size needs to be less than or equal to the effective generation batch size, "
+                    f"which is self.per_device_train_batch_size * gradient_accumulation_steps."
+                )
+        self.unsloth_logit_chunk_multiplier = unsloth_logit_chunk_multiplier
+        self.max_seq_length = max_seq_length
+pass
+class _UnslothNashMDTrainer(OnlineDPOTrainer):
+    """"""
+    _tag_names = ["trl", "nash-md"]
+    _name = "Nash-MD"
+    _paper = {
+        "title": "Nash Learning from Human Feedback",
+        "id": "2312.00886",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @inproceedings{munos2024nash,
+                title        = {{Nash Learning from Human Feedback}},
+                author       = {R{\'{e}}mi Munos and Michal Valko and Daniele Calandriello and Mohammad Gheshlaghi Azar and Mark Rowland and Zhaohan Daniel Guo and Yunhao Tang and Matthieu Geist and Thomas Mesnard and C{\\^{o}}me Fiegel and Andrea Michi and Marco Selvi and Sertan Girgin and Nikola Momchev and Olivier Bachem and Daniel J. Mankowitz and Doina Precup and Bilal Piot},
+                year         = 2024,
+                booktitle    = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024},
+                publisher    = {OpenReview.net},
+                url          = {https://openreview.net/forum?id=Y5AmNYiyCQ}
+            }"""),
+    }
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module] = None,
+        ref_model: Union[PreTrainedModel, nn.Module] = None,
+        reward_funcs: Union[PreTrainedModel, nn.Module, None] = None,
+        judge: Optional[BasePairwiseJudge] = None,
+        args: Optional[NashMDConfig] = None,
+        data_collator: Optional[Callable] = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        # Deprecated parameters
+        reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            ref_model=ref_model,
+            reward_funcs=reward_funcs,
+            judge=judge,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=processing_class,
+            peft_config=peft_config,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+            reward_model=reward_model,
+        )
+        self._mixture_coef = self.args.mixture_coef
+        # Overwrite the stats dictionary to include NashMD specific statistics
+        self.stats = {
+            # Remove "non_score_reward", "rlhf_reward", "scores_margin"
+            # Add "mixture_coef"
+            "loss/kl": [],
+            "objective/entropy": [],
+            "loss/score": [],
+            "rewards/probabilities": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            "val/model_contain_eos_token": [],
+            "val/ref_contain_eos_token": [],
+            "beta": [],
+            "mixture_coef": [],
+        }
+        if self.reward_funcs is not None:
+            if len(self.reward_funcs) != 1:
+                raise ValueError("NashMDTrainer only supports one reward function/model.")
+            self.reward_funcs = self.reward_funcs[0]
+            self.stats["rewards/chosen"] = []
+            self.stats["rewards/rejected"] = []
+    @property
+    def mixture_coef(self):
+        if isinstance(self._mixture_coef, list):
+            epoch = self.state.epoch
+            return self._mixture_coef[epoch] if epoch < len(self._mixture_coef) else self._mixture_coef[-1]
+        else:
+            return self._mixture_coef
+    def _generate_completions(self, model, prompts):
+        # Generate completions from the policy model.
+        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_for_gen_ctx:
+            model_output = unwrapped_policy_for_gen_ctx.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+        # Get the DDP/FSDP unwrapped version of the main model.
+        # This will be the policy model for GeometricMixtureWrapper (PEFT adapters active if PEFT is used).
+        policy_model_for_gmw = self.accelerator.unwrap_model(model)
+        # Determine the correct reference model for GeometricMixtureWrapper.
+        # This also needs to be DDP/FSDP unwrapped.
+        ref_model_for_gmw: torch.nn.Module
+        if self.ref_model is None:
+            # No explicit ref_model is provided.
+            # Use the base of the main `model` if it's a PEFT model.
+            # policy_model_for_gmw is already DDP-unwrapped.
+            if is_peft_available() and isinstance(policy_model_for_gmw, PeftModel):
+                ref_model_for_gmw = policy_model_for_gmw.get_base_model()
+            else:
+                # Not a PEFT model (or PEFT not available), or already a base model.
+                # Use the DDP-unwrapped policy model itself as the reference.
+                ref_model_for_gmw = policy_model_for_gmw
+        else:
+            # An explicit ref_model is provided. Unwrap it for DDP/FSDP.
+            ref_model_for_gmw = self.accelerator.unwrap_model(self.ref_model)
+        # Both models given to GeometricMixtureWrapper (policy_model_for_gmw and ref_model_for_gmw) are DDP-unwrapped.
+        with torch.no_grad():  # Ensure no_grad context for mixture model generation
+            mixture_model = GeometricMixtureWrapper(
+                model=policy_model_for_gmw,
+                ref_model=ref_model_for_gmw,
+                generation_config=self.generation_config,
+                mixture_coef=self.mixture_coef,
+                device=self.accelerator.device,
+            )
+            mixture_output = mixture_model.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+        return model_output, mixture_output
+    def _process_completions(self, model_output, mixture_output, prompts):
+        context_length = prompts["input_ids"].shape[1]
+        # Process model completions
+        model_completion_ids = model_output[:, context_length:]
+        model_completion_ids, model_completion_mask = truncate_right(
+            model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        model_data = {
+            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+        # Process reference model completions
+        mixture_completion_ids = mixture_output[:, context_length:]
+        mixture_completion_ids, mixture_completion_mask = truncate_right(
+            mixture_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        mixture_data = {
+            "input_ids": torch.cat((prompts["input_ids"], mixture_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], mixture_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+        return model_data, mixture_data
+    def _compute_rewards(self, model_data, mixture_data, context_length):
+        with torch.no_grad():
+            _, model_scores, _ = get_reward(
+                self.reward_funcs, model_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+            _, mixture_scores, _ = get_reward(
+                self.reward_funcs, mixture_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+        # Apply EOS penalty if needed
+        if self.args.missing_eos_penalty is not None:
+            model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            mixture_contain_eos = torch.any(mixture_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
+            mixture_scores[~mixture_contain_eos] -= self.args.missing_eos_penalty
+        return model_scores, mixture_scores
+    def _compute_judge(self, model_data, mixture_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [completion.strip() for completion in model_data_completions]
+        mixture_data_completions = self.processing_class.batch_decode(
+            mixture_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        mixture_data_completions = [completion.strip() for completion in mixture_data_completions]
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
+            mixture_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in mixture_data_completions
+            ]
+            mixture_data_completions = [
+                template.render(messages=completion) for completion in mixture_data_completions
+            ]
+        probability = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, mixture_data_completions)),
+            return_scores=True,
+        )
+        return torch.tensor(probability, device=model_data["input_ids"].device)
+    def _compute_logprobs(self, model, model_data, context_length):
+        def compute_logprobs_for_data(m, data):
+            output = m(data["input_ids"], attention_mask=data["attention_mask"])
+            logits = output.logits[:, context_length - 1 : -1]
+            token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:])
+            return token_logprobs
+        # Compute logprobs for model completions under the model
+        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+        # Compute logprobs of model completions under the reference model
+        with torch.no_grad():
+            if self.ref_model is None:
+                with model.disable_adapter():
+                    ref_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+            else:
+                ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data)
+        # Mask padding tokens
+        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
+        model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+        return (model_logprobs_model_data, ref_logprobs_model_data)
+    def _compute_losses(
+        self,
+        model_logprobs_model_data,
+        ref_logprobs_model_data,
+        probability,
+    ):
+        # reinforce score where 0.5 is a control variate
+        score = (probability - 0.5) * model_logprobs_model_data.sum(1)
+        # kl divergence via reinforce
+        with torch.no_grad():
+            log_ratio = model_logprobs_model_data - ref_logprobs_model_data
+            kl_div_log = log_ratio.sum(1)
+        kl_div_loss = (log_ratio * model_logprobs_model_data).sum(1)
+        # final loss
+        loss = self.beta * kl_div_loss - score
+        return loss.mean(), score, kl_div_log
+    def _log_statistics(
+        self,
+        model_data,
+        mixture_data,
+        model_logprobs_model_data,
+        ref_logprobs_model_data,
+        probability,
+        score,
+        kl_div,
+        context_length,
+        model_scores=None,
+        mixture_scores=None,
+    ):
+        # Helper function to gather and compute mean
+        def gather_mean(tensor):
+            return self.accelerator.gather_for_metrics(tensor).mean().item()
+        # Log score
+        self.stats["loss/score"].append(gather_mean(score))
+        # Log KL divergence
+        self.stats["loss/kl"].append(gather_mean(kl_div))
+        # Log logprobs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+        self.stats["logps/chosen"].append(gather_mean(model_logprobs_model_data_sum))
+        self.stats["logps/rejected"].append(gather_mean(ref_logprobs_model_data_sum))
+        # Log rewards
+        if self.reward_funcs is not None:
+            self.stats["rewards/chosen"].append(gather_mean(model_scores))
+            self.stats["rewards/rejected"].append(gather_mean(mixture_scores))
+        # Log probabilities
+        self.stats["rewards/probabilities"].append(gather_mean(probability))
+        # Calculate entropy for model data
+        entropy_model_data = -model_logprobs_model_data.sum(1)
+        self.stats["objective/entropy"].append(gather_mean(entropy_model_data))
+        # Calculate margins
+        margin = model_logprobs_model_data_sum - ref_logprobs_model_data_sum
+        self.stats["rewards/margins"].append(gather_mean(margin))
+        # Calculate accuracy
+        accuracy = (margin > 0).float()
+        self.stats["rewards/accuracies"].append(gather_mean(accuracy))
+        # Log EOS token statistics
+        model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        mixture_eos = (mixture_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
+        self.stats["val/ref_contain_eos_token"].append(gather_mean(mixture_eos.float()))
+        # Log beta and mixture coef
+        self.stats["beta"].append(self.beta)
+        self.stats["mixture_coef"].append(self.mixture_coef)
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+    ) -> torch.Tensor:
+        model.train()
+        # Apply chat template and tokenize the input
+        batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
+        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
+        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
+        inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
+        inputs = self.data_collator(inputs)
+        # need the prompt_ only
+        inputs = self._prepare_inputs(inputs)
+        context_length = inputs["prompt_input_ids"].shape[1]
+        prompts = {
+            "input_ids": inputs["prompt_input_ids"],
+            "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
+        }
+        del inputs
+        # Sample completions from both the model and the reference model
+        model_output, mixture_output = self._generate_completions(model, prompts)
+        # Process model completions
+        model_data, mixture_data = self._process_completions(model_output, mixture_output, prompts)
+        # Compute rewards
+        if self.reward_funcs is not None:
+            model_scores, mixture_scores = self._compute_rewards(model_data, mixture_data, context_length)
+            # probability of the model data vs the mixture data
+            probability = F.sigmoid(model_scores - mixture_scores)
+        else:
+            model_scores, mixture_scores = None, None
+            probability = self._compute_judge(model_data, mixture_data, context_length)
+        # Compute logprobs
+        model_logprobs_model_data, ref_logprobs_model_data = self._compute_logprobs(model, model_data, context_length)
+        # Compute loss
+        loss, score, kl_div = self._compute_losses(model_logprobs_model_data, ref_logprobs_model_data, probability)
+        # Log everything
+        self._log_statistics(
+            model_data,
+            mixture_data,
+            model_logprobs_model_data.detach(),
+            ref_logprobs_model_data,
+            probability,
+            score.detach(),
+            kl_div.detach(),
+            context_length,
+            model_scores,
+            mixture_scores,
+        )
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+        kwargs = {}
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+        self.accelerator.backward(loss, **kwargs)
+        return loss.detach() / self.args.gradient_accumulation_steps
+class UnslothNashMDTrainer(_UnslothNashMDTrainer):
+    """
+    Trainer for the Nash-MD method.
+    It is implemented as a subclass of [`OnlineDPOTrainer`].
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an `AutoModelForCausalLM`.
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        reward_funcs ([`~transformers.PreTrainedModel`]):
+            The reward model to score completions with, preferably an
+            [`~transformers.AutoModelForSequenceClassification`].
+        judge ([`BasePairwiseJudge`]):
+            The judge to use for pairwise comparison of model completions.
+        args ([`NashMDConfig`]):
+            The NashMD config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        peft_config (`dict`):
+            The peft config to use for training.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        reward_model:
+            <Deprecated version="0.22.0">
+            This parameter is deprecated and will be removed in version 0.25.0. Use `reward_funcs` instead.
+            </Deprecated>
+    """
+    def __init__(
+        self,
+        model = None,
+        ref_model = None,
+        reward_funcs = None,
+        judge = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        peft_config = None,
+        compute_metrics = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        reward_model = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothNashMDConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().weight.dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+            if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no'
+            # args.mixed_precision is a new argument which needs to be set now
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+            if hasattr(args, 'mixed_precision'): args.mixed_precision = 'fp16' if float16 else 'bf16'
+            # args.mixed_precision is a new argument which needs to be set now
+        elif mixed_precision_dtype == 'bfloat16':
+            # Both False since bfloat16 full finetuning doesn't do any autocasting.
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+            if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no'
+            # args.mixed_precision is a new argument which needs to be set now
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if model is not None:
+            _warnings_issued = getattr(model, 'warnings_issued', None)
+            if _warnings_issued is None:
+                model.warnings_issued = {}
+            elif not isinstance(_warnings_issued, dict):
+                try:
+                    model.warnings_issued = dict(_warnings_issued)
+                except Exception:
+                    model.warnings_issued = {}
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+            elif args_max_seq_length is not None and model_max_seq_length is not None:
+                if args_max_seq_length > model_max_seq_length:
+                    print('Unsloth: You set `max_seq_length` as ' + str(args_max_seq_length) + ' but '
+                           'the maximum the model supports is ' + str(model_max_seq_length) + '. We shall reduce it.')
+                    args.max_seq_length = model_max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True))
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('nash_md_trainer', other_metrics)
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True))
+        super().__init__(
+            model = model,
+            ref_model = ref_model,
+            reward_funcs = reward_funcs,
+            judge = judge,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            peft_config = peft_config,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            reward_model = reward_model,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        if hasattr(self, 'llm') and self.llm is not None and hasattr(self.llm, 'get_tokenizer'):
+            _vllm_tok = self.llm.get_tokenizer()
+            _pc = getattr(self, 'processing_class', None) or getattr(self, 'tokenizer', None)
+            if _vllm_tok is not None and _pc is not None and getattr(_pc, 'chat_template', None) is not None and getattr(_vllm_tok, 'chat_template', None) is None:
+                _vllm_tok.chat_template = _pc.chat_template
+        pass
+pass

unsloth_compiled_cache/UnslothORPOTrainer.py CHANGED Viewed

@@ -28,7 +28,7 @@ import torch.nn as nn
 from torch.nn import functional as F
 from unsloth_zoo.temporary_patches.common import torch_compile
 from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
-from trl.trainer.orpo_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, BaseTrainer, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, ORPOConfig, ORPOTrainer, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, Union, add_bos_token_if_needed, add_eos_token_if_needed, autocast, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_peft_available, is_torch_fx_proxy, is_torch_xla_available, is_wandb_available, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, warnings, AutoModelForCausalLM, BaseImageProcessor, Callable, DPODataCollatorWithPadding, DataCollator, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, ORPOConfig, ORPOTrainer, Optional, PartialState, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, Union, autocast, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_peft_available, is_wandb_available, logger, maybe_apply_chat_template, maybe_extract_prompt, nn, np, os, peft_module_casting_to_bf16, prepare_model_for_kbit_training, torch, warnings, F, PeftModel, PreTrainedModel, is_peft_available, logger, os, torch)
 import os

 from torch.nn import functional as F
 from unsloth_zoo.temporary_patches.common import torch_compile
 from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.orpo_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, BaseTrainer, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, ORPOConfig, ORPOTrainer, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, Union, add_bos_token_if_needed, add_eos_token_if_needed, autocast, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_peft_available, is_torch_fx_proxy, is_torch_xla_available, is_wandb_available, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, maybe_extract_prompt, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_model_for_kbit_training, random, selective_log_softmax, textwrap, torch, wandb, warnings, AutoModelForCausalLM, BaseImageProcessor, Callable, DPODataCollatorWithPadding, DataCollator, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, ORPOConfig, ORPOTrainer, Optional, PartialState, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, TrainerCallback, Union, autocast, defaultdict, disable_dropout_in_model, inspect, is_comet_available, is_peft_available, is_wandb_available, logger, maybe_apply_chat_template, maybe_extract_prompt, nn, np, os, peft_module_casting_to_bf16, prepare_model_for_kbit_training, torch, wandb, warnings, F, PeftModel, PreTrainedModel, is_peft_available, logger, os, torch)
 import os

unsloth_compiled_cache/UnslothOnlineDPOTrainer.py ADDED Viewed

The diff for this file is too large to render. See raw diff

unsloth_compiled_cache/UnslothRLOOTrainer.py ADDED Viewed

The diff for this file is too large to render. See raw diff

unsloth_compiled_cache/UnslothXPOTrainer.py ADDED Viewed

	@@ -0,0 +1,1385 @@

+"""
+2026.3.2
+2026.3.4
+5.3.0
+0.24.0
+__UNSLOTH_VERSIONING__
+"""
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from unsloth_zoo.temporary_patches.common import torch_compile
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from trl.trainer.xpo_trainer import (Any, BaseImageProcessor, BasePairwiseJudge, Callable, Dataset, EvalPrediction, F, FeatureExtractionMixin, IterableDataset, OnlineDPOTrainer, OptimizerNames, Optional, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SIMPLE_CHAT_TEMPLATE, TrainerCallback, Union, XPOConfig, XPOTrainer, empty_cache, get_reward, is_conversational, is_peft_available, jinja2, maybe_apply_chat_template, nn, selective_log_softmax, textwrap, torch, truncate_right, unwrap_model_for_generation)
+import os
+import math
+import logging
+from typing import *
+from dataclasses import dataclass, field
+from packaging.version import Version
+import torch
+import numpy as np
+from contextlib import nullcontext
+from torch.nn import functional as F
+import inspect
+from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
+from transformers.training_args import ParallelMode
+from unsloth_zoo.device_type import DEVICE_TYPE, device_synchronize
+# Wrap trainer with padding to right and enable training mode
+# Also patches W&B since multiple runs must use wandb.finish()
+import functools
+from types import MethodType
+try:
+    from unsloth_zoo.gradient_checkpointing import reset_unsloth_gradient_checkpointing_buffers
+except:
+    def reset_unsloth_gradient_checkpointing_buffers(): pass
+def prepare_for_training_mode(f):
+    @functools.wraps(f)
+    def wrapper(self, *args, **kwargs):
+        # Enable training mode
+        _was_training = None
+        # Get gradient checkpointing setting from training arguments
+        use_gc = getattr(self.args, 'gradient_checkpointing', True)
+        if hasattr(self, 'model') and hasattr(self.model, "training"):
+            _was_training = self.model.training
+        if hasattr(self, 'model') and hasattr(self.model, "for_training"):
+            self.model.for_training(use_gradient_checkpointing=use_gc)
+        output = f(self, *args, **kwargs)
+        # Restore previous mode when possible
+        if hasattr(self, 'model') and hasattr(self.model, "for_inference"):
+            if _was_training is False:
+                self.model.for_inference()
+            elif _was_training is True and hasattr(self.model, "for_training"):
+                self.model.for_training(use_gradient_checkpointing=use_gc)
+        # Reset gradient checkpointing buffers to free memory while staying ready for next run
+        try:
+            reset_unsloth_gradient_checkpointing_buffers()
+        except:
+            pass
+        # Patch W&B to enable logging on future runs, otherwise it'll overwrite the first run
+        try:
+            import wandb
+            wandb.finish()
+        except:
+            pass
+        return output
+    return wrapper
+pass
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : False,
+    "shape_padding"     : True,
+    "trace.enabled"     : False,
+    "triton.cudagraphs" : False,
+}
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_hidden_states_selective_log_softmax(
+    hidden_states: torch.Tensor,
+    lm_head: torch.Tensor,
+    index: torch.Tensor,
+    chunks: int = 4,
+    logit_scale_multiply: float = 0.0,
+    logit_scale_divide: float = 0.0,
+    logit_softcapping: float = 0.0,
+    temperature: float = 1.0,
+) -> torch.Tensor:
+    # All Unsloth Zoo code licensed under AGPL3
+    flat_hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1])
+    flat_index = index.reshape(-1)
+    chunked_hidden_states = torch.chunk(flat_hidden_states, chunks=chunks, dim=0)
+    chunked_index = torch.chunk(flat_index, chunks=chunks, dim=0)
+    all_per_token_logps = []
+    for chunk_hidden_states, chunk_index in zip(chunked_hidden_states, chunked_index):
+        chunk_logits = chunk_hidden_states.to(lm_head.dtype) @ lm_head.t()
+        if logit_scale_multiply != 0.0:
+            chunk_logits = chunk_logits * logit_scale_multiply
+        if logit_scale_divide != 0.0:
+            chunk_logits = chunk_logits / logit_scale_divide
+        if logit_softcapping != 0.0:
+            chunk_logits = chunk_logits * torch.tanh(chunk_logits / logit_softcapping)
+        chunk_logits = chunk_logits.to(torch.float32)
+        if temperature != 1.0:
+            chunk_logits = chunk_logits / temperature
+        selected_logits = torch.gather(chunk_logits, dim=-1, index=chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim=-1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((hidden_states.shape[0], hidden_states.shape[1]))
+    return all_per_token_logps
+@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,)
+def chunked_selective_log_softmax(logits, index):
+    # Split into 4 chunks only
+    chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0)
+    chunked_index  = torch.chunk(index.reshape(-1), chunks = 4, dim = 0)
+    all_per_token_logps = []
+    # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index)
+    for chunk_logits, chunk_index in zip(chunked_logits, chunked_index):
+        chunk_logits = chunk_logits.to(torch.float32)
+        selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1)
+        logsumexp_values = torch.logsumexp(chunk_logits, dim = -1)
+        per_token_logps = selected_logits - logsumexp_values
+        all_per_token_logps.append(per_token_logps)
+    pass
+    all_per_token_logps = torch.concat(all_per_token_logps)
+    all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1]))
+    return all_per_token_logps
+def calculate_pad_tokens_in_prompt(
+    input_ids: torch.Tensor,
+    logits_to_keep: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens
+    """
+    if logits_to_keep >= input_ids.shape[1]:
+        raise ValueError("logits_to_keep must be smaller than the sequence length.")
+    prompt_section = input_ids[:, :-logits_to_keep]
+    padding_mask = (prompt_section == pad_token_id)
+    pad_token_counts = padding_mask.sum(dim=1)
+    return pad_token_counts
+def create_completion_attention_mask(
+    completion_input_ids: torch.Tensor,
+    left_pad_tokens_per_prompt: torch.Tensor,
+    max_left_pad: int,
+    pad_token_id: int
+) -> torch.Tensor:
+    """
+    Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad]
+    Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens
+    and pad are pad tokens, this function would make a completion mask that would 0 out the pad
+    and p tokens. so in this example [0,0,0,1,1,1,0,0,0]
+    """
+    batch_size, completion_len = completion_input_ids.shape
+    device = completion_input_ids.device
+    num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt
+    indices = torch.arange(completion_len, device=device).unsqueeze(0)
+    shift_mask = indices >= num_tokens_to_mask.unsqueeze(1)
+    non_padding_mask = (completion_input_ids != pad_token_id)
+    final_mask = shift_mask & non_padding_mask
+    return final_mask
+def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor:
+    """
+    Moves all padding tokens in each sequence of a batch to the right.
+    """
+    mask = (tensor != pad_id)
+    # Must do stable=True since binary mark is unordered
+    sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True)
+    packed_tensor = torch.gather(tensor, 1, sorted_indices)
+    return packed_tensor
+def align_logprobs_with_mask(
+    logprob_tensor: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pad_value: float = 0.0
+) -> torch.Tensor:
+    """
+    Aligns a log probability tensor with a given attention mask.
+    """
+    device = logprob_tensor.device
+    batch_size, logprob_seq_len = logprob_tensor.shape
+    mask_seq_len = attention_mask.shape[1]
+    padded_logprobs = torch.full(
+        attention_mask.shape,
+        fill_value=pad_value,
+        dtype=logprob_tensor.dtype,
+        device=device
+    )
+    left_pad_counts = torch.argmax(attention_mask, dim=1)
+    cols = torch.arange(logprob_seq_len, device=device)
+    dest_indices = left_pad_counts.unsqueeze(1) + cols
+    # Create destination row indices
+    # Shape: [batch_size, logprob_seq_len]
+    row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices)
+    # --- 4. Filter out-of-bounds indices and perform assignment ---
+    # Create a mask to identify only the indices that are within the bounds
+    # of the target tensor's sequence length.
+    valid_mask = dest_indices < mask_seq_len
+    # Use this mask to select only the valid row indices, column indices,
+    # and the corresponding values from the logprob tensor.
+    # This flattens the selected elements into 1D tensors.
+    valid_rows = row_indices[valid_mask]
+    valid_cols = dest_indices[valid_mask]
+    valid_vals = logprob_tensor[valid_mask]
+    # Place the valid values into their correct positions in the padded tensor
+    # using a single, efficient advanced indexing operation.
+    padded_logprobs[valid_rows, valid_cols] = valid_vals
+    return padded_logprobs
+def autotune_batch_and_chunks(
+    total_input_rows,
+    seq_len,
+    hidden_size,
+    vocab_size,
+    dtype_bytes=16,
+    multiplier=None
+):
+    if multiplier is None:
+        final_m = max(4, seq_len // 4096)
+    else:
+        final_m = multiplier
+    if torch.cuda.is_available():
+        free_bytes, _ = torch.cuda.mem_get_info()
+        limit_gb = (free_bytes / (1024**3))*.80
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        # For XPU: estimate free memory from total - reserved
+        total_mem = torch.xpu.get_device_properties(0).total_memory
+        reserved_mem = torch.xpu.memory_reserved()
+        free_bytes = total_mem - reserved_mem
+        limit_gb = (free_bytes / (1024**3)) * 0.80
+    else:
+        # Fallback: assume 8GB available
+        limit_gb = 8.0
+    bytes_to_gb = 1024**3
+    b_vals = torch.arange(total_input_rows, 0, -1, device='cpu', dtype=torch.float32)
+    hidden_gb = (b_vals * seq_len * hidden_size * dtype_bytes) / bytes_to_gb
+    base_logits = ((b_vals/total_input_rows) * b_vals * seq_len * vocab_size * dtype_bytes) / bytes_to_gb
+    logits_gb = base_logits / final_m
+    total_mem_gb = hidden_gb + logits_gb
+    valid_mask = total_mem_gb <= limit_gb
+    valid_indices = torch.nonzero(valid_mask, as_tuple=False)
+    if valid_indices.shape[0] == 0:
+        #This means your GPU will OOM
+        return 4, final_m
+    best_idx = valid_indices[0].item()
+    final_b = int(b_vals[best_idx].item())
+    return final_b, final_m
+def sanitize_logprob(logprob):
+    """Local port of trl.scripts.vllm_serve.sanitize_logprob.
+    Filters NaN logprobs from vLLM outputs."""
+    value = logprob.logprob
+    if math.isnan(value):
+        logging.getLogger(__name__).warning(
+            f"Generated NaN logprob, token logprob '{logprob}' will be ignored"
+        )
+        return None
+    return value
+@dataclass
+class UnslothXPOConfig(XPOConfig):
+    """
+    Configuration class for the [`XPOTrainer`].
+    Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following:
+    Parameters:
+        alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`):
+            Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch
+            and the last alpha is used for the rest of the epochs.
+    """
+    vllm_sampling_params: Optional[Any] = field(
+        default = None,
+        metadata = {'help': 'vLLM SamplingParams'},
+    )
+    unsloth_num_chunks : Optional[int] = field(
+        default = -1,
+        metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'},
+    )
+    unsloth_logit_chunk_multiplier : Optional[int] = field(
+            default = None,
+            metadata = {'help': 'Multiplier for chunked logit computations.'},
+        )
+    unsloth_grpo_mini_batch : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Mini batch size for GRPO hidden state accumulation. Default is None unless user defines it.'},
+    )
+    max_seq_length : Optional[int] = field(
+        default = None,
+        metadata = {'help': 'Maximum sequence length to truncate to.'},
+    )
+    def __init__(
+        self,
+        output_dir = None,
+        per_device_train_batch_size = 4,
+        num_train_epochs = 3.0,
+        max_steps = -1,
+        learning_rate = 5e-05,
+        lr_scheduler_type = 'linear',
+        lr_scheduler_kwargs = None,
+        warmup_steps = 0.1,
+        optim = 'adamw_8bit',
+        optim_args = None,
+        weight_decay = 0.01,
+        adam_beta1 = 0.9,
+        adam_beta2 = 0.999,
+        adam_epsilon = 1e-08,
+        optim_target_modules = None,
+        gradient_accumulation_steps = 2,
+        average_tokens_across_devices = True,
+        max_grad_norm = 1.0,
+        label_smoothing_factor = 0.0,
+        bf16 = False,
+        fp16 = False,
+        bf16_full_eval = False,
+        fp16_full_eval = False,
+        tf32 = None,
+        gradient_checkpointing = True,
+        gradient_checkpointing_kwargs = None,
+        torch_compile = False,
+        torch_compile_backend = None,
+        torch_compile_mode = None,
+        use_liger_kernel = False,
+        liger_kernel_config = None,
+        use_cache = False,
+        neftune_noise_alpha = None,
+        torch_empty_cache_steps = 250,
+        auto_find_batch_size = False,
+        logging_strategy = 'steps',
+        logging_steps = 1,
+        logging_first_step = False,
+        log_on_each_node = True,
+        logging_nan_inf_filter = False,
+        include_num_input_tokens_seen = False,
+        log_level = 'passive',
+        log_level_replica = 'warning',
+        disable_tqdm = None,
+        report_to = 'none',
+        run_name = None,
+        project = 'huggingface',
+        trackio_space_id = 'trackio',
+        eval_strategy = 'no',
+        eval_steps = None,
+        eval_delay = 0,
+        per_device_eval_batch_size = 4,
+        prediction_loss_only = False,
+        eval_on_start = False,
+        eval_do_concat_batches = True,
+        eval_use_gather_object = False,
+        eval_accumulation_steps = 2,
+        batch_eval_metrics = False,
+        save_only_model = False,
+        save_strategy = 'steps',
+        save_steps = 500,
+        save_on_each_node = False,
+        save_total_limit = None,
+        enable_jit_checkpoint = False,
+        push_to_hub = False,
+        hub_token = None,
+        hub_private_repo = None,
+        hub_model_id = None,
+        hub_strategy = 'every_save',
+        hub_always_push = False,
+        hub_revision = None,
+        load_best_model_at_end = False,
+        metric_for_best_model = None,
+        greater_is_better = None,
+        ignore_data_skip = False,
+        restore_callback_states_from_checkpoint = False,
+        full_determinism = False,
+        seed = 3407,
+        data_seed = 3407,
+        use_cpu = False,
+        accelerator_config = None,
+        parallelism_config = None,
+        dataloader_drop_last = False,
+        dataloader_num_workers = 0,
+        dataloader_pin_memory = True,
+        dataloader_persistent_workers = False,
+        dataloader_prefetch_factor = None,
+        remove_unused_columns = True,
+        label_names = None,
+        train_sampling_strategy = 'random',
+        length_column_name = 'length',
+        ddp_find_unused_parameters = None,
+        ddp_bucket_cap_mb = None,
+        ddp_broadcast_buffers = None,
+        ddp_backend = None,
+        ddp_timeout = 1800,
+        fsdp = None,
+        fsdp_config = None,
+        deepspeed = None,
+        debug = '',
+        skip_memory_metrics = True,
+        do_train = False,
+        do_eval = False,
+        do_predict = False,
+        resume_from_checkpoint = None,
+        warmup_ratio = None,
+        logging_dir = None,
+        local_rank = -1,
+        reward_model_path = None,
+        judge = None,
+        max_new_tokens = 64,
+        max_length = 512,
+        temperature = 0.9,
+        top_p = 1.0,
+        top_k = None,
+        min_p = None,
+        repetition_penalty = 1.0,
+        generation_kwargs = {},
+        use_transformers_paged = False,
+        cache_implementation = None,
+        missing_eos_penalty = None,
+        loss_type = 'sigmoid',
+        disable_dropout = True,
+        use_vllm = False,
+        vllm_model_impl = 'vllm',
+        vllm_guided_decoding_regex = None,
+        vllm_gpu_memory_utilization = 0.55,
+        vllm_mode = 'colocate',
+        vllm_server_base_url = None,
+        vllm_server_host = '0.0.0.0',
+        vllm_server_port = 8000,
+        vllm_server_timeout = 240.0,
+        vllm_tensor_parallel_size = 1,
+        ds3_gather_for_generation = True,
+        model_init_kwargs = None,
+        reward_weights = None,
+        dataset_num_proc = None,
+        gpu_memory_utilization = None,
+        vllm_sampling_params = None,
+        unsloth_num_chunks = -1,
+        unsloth_logit_chunk_multiplier = None,
+        unsloth_grpo_mini_batch = None,
+        max_seq_length = None,
+        **kwargs,
+    ):
+        if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!')
+        if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!')
+        if num_train_epochs is None:
+            num_train_epochs = 3.0  # Default to 3 epochs if None, max_steps will override
+        if output_dir is None and save_strategy == 'steps' and save_steps == 500:
+            output_dir = 'unsloth_training_checkpoints'
+            save_strategy = 'no'
+        import multiprocessing as _mp
+        if _mp.get_start_method() != 'fork':
+            dataset_num_proc = None
+        elif dataset_num_proc is None:
+            import psutil
+            dataset_num_proc = min(max((psutil.cpu_count() or 1)+4, 2), 64)
+            memory_gb_left = psutil.virtual_memory().available / (1024**3)
+            if memory_gb_left <= 2: dataset_num_proc = 1
+            else: dataset_num_proc = min(dataset_num_proc, int(memory_gb_left))
+        if temperature <= 0:
+            raise ValueError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.')
+        elif temperature >= 10:
+            raise ValueError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.')
+        super().__init__(
+            output_dir = output_dir,
+            per_device_train_batch_size = per_device_train_batch_size,
+            num_train_epochs = num_train_epochs,
+            max_steps = max_steps,
+            learning_rate = learning_rate,
+            lr_scheduler_type = lr_scheduler_type,
+            lr_scheduler_kwargs = lr_scheduler_kwargs,
+            warmup_steps = warmup_steps,
+            optim = optim,
+            optim_args = optim_args,
+            weight_decay = weight_decay,
+            adam_beta1 = adam_beta1,
+            adam_beta2 = adam_beta2,
+            adam_epsilon = adam_epsilon,
+            optim_target_modules = optim_target_modules,
+            gradient_accumulation_steps = gradient_accumulation_steps,
+            average_tokens_across_devices = average_tokens_across_devices,
+            max_grad_norm = max_grad_norm,
+            label_smoothing_factor = label_smoothing_factor,
+            bf16 = bf16,
+            fp16 = fp16,
+            bf16_full_eval = bf16_full_eval,
+            fp16_full_eval = fp16_full_eval,
+            tf32 = tf32,
+            gradient_checkpointing = gradient_checkpointing,
+            gradient_checkpointing_kwargs = gradient_checkpointing_kwargs,
+            torch_compile = torch_compile,
+            torch_compile_backend = torch_compile_backend,
+            torch_compile_mode = torch_compile_mode,
+            use_liger_kernel = use_liger_kernel,
+            liger_kernel_config = liger_kernel_config,
+            use_cache = use_cache,
+            neftune_noise_alpha = neftune_noise_alpha,
+            torch_empty_cache_steps = torch_empty_cache_steps,
+            auto_find_batch_size = auto_find_batch_size,
+            logging_strategy = logging_strategy,
+            logging_steps = logging_steps,
+            logging_first_step = logging_first_step,
+            log_on_each_node = log_on_each_node,
+            logging_nan_inf_filter = logging_nan_inf_filter,
+            include_num_input_tokens_seen = include_num_input_tokens_seen,
+            log_level = log_level,
+            log_level_replica = log_level_replica,
+            disable_tqdm = disable_tqdm,
+            report_to = report_to,
+            run_name = run_name,
+            project = project,
+            trackio_space_id = trackio_space_id,
+            eval_strategy = eval_strategy,
+            eval_steps = eval_steps,
+            eval_delay = eval_delay,
+            per_device_eval_batch_size = per_device_eval_batch_size,
+            prediction_loss_only = prediction_loss_only,
+            eval_on_start = eval_on_start,
+            eval_do_concat_batches = eval_do_concat_batches,
+            eval_use_gather_object = eval_use_gather_object,
+            eval_accumulation_steps = eval_accumulation_steps,
+            batch_eval_metrics = batch_eval_metrics,
+            save_only_model = save_only_model,
+            save_strategy = save_strategy,
+            save_steps = save_steps,
+            save_on_each_node = save_on_each_node,
+            save_total_limit = save_total_limit,
+            enable_jit_checkpoint = enable_jit_checkpoint,
+            push_to_hub = push_to_hub,
+            hub_token = hub_token,
+            hub_private_repo = hub_private_repo,
+            hub_model_id = hub_model_id,
+            hub_strategy = hub_strategy,
+            hub_always_push = hub_always_push,
+            hub_revision = hub_revision,
+            load_best_model_at_end = load_best_model_at_end,
+            metric_for_best_model = metric_for_best_model,
+            greater_is_better = greater_is_better,
+            ignore_data_skip = ignore_data_skip,
+            restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint,
+            full_determinism = full_determinism,
+            seed = seed,
+            data_seed = data_seed,
+            use_cpu = use_cpu,
+            accelerator_config = accelerator_config,
+            parallelism_config = parallelism_config,
+            dataloader_drop_last = dataloader_drop_last,
+            dataloader_num_workers = dataloader_num_workers,
+            dataloader_pin_memory = dataloader_pin_memory,
+            dataloader_persistent_workers = dataloader_persistent_workers,
+            dataloader_prefetch_factor = dataloader_prefetch_factor,
+            remove_unused_columns = remove_unused_columns,
+            label_names = label_names,
+            train_sampling_strategy = train_sampling_strategy,
+            length_column_name = length_column_name,
+            ddp_find_unused_parameters = ddp_find_unused_parameters,
+            ddp_bucket_cap_mb = ddp_bucket_cap_mb,
+            ddp_broadcast_buffers = ddp_broadcast_buffers,
+            ddp_backend = ddp_backend,
+            ddp_timeout = ddp_timeout,
+            fsdp = fsdp,
+            fsdp_config = fsdp_config,
+            deepspeed = deepspeed,
+            debug = debug,
+            skip_memory_metrics = skip_memory_metrics,
+            do_train = do_train,
+            do_eval = do_eval,
+            do_predict = do_predict,
+            resume_from_checkpoint = resume_from_checkpoint,
+            warmup_ratio = warmup_ratio,
+            logging_dir = logging_dir,
+            local_rank = local_rank,
+            reward_model_path = reward_model_path,
+            judge = judge,
+            max_new_tokens = max_new_tokens,
+            max_length = max_length,
+            temperature = temperature,
+            top_p = top_p,
+            top_k = top_k,
+            min_p = min_p,
+            repetition_penalty = repetition_penalty,
+            generation_kwargs = generation_kwargs,
+            use_transformers_paged = use_transformers_paged,
+            cache_implementation = cache_implementation,
+            missing_eos_penalty = missing_eos_penalty,
+            loss_type = loss_type,
+            disable_dropout = disable_dropout,
+            use_vllm = use_vllm,
+            vllm_model_impl = vllm_model_impl,
+            vllm_guided_decoding_regex = vllm_guided_decoding_regex,
+            vllm_gpu_memory_utilization = vllm_gpu_memory_utilization,
+            vllm_mode = vllm_mode,
+            vllm_server_base_url = vllm_server_base_url,
+            vllm_server_host = vllm_server_host,
+            vllm_server_port = vllm_server_port,
+            vllm_server_timeout = vllm_server_timeout,
+            vllm_tensor_parallel_size = vllm_tensor_parallel_size,
+            ds3_gather_for_generation = ds3_gather_for_generation,
+            model_init_kwargs = model_init_kwargs,
+            reward_weights = reward_weights,
+            dataset_num_proc = dataset_num_proc,
+            gpu_memory_utilization = gpu_memory_utilization,**kwargs)
+        self.vllm_sampling_params = vllm_sampling_params
+        self.unsloth_num_chunks = unsloth_num_chunks
+        if unsloth_grpo_mini_batch is not None:
+            if self.generation_batch_size >= unsloth_grpo_mini_batch:
+                self.unsloth_grpo_mini_batch = unsloth_grpo_mini_batch
+            else:
+                raise ValueError(
+                    f"Unsloth GRPO mini batch size needs to be less than or equal to the effective generation batch size, "
+                    f"which is self.per_device_train_batch_size * gradient_accumulation_steps."
+                )
+        self.unsloth_logit_chunk_multiplier = unsloth_logit_chunk_multiplier
+        self.max_seq_length = max_seq_length
+pass
+class _UnslothXPOTrainer(OnlineDPOTrainer):
+    """"""
+    _tag_names = ["trl", "xpo"]
+    _name = "XPO"
+    _paper = {
+        "title": "Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF",
+        "id": "2405.21046",
+        # docstyle-ignore
+        "citation": textwrap.dedent("""\
+            @article{jung2024binary,
+                title        = {{Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF}},
+                author       = {Tengyang Xie and Dylan J. Foster and Akshay Krishnamurthy and Corby Rosset and Ahmed Awadallah and Alexander Rakhlin},
+                year         = 2024,
+                eprint       = {arXiv:2405.21046}
+            }"""),
+    }
+    def __init__(
+        self,
+        model: Union[PreTrainedModel, nn.Module] = None,
+        ref_model: Union[PreTrainedModel, nn.Module] = None,
+        reward_funcs: Optional[nn.Module] = None,
+        judge: Optional[BasePairwiseJudge] = None,
+        args: Optional[XPOConfig] = None,
+        data_collator: Optional[Callable] = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
+        processing_class: Optional[
+            Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin]
+        ] = None,
+        reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None,
+        peft_config: Optional[dict] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        # Deprecated parameters
+        reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            ref_model=ref_model,
+            judge=judge,
+            reward_funcs=reward_funcs,
+            reward_model=reward_model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            reward_processing_classes=reward_processing_classes,
+            peft_config=peft_config,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+        self._alpha = self.args.alpha
+        # Overwrite the stats dictionary to include XPO specific statistics
+        self.stats = {
+            # Remove "non_score_reward", "rlhf_reward", "scores"
+            # Add "loss/dpo", "loss/xpo"
+            "loss/dpo": [],
+            "loss/xpo": [],
+            "objective/kl": [],
+            "objective/entropy": [],
+            "rewards/chosen": [],
+            "rewards/rejected": [],
+            "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
+            # Replace "contain_eos_token" by "model_contain_eos_token" and "ref_contain_eos_token"
+            "val/model_contain_eos_token": [],
+            "val/ref_contain_eos_token": [],
+            "alpha": [],
+            "beta": [],
+        }
+        if self.reward_funcs is not None:
+            if len(self.reward_funcs) != 1:
+                raise ValueError("XPOTrainer only supports one reward function/model.")
+            self.reward_funcs = self.reward_funcs[0]
+            self.stats["objective/model_scores"] = []
+            self.stats["objective/ref_scores"] = []
+            self.stats["objective/scores_margin"] = []
+    @property
+    def alpha(self):
+        if isinstance(self._alpha, list):
+            epoch = self.state.epoch
+            return self._alpha[epoch] if epoch < len(self._alpha) else self._alpha[-1]
+        else:
+            return self._alpha
+    def _generate_completions(self, prompts, model):
+        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_model_for_gen:
+            model_output = unwrapped_policy_model_for_gen.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+        actual_model_for_ref_generation: torch.nn.Module
+        if self.ref_model is None:
+            unwrapped_main_model_for_ref_logic = self.accelerator.unwrap_model(model)
+            if is_peft_available() and isinstance(unwrapped_main_model_for_ref_logic, PeftModel):
+                actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic.get_base_model()
+            else:
+                actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic
+        else:
+            actual_model_for_ref_generation = self.accelerator.unwrap_model(self.ref_model)
+        with unwrap_model_for_generation(actual_model_for_ref_generation, self.accelerator) as final_ref_model_for_gen:
+            ref_output = final_ref_model_for_gen.generate(
+                input_ids=prompts["input_ids"],
+                attention_mask=prompts["attention_mask"],
+                generation_config=self.generation_config,
+            )
+        return model_output, ref_output
+    def _process_completions(self, model_output, ref_output, prompts):
+        context_length = prompts["input_ids"].shape[1]
+        # Process model completions
+        model_completion_ids = model_output[:, context_length:]
+        model_completion_ids, model_completion_mask = truncate_right(
+            model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        model_data = {
+            "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+        # Process reference model completions
+        ref_completion_ids = ref_output[:, context_length:]
+        ref_completion_ids, ref_completion_mask = truncate_right(
+            ref_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
+        )
+        ref_data = {
+            "input_ids": torch.cat((prompts["input_ids"], ref_completion_ids), dim=1),
+            "attention_mask": torch.cat((prompts["attention_mask"], ref_completion_mask), dim=1),
+            "raw": prompts["raw"],
+        }
+        return model_data, ref_data
+    def _compute_rewards(self, model_data, ref_data, context_length):
+        with torch.no_grad():
+            _, model_scores, _ = get_reward(
+                self.reward_funcs, model_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+            _, ref_scores, _ = get_reward(
+                self.reward_funcs, ref_data["input_ids"], self.processing_class.pad_token_id, context_length
+            )
+        # Apply EOS penalty if needed
+        if self.args.missing_eos_penalty is not None:
+            model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            ref_contain_eos = torch.any(ref_data["input_ids"] == self.processing_class.eos_token_id, dim=-1)
+            model_scores[~model_contain_eos] -= self.args.missing_eos_penalty
+            ref_scores[~ref_contain_eos] -= self.args.missing_eos_penalty
+        return model_scores, ref_scores
+    def _compute_judge(self, model_data, ref_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [completion.strip() for completion in model_data_completions]
+        ref_data_completions = self.processing_class.batch_decode(
+            ref_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        ref_data_completions = [completion.strip() for completion in ref_data_completions]
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
+            ref_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in ref_data_completions
+            ]
+            ref_data_completions = [template.render(messages=completion) for completion in ref_data_completions]
+        ranks_of_first_completion = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, ref_data_completions)),
+        )
+        # convert ranks to a True/False mask:
+        # when rank == 0, it means the first completion is the best
+        # when rank == 1, it means the second completion is the best
+        return torch.tensor([rank == 0 for rank in ranks_of_first_completion], device=model_data["input_ids"].device)
+    def _compute_logprobs(self, model, model_data, ref_data, context_length):
+        def compute_logprobs_for_data(m, data):
+            output = m(data["input_ids"], attention_mask=data["attention_mask"])
+            logits = output.logits[:, context_length - 1 : -1]
+            token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:])
+            return token_logprobs
+        # Compute logprobs for model completions
+        model_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+        # Compute logprobs for model on reference completions (for XPO loss)
+        model_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
+        # Compute logprobs for reference model completions
+        with torch.no_grad():
+            if self.ref_model is None:
+                with model.disable_adapter():
+                    ref_logprobs_model_data = compute_logprobs_for_data(model, model_data)
+                    ref_logprobs_ref_data = compute_logprobs_for_data(model, ref_data)
+            else:
+                ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data)
+                ref_logprobs_ref_data = compute_logprobs_for_data(self.ref_model, ref_data)
+        # Mask padding tokens
+        model_padding_mask = model_data["attention_mask"][:, context_length:] == 0
+        ref_padding_mask = ref_data["attention_mask"][:, context_length:] == 0
+        model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+        model_logprobs_ref_data = model_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0)
+        ref_logprobs_ref_data = ref_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0)
+        ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0)
+        return model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data
+    def _compute_losses(
+        self,
+        model_logprobs_model_data,
+        model_logprobs_ref_data,
+        ref_logprobs_ref_data,
+        ref_logprobs_model_data,
+        chosen_mask,
+    ):
+        # Compute log probs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
+        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+        chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
+        rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
+        # Compute logits as the difference between chosen and rejected log ratios
+        logits = chosen_log_ratios - rejected_log_ratios
+        if self.args.loss_type == "sigmoid":
+            dpo_losses = -F.logsigmoid(self.beta * logits)
+        elif self.args.loss_type == "ipo":
+            dpo_losses = (logits - 1 / (2 * self.beta)) ** 2
+        else:
+            raise NotImplementedError(f"invalid loss type {self.args.loss_type}")
+        # Compute XPO specific loss
+        xpo_losses = self.alpha * model_logprobs_ref_data_sum
+        # Total loss
+        loss = (dpo_losses + xpo_losses).mean()
+        return loss, dpo_losses, xpo_losses
+    def _log_statistics(
+        self,
+        model_data,
+        ref_data,
+        model_logprobs_model_data,
+        model_logprobs_ref_data,
+        ref_logprobs_ref_data,
+        ref_logprobs_model_data,
+        chosen_mask,
+        dpo_losses,
+        xpo_losses,
+        context_length,
+        model_scores=None,
+        ref_scores=None,
+    ):
+        # Helper function to gather and compute mean
+        def gather_mean(tensor):
+            return self.accelerator.gather_for_metrics(tensor).mean().item()
+        # Log losses
+        self.stats["loss/dpo"].append(gather_mean(dpo_losses))
+        self.stats["loss/xpo"].append(gather_mean(xpo_losses))
+        # Log scores
+        if self.reward_funcs is not None:
+            self.stats["objective/model_scores"].append(gather_mean(model_scores))
+            self.stats["objective/ref_scores"].append(gather_mean(ref_scores))
+            self.stats["objective/scores_margin"].append(gather_mean(model_scores - ref_scores))
+        # Log logprobs
+        model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
+        model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1)
+        ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1)
+        ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
+        chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs
+        rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum)
+        rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum)
+        rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs
+        self.stats["logps/chosen"].append(gather_mean(chosen_model_logprobs.mean() + chosen_ref_logprobs.mean()))
+        self.stats["logps/rejected"].append(gather_mean(rejected_model_logprobs.mean() + rejected_ref_logprobs.mean()))
+        # Log rewards
+        # Compute various statistics
+        chosen_rewards = chosen_log_ratios * self.beta
+        rejected_rewards = rejected_log_ratios * self.beta
+        self.stats["rewards/chosen"].append(gather_mean(chosen_rewards.mean()))
+        self.stats["rewards/rejected"].append(gather_mean(rejected_rewards.mean()))
+        # Calculate KL divergence for model and ref data
+        kl_model_data = model_logprobs_model_data - ref_logprobs_model_data
+        kl_ref_data = model_logprobs_ref_data - ref_logprobs_ref_data
+        mean_kl = (kl_model_data.sum(1) + kl_ref_data.sum(1)).mean() / 2
+        self.stats["objective/kl"].append(gather_mean(mean_kl))
+        # Calculate entropy for model and ref data
+        entropy_model_data = -model_logprobs_model_data.sum(1)
+        entropy_ref_data = -model_logprobs_ref_data.sum(1)
+        mean_entropy = (entropy_model_data.mean() + entropy_ref_data.mean()) / 2
+        self.stats["objective/entropy"].append(gather_mean(mean_entropy))
+        # Calculate margins
+        margin = chosen_rewards - rejected_rewards
+        self.stats["rewards/margins"].append(gather_mean(margin.mean()))
+        # Calculate accuracy
+        accuracy = (margin > 0).float()
+        self.stats["rewards/accuracies"].append(gather_mean(accuracy.mean()))
+        # Log EOS token statistics
+        model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        ref_eos = (ref_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1)
+        self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float()))
+        self.stats["val/ref_contain_eos_token"].append(gather_mean(ref_eos.float()))
+        # Log alpha and beta
+        self.stats["alpha"].append(self.alpha)
+        self.stats["beta"].append(self.beta)
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None
+    ) -> torch.Tensor:
+        model.train()
+        # Apply chat template and tokenize the input
+        batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
+        inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
+        inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
+        inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
+        inputs = self.data_collator(inputs)
+        # need the prompt_ only
+        inputs = self._prepare_inputs(inputs)
+        context_length = inputs["prompt_input_ids"].shape[1]
+        prompts = {
+            "input_ids": inputs["prompt_input_ids"],
+            "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
+        }
+        del inputs
+        # Sample completions from both the model and the reference model
+        model_output, ref_output = self._generate_completions(prompts, model)
+        # Process model completions
+        model_data, ref_data = self._process_completions(model_output, ref_output, prompts)
+        # Compute rewards
+        if self.reward_funcs is not None:
+            model_scores, ref_scores = self._compute_rewards(model_data, ref_data, context_length)
+            chosen_mask = model_scores >= ref_scores
+        else:
+            model_scores, ref_scores = None, None
+            chosen_mask = self._compute_judge(model_data, ref_data, context_length)
+        # Compute logprobs
+        model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data = (
+            self._compute_logprobs(model, model_data, ref_data, context_length)
+        )
+        # Compute loss
+        loss, dpo_losses, xpo_losses = self._compute_losses(
+            model_logprobs_model_data,
+            model_logprobs_ref_data,
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+            chosen_mask,
+        )
+        # Log everything
+        self._log_statistics(
+            model_data,
+            ref_data,
+            model_logprobs_model_data.detach(),
+            model_logprobs_ref_data.detach(),
+            ref_logprobs_ref_data,
+            ref_logprobs_model_data,
+            chosen_mask,
+            dpo_losses.detach(),
+            xpo_losses.detach(),
+            context_length,
+            model_scores,
+            ref_scores,
+        )
+        if (
+            self.args.torch_empty_cache_steps is not None
+            and self.state.global_step % self.args.torch_empty_cache_steps == 0
+        ):
+            empty_cache()
+        kwargs = {}
+        # For LOMO optimizers you need to explicitly use the learning rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+        self.accelerator.backward(loss, **kwargs)
+        return loss.detach() / self.args.gradient_accumulation_steps
+class UnslothXPOTrainer(_UnslothXPOTrainer):
+    """
+    Trainer for Exploratory Preference Optimization (XPO).
+    It is implemented as a subclass of [`OnlineDPOTrainer`].
+    Args:
+        model ([`~transformers.PreTrainedModel`]):
+            The model to train, preferably an `AutoModelForCausalLM`.
+        ref_model ([`PreTrainedModelWrapper`]):
+            Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation
+            and loss. If no reference model is provided, the trainer will create a reference model with the same
+            architecture as the model to be optimized.
+        reward_funcs ([`~transformers.PreTrainedModel`]):
+            The reward model to score completions with, preferably an
+            [`~transformers.AutoModelForSequenceClassification`].
+        judge ([`BasePairwiseJudge`]):
+            The judge to use for pairwise comparison of model completions.
+        args ([`XPOConfig`]):
+            The XPO config arguments to use for training.
+        data_collator ([`~transformers.DataCollator`]):
+            The data collator to use for training. If None is specified, the default data collator
+            ([`DPODataCollatorWithPadding`]) will be used which will pad the sequences to the maximum length of the
+            sequences in the batch, given a dataset of paired sequences.
+        train_dataset ([`~datasets.Dataset`]):
+            The dataset to use for training.
+        eval_dataset ([`~datasets.Dataset`]):
+            The dataset to use for evaluation.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*):
+            Processing class used to process the data. If provided, will be used to automatically process the inputs
+            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
+            reuse the fine-tuned model.
+        peft_config (`dict`):
+            The peft config to use for training.
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
+            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
+            metric values.
+        callbacks (`list[transformers.TrainerCallback]`):
+            The callbacks to use for training.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
+            The optimizer and scheduler to use for training.
+        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
+            The function to use to preprocess the logits before computing the metrics.
+        reward_model:
+            <Deprecated version="0.22.0">
+            This parameter is deprecated and will be removed in version 0.25.0. Use `reward_funcs` instead.
+            </Deprecated>
+    """
+    def __init__(
+        self,
+        model = None,
+        ref_model = None,
+        reward_funcs = None,
+        judge = None,
+        args = None,
+        data_collator = None,
+        train_dataset = None,
+        eval_dataset = None,
+        processing_class = None,
+        reward_processing_classes = None,
+        peft_config = None,
+        compute_metrics = None,
+        callbacks = None,
+        preprocess_logits_for_metrics = None,
+        reward_model = None,
+        **kwargs
+    ):
+        if args is None: args = UnslothXPOConfig()
+        use_bf16 = getattr(args, 'bf16', False)
+        if type(use_bf16) is not bool: use_bf16 = False
+        use_fp16 = getattr(args, 'fp16', False)
+        if type(use_fp16) is not bool: use_fp16 = False
+        force_float32 = False
+        full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1'
+        if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'):
+            print('Unsloth: Switching to float32 training since model cannot work with float16')
+            force_float32 = True
+        mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32')
+        dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None)
+        if dtype is None: dtype = model.get_input_embeddings().weight.dtype
+        from unsloth_zoo.utils import _get_dtype
+        dtype = _get_dtype(dtype)
+        float16 = dtype == torch.float16
+        if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`')
+        if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`')
+        if force_float32:
+            # Forced float32 training
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+            if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no'
+            # args.mixed_precision is a new argument which needs to be set now
+        elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32':
+            # Mixed precision training
+            args.fp16 = float16
+            args.bf16 = not float16
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16'
+            if hasattr(args, 'mixed_precision'): args.mixed_precision = 'fp16' if float16 else 'bf16'
+            # args.mixed_precision is a new argument which needs to be set now
+        elif mixed_precision_dtype == 'bfloat16':
+            # Both False since bfloat16 full finetuning doesn't do any autocasting.
+            args.fp16 = False
+            args.bf16 = False
+            os.environ['ACCELERATE_MIXED_PRECISION'] = 'no'
+            if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no'
+            # args.mixed_precision is a new argument which needs to be set now
+        if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no':
+            args.eval_strategy = 'steps'
+            if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1
+        ga_steps = getattr(args, 'gradient_accumulation_steps', None)
+        if ga_steps is not None and ga_steps > 1:
+            from transformers import __version__ as transformers_version
+            if Version(transformers_version) <= Version('4.45.2'):
+                print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n'
+                      '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`')
+        if getattr(args, 'eval_strategy', 'no') != 'no':
+            eval_bsz = getattr(args, 'per_device_eval_batch_size', 8)
+            if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size
+            if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps
+        fp16_full_eval = getattr(args, 'fp16_full_eval', False)
+        if type(fp16_full_eval) is not bool: fp16_full_eval = False
+        bf16_full_eval = getattr(args, 'bf16_full_eval', False)
+        if type(bf16_full_eval) is not bool: bf16_full_eval = False
+        if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True
+        if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False
+        if force_float32:
+            args.bf16_full_eval = False
+            args.fp16_full_eval = False
+        elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16':
+            args.bf16_full_eval = True
+            args.fp16_full_eval = False
+        elif not bf16_full_eval and not fp16_full_eval:
+            args.bf16_full_eval = args.bf16
+            args.fp16_full_eval = args.fp16
+        _output_logits = False
+        if locals().get('compute_metrics', None) is not None: _output_logits = True
+        if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True
+        if _output_logits:
+            os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
+        if model is not None:
+            _warnings_issued = getattr(model, 'warnings_issued', None)
+            if _warnings_issued is None:
+                model.warnings_issued = {}
+            elif not isinstance(_warnings_issued, dict):
+                try:
+                    model.warnings_issued = dict(_warnings_issued)
+                except Exception:
+                    model.warnings_issued = {}
+        if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'):
+            pass
+        else:
+            model_max_seq_length = getattr(model, 'max_seq_length', None)
+            args_max_seq_length  = getattr(args,  'max_seq_length', None)
+            if args_max_seq_length is None and model_max_seq_length is not None:
+                max_seq_length = model.max_seq_length
+                if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length
+            elif args_max_seq_length is not None and model_max_seq_length is not None:
+                if args_max_seq_length > model_max_seq_length:
+                    print('Unsloth: You set `max_seq_length` as ' + str(args_max_seq_length) + ' but '
+                           'the maximum the model supports is ' + str(model_max_seq_length) + '. We shall reduce it.')
+                    args.max_seq_length = model_max_seq_length
+        if model is not None and hasattr(model, 'for_training'):
+            model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True))
+        if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right'
+        if 'processing_class' in locals():
+            if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right'
+            if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right'
+        __tokenizer = processing_class if 'processing_class' in locals() else tokenizer
+        from unsloth_zoo.vision_utils import UnslothVisionDataCollator
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names:
+                data_collator = TransformersDataCollatorForLanguageModeling(
+                    __tokenizer,
+                    mlm = False,
+                    mlm_probability = 0.0,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+            elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names:
+                data_collator = DataCollatorForSeq2Seq(
+                    __tokenizer,
+                    pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                )
+        else:
+            if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False
+            if hasattr(args, 'dataset_text_field'): args.dataset_text_field = ''
+            if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True}
+        if not isinstance(data_collator, UnslothVisionDataCollator):
+            if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'):
+                if isinstance(data_collator, DataCollatorForSeq2Seq):
+                    data_collator = DataCollatorForSeq2Seq(
+                        __tokenizer.tokenizer,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+                else:
+                    data_collator = TransformersDataCollatorForLanguageModeling(
+                        __tokenizer.tokenizer,
+                        mlm = False,
+                        mlm_probability = 0.0,
+                        pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None),
+                    )
+        other_metrics = []
+        from unsloth_zoo.logging_utils import PatchRLStatistics
+        PatchRLStatistics('xpo_trainer', other_metrics)
+        # [TODO] Fix up DataParallel multiplying batch sizes
+        # [TODO] DDP works, but DP seems to not work? [TODO]
+        if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1:
+            if getattr(args, "_n_gpu", 1) != 1:
+                args._n_gpu = 1
+        if "model" in locals() and hasattr(model, "for_training"):
+            model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True))
+        super().__init__(
+            model = model,
+            ref_model = ref_model,
+            reward_funcs = reward_funcs,
+            judge = judge,
+            args = args,
+            data_collator = data_collator,
+            train_dataset = train_dataset,
+            eval_dataset = eval_dataset,
+            processing_class = processing_class,
+            reward_processing_classes = reward_processing_classes,
+            peft_config = peft_config,
+            compute_metrics = compute_metrics,
+            callbacks = callbacks,
+            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
+            reward_model = reward_model,**kwargs)
+        if "model" in locals() and hasattr(model, "for_inference"):
+            model.for_inference()
+        if hasattr(self, 'neftune_hook_handle'):
+            self.neftune_hook_handle.remove()
+            if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle
+        if getattr(args, 'neftune_noise_alpha', None) is not None:
+            model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha
+        pass
+        if hasattr(self, 'accelerator'):
+            scaler = self.accelerator.scaler
+            current_model = model
+            while hasattr(current_model, 'model'):
+                current_model.accelerator_scaler = scaler
+                current_model = current_model.model
+            current_model.accelerator_scaler = scaler
+        pass
+        if hasattr(self, 'train'):
+            self.train = MethodType(prepare_for_training_mode(self.__class__.train), self)
+        pass
+        if hasattr(self, 'llm') and self.llm is not None and hasattr(self.llm, 'get_tokenizer'):
+            _vllm_tok = self.llm.get_tokenizer()
+            _pc = getattr(self, 'processing_class', None) or getattr(self, 'tokenizer', None)
+            if _vllm_tok is not None and _pc is not None and getattr(_pc, 'chat_template', None) is not None and getattr(_vllm_tok, 'chat_template', None) is None:
+                _vllm_tok.chat_template = _pc.chat_template
+        pass
+pass