Keshav051
/

AntiAtropos

Model card Files Files and versions

div18 commited on Apr 25

Commit

863bb8c

·

1 Parent(s): 1b9be85

OOM

Files changed (2) hide show

training/config.yaml +1 -1
training/launch_train.py +1 -0

training/config.yaml CHANGED Viewed

@@ -46,7 +46,7 @@ loss_type: "reinforce_baseline"       # reinforce | reinforce_baseline | grpo
 num_episodes_per_iteration: 4         # Safe now: max_seq_len=512 + loss_batch_size=8 + CPU offload
 num_iterations: 500                   # Total training iterations
 parallel_episodes: true               # Batch generation across episodes (10x faster)
-loss_batch_size: 4                    # Qwen3.5 logits = 4×512×151936×4 = 1.24 GB per batch
 learning_rate: 2.0e-4
 per_device_train_batch_size: 2        # A10G can handle 2 with seq_len=1024
 gradient_accumulation_steps: 4        # Effective batch = 2*4 = 8 transitions

 num_episodes_per_iteration: 4         # Safe now: max_seq_len=512 + loss_batch_size=8 + CPU offload
 num_iterations: 500                   # Total training iterations
 parallel_episodes: true               # Batch generation across episodes (10x faster)
+loss_batch_size: 2                    # Qwen3.5 logits = 2×512×151936×4 = 0.62 GB per batch
 learning_rate: 2.0e-4
 per_device_train_batch_size: 2        # A10G can handle 2 with seq_len=1024
 gradient_accumulation_steps: 4        # Effective batch = 2*4 = 8 transitions

training/launch_train.py CHANGED Viewed

@@ -121,6 +121,7 @@ def build_job_command() -> str:
         "done\n"
         "\n"
         "echo '[bootstrap] Launching training (local server, Hub persistence)...'\n"
         "ANTIATROPOS_HUB_MODEL_REPO=$HUB_MODEL_REPO "
         "ANTIATROPOS_HUB_METRICS_DATASET=$HUB_METRICS_DATASET "
         "ANTIATROPOS_ENV_URL=http://localhost:8000 "

         "done\n"
         "\n"
         "echo '[bootstrap] Launching training (local server, Hub persistence)...'\n"
+        "export PYTORCH_ALLOC_CONF='expandable_segments:True'  # required by Qwen3.5 to avoid OOM fragmentation\n"
         "ANTIATROPOS_HUB_MODEL_REPO=$HUB_MODEL_REPO "
         "ANTIATROPOS_HUB_METRICS_DATASET=$HUB_METRICS_DATASET "
         "ANTIATROPOS_ENV_URL=http://localhost:8000 "