diff --git a/run.log b/run.log new file mode 100644 index 0000000000000000000000000000000000000000..c2a6dfc78cf15d93fdef568f6a48cf90bbe8d85e --- /dev/null +++ b/run.log @@ -0,0 +1,28969 @@ +[2026-03-25 15:34:03,236][mllm.models.large_language_model_local][INFO] - Initializing adapter 'agent_adapter': no initial weights provided or found; starting from scratch. +[2026-03-25 15:34:05,743][mllm.models.adapter_training_wrapper][INFO] - Adapter 'agent_adapter': initialized with fresh weights (no initial weights found). +[2026-03-25 15:34:05,750][mllm.models.large_language_model_local][INFO] - Initializing adapter 'critic_adapter': no initial weights provided or found; starting from scratch. +[2026-03-25 15:34:08,192][mllm.models.adapter_training_wrapper][INFO] - Adapter 'critic_adapter': initialized with fresh weights (no initial weights found). +[2026-03-25 15:37:18,192][__main__][INFO] - Starting iteration 0. +[2026-03-25 15:37:18,199][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:37:18,200][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:37:24,414][__main__][INFO] - Number of regex retries in iteration 0: 0 +[2026-03-25 15:37:24,415][__main__][INFO] - agents played in iteration 0 are Bob, Alice +[2026-03-25 15:37:25,480][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:37:26,045][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:37:28,179][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:37:29,088][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:37:29,409][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:37:29,731][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:37:30,054][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:37:30,375][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:37:30,697][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:37:31,020][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:37:31,341][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:37:31,663][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:37:31,984][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:37:32,306][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:37:32,626][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:37:32,947][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:37:33,268][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:37:33,589][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:37:33,910][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:37:34,232][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:37:34,552][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:37:34,874][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:37:35,194][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:37:35,518][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:37:35,840][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:37:36,162][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:37:36,482][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:37:36,802][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:37:37,124][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:37:37,446][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:37:37,768][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:37:38,088][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:37:38,408][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:37:38,730][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:37:39,052][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:37:39,374][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:37:39,695][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:37:40,017][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:37:40,337][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:37:40,659][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:37:40,981][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:37:41,302][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:37:41,624][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:37:41,945][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:37:42,266][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:37:42,586][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:37:42,907][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:37:43,526][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:37:43,847][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:37:44,167][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:37:44,488][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:37:44,810][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:37:45,131][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:37:45,454][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:37:45,776][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:37:46,097][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:37:46,419][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:37:46,741][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:37:47,062][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:37:47,382][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:37:47,704][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:37:48,025][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:37:48,347][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:37:48,669][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:37:48,992][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:37:49,312][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:37:50,863][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 8.33%, Current % of VRAM taken: 45.78%, Block Peak % of device VRAM: 25.84%, ΔTime: 00:00:24 +[2026-03-25 15:37:51,490][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:37:51,492][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:37:51,494][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:37:52,149][__main__][INFO] - Iteration 1 took 33s (18.31% Gen, 79.76% Train). Generation: 6s, Training: 27s. Estimated remaining time: 9h 21m 26s. Estimated total time: 9h 25m 51s. Time estimates for 10 more iterations: 5m 39s, 100 more iterations: 56m 35s, 500 more iterations: 4h 42m 55s. +[2026-03-25 15:37:52,151][__main__][INFO] - Starting iteration 1. +[2026-03-25 15:37:52,155][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:37:52,155][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:37:56,680][__main__][INFO] - Number of regex retries in iteration 1: 0 +[2026-03-25 15:37:56,681][__main__][INFO] - agents played in iteration 1 are Bob, Alice +[2026-03-25 15:37:57,944][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:37:58,985][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:38:00,512][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:38:00,817][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:38:01,139][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:38:01,461][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:38:01,782][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:38:02,104][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:38:02,427][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:38:02,750][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:38:03,072][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:38:03,392][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:38:03,713][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:38:04,036][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:38:04,359][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:38:04,681][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:38:05,002][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:38:05,324][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:38:05,645][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:38:05,966][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:38:06,287][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:38:06,610][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:38:06,932][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:38:07,255][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:38:07,578][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:38:07,901][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:38:08,223][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:38:08,545][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:38:08,866][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:38:09,187][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:38:09,508][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:38:09,829][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:38:10,152][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:38:10,472][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:38:10,794][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:38:11,116][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:38:11,440][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:38:11,763][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:38:12,084][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:38:12,406][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:38:12,728][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:38:13,051][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:38:13,371][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:38:13,692][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:38:14,015][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:38:14,338][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:38:14,661][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:38:14,982][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:38:15,304][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:38:15,626][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:38:15,948][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:38:16,270][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:38:16,592][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:38:17,229][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:38:17,550][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:38:17,871][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:38:18,192][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:38:18,513][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:38:18,835][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:38:19,157][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:38:19,480][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:38:19,801][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:38:20,122][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:38:20,444][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:38:20,766][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:38:21,088][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:38:22,448][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:38:23,152][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:38:23,155][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:38:23,156][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:38:23,801][__main__][INFO] - Iteration 2 took 31s (14.30% Gen, 83.66% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 42m 31s. Estimated total time: 8h 47m 27s. Time estimates for 10 more iterations: 5m 16s, 100 more iterations: 52m 44s, 500 more iterations: 4h 23m 43s. +[2026-03-25 15:38:23,803][__main__][INFO] - Starting iteration 2. +[2026-03-25 15:38:23,807][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:38:23,808][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:38:28,251][__main__][INFO] - Number of regex retries in iteration 2: 0 +[2026-03-25 15:38:28,253][__main__][INFO] - agents played in iteration 2 are Bob, Alice +[2026-03-25 15:38:29,517][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:38:30,563][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:38:32,087][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:38:32,394][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:38:32,717][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:38:33,040][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:38:33,364][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:38:33,685][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:38:34,007][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:38:34,329][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:38:34,652][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:38:34,975][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:38:35,295][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:38:35,617][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:38:35,938][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:38:36,260][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:38:36,582][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:38:36,905][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:38:37,226][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:38:37,547][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:38:37,869][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:38:38,191][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:38:38,513][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:38:38,835][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:38:39,156][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:38:39,476][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:38:39,797][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:38:40,119][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:38:40,442][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:38:40,763][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:38:41,084][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:38:41,406][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:38:41,726][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:38:42,047][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:38:42,368][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:38:42,689][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:38:43,009][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:38:43,330][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:38:43,651][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:38:43,973][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:38:44,295][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:38:44,616][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:38:44,937][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:38:45,260][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:38:45,581][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:38:45,902][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:38:46,222][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:38:46,545][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:38:46,866][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:38:47,186][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:38:47,507][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:38:47,829][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:38:48,150][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:38:48,759][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:38:49,082][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:38:49,404][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:38:49,726][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:38:50,047][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:38:50,371][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:38:50,694][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:38:51,016][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:38:51,340][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:38:51,663][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:38:51,984][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:38:52,306][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:38:52,627][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:38:54,024][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:38:54,724][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:38:54,726][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:38:54,728][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:38:55,362][__main__][INFO] - Iteration 3 took 31s (14.08% Gen, 83.90% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 40m 28s. Estimated total time: 8h 45m 55s. Time estimates for 10 more iterations: 5m 15s, 100 more iterations: 52m 35s, 500 more iterations: 4h 22m 57s. +[2026-03-25 15:38:55,364][__main__][INFO] - Starting iteration 3. +[2026-03-25 15:38:55,368][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:38:55,368][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:38:59,921][__main__][INFO] - Number of regex retries in iteration 3: 0 +[2026-03-25 15:38:59,922][__main__][INFO] - agents played in iteration 3 are Bob, Alice +[2026-03-25 15:39:01,081][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:39:02,120][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:39:03,654][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:39:03,964][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:39:04,286][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:39:04,606][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:39:04,927][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:39:05,251][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:39:05,573][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:39:05,896][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:39:06,217][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:39:06,539][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:39:06,860][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:39:07,181][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:39:07,502][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:39:07,825][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:39:08,147][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:39:08,468][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:39:08,789][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:39:09,109][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:39:09,432][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:39:09,755][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:39:10,077][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:39:10,399][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:39:10,720][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:39:11,042][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:39:11,362][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:39:11,683][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:39:12,005][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:39:12,326][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:39:12,648][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:39:12,971][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:39:13,292][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:39:13,612][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:39:13,936][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:39:14,256][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:39:14,579][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:39:14,901][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:39:15,224][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:39:15,544][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:39:15,868][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:39:16,189][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:39:16,510][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:39:16,831][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:39:17,151][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:39:17,474][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:39:17,794][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:39:18,116][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:39:18,439][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:39:18,761][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:39:19,083][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:39:19,405][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:39:19,726][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:39:20,357][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:39:20,678][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:39:21,000][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:39:21,321][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:39:21,642][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:39:21,963][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:39:22,284][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:39:22,606][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:39:22,928][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:39:23,248][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:39:23,571][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:39:23,892][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:39:24,214][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:39:25,618][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:39:26,316][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:39:26,318][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:39:26,320][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:39:26,958][__main__][INFO] - Iteration 4 took 31s (14.42% Gen, 83.56% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 40m 32s. Estimated total time: 8h 46m 31s. Time estimates for 10 more iterations: 5m 15s, 100 more iterations: 52m 39s, 500 more iterations: 4h 23m 15s. +[2026-03-25 15:39:26,960][__main__][INFO] - Starting iteration 4. +[2026-03-25 15:39:26,964][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:39:26,964][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:39:31,412][__main__][INFO] - Number of regex retries in iteration 4: 0 +[2026-03-25 15:39:31,413][__main__][INFO] - agents played in iteration 4 are Bob, Alice +[2026-03-25 15:39:32,671][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:39:33,710][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:39:35,240][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:39:35,547][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:39:35,868][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:39:36,191][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:39:36,512][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:39:36,833][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:39:37,156][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:39:37,479][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:39:37,802][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:39:38,126][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:39:38,449][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:39:38,772][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:39:39,094][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:39:39,418][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:39:39,741][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:39:40,062][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:39:40,384][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:39:40,707][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:39:41,028][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:39:41,351][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:39:41,673][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:39:41,996][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:39:42,317][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:39:42,640][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:39:42,963][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:39:43,284][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:39:43,606][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:39:43,929][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:39:44,252][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:39:44,574][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:39:44,897][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:39:45,219][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:39:45,542][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:39:45,865][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:39:46,187][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:39:46,510][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:39:46,833][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:39:47,157][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:39:47,479][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:39:47,802][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:39:48,124][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:39:48,446][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:39:48,768][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:39:49,091][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:39:49,413][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:39:49,734][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:39:50,057][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:39:50,379][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:39:50,701][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:39:51,024][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:39:51,345][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:39:51,966][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:39:52,287][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:39:52,608][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:39:52,929][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:39:53,250][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:39:53,571][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:39:53,891][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:39:54,212][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:39:54,533][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:39:54,855][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:39:55,176][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:39:55,497][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:39:55,818][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:39:57,155][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:39:57,856][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:39:57,859][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:39:57,860][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:39:58,492][__main__][INFO] - Iteration 5 took 31s (14.11% Gen, 83.88% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 38m 58s. Estimated total time: 8h 45m 29s. Time estimates for 10 more iterations: 5m 15s, 100 more iterations: 52m 32s, 500 more iterations: 4h 22m 44s. +[2026-03-25 15:39:58,494][__main__][INFO] - Starting iteration 5. +[2026-03-25 15:39:58,497][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:39:58,498][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:40:02,793][__main__][INFO] - Number of regex retries in iteration 5: 0 +[2026-03-25 15:40:02,794][__main__][INFO] - agents played in iteration 5 are Bob, Alice +[2026-03-25 15:40:03,980][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:40:04,999][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:40:06,454][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:40:06,760][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:40:07,082][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:40:07,405][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:40:07,729][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:40:08,051][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:40:08,374][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:40:08,695][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:40:09,017][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:40:09,340][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:40:09,661][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:40:09,983][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:40:10,305][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:40:10,627][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:40:10,949][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:40:11,273][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:40:11,594][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:40:11,916][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:40:12,237][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:40:12,561][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:40:12,883][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:40:13,205][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:40:13,526][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:40:13,849][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:40:14,171][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:40:14,492][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:40:14,814][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:40:15,134][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:40:15,458][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:40:15,780][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:40:16,101][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:40:16,424][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:40:16,747][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:40:17,069][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:40:17,390][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:40:17,711][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:40:18,033][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:40:18,355][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:40:18,676][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:40:19,001][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:40:19,322][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:40:19,644][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:40:19,967][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:40:20,288][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:40:20,611][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:40:20,932][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:40:21,255][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:40:21,577][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:40:21,901][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:40:22,221][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:40:22,545][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:40:23,168][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:40:23,490][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:40:23,811][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:40:24,134][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:40:24,456][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:40:24,778][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:40:25,104][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:40:25,430][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:40:25,753][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:40:26,077][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:40:26,400][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:40:26,725][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:40:27,050][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:40:28,523][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:40:29,224][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:40:29,226][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:40:29,228][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:40:29,860][__main__][INFO] - Iteration 6 took 31s (13.70% Gen, 84.28% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 35m 41s. Estimated total time: 8h 42m 43s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 16s, 500 more iterations: 4h 21m 21s. +[2026-03-25 15:40:29,862][__main__][INFO] - Starting iteration 6. +[2026-03-25 15:40:29,866][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:40:29,866][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:40:34,252][__main__][INFO] - Number of regex retries in iteration 6: 0 +[2026-03-25 15:40:34,253][__main__][INFO] - agents played in iteration 6 are Bob, Alice +[2026-03-25 15:40:35,347][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:40:36,368][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:40:37,817][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:40:38,123][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:40:38,445][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:40:38,770][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:40:39,094][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:40:39,419][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:40:39,744][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:40:40,070][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:40:40,393][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:40:40,718][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:40:41,043][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:40:41,367][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:40:41,689][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:40:42,011][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:40:42,332][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:40:42,654][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:40:42,975][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:40:43,296][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:40:43,617][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:40:43,940][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:40:44,261][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:40:44,581][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:40:44,902][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:40:45,224][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:40:45,545][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:40:45,868][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:40:46,189][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:40:46,510][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:40:46,831][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:40:47,154][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:40:47,477][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:40:47,801][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:40:48,124][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:40:48,445][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:40:48,766][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:40:49,087][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:40:49,408][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:40:49,731][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:40:50,053][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:40:50,375][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:40:50,697][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:40:51,018][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:40:51,340][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:40:51,663][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:40:51,984][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:40:52,305][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:40:52,628][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:40:52,948][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:40:53,271][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:40:53,592][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:40:53,912][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:40:54,542][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:40:54,864][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:40:55,185][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:40:55,508][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:40:55,830][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:40:56,151][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:40:56,473][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:40:56,794][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:40:57,116][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:40:57,439][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:40:57,761][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:40:58,085][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:40:58,406][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:40:59,428][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:41:00,130][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:41:00,132][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:41:00,134][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:41:00,767][__main__][INFO] - Iteration 7 took 30s (14.19% Gen, 83.75% Train). Generation: 4s, Training: 25s. Estimated remaining time: 8h 27m 29s. Estimated total time: 8h 35m 2s. Time estimates for 10 more iterations: 5m 9s, 100 more iterations: 51m 30s, 500 more iterations: 4h 17m 31s. +[2026-03-25 15:41:00,769][__main__][INFO] - Starting iteration 7. +[2026-03-25 15:41:00,772][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:41:00,773][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:41:05,182][__main__][INFO] - Number of regex retries in iteration 7: 0 +[2026-03-25 15:41:05,183][__main__][INFO] - agents played in iteration 7 are Bob, Alice +[2026-03-25 15:41:06,506][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:41:07,550][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:41:09,081][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:41:09,388][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:41:09,710][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:41:10,033][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:41:10,356][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:41:10,679][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:41:11,003][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:41:11,326][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:41:11,648][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:41:11,969][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:41:12,291][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:41:12,613][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:41:12,935][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:41:13,256][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:41:13,578][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:41:13,900][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:41:14,222][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:41:14,545][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:41:14,866][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:41:15,189][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:41:15,510][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:41:15,832][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:41:16,154][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:41:16,476][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:41:16,800][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:41:17,121][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:41:17,443][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:41:17,764][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:41:18,084][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:41:18,406][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:41:18,727][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:41:19,048][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:41:19,368][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:41:19,690][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:41:20,012][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:41:20,335][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:41:20,656][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:41:20,978][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:41:21,301][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:41:21,623][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:41:21,945][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:41:22,268][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:41:22,589][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:41:22,911][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:41:23,232][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:41:23,552][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:41:23,874][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:41:24,195][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:41:24,518][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:41:24,840][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:41:25,161][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:41:25,781][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:41:26,102][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:41:26,425][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:41:26,748][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:41:27,070][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:41:27,391][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:41:27,711][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:41:28,033][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:41:28,355][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:41:28,676][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:41:28,998][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:41:29,320][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:41:29,643][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:41:30,999][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:41:31,700][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:41:31,702][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:41:31,704][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:41:32,321][__main__][INFO] - Iteration 8 took 31s (13.98% Gen, 84.06% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 37m 45s. Estimated total time: 8h 45m 49s. Time estimates for 10 more iterations: 5m 15s, 100 more iterations: 52m 34s, 500 more iterations: 4h 22m 54s. +[2026-03-25 15:41:32,323][__main__][INFO] - Starting iteration 8. +[2026-03-25 15:41:32,327][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:41:32,327][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:41:36,766][__main__][INFO] - Number of regex retries in iteration 8: 0 +[2026-03-25 15:41:36,767][__main__][INFO] - agents played in iteration 8 are Bob, Alice +[2026-03-25 15:41:38,081][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:41:39,122][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:41:40,656][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:41:40,967][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:41:41,289][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:41:41,610][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:41:41,932][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:41:42,253][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:41:42,574][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:41:42,895][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:41:43,218][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:41:43,541][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:41:43,863][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:41:44,183][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:41:44,505][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:41:44,826][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:41:45,149][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:41:45,470][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:41:45,791][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:41:46,112][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:41:46,434][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:41:46,755][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:41:47,078][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:41:47,400][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:41:47,722][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:41:48,045][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:41:48,366][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:41:48,687][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:41:49,009][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:41:49,330][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:41:49,652][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:41:49,972][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:41:50,296][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:41:50,620][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:41:50,942][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:41:51,264][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:41:51,585][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:41:51,906][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:41:52,227][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:41:52,549][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:41:52,869][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:41:53,190][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:41:53,510][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:41:53,831][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:41:54,153][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:41:54,476][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:41:54,798][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:41:55,119][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:41:55,442][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:41:55,763][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:41:56,085][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:41:56,408][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:41:56,730][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:41:57,339][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:41:57,662][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:41:57,984][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:41:58,305][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:41:58,626][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:41:58,948][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:41:59,271][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:41:59,594][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:41:59,916][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:42:00,240][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:42:00,561][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:42:00,885][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:42:01,206][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:42:02,555][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:42:03,259][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:42:03,261][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:42:03,263][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:42:03,888][__main__][INFO] - Iteration 9 took 31s (14.07% Gen, 83.95% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 37m 26s. Estimated total time: 8h 46m 2s. Time estimates for 10 more iterations: 5m 15s, 100 more iterations: 52m 36s, 500 more iterations: 4h 23m 1s. +[2026-03-25 15:42:03,891][__main__][INFO] - Starting iteration 9. +[2026-03-25 15:42:03,894][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:42:03,895][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:42:08,338][__main__][INFO] - Number of regex retries in iteration 9: 0 +[2026-03-25 15:42:08,339][__main__][INFO] - agents played in iteration 9 are Bob, Alice +[2026-03-25 15:42:09,639][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:42:10,683][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:42:11,419][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:42:11,764][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:42:12,085][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:42:12,407][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:42:12,730][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:42:13,051][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:42:13,373][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:42:13,697][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:42:14,019][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:42:14,340][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:42:14,662][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:42:14,983][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:42:15,305][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:42:15,627][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:42:15,950][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:42:16,273][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:42:16,594][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:42:16,916][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:42:17,239][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:42:17,562][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:42:17,883][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:42:18,205][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:42:18,527][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:42:18,850][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:42:19,174][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:42:19,495][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:42:19,818][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:42:20,141][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:42:20,462][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:42:20,785][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:42:21,106][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:42:21,427][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:42:21,749][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:42:22,070][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:42:22,392][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:42:22,713][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:42:23,035][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:42:23,358][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:42:23,680][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:42:24,003][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:42:24,326][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:42:24,649][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:42:24,970][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:42:25,293][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:42:25,614][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:42:25,935][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:42:26,259][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:42:26,581][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:42:26,902][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:42:27,223][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:42:27,545][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:42:28,154][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:42:28,478][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:42:28,800][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:42:29,122][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:42:29,444][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:42:29,765][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:42:30,088][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:42:30,411][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:42:30,733][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:42:31,054][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:42:31,376][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:42:31,698][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:42:32,019][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:42:33,277][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 15:42:33,981][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:42:33,983][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:42:33,985][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:42:34,600][__main__][INFO] - Iteration 10 took 30s (14.47% Gen, 83.52% Train). Generation: 4s, Training: 25s. Estimated remaining time: 8h 22m 40s. Estimated total time: 8h 31m 46s. Time estimates for 10 more iterations: 5m 7s, 100 more iterations: 51m 10s, 500 more iterations: 4h 15m 53s. +[2026-03-25 15:42:34,602][__main__][INFO] - Starting iteration 10. +[2026-03-25 15:42:34,606][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:42:34,606][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:42:39,058][__main__][INFO] - Number of regex retries in iteration 10: 0 +[2026-03-25 15:42:39,059][__main__][INFO] - agents played in iteration 10 are Bob, Alice +[2026-03-25 15:42:40,350][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:42:41,391][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:42:42,917][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:42:43,222][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:42:43,545][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:42:43,868][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:42:44,190][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:42:44,511][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:42:44,833][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:42:45,156][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:42:45,479][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:42:45,803][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:42:46,126][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:42:46,448][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:42:46,770][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:42:47,092][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:42:47,413][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:42:47,738][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:42:48,062][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:42:48,387][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:42:48,710][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:42:49,032][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:42:49,355][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:42:49,678][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:42:50,001][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:42:50,323][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:42:50,647][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:42:50,970][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:42:51,293][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:42:51,617][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:42:51,941][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:42:52,264][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:42:52,586][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:42:52,909][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:42:53,232][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:42:53,554][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:42:53,878][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:42:54,200][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:42:54,524][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:42:54,846][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:42:55,169][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:42:55,492][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:42:55,816][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:42:56,140][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:42:56,463][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:42:56,785][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:42:57,108][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:42:57,430][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:42:57,753][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:42:58,077][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:42:58,402][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:42:58,726][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:42:59,049][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:42:59,657][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:42:59,980][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:43:00,301][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:43:00,622][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:43:00,945][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:43:01,267][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:43:01,591][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:43:01,912][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:43:02,234][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:43:02,557][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:43:02,879][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:43:03,202][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:43:03,524][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:43:05,138][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:43:05,860][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:43:05,863][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:43:05,864][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:43:06,485][__main__][INFO] - Iteration 11 took 31s (13.97% Gen, 84.08% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 41m 42s. Estimated total time: 8h 51m 20s. Time estimates for 10 more iterations: 5m 18s, 100 more iterations: 53m 8s, 500 more iterations: 4h 25m 40s. +[2026-03-25 15:43:06,487][__main__][INFO] - Starting iteration 11. +[2026-03-25 15:43:06,491][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:43:06,491][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:43:10,971][__main__][INFO] - Number of regex retries in iteration 11: 0 +[2026-03-25 15:43:10,972][__main__][INFO] - agents played in iteration 11 are Bob, Alice +[2026-03-25 15:43:12,138][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:43:13,152][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:43:14,607][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:43:14,910][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:43:15,235][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:43:15,561][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:43:15,886][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:43:16,208][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:43:16,533][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:43:16,855][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:43:17,176][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:43:17,499][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:43:17,820][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:43:18,143][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:43:18,466][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:43:18,787][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:43:19,109][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:43:19,432][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:43:19,754][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:43:20,077][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:43:20,399][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:43:20,720][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:43:21,043][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:43:21,365][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:43:21,686][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:43:22,008][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:43:22,329][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:43:22,653][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:43:22,976][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:43:23,300][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:43:23,624][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:43:23,947][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:43:24,269][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:43:24,591][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:43:24,913][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:43:25,235][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:43:25,558][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:43:25,881][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:43:26,202][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:43:26,525][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:43:26,848][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:43:27,169][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:43:27,491][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:43:27,813][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:43:28,134][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:43:28,456][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:43:28,779][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:43:29,101][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:43:29,423][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:43:29,744][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:43:30,065][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:43:30,386][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:43:30,709][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:43:31,317][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:43:31,639][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:43:31,963][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:43:32,286][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:43:32,608][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:43:32,930][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:43:33,254][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:43:33,575][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:43:33,897][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:43:34,220][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:43:34,542][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:43:34,865][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:43:35,187][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:43:36,672][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:43:37,399][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:43:37,401][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:43:37,403][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:43:38,040][__main__][INFO] - Iteration 12 took 31s (14.20% Gen, 83.77% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 35m 39s. Estimated total time: 8h 45m 50s. Time estimates for 10 more iterations: 5m 15s, 100 more iterations: 52m 35s, 500 more iterations: 4h 22m 55s. +[2026-03-25 15:43:38,042][__main__][INFO] - Starting iteration 12. +[2026-03-25 15:43:38,046][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:43:38,046][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:43:42,047][__main__][INFO] - Number of regex retries in iteration 12: 0 +[2026-03-25 15:43:42,048][__main__][INFO] - agents played in iteration 12 are Bob, Alice +[2026-03-25 15:43:43,509][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:43:44,524][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:43:45,976][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:43:46,279][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:43:46,603][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:43:46,927][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:43:47,252][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:43:47,578][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:43:47,902][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:43:48,227][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:43:48,552][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:43:48,878][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:43:49,202][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:43:49,526][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:43:49,851][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:43:50,175][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:43:50,498][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:43:50,822][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:43:51,145][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:43:51,468][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:43:51,791][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:43:52,114][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:43:52,436][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:43:52,760][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:43:53,082][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:43:53,405][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:43:53,728][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:43:54,051][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:43:54,373][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:43:54,696][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:43:55,019][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:43:55,343][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:43:55,666][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:43:55,988][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:43:56,309][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:43:56,632][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:43:56,954][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:43:57,277][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:43:57,598][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:43:57,922][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:43:58,245][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:43:58,566][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:43:58,888][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:43:59,211][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:43:59,534][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:43:59,858][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:44:00,181][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:44:00,504][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:44:00,826][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:44:01,150][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:44:01,473][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:44:01,796][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:44:02,117][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:44:02,725][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:44:03,048][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:44:03,370][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:44:03,691][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:44:04,013][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:44:04,335][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:44:04,657][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:44:04,980][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:44:05,302][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:44:05,624][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:44:05,945][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:44:06,268][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:44:06,590][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:44:08,063][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:44:08,792][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:44:08,794][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:44:08,796][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:44:09,430][__main__][INFO] - Iteration 13 took 31s (12.75% Gen, 85.22% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 32m 23s. Estimated total time: 8h 43m 5s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 18s, 500 more iterations: 4h 21m 32s. +[2026-03-25 15:44:09,432][__main__][INFO] - Starting iteration 13. +[2026-03-25 15:44:09,436][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:44:09,436][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:44:13,705][__main__][INFO] - Number of regex retries in iteration 13: 0 +[2026-03-25 15:44:13,706][__main__][INFO] - agents played in iteration 13 are Bob, Alice +[2026-03-25 15:44:14,904][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:44:15,913][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:44:17,373][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:44:17,677][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:44:18,001][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:44:18,322][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:44:18,645][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:44:18,966][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:44:19,289][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:44:19,610][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:44:19,931][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:44:20,255][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:44:20,576][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:44:20,898][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:44:21,219][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:44:21,541][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:44:21,862][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:44:22,183][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:44:22,504][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:44:22,825][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:44:23,146][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:44:23,467][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:44:23,788][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:44:24,110][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:44:24,432][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:44:24,755][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:44:25,077][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:44:25,400][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:44:25,722][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:44:26,042][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:44:26,364][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:44:26,686][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:44:27,007][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:44:27,329][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:44:27,649][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:44:27,970][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:44:28,292][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:44:28,613][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:44:28,935][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:44:29,257][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:44:29,581][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:44:29,903][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:44:30,225][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:44:30,546][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:44:30,868][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:44:31,190][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:44:31,513][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:44:31,834][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:44:32,155][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:44:32,476][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:44:32,797][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:44:33,120][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:44:33,442][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:44:34,050][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:44:34,371][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:44:34,694][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:44:35,015][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:44:35,337][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:44:35,660][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:44:35,983][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:44:36,305][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:44:36,627][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:44:36,949][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:44:37,270][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:44:37,592][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:44:37,913][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:44:39,439][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:44:40,168][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:44:40,171][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:44:40,172][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:44:40,807][__main__][INFO] - Iteration 14 took 31s (13.61% Gen, 84.36% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 31m 39s. Estimated total time: 8h 42m 53s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 17s, 500 more iterations: 4h 21m 26s. +[2026-03-25 15:44:40,810][__main__][INFO] - Starting iteration 14. +[2026-03-25 15:44:40,813][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:44:40,814][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:44:45,057][__main__][INFO] - Number of regex retries in iteration 14: 0 +[2026-03-25 15:44:45,058][__main__][INFO] - agents played in iteration 14 are Bob, Alice +[2026-03-25 15:44:46,272][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:44:47,281][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:44:48,741][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:44:49,043][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:44:49,365][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:44:49,688][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:44:50,010][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:44:50,333][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:44:50,655][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:44:50,977][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:44:51,301][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:44:51,624][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:44:51,947][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:44:52,271][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:44:52,595][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:44:52,919][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:44:53,240][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:44:53,561][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:44:53,883][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:44:54,204][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:44:54,526][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:44:54,847][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:44:55,169][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:44:55,491][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:44:55,813][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:44:56,133][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:44:56,455][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:44:56,776][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:44:57,097][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:44:57,418][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:44:57,741][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:44:58,063][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:44:58,384][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:44:58,707][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:44:59,028][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:44:59,351][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:44:59,674][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:44:59,995][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:45:00,316][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:45:00,638][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:45:00,960][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:45:01,282][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:45:01,603][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:45:01,924][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:45:02,245][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:45:02,566][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:45:02,887][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:45:03,209][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:45:03,530][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:45:03,852][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:45:04,175][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:45:04,496][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:45:04,818][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:45:05,428][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:45:05,750][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:45:06,072][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:45:06,393][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:45:06,716][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:45:07,038][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:45:07,360][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:45:07,681][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:45:08,003][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:45:08,325][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:45:08,646][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:45:08,967][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:45:09,290][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:45:10,814][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:45:11,545][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:45:11,548][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:45:11,550][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:45:12,191][__main__][INFO] - Iteration 15 took 31s (13.53% Gen, 84.43% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 31m 14s. Estimated total time: 8h 42m 58s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 17s, 500 more iterations: 4h 21m 29s. +[2026-03-25 15:45:12,193][__main__][INFO] - Starting iteration 15. +[2026-03-25 15:45:12,197][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:45:12,197][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:45:16,453][__main__][INFO] - Number of regex retries in iteration 15: 0 +[2026-03-25 15:45:16,454][__main__][INFO] - agents played in iteration 15 are Bob, Alice +[2026-03-25 15:45:17,644][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:45:18,659][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:45:20,116][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:45:20,419][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:45:20,741][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:45:21,065][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:45:21,387][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:45:21,708][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:45:22,030][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:45:22,352][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:45:22,675][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:45:22,997][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:45:23,320][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:45:23,642][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:45:23,964][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:45:24,287][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:45:24,608][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:45:24,929][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:45:25,252][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:45:25,575][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:45:25,897][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:45:26,218][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:45:26,541][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:45:26,863][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:45:27,185][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:45:27,506][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:45:27,827][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:45:28,150][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:45:28,473][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:45:28,794][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:45:29,115][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:45:29,438][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:45:29,760][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:45:30,081][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:45:30,404][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:45:30,726][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:45:31,049][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:45:31,371][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:45:31,693][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:45:32,014][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:45:32,335][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:45:32,658][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:45:32,981][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:45:33,303][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:45:33,623][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:45:33,945][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:45:34,268][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:45:34,589][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:45:34,910][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:45:35,233][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:45:35,554][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:45:35,876][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:45:36,199][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:45:36,813][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:45:37,135][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:45:37,458][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:45:37,780][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:45:38,103][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:45:38,424][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:45:38,745][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:45:39,068][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:45:39,389][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:45:39,710][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:45:40,033][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:45:40,354][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:45:40,676][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:45:42,177][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:45:42,914][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:45:42,916][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:45:42,918][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:45:43,558][__main__][INFO] - Iteration 16 took 31s (13.57% Gen, 84.38% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 30m 26s. Estimated total time: 8h 42m 42s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 16s, 500 more iterations: 4h 21m 21s. +[2026-03-25 15:45:43,561][__main__][INFO] - Starting iteration 16. +[2026-03-25 15:45:43,564][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:45:43,565][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:45:47,841][__main__][INFO] - Number of regex retries in iteration 16: 0 +[2026-03-25 15:45:47,842][__main__][INFO] - agents played in iteration 16 are Bob, Alice +[2026-03-25 15:45:49,002][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:45:50,024][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:45:51,476][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:45:51,780][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:45:52,103][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:45:52,426][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:45:52,748][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:45:53,071][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:45:53,393][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:45:53,715][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:45:54,040][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:45:54,364][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:45:54,686][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:45:55,010][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:45:55,333][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:45:55,658][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:45:55,981][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:45:56,303][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:45:56,626][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:45:56,948][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:45:57,271][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:45:57,593][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:45:57,915][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:45:58,239][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:45:58,561][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:45:58,883][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:45:59,206][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:45:59,530][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:45:59,853][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:46:00,176][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:46:00,499][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:46:00,822][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:46:01,146][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:46:01,467][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:46:01,790][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:46:02,113][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:46:02,435][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:46:02,757][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:46:03,080][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:46:03,403][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:46:03,726][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:46:04,050][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:46:04,374][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:46:04,695][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:46:05,018][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:46:05,342][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:46:05,665][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:46:05,987][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:46:06,308][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:46:06,631][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:46:06,954][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:46:07,277][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:46:07,600][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:46:08,216][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:46:08,540][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:46:08,864][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:46:09,186][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:46:09,507][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:46:09,830][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:46:10,153][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:46:10,475][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:46:10,799][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:46:11,121][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:46:11,444][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:46:11,766][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:46:12,090][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:46:13,543][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:46:14,275][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:46:14,277][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:46:14,279][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:46:14,919][__main__][INFO] - Iteration 17 took 31s (13.64% Gen, 84.31% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 29m 49s. Estimated total time: 8h 42m 36s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 15s, 500 more iterations: 4h 21m 18s. +[2026-03-25 15:46:14,922][__main__][INFO] - Starting iteration 17. +[2026-03-25 15:46:14,925][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:46:14,926][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:46:19,195][__main__][INFO] - Number of regex retries in iteration 17: 0 +[2026-03-25 15:46:19,196][__main__][INFO] - agents played in iteration 17 are Bob, Alice +[2026-03-25 15:46:20,377][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:46:21,399][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:46:22,849][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:46:23,152][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:46:23,475][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:46:23,798][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:46:24,121][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:46:24,444][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:46:24,767][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:46:25,090][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:46:25,413][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:46:25,736][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:46:26,059][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:46:26,381][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:46:26,703][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:46:27,025][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:46:27,348][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:46:27,670][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:46:27,993][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:46:28,317][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:46:28,640][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:46:28,963][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:46:29,286][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:46:29,609][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:46:29,932][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:46:30,255][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:46:30,576][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:46:30,899][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:46:31,221][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:46:31,544][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:46:31,867][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:46:32,190][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:46:32,512][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:46:32,833][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:46:33,157][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:46:33,480][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:46:33,802][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:46:34,123][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:46:34,446][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:46:34,769][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:46:35,091][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:46:35,412][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:46:35,733][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:46:36,056][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:46:36,379][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:46:36,701][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:46:37,022][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:46:37,343][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:46:37,666][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:46:37,988][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:46:38,310][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:46:38,632][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:46:38,953][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:46:39,570][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:46:39,892][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:46:40,213][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:46:40,534][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:46:40,858][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:46:41,180][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:46:41,502][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:46:41,824][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:46:42,147][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:46:42,469][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:46:42,792][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:46:43,115][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:46:43,439][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:46:44,099][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 15:46:44,833][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:46:44,835][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:46:44,837][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:46:45,483][__main__][INFO] - Iteration 18 took 30s (13.97% Gen, 83.91% Train). Generation: 4s, Training: 25s. Estimated remaining time: 8h 16m 1s. Estimated total time: 8h 29m 19s. Time estimates for 10 more iterations: 5m 5s, 100 more iterations: 50m 55s, 500 more iterations: 4h 14m 39s. +[2026-03-25 15:46:45,485][__main__][INFO] - Starting iteration 18. +[2026-03-25 15:46:45,489][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:46:45,489][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:46:48,571][mllm.models.large_language_model_local][WARNING] - Response +usercontent +It seems like you want to continue playing with a pattern or strategy. Given the history and the simplicity of the game, you might be trying to establish a cooperative or competitive strategy. If you continue playing B based on the previous rounds, you are signaling a certain level of predictability that the other player might exploit or mirror. Would you like to change your strategy or continue with B? did not match regex: (|), retry 1/1 +[2026-03-25 15:46:50,048][__main__][INFO] - Number of regex retries in iteration 18: 1 +[2026-03-25 15:46:50,049][__main__][INFO] - agents played in iteration 18 are Bob, Alice +[2026-03-25 15:46:50,586][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:46:51,245][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:46:51,535][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:46:51,857][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:46:52,177][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:46:52,495][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:46:52,814][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:46:53,134][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:46:53,453][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:46:53,773][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:46:54,091][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:46:54,410][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:46:54,729][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:46:55,050][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:46:55,371][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:46:55,691][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:46:56,012][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:46:56,332][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:46:56,652][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:46:56,973][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:46:57,292][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:46:57,613][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:46:57,932][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:46:58,251][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:46:58,569][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:46:58,888][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:46:59,208][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:46:59,527][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:46:59,847][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:47:00,166][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:47:00,484][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:47:00,804][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:47:01,124][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:47:01,443][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:47:01,763][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:47:02,082][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:47:02,402][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:47:02,723][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:47:03,044][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:47:03,362][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:47:03,681][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:47:04,002][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:47:04,321][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:47:04,642][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:47:04,963][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:47:05,282][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:47:05,603][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:47:05,922][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:47:06,243][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:47:06,563][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:47:06,884][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:47:07,204][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:47:07,523][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:47:08,140][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:47:08,461][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:47:08,781][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:47:09,103][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:47:09,423][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:47:09,744][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:47:10,065][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:47:10,386][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:47:10,706][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:47:11,026][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:47:11,347][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:47:11,667][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:47:11,987][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:47:12,663][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 15:47:13,392][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:47:13,394][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:47:13,396][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:47:14,041][__main__][INFO] - Iteration 19 took 28s (15.97% Gen, 81.76% Train). Generation: 4s, Training: 23s. Estimated remaining time: 7h 42m 6s. Estimated total time: 7h 55m 53s. Time estimates for 10 more iterations: 4m 45s, 100 more iterations: 47m 35s, 500 more iterations: 3h 57m 56s. +[2026-03-25 15:47:14,043][__main__][INFO] - Starting iteration 19. +[2026-03-25 15:47:14,046][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:47:14,047][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:47:17,455][__main__][INFO] - Number of regex retries in iteration 19: 0 +[2026-03-25 15:47:17,456][__main__][INFO] - agents played in iteration 19 are Bob, Alice +[2026-03-25 15:47:18,032][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:47:18,679][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:47:18,969][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:47:19,290][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:47:19,609][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:47:19,928][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:47:20,250][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:47:20,570][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:47:20,889][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:47:21,210][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:47:21,528][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:47:21,848][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:47:22,169][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:47:22,489][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:47:22,809][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:47:23,128][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:47:23,448][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:47:23,766][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:47:24,086][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:47:24,405][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:47:24,724][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:47:25,044][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:47:25,363][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:47:25,682][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:47:26,003][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:47:26,323][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:47:26,643][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:47:26,964][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:47:27,285][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:47:27,605][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:47:27,926][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:47:28,246][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:47:28,565][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:47:28,886][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:47:29,205][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:47:29,524][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:47:29,845][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:47:30,165][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:47:30,486][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:47:30,806][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:47:31,126][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:47:31,446][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:47:31,765][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:47:32,085][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:47:32,405][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:47:32,724][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:47:33,045][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:47:33,365][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:47:33,685][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:47:34,005][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:47:34,326][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:47:34,646][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:47:34,966][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:47:35,575][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:47:35,896][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:47:36,215][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:47:36,536][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:47:36,854][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:47:37,174][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:47:37,494][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:47:37,814][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:47:38,132][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:47:38,453][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:47:38,771][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:47:39,090][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:47:39,409][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:47:40,690][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 15:47:41,409][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:47:41,411][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:47:41,413][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:47:42,048][__main__][INFO] - Iteration 20 took 28s (12.17% Gen, 85.55% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 32m 28s. Estimated total time: 7h 46m 43s. Time estimates for 10 more iterations: 4m 40s, 100 more iterations: 46m 40s, 500 more iterations: 3h 53m 21s. +[2026-03-25 15:47:42,051][__main__][INFO] - Starting iteration 20. +[2026-03-25 15:47:42,053][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:47:42,054][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:47:46,245][__main__][INFO] - Number of regex retries in iteration 20: 0 +[2026-03-25 15:47:46,245][__main__][INFO] - agents played in iteration 20 are Bob, Alice +[2026-03-25 15:47:47,488][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:47:48,529][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:47:49,992][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:47:50,289][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:47:50,611][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:47:50,931][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:47:51,251][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:47:51,569][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:47:51,889][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:47:52,208][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:47:52,528][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:47:52,847][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:47:53,166][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:47:53,486][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:47:53,807][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:47:54,127][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:47:54,445][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:47:54,766][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:47:55,086][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:47:55,407][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:47:55,727][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:47:56,048][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:47:56,368][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:47:56,687][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:47:57,007][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:47:57,327][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:47:57,647][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:47:57,966][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:47:58,286][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:47:58,606][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:47:58,926][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:47:59,245][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:47:59,564][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:47:59,885][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:48:00,205][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:48:00,523][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:48:00,844][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:48:01,164][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:48:01,484][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:48:01,803][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:48:02,123][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:48:02,443][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:48:02,764][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:48:03,084][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:48:03,405][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:48:03,726][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:48:04,044][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:48:04,364][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:48:04,684][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:48:05,004][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:48:05,324][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:48:05,645][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:48:05,965][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:48:06,596][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:48:06,915][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:48:07,234][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:48:07,554][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:48:07,874][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:48:08,194][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:48:08,514][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:48:08,833][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:48:09,152][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:48:09,471][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:48:09,792][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:48:10,113][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:48:10,433][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:48:12,060][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:48:12,781][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:48:12,783][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:48:12,785][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:48:13,420][__main__][INFO] - Iteration 21 took 31s (13.36% Gen, 84.61% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 28m 2s. Estimated total time: 8h 42m 48s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 16s, 500 more iterations: 4h 21m 24s. +[2026-03-25 15:48:13,423][__main__][INFO] - Starting iteration 21. +[2026-03-25 15:48:13,426][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:48:13,426][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:48:17,654][__main__][INFO] - Number of regex retries in iteration 21: 0 +[2026-03-25 15:48:17,655][__main__][INFO] - agents played in iteration 21 are Bob, Alice +[2026-03-25 15:48:18,861][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:48:19,900][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:48:21,365][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:48:21,663][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:48:21,982][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:48:22,302][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:48:22,622][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:48:22,941][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:48:23,261][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:48:23,581][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:48:23,901][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:48:24,221][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:48:24,541][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:48:24,862][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:48:25,180][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:48:25,501][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:48:25,822][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:48:26,142][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:48:26,463][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:48:26,783][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:48:27,103][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:48:27,423][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:48:27,744][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:48:28,064][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:48:28,385][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:48:28,703][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:48:29,022][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:48:29,343][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:48:29,662][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:48:29,982][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:48:30,303][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:48:30,624][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:48:30,944][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:48:31,263][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:48:31,582][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:48:31,902][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:48:32,221][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:48:32,541][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:48:32,861][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:48:33,182][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:48:33,502][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:48:33,823][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:48:34,144][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:48:34,463][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:48:34,783][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:48:35,104][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:48:35,422][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:48:35,742][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:48:36,062][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:48:36,382][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:48:36,703][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:48:37,023][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:48:37,341][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:48:37,951][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:48:38,271][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:48:38,591][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:48:38,910][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:48:39,230][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:48:39,549][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:48:39,869][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:48:40,189][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:48:40,509][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:48:40,827][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:48:41,148][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:48:41,467][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:48:41,786][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:48:43,440][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:48:44,160][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:48:44,162][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:48:44,164][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:48:44,796][__main__][INFO] - Iteration 22 took 31s (13.48% Gen, 84.50% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 27m 34s. Estimated total time: 8h 42m 51s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 17s, 500 more iterations: 4h 21m 25s. +[2026-03-25 15:48:44,798][__main__][INFO] - Starting iteration 22. +[2026-03-25 15:48:44,801][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:48:44,802][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:48:49,032][__main__][INFO] - Number of regex retries in iteration 22: 0 +[2026-03-25 15:48:49,033][__main__][INFO] - agents played in iteration 22 are Bob, Alice +[2026-03-25 15:48:50,239][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:48:51,282][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:48:52,745][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:48:53,044][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:48:53,363][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:48:53,684][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:48:54,003][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:48:54,323][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:48:54,644][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:48:54,965][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:48:55,286][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:48:55,605][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:48:55,925][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:48:56,244][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:48:56,565][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:48:56,884][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:48:57,204][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:48:57,524][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:48:57,844][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:48:58,165][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:48:58,485][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:48:58,806][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:48:59,127][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:48:59,446][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:48:59,767][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:49:00,088][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:49:00,408][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:49:00,728][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:49:01,048][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:49:01,368][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:49:01,688][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:49:02,008][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:49:02,329][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:49:02,651][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:49:02,972][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:49:03,293][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:49:03,612][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:49:03,932][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:49:04,252][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:49:04,573][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:49:04,893][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:49:05,213][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:49:05,533][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:49:05,853][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:49:06,175][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:49:06,494][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:49:06,813][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:49:07,134][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:49:07,454][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:49:07,773][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:49:08,094][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:49:08,414][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:49:08,734][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:49:09,359][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:49:09,680][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:49:10,001][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:49:10,322][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:49:10,644][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:49:10,965][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:49:11,284][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:49:11,603][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:49:11,924][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:49:12,244][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:49:12,565][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:49:12,884][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:49:13,205][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:49:14,825][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:49:15,552][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:49:15,554][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:49:15,556][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:49:16,186][__main__][INFO] - Iteration 23 took 31s (13.48% Gen, 84.51% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 27m 17s. Estimated total time: 8h 43m 5s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 18s, 500 more iterations: 4h 21m 32s. +[2026-03-25 15:49:16,188][__main__][INFO] - Starting iteration 23. +[2026-03-25 15:49:16,191][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:49:16,192][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:49:20,425][__main__][INFO] - Number of regex retries in iteration 23: 0 +[2026-03-25 15:49:20,425][__main__][INFO] - agents played in iteration 23 are Bob, Alice +[2026-03-25 15:49:21,622][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:49:22,665][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:49:24,125][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:49:24,424][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:49:24,744][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:49:25,063][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:49:25,385][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:49:25,706][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:49:26,026][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:49:26,346][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:49:26,667][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:49:26,986][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:49:27,306][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:49:27,627][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:49:27,947][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:49:28,267][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:49:28,587][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:49:28,906][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:49:29,226][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:49:29,547][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:49:29,867][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:49:30,187][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:49:30,506][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:49:30,825][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:49:31,145][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:49:31,466][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:49:31,786][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:49:32,105][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:49:32,424][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:49:32,745][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:49:33,066][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:49:33,385][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:49:33,704][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:49:34,025][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:49:34,344][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:49:34,665][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:49:34,986][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:49:35,305][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:49:35,624][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:49:35,943][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:49:36,262][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:49:36,582][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:49:36,903][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:49:37,223][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:49:37,544][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:49:37,865][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:49:38,184][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:49:38,505][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:49:38,825][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:49:39,145][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:49:39,466][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:49:39,785][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:49:40,104][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:49:40,717][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:49:41,039][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:49:41,360][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:49:41,680][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:49:42,002][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:49:42,323][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:49:42,643][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:49:42,962][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:49:43,282][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:49:43,601][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:49:43,922][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:49:44,243][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:49:44,563][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:49:46,202][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:49:46,932][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:49:46,934][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:49:46,936][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:49:47,566][__main__][INFO] - Iteration 24 took 31s (13.49% Gen, 84.49% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 26m 36s. Estimated total time: 8h 42m 56s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 17s, 500 more iterations: 4h 21m 28s. +[2026-03-25 15:49:47,569][__main__][INFO] - Starting iteration 24. +[2026-03-25 15:49:47,572][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:49:47,572][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:49:51,783][__main__][INFO] - Number of regex retries in iteration 24: 0 +[2026-03-25 15:49:51,784][__main__][INFO] - agents played in iteration 24 are Bob, Alice +[2026-03-25 15:49:52,990][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:49:54,043][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:49:55,499][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:49:55,799][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:49:56,120][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:49:56,441][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:49:56,762][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:49:57,083][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:49:57,404][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:49:57,724][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:49:58,045][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:49:58,366][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:49:58,686][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:49:59,007][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:49:59,327][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:49:59,647][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:49:59,968][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:50:00,288][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:50:00,608][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:50:00,928][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:50:01,249][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:50:01,568][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:50:01,888][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:50:02,208][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:50:02,528][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:50:02,849][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:50:03,169][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:50:03,489][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:50:03,808][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:50:04,128][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:50:04,448][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:50:04,768][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:50:05,088][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:50:05,408][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:50:05,728][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:50:06,047][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:50:06,367][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:50:06,687][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:50:07,008][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:50:07,328][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:50:07,648][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:50:07,969][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:50:08,288][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:50:08,608][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:50:08,927][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:50:09,246][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:50:09,567][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:50:09,890][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:50:10,210][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:50:10,530][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:50:10,851][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:50:11,172][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:50:11,492][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:50:12,105][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:50:12,427][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:50:12,748][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:50:13,068][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:50:13,387][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:50:13,708][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:50:14,028][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:50:14,348][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:50:14,668][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:50:14,990][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:50:15,311][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:50:15,632][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:50:15,952][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:50:17,567][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:50:18,295][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:50:18,298][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:50:18,300][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:50:18,934][__main__][INFO] - Iteration 25 took 31s (13.43% Gen, 84.54% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 25m 52s. Estimated total time: 8h 42m 43s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 16s, 500 more iterations: 4h 21m 21s. +[2026-03-25 15:50:18,936][__main__][INFO] - Starting iteration 25. +[2026-03-25 15:50:18,939][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:50:18,940][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:50:23,158][__main__][INFO] - Number of regex retries in iteration 25: 0 +[2026-03-25 15:50:23,159][__main__][INFO] - agents played in iteration 25 are Bob, Alice +[2026-03-25 15:50:24,369][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:50:25,413][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:50:26,876][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:50:27,176][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:50:27,496][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:50:27,817][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:50:28,137][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:50:28,456][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:50:28,775][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:50:29,095][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:50:29,416][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:50:29,738][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:50:30,060][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:50:30,380][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:50:30,699][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:50:31,019][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:50:31,338][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:50:31,659][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:50:31,980][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:50:32,301][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:50:32,622][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:50:32,943][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:50:33,263][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:50:33,584][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:50:33,905][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:50:34,225][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:50:34,546][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:50:34,867][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:50:35,188][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:50:35,509][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:50:35,829][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:50:36,149][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:50:36,468][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:50:36,788][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:50:37,109][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:50:37,429][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:50:37,747][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:50:38,068][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:50:38,387][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:50:38,708][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:50:39,028][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:50:39,348][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:50:39,668][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:50:39,988][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:50:40,308][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:50:40,629][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:50:40,950][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:50:41,271][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:50:41,591][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:50:41,912][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:50:42,231][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:50:42,551][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:50:42,870][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:50:43,485][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:50:43,806][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:50:44,126][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:50:44,448][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:50:44,768][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:50:45,090][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:50:45,411][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:50:45,731][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:50:46,052][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:50:46,373][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:50:46,693][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:50:47,014][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:50:47,335][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:50:48,950][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:50:49,676][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:50:49,678][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:50:49,680][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:50:50,312][__main__][INFO] - Iteration 26 took 31s (13.45% Gen, 84.53% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 25m 31s. Estimated total time: 8h 42m 54s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 17s, 500 more iterations: 4h 21m 27s. +[2026-03-25 15:50:50,315][__main__][INFO] - Starting iteration 26. +[2026-03-25 15:50:50,318][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:50:50,318][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:50:54,530][__main__][INFO] - Number of regex retries in iteration 26: 0 +[2026-03-25 15:50:54,531][__main__][INFO] - agents played in iteration 26 are Bob, Alice +[2026-03-25 15:50:55,753][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:50:56,800][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:50:58,261][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:50:58,560][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:50:58,880][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:50:59,199][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:50:59,520][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:50:59,842][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:51:00,162][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:51:00,482][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:51:00,804][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:51:01,123][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:51:01,444][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:51:01,763][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:51:02,083][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:51:02,403][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:51:02,724][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:51:03,044][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:51:03,363][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:51:03,684][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:51:04,004][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:51:04,325][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:51:04,647][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:51:04,967][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:51:05,288][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:51:05,608][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:51:05,927][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:51:06,247][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:51:06,568][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:51:06,887][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:51:07,208][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:51:07,529][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:51:07,851][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:51:08,172][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:51:08,493][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:51:08,813][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:51:09,133][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:51:09,453][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:51:09,772][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:51:10,092][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:51:10,413][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:51:10,733][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:51:11,054][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:51:11,374][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:51:11,694][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:51:12,013][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:51:12,334][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:51:12,654][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:51:12,975][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:51:13,294][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:51:13,614][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:51:13,934][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:51:14,253][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:51:14,867][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:51:15,188][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:51:15,509][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:51:15,828][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:51:16,148][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:51:16,469][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:51:16,789][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:51:17,109][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:51:17,429][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:51:17,749][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:51:18,070][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:51:18,389][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:51:18,708][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:51:20,336][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:51:21,064][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:51:21,066][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:51:21,068][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:51:21,699][__main__][INFO] - Iteration 27 took 31s (13.42% Gen, 84.56% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 25m 8s. Estimated total time: 8h 43m 2s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 18s, 500 more iterations: 4h 21m 31s. +[2026-03-25 15:51:21,702][__main__][INFO] - Starting iteration 27. +[2026-03-25 15:51:21,704][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:51:21,705][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:51:25,925][__main__][INFO] - Number of regex retries in iteration 27: 0 +[2026-03-25 15:51:25,926][__main__][INFO] - agents played in iteration 27 are Bob, Alice +[2026-03-25 15:51:27,136][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:51:28,181][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:51:29,640][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:51:29,941][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:51:30,260][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:51:30,581][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:51:30,901][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:51:31,220][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:51:31,541][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:51:31,861][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:51:32,181][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:51:32,501][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:51:32,821][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:51:33,142][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:51:33,463][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:51:33,783][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:51:34,104][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:51:34,425][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:51:34,746][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:51:35,067][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:51:35,388][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:51:35,708][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:51:36,029][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:51:36,349][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:51:36,669][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:51:36,988][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:51:37,309][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:51:37,628][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:51:37,949][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:51:38,271][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:51:38,591][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:51:38,912][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:51:39,232][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:51:39,552][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:51:39,873][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:51:40,194][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:51:40,515][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:51:40,834][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:51:41,155][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:51:41,474][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:51:41,795][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:51:42,115][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:51:42,436][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:51:42,755][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:51:43,075][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:51:43,396][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:51:43,716][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:51:44,035][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:51:44,354][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:51:44,673][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:51:44,993][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:51:45,314][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:51:45,633][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:51:46,243][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:51:46,564][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:51:46,885][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:51:47,205][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:51:47,525][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:51:47,845][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:51:48,164][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:51:48,485][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:51:48,804][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:51:49,125][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:51:49,445][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:51:49,764][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:51:50,084][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:51:51,712][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:51:52,435][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:51:52,438][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:51:52,440][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:51:53,064][__main__][INFO] - Iteration 28 took 31s (13.46% Gen, 84.54% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 24m 14s. Estimated total time: 8h 42m 40s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 16s, 500 more iterations: 4h 21m 20s. +[2026-03-25 15:51:53,066][__main__][INFO] - Starting iteration 28. +[2026-03-25 15:51:53,069][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:51:53,069][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:51:57,106][__main__][INFO] - Number of regex retries in iteration 28: 0 +[2026-03-25 15:51:57,107][__main__][INFO] - agents played in iteration 28 are Bob, Alice +[2026-03-25 15:51:58,515][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:51:59,560][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:52:01,022][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:52:01,322][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:52:01,642][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:52:01,963][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:52:02,284][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:52:02,605][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:52:02,925][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:52:03,246][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:52:03,567][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:52:03,889][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:52:04,210][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:52:04,531][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:52:04,851][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:52:05,170][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:52:05,491][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:52:05,809][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:52:06,130][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:52:06,450][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:52:06,771][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:52:07,092][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:52:07,412][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:52:07,732][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:52:08,053][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:52:08,374][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:52:08,694][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:52:09,015][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:52:09,335][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:52:09,655][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:52:09,977][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:52:10,297][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:52:10,617][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:52:10,937][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:52:11,257][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:52:11,577][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:52:11,898][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:52:12,219][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:52:12,542][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:52:12,862][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:52:13,182][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:52:13,504][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:52:13,825][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:52:14,145][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:52:14,466][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:52:14,787][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:52:15,107][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:52:15,429][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:52:15,752][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:52:16,073][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:52:16,394][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:52:16,713][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:52:17,034][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:52:17,642][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:52:17,963][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:52:18,284][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:52:18,603][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:52:18,925][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:52:19,246][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:52:19,566][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:52:19,886][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:52:20,208][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:52:20,529][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:52:20,850][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:52:21,170][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:52:21,489][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:52:23,087][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:52:23,810][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:52:23,812][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:52:23,814][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:52:24,441][__main__][INFO] - Iteration 29 took 31s (12.87% Gen, 85.13% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 23m 56s. Estimated total time: 8h 42m 53s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 17s, 500 more iterations: 4h 21m 26s. +[2026-03-25 15:52:24,443][__main__][INFO] - Starting iteration 29. +[2026-03-25 15:52:24,446][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:52:24,447][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:52:28,670][__main__][INFO] - Number of regex retries in iteration 29: 0 +[2026-03-25 15:52:28,671][__main__][INFO] - agents played in iteration 29 are Bob, Alice +[2026-03-25 15:52:29,917][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:52:30,959][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:52:32,420][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:52:32,721][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:52:33,042][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:52:33,362][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:52:33,682][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:52:34,002][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:52:34,323][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:52:34,643][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:52:34,963][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:52:35,283][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:52:35,603][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:52:35,924][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:52:36,246][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:52:36,567][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:52:36,888][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:52:37,208][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:52:37,528][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:52:37,847][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:52:38,168][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:52:38,487][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:52:38,808][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:52:39,128][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:52:39,448][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:52:39,768][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:52:40,088][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:52:40,408][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:52:40,728][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:52:41,048][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:52:41,367][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:52:41,688][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:52:42,008][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:52:42,328][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:52:42,649][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:52:42,968][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:52:43,289][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:52:43,610][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:52:43,929][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:52:44,249][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:52:44,570][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:52:44,890][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:52:45,211][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:52:45,530][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:52:45,851][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:52:46,171][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:52:46,492][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:52:46,812][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:52:47,134][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:52:47,454][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:52:47,774][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:52:48,094][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:52:48,415][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:52:49,022][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:52:49,344][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:52:49,664][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:52:49,985][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:52:50,307][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:52:50,628][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:52:50,948][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:52:51,269][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:52:51,589][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:52:51,908][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:52:52,228][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:52:52,549][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:52:52,870][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:52:54,492][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:52:55,216][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:52:55,218][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:52:55,220][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:52:55,846][__main__][INFO] - Iteration 30 took 31s (13.45% Gen, 84.55% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 23m 52s. Estimated total time: 8h 43m 20s. Time estimates for 10 more iterations: 5m 14s, 100 more iterations: 52m 20s, 500 more iterations: 4h 21m 40s. +[2026-03-25 15:52:55,848][__main__][INFO] - Starting iteration 30. +[2026-03-25 15:52:55,851][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:52:55,851][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:53:00,030][__main__][INFO] - Number of regex retries in iteration 30: 0 +[2026-03-25 15:53:00,031][__main__][INFO] - agents played in iteration 30 are Bob, Alice +[2026-03-25 15:53:01,295][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:53:02,340][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:53:03,797][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:53:04,098][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:53:04,417][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:53:04,739][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:53:05,059][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:53:05,381][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:53:05,703][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:53:06,023][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:53:06,344][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:53:06,665][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:53:06,986][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:53:07,305][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:53:07,627][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:53:07,947][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:53:08,267][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:53:08,587][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:53:08,909][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:53:09,230][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:53:09,550][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:53:09,872][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:53:10,193][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:53:10,514][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:53:10,834][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:53:11,155][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:53:11,479][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:53:11,803][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:53:12,124][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:53:12,445][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:53:12,766][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:53:13,088][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:53:13,409][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:53:13,729][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:53:14,051][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:53:14,370][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:53:14,692][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:53:15,013][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:53:15,334][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:53:15,654][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:53:15,975][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:53:16,294][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:53:16,615][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:53:16,936][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:53:17,256][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:53:17,575][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:53:17,895][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:53:18,216][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:53:18,534][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:53:18,855][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:53:19,176][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:53:19,496][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:53:19,816][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:53:20,425][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:53:20,746][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:53:21,067][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:53:21,388][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:53:21,708][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:53:22,028][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:53:22,348][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:53:22,668][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:53:22,987][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:53:23,308][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:53:23,627][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:53:23,947][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:53:24,268][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:53:25,872][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:53:26,595][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:53:26,597][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:53:26,599][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:53:27,226][__main__][INFO] - Iteration 31 took 31s (13.32% Gen, 84.68% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 22m 56s. Estimated total time: 8h 42m 56s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 17s, 500 more iterations: 4h 21m 28s. +[2026-03-25 15:53:27,228][__main__][INFO] - Starting iteration 31. +[2026-03-25 15:53:27,231][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:53:27,232][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:53:30,541][__main__][INFO] - Number of regex retries in iteration 31: 0 +[2026-03-25 15:53:30,541][__main__][INFO] - agents played in iteration 31 are Bob, Alice +[2026-03-25 15:53:31,805][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:53:32,847][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:53:34,308][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:53:34,606][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:53:34,927][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:53:35,248][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:53:35,568][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:53:35,889][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:53:36,209][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:53:36,529][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:53:36,848][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:53:37,168][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:53:37,488][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:53:37,809][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:53:38,129][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:53:38,449][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:53:38,769][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:53:39,088][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:53:39,409][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:53:39,729][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:53:40,050][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:53:40,369][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:53:40,690][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:53:41,010][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:53:41,330][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:53:41,649][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:53:41,969][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:53:42,290][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:53:42,611][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:53:42,930][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:53:43,250][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:53:43,569][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:53:43,889][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:53:44,210][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:53:44,530][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:53:44,851][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:53:45,171][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:53:45,490][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:53:45,811][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:53:46,130][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:53:46,451][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:53:46,772][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:53:47,093][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:53:47,413][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:53:47,734][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:53:48,055][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:53:48,376][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:53:48,696][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:53:49,016][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:53:49,336][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:53:49,657][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:53:49,976][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:53:50,296][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:53:50,906][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:53:51,226][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:53:51,547][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:53:51,869][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:53:52,190][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:53:52,510][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:53:52,831][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:53:53,150][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:53:53,470][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:53:53,790][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:53:54,111][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:53:54,430][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:53:54,750][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:53:56,391][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:53:57,121][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:53:57,123][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:53:57,125][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:53:57,750][__main__][INFO] - Iteration 32 took 30s (10.84% Gen, 87.10% Train). Generation: 3s, Training: 26s. Estimated remaining time: 8h 8m 9s. Estimated total time: 8h 28m 39s. Time estimates for 10 more iterations: 5m 5s, 100 more iterations: 50m 51s, 500 more iterations: 4h 14m 19s. +[2026-03-25 15:53:57,752][__main__][INFO] - Starting iteration 32. +[2026-03-25 15:53:57,755][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:53:57,756][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:54:01,763][__main__][INFO] - Number of regex retries in iteration 32: 0 +[2026-03-25 15:54:01,764][__main__][INFO] - agents played in iteration 32 are Bob, Alice +[2026-03-25 15:54:03,193][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:54:04,237][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:54:05,700][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:54:06,000][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:54:06,320][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:54:06,642][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:54:06,962][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:54:07,284][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:54:07,603][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:54:07,925][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:54:08,246][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:54:08,567][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:54:08,888][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:54:09,210][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:54:09,530][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:54:09,852][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:54:10,172][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:54:10,493][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:54:10,813][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:54:11,133][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:54:11,452][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:54:11,773][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:54:12,094][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:54:12,415][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:54:12,735][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:54:13,055][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:54:13,376][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:54:13,696][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:54:14,016][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:54:14,336][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:54:14,655][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:54:14,976][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:54:15,296][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:54:15,616][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:54:15,938][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:54:16,258][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:54:16,578][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:54:16,899][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:54:17,221][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:54:17,542][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:54:17,863][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:54:18,183][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:54:18,503][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:54:18,823][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:54:19,143][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:54:19,464][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:54:19,784][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:54:20,104][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:54:20,424][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:54:20,746][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:54:21,067][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:54:21,387][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:54:21,708][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:54:22,318][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:54:22,638][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:54:22,958][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:54:23,278][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:54:23,598][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:54:23,919][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:54:24,239][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:54:24,561][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:54:24,884][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:54:25,204][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:54:25,523][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:54:25,845][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:54:26,167][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:54:27,768][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:54:28,493][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:54:28,496][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:54:28,497][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:54:29,122][__main__][INFO] - Iteration 33 took 31s (12.78% Gen, 85.23% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 21m 45s. Estimated total time: 8h 42m 47s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 16s, 500 more iterations: 4h 21m 23s. +[2026-03-25 15:54:29,124][__main__][INFO] - Starting iteration 33. +[2026-03-25 15:54:29,127][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:54:29,128][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:54:32,458][__main__][INFO] - Number of regex retries in iteration 33: 0 +[2026-03-25 15:54:32,458][__main__][INFO] - agents played in iteration 33 are Bob, Alice +[2026-03-25 15:54:33,696][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:54:34,738][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:54:36,201][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:54:36,503][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:54:36,822][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:54:37,144][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:54:37,464][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:54:37,784][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:54:38,104][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:54:38,425][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:54:38,745][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:54:39,066][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:54:39,387][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:54:39,707][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:54:40,028][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:54:40,350][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:54:40,671][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:54:40,991][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:54:41,311][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:54:41,632][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:54:41,951][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:54:42,273][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:54:42,593][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:54:42,915][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:54:43,235][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:54:43,556][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:54:43,878][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:54:44,198][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:54:44,519][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:54:44,839][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:54:45,160][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:54:45,479][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:54:45,800][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:54:46,121][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:54:46,444][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:54:46,764][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:54:47,084][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:54:47,405][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:54:47,726][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:54:48,046][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:54:48,367][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:54:48,688][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:54:49,009][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:54:49,329][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:54:49,650][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:54:49,970][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:54:50,291][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:54:50,611][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:54:50,930][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:54:51,251][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:54:51,570][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:54:51,889][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:54:52,211][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:54:52,825][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:54:53,145][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:54:53,465][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:54:53,786][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:54:54,105][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:54:54,426][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:54:54,747][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:54:55,069][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:54:55,390][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:54:55,712][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:54:56,033][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:54:56,355][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:54:56,675][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:54:58,273][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:54:58,995][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:54:58,997][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:54:58,999][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:54:59,625][__main__][INFO] - Iteration 34 took 30s (10.92% Gen, 87.02% Train). Generation: 3s, Training: 26s. Estimated remaining time: 8h 6m 47s. Estimated total time: 8h 28m 19s. Time estimates for 10 more iterations: 5m 4s, 100 more iterations: 50m 49s, 500 more iterations: 4h 14m 9s. +[2026-03-25 15:54:59,627][__main__][INFO] - Starting iteration 34. +[2026-03-25 15:54:59,630][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:54:59,631][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:55:03,830][__main__][INFO] - Number of regex retries in iteration 34: 0 +[2026-03-25 15:55:03,830][__main__][INFO] - agents played in iteration 34 are Bob, Alice +[2026-03-25 15:55:05,073][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:55:06,121][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:55:07,577][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:55:07,877][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:55:08,197][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:55:08,517][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:55:08,839][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:55:09,160][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:55:09,482][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:55:09,801][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:55:10,123][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:55:10,443][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:55:10,764][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:55:11,084][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:55:11,405][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:55:11,724][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:55:12,045][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:55:12,366][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:55:12,685][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:55:13,007][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:55:13,327][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:55:13,647][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:55:13,969][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:55:14,289][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:55:14,609][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:55:14,929][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:55:15,249][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:55:15,569][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:55:15,888][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:55:16,209][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:55:16,530][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:55:16,851][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:55:17,170][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:55:17,491][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:55:17,812][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:55:18,133][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:55:18,453][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:55:18,774][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:55:19,094][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:55:19,414][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:55:19,736][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:55:20,057][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:55:20,378][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:55:20,699][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:55:21,021][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:55:21,342][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:55:21,663][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:55:21,984][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:55:22,304][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:55:22,626][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:55:22,947][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:55:23,267][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:55:23,587][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:55:24,202][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:55:24,522][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:55:24,843][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:55:25,164][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:55:25,484][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:55:25,804][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:55:26,124][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:55:26,445][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:55:26,766][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:55:27,087][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:55:27,408][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:55:27,729][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:55:28,050][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:55:29,660][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:55:30,383][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:55:30,386][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:55:30,387][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:55:31,012][__main__][INFO] - Iteration 35 took 31s (13.38% Gen, 84.62% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 20m 59s. Estimated total time: 8h 43m 2s. Time estimates for 10 more iterations: 5m 13s, 100 more iterations: 52m 18s, 500 more iterations: 4h 21m 31s. +[2026-03-25 15:55:31,014][__main__][INFO] - Starting iteration 35. +[2026-03-25 15:55:31,017][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:55:31,018][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:55:34,341][__main__][INFO] - Number of regex retries in iteration 35: 0 +[2026-03-25 15:55:34,342][__main__][INFO] - agents played in iteration 35 are Bob, Alice +[2026-03-25 15:55:35,516][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:55:36,306][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:55:37,816][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:55:38,108][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:55:38,430][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:55:38,749][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:55:39,069][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:55:39,389][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:55:39,709][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:55:40,029][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:55:40,348][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:55:40,668][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:55:40,989][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:55:41,309][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:55:41,629][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:55:41,949][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:55:42,268][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:55:42,589][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:55:42,909][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:55:43,228][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:55:43,550][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:55:43,869][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:55:44,189][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:55:44,509][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:55:44,830][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:55:45,150][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:55:45,470][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:55:45,791][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:55:46,112][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:55:46,433][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:55:46,755][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:55:47,075][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:55:47,397][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:55:47,716][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:55:48,037][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:55:48,358][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:55:48,679][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:55:49,000][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:55:49,321][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:55:49,643][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:55:49,963][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:55:50,284][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:55:50,603][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:55:50,925][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:55:51,247][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:55:51,569][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:55:51,890][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:55:52,212][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:55:52,532][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:55:52,853][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:55:53,174][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:55:53,494][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:55:53,815][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:55:54,425][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:55:54,746][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:55:55,067][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:55:55,387][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:55:55,708][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:55:56,028][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:55:56,350][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:55:56,672][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:55:56,992][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:55:57,313][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:55:57,634][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:55:57,956][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:55:58,277][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:56:00,208][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:56:00,931][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:56:00,933][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:56:00,934][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:56:01,559][__main__][INFO] - Iteration 36 took 30s (10.88% Gen, 87.07% Train). Generation: 3s, Training: 26s. Estimated remaining time: 8h 6m 29s. Estimated total time: 8h 29m 3s. Time estimates for 10 more iterations: 5m 5s, 100 more iterations: 50m 54s, 500 more iterations: 4h 14m 31s. +[2026-03-25 15:56:01,561][__main__][INFO] - Starting iteration 36. +[2026-03-25 15:56:01,565][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:56:01,565][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:56:05,774][__main__][INFO] - Number of regex retries in iteration 36: 0 +[2026-03-25 15:56:05,775][__main__][INFO] - agents played in iteration 36 are Bob, Alice +[2026-03-25 15:56:06,988][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:56:08,030][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:56:09,488][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:56:09,787][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:56:10,107][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:56:10,428][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:56:10,749][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:56:11,068][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:56:11,388][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:56:11,708][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:56:12,027][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:56:12,348][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:56:12,669][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:56:12,990][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:56:13,311][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:56:13,632][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:56:13,951][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:56:14,270][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:56:14,590][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:56:14,910][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:56:15,231][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:56:15,551][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:56:15,872][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:56:16,191][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:56:16,511][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:56:16,830][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:56:17,150][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:56:17,469][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:56:17,790][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:56:18,110][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:56:18,429][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:56:18,750][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:56:19,071][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:56:19,392][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:56:19,713][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:56:20,034][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:56:20,355][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:56:20,676][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:56:20,996][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:56:21,316][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:56:21,636][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:56:21,955][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:56:22,275][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:56:22,595][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:56:22,915][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:56:23,235][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:56:23,557][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:56:23,877][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:56:24,198][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:56:24,517][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:56:24,837][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:56:25,158][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:56:25,477][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:56:26,087][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:56:26,408][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:56:26,728][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:56:27,047][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:56:27,368][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:56:27,689][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:56:28,010][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:56:28,331][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:56:28,652][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:56:28,971][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:56:29,291][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:56:29,611][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:56:29,933][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:56:31,558][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:56:32,285][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:56:32,287][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:56:32,289][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:56:33,280][__main__][INFO] - Iteration 37 took 31s (13.27% Gen, 83.60% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 25m 30s. Estimated total time: 8h 48m 36s. Time estimates for 10 more iterations: 5m 17s, 100 more iterations: 52m 51s, 500 more iterations: 4h 24m 18s. +[2026-03-25 15:56:33,282][__main__][INFO] - Starting iteration 37. +[2026-03-25 15:56:33,285][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:56:33,286][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:56:37,113][__main__][INFO] - Number of regex retries in iteration 37: 0 +[2026-03-25 15:56:37,113][__main__][INFO] - agents played in iteration 37 are Bob, Alice +[2026-03-25 15:56:38,358][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:56:39,399][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:56:40,863][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:56:41,163][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:56:41,482][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:56:41,802][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:56:42,124][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:56:42,442][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:56:42,764][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:56:43,084][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:56:43,404][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:56:43,724][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:56:44,043][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:56:44,364][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:56:44,684][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:56:45,005][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:56:45,326][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:56:45,648][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:56:45,969][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:56:46,290][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:56:46,611][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:56:46,930][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:56:47,252][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:56:47,571][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:56:47,891][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:56:48,210][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:56:48,531][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:56:48,852][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:56:49,171][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:56:49,491][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:56:49,812][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:56:50,133][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:56:50,454][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:56:50,775][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:56:51,095][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:56:51,415][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:56:51,736][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:56:52,056][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:56:52,378][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:56:52,700][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:56:53,020][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:56:53,341][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:56:53,662][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:56:53,984][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:56:54,305][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:56:54,626][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:56:54,947][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:56:55,268][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:56:55,587][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:56:55,908][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:56:56,228][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:56:56,548][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:56:56,869][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:56:57,480][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:56:57,801][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:56:58,122][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:56:58,445][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:56:58,766][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:56:59,086][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:56:59,407][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:56:59,728][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:57:00,049][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:57:00,370][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:57:00,689][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:57:01,010][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:57:01,330][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:57:02,941][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:57:03,668][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:57:03,670][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:57:03,672][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:57:04,297][__main__][INFO] - Iteration 38 took 31s (12.34% Gen, 85.64% Train). Generation: 3s, Training: 26s. Estimated remaining time: 8h 13m 16s. Estimated total time: 8h 36m 53s. Time estimates for 10 more iterations: 5m 10s, 100 more iterations: 51m 41s, 500 more iterations: 4h 18m 26s. +[2026-03-25 15:57:04,299][__main__][INFO] - Starting iteration 38. +[2026-03-25 15:57:04,303][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:57:04,303][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:57:09,025][__main__][INFO] - Number of regex retries in iteration 38: 0 +[2026-03-25 15:57:09,025][__main__][INFO] - agents played in iteration 38 are Bob, Alice +[2026-03-25 15:57:10,266][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:57:11,309][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:57:12,767][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:57:13,066][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:57:13,384][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:57:13,703][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:57:14,023][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:57:14,343][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:57:14,663][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:57:14,984][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:57:15,305][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:57:15,626][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:57:15,946][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:57:16,267][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:57:16,588][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:57:16,909][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:57:17,229][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:57:17,548][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:57:17,868][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:57:18,189][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:57:18,510][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:57:18,830][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:57:19,151][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:57:19,472][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:57:19,791][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:57:20,111][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:57:20,432][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:57:20,752][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:57:21,071][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:57:21,392][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:57:21,713][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:57:22,034][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:57:22,354][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:57:22,674][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:57:22,993][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:57:23,313][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:57:23,634][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:57:23,953][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:57:24,275][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:57:24,595][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:57:24,914][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:57:25,235][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:57:25,557][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:57:25,879][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:57:26,200][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:57:26,520][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:57:26,842][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:57:27,163][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:57:27,484][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:57:27,804][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:57:28,124][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:57:28,445][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:57:28,766][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:57:29,375][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:57:29,696][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:57:30,015][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:57:30,336][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:57:30,657][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:57:30,978][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:57:31,298][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:57:31,618][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:57:31,938][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:57:32,258][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:57:32,578][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:57:32,897][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:57:33,216][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:57:34,838][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:57:35,566][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:57:35,569][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:57:35,570][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:57:36,188][__main__][INFO] - Iteration 39 took 31s (14.81% Gen, 83.25% Train). Generation: 4s, Training: 26s. Estimated remaining time: 8h 27m 18s. Estimated total time: 8h 51m 26s. Time estimates for 10 more iterations: 5m 18s, 100 more iterations: 53m 8s, 500 more iterations: 4h 25m 43s. +[2026-03-25 15:57:36,191][__main__][INFO] - Starting iteration 39. +[2026-03-25 15:57:36,194][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:57:36,194][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:57:39,524][__main__][INFO] - Number of regex retries in iteration 39: 0 +[2026-03-25 15:57:39,525][__main__][INFO] - agents played in iteration 39 are Bob, Alice +[2026-03-25 15:57:40,764][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:57:41,808][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:57:43,267][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:57:43,567][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:57:43,887][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:57:44,207][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:57:44,527][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:57:44,848][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:57:45,169][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:57:45,489][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:57:45,810][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:57:46,131][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:57:46,452][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:57:46,773][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:57:47,094][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:57:47,415][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:57:47,736][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:57:48,057][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:57:48,377][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:57:48,697][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:57:49,018][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:57:49,337][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:57:49,658][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:57:49,979][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:57:50,300][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:57:50,622][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:57:50,944][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:57:51,265][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:57:51,586][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:57:51,905][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:57:52,226][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:57:52,548][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:57:52,869][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:57:53,189][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:57:53,510][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:57:53,831][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:57:54,152][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:57:54,471][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:57:54,791][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:57:55,112][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:57:55,431][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:57:55,751][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:57:56,070][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:57:56,392][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:57:56,712][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:57:57,032][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:57:57,353][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:57:57,673][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:57:57,993][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:57:58,314][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:57:58,636][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:57:58,957][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:57:59,277][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:57:59,888][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:58:00,209][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:58:00,529][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:58:00,850][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:58:01,172][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:58:01,494][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:58:01,814][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:58:02,135][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:58:02,455][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:58:02,776][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:58:03,096][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:58:03,417][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:58:03,738][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:58:05,348][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:23 +[2026-03-25 15:58:06,302][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:58:06,304][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:58:06,306][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:58:06,932][__main__][INFO] - Iteration 40 took 30s (10.83% Gen, 87.12% Train). Generation: 3s, Training: 26s. Estimated remaining time: 8h 7m 40s. Estimated total time: 8h 32m 19s. Time estimates for 10 more iterations: 5m 7s, 100 more iterations: 51m 13s, 500 more iterations: 4h 16m 9s. +[2026-03-25 15:58:06,935][__main__][INFO] - Starting iteration 40. +[2026-03-25 15:58:06,938][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:58:06,938][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:58:10,924][__main__][INFO] - Number of regex retries in iteration 40: 0 +[2026-03-25 15:58:10,925][__main__][INFO] - agents played in iteration 40 are Bob, Alice +[2026-03-25 15:58:12,146][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:58:13,187][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:58:14,648][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:58:14,945][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:58:15,265][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:58:15,584][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:58:15,904][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:58:16,225][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:58:16,545][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:58:16,866][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:58:17,188][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:58:17,509][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:58:17,829][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:58:18,148][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:58:18,469][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:58:18,788][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:58:19,109][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:58:19,429][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:58:19,749][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:58:20,070][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:58:20,391][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:58:20,712][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:58:21,033][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:58:21,354][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:58:21,675][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:58:21,994][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:58:22,314][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:58:22,635][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:58:22,954][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:58:23,276][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:58:23,595][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:58:23,915][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:58:24,235][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:58:24,556][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:58:24,876][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:58:25,196][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:58:25,516][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:58:25,836][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:58:26,157][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:58:26,478][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:58:26,798][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:58:27,118][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:58:27,439][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:58:27,760][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:58:28,080][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:58:28,401][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:58:28,721][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:58:29,043][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:58:29,364][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:58:29,685][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:58:30,005][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:58:30,325][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:58:30,645][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:58:31,254][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:58:31,576][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:58:31,895][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:58:32,216][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:58:32,537][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:58:32,856][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:58:33,176][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:58:33,495][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:58:33,815][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:58:34,137][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:58:34,457][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:58:34,777][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:58:35,097][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:58:35,813][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 15:58:36,540][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:58:36,542][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:58:36,544][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:58:37,169][__main__][INFO] - Iteration 41 took 30s (13.19% Gen, 84.74% Train). Generation: 3s, Training: 25s. Estimated remaining time: 7h 58m 42s. Estimated total time: 8h 23m 52s. Time estimates for 10 more iterations: 5m 2s, 100 more iterations: 50m 23s, 500 more iterations: 4h 11m 56s. +[2026-03-25 15:58:37,171][__main__][INFO] - Starting iteration 41. +[2026-03-25 15:58:37,174][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:58:37,175][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:58:40,418][__main__][INFO] - Number of regex retries in iteration 41: 0 +[2026-03-25 15:58:40,419][__main__][INFO] - agents played in iteration 41 are Bob, Alice +[2026-03-25 15:58:40,968][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:58:41,613][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:58:41,904][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:58:42,225][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:58:42,546][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:58:42,868][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:58:43,190][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:58:43,510][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:58:43,831][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:58:44,153][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:58:44,475][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:58:44,797][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:58:45,117][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:58:45,437][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:58:45,759][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:58:46,081][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:58:46,403][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:58:46,723][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:58:47,046][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:58:47,367][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:58:47,688][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:58:48,010][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:58:48,330][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:58:48,651][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:58:48,971][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:58:49,290][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:58:49,611][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:58:49,932][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:58:50,253][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:58:50,574][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:58:50,895][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:58:51,215][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:58:51,536][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:58:51,857][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:58:52,178][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:58:52,499][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:58:52,820][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:58:53,142][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:58:53,463][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:58:53,783][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:58:54,105][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:58:54,425][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:58:54,745][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:58:55,065][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:58:55,387][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:58:55,708][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:58:56,029][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:58:56,349][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:58:56,671][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:58:56,991][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:58:57,310][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:58:57,630][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:58:57,949][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:58:58,560][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:58:58,882][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:58:59,204][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:58:59,524][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:58:59,843][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:59:00,163][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:59:00,483][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:59:00,804][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:59:01,125][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:59:01,446][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:59:01,765][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:59:02,086][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:59:02,408][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:59:03,063][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 15:59:03,804][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:59:03,806][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:59:03,807][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:59:04,426][__main__][INFO] - Iteration 42 took 27s (11.90% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 8m 35s. Estimated total time: 7h 34m 12s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 6s. +[2026-03-25 15:59:04,428][__main__][INFO] - Starting iteration 42. +[2026-03-25 15:59:04,431][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:59:04,431][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:59:09,043][__main__][INFO] - Number of regex retries in iteration 42: 0 +[2026-03-25 15:59:09,044][__main__][INFO] - agents played in iteration 42 are Bob, Alice +[2026-03-25 15:59:09,605][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:59:10,257][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:59:10,548][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:59:10,867][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:59:11,188][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:59:11,509][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:59:11,829][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:59:12,149][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:59:12,470][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:59:12,790][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:59:13,110][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:59:13,431][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:59:13,752][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:59:14,073][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:59:14,393][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:59:14,714][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:59:15,033][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:59:15,354][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:59:15,675][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:59:15,995][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:59:16,317][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:59:16,637][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:59:16,957][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:59:17,277][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:59:17,599][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:59:17,920][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:59:18,242][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:59:18,564][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:59:18,884][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:59:19,205][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:59:19,526][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:59:19,847][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:59:20,168][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:59:20,491][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:59:20,811][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:59:21,132][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:59:21,453][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:59:21,774][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:59:22,095][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:59:22,415][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:59:22,736][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:59:23,056][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:59:23,376][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:59:23,696][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:59:24,016][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:59:24,336][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:59:24,656][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:59:24,976][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:59:25,296][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:59:25,617][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:59:25,938][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:59:26,261][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:59:26,582][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:59:27,195][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:59:27,516][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:59:27,837][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:59:28,160][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:59:28,482][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:59:28,804][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:59:29,123][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:59:29,445][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:59:29,767][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:59:30,088][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:59:30,409][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:59:30,730][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:59:31,050][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:59:31,703][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 15:59:32,448][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 15:59:32,450][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 15:59:32,452][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 15:59:33,081][__main__][INFO] - Iteration 43 took 28s (16.10% Gen, 81.70% Train). Generation: 4s, Training: 23s. Estimated remaining time: 7h 31m 25s. Estimated total time: 7h 57m 30s. Time estimates for 10 more iterations: 4m 46s, 100 more iterations: 47m 45s, 500 more iterations: 3h 58m 45s. +[2026-03-25 15:59:33,083][__main__][INFO] - Starting iteration 43. +[2026-03-25 15:59:33,086][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 15:59:33,087][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 15:59:36,406][__main__][INFO] - Number of regex retries in iteration 43: 0 +[2026-03-25 15:59:36,406][__main__][INFO] - agents played in iteration 43 are Bob, Alice +[2026-03-25 15:59:36,959][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 15:59:37,611][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 15:59:38,486][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 15:59:38,801][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 15:59:39,120][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 15:59:39,442][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 15:59:39,763][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 15:59:40,085][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 15:59:40,406][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 15:59:40,727][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 15:59:41,049][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 15:59:41,369][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 15:59:41,690][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 15:59:42,011][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 15:59:42,330][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 15:59:42,650][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 15:59:42,971][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 15:59:43,293][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 15:59:43,613][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 15:59:43,934][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 15:59:44,254][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 15:59:44,575][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 15:59:44,896][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 15:59:45,216][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 15:59:45,536][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 15:59:45,856][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 15:59:46,177][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 15:59:46,496][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 15:59:46,816][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 15:59:47,135][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 15:59:47,456][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 15:59:47,775][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 15:59:48,095][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 15:59:48,417][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 15:59:48,738][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 15:59:49,059][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 15:59:49,379][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 15:59:49,700][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 15:59:50,021][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 15:59:50,343][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 15:59:50,664][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 15:59:50,984][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 15:59:51,305][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 15:59:51,625][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 15:59:51,946][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 15:59:52,268][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 15:59:52,590][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 15:59:52,910][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 15:59:53,230][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 15:59:53,552][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 15:59:53,871][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 15:59:54,190][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 15:59:54,511][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 15:59:55,123][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 15:59:55,445][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 15:59:55,765][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 15:59:56,085][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 15:59:56,406][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 15:59:56,728][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 15:59:57,049][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 15:59:57,371][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 15:59:57,692][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 15:59:58,011][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 15:59:58,331][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 15:59:58,652][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 15:59:58,973][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 15:59:59,629][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 16:00:00,354][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:00:00,356][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:00:00,357][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:00:00,991][__main__][INFO] - Iteration 44 took 27s (11.90% Gen, 85.83% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 18m 32s. Estimated total time: 7h 45m 5s. Time estimates for 10 more iterations: 4m 39s, 100 more iterations: 46m 30s, 500 more iterations: 3h 52m 32s. +[2026-03-25 16:00:00,993][__main__][INFO] - Starting iteration 44. +[2026-03-25 16:00:00,996][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 16:00:00,997][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:00:04,263][__main__][INFO] - Number of regex retries in iteration 44: 0 +[2026-03-25 16:00:04,264][__main__][INFO] - agents played in iteration 44 are Bob, Alice +[2026-03-25 16:00:04,809][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:00:05,460][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:00:06,454][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:00:06,767][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:00:07,088][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:00:07,408][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:00:07,729][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:00:08,050][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:00:08,371][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:00:08,690][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:00:09,011][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:00:09,334][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:00:09,655][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:00:09,975][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:00:10,296][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:00:10,616][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:00:10,936][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:00:11,257][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:00:11,576][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:00:11,895][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:00:12,215][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:00:12,535][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:00:12,856][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:00:13,178][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:00:13,499][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:00:13,821][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:00:14,143][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:00:14,464][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:00:14,784][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:00:15,104][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:00:15,423][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:00:15,745][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:00:16,065][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:00:16,387][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:00:16,707][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:00:17,027][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:00:17,349][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:00:17,669][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:00:17,990][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:00:18,312][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:00:18,632][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:00:18,954][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:00:19,274][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:00:19,594][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:00:19,913][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:00:20,234][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:00:20,555][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:00:20,875][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:00:21,194][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:00:21,515][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:00:21,834][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:00:22,154][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:00:22,475][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:00:23,087][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:00:23,409][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:00:23,729][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:00:24,049][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:00:24,373][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:00:24,694][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:00:25,015][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:00:25,336][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:00:25,657][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:00:25,976][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:00:26,296][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:00:26,617][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:00:26,938][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:00:27,714][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 16:00:29,441][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:00:29,443][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:00:29,445][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:00:30,104][__main__][INFO] - Iteration 45 took 29s (11.22% Gen, 86.51% Train). Generation: 3s, Training: 25s. Estimated remaining time: 7h 38m 5s. Estimated total time: 8h 5m 8s. Time estimates for 10 more iterations: 4m 51s, 100 more iterations: 48m 30s, 500 more iterations: 4h 2m 34s. +[2026-03-25 16:00:30,108][__main__][INFO] - Starting iteration 45. +[2026-03-25 16:00:30,111][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 16:00:30,112][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:00:33,387][__main__][INFO] - Number of regex retries in iteration 45: 0 +[2026-03-25 16:00:33,388][__main__][INFO] - agents played in iteration 45 are Bob, Alice +[2026-03-25 16:00:33,937][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:00:34,589][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:00:34,879][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:00:35,199][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:00:35,520][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:00:35,842][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:00:36,164][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:00:36,486][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:00:36,806][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:00:37,127][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:00:37,447][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:00:37,767][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:00:38,089][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:00:38,411][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:00:38,732][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:00:39,053][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:00:39,374][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:00:39,695][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:00:40,016][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:00:40,335][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:00:40,655][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:00:40,976][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:00:41,296][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:00:41,617][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:00:41,937][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:00:42,256][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:00:42,577][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:00:42,896][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:00:43,217][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:00:43,536][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:00:43,857][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:00:44,177][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:00:44,497][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:00:44,817][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:00:45,139][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:00:45,460][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:00:45,780][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:00:46,102][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:00:46,422][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:00:46,741][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:00:47,063][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:00:47,383][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:00:47,704][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:00:48,025][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:00:48,347][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:00:48,668][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:00:48,989][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:00:49,309][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:00:49,629][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:00:49,951][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:00:50,271][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:00:50,591][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:00:50,912][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:00:51,525][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:00:51,846][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:00:52,167][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:00:52,487][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:00:52,807][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:00:53,128][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:00:53,448][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:00:53,769][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:00:54,088][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:00:54,409][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:00:54,729][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:00:55,050][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:00:55,369][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:00:56,025][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:00:56,752][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:00:56,754][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:00:56,756][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:00:57,386][__main__][INFO] - Iteration 46 took 27s (12.01% Gen, 85.67% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 7m 5s. Estimated total time: 7h 34m 35s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 17s. +[2026-03-25 16:00:57,388][__main__][INFO] - Starting iteration 46. +[2026-03-25 16:00:57,391][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 16:00:57,391][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:01:00,640][__main__][INFO] - Number of regex retries in iteration 46: 0 +[2026-03-25 16:01:00,640][__main__][INFO] - agents played in iteration 46 are Bob, Alice +[2026-03-25 16:01:01,169][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:01:01,821][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:01:02,111][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:01:02,434][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:01:02,754][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:01:03,075][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:01:03,395][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:01:03,715][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:01:04,035][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:01:04,355][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:01:04,674][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:01:04,994][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:01:05,315][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:01:05,636][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:01:05,957][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:01:06,278][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:01:06,599][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:01:06,919][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:01:07,239][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:01:07,560][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:01:07,880][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:01:08,199][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:01:08,519][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:01:08,839][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:01:09,159][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:01:09,479][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:01:09,798][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:01:10,118][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:01:10,437][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:01:10,758][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:01:11,077][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:01:11,399][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:01:11,719][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:01:12,041][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:01:12,362][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:01:12,684][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:01:13,005][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:01:13,325][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:01:13,646][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:01:13,966][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:01:14,287][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:01:14,609][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:01:14,930][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:01:15,251][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:01:15,572][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:01:15,892][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:01:16,214][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:01:16,534][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:01:16,855][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:01:17,176][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:01:17,495][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:01:17,815][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:01:18,136][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:01:18,751][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:01:19,073][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:01:19,392][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:01:19,713][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:01:20,035][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:01:20,356][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:01:20,677][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:01:20,998][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:01:21,320][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:01:21,640][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:01:21,959][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:01:22,279][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:01:22,599][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:01:23,255][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:01:24,025][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:01:24,028][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:01:24,029][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:01:24,683][__main__][INFO] - Iteration 47 took 27s (11.90% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 6m 56s. Estimated total time: 7h 34m 53s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 26s. +[2026-03-25 16:01:24,685][__main__][INFO] - Starting iteration 47. +[2026-03-25 16:01:24,688][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 16:01:24,689][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:01:27,937][__main__][INFO] - Number of regex retries in iteration 47: 0 +[2026-03-25 16:01:27,938][__main__][INFO] - agents played in iteration 47 are Bob, Alice +[2026-03-25 16:01:28,482][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:01:29,131][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:01:29,455][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:01:29,748][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:01:30,069][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:01:30,390][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:01:30,709][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:01:31,030][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:01:31,350][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:01:31,670][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:01:31,989][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:01:32,310][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:01:32,630][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:01:32,950][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:01:33,272][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:01:33,593][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:01:33,914][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:01:34,233][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:01:34,554][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:01:34,875][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:01:35,193][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:01:35,513][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:01:35,832][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:01:36,152][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:01:36,473][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:01:36,793][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:01:37,112][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:01:37,432][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:01:37,752][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:01:38,072][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:01:38,393][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:01:38,714][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:01:39,034][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:01:39,356][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:01:39,676][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:01:39,996][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:01:40,317][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:01:40,638][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:01:40,959][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:01:41,281][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:01:41,602][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:01:41,924][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:01:42,245][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:01:42,565][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:01:42,885][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:01:43,207][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:01:43,527][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:01:43,847][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:01:44,167][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:01:44,486][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:01:44,807][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:01:45,126][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:01:45,448][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:01:46,065][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:01:46,387][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:01:46,706][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:01:47,027][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:01:47,348][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:01:47,668][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:01:47,987][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:01:48,308][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:01:48,628][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:01:48,949][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:01:49,269][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:01:49,590][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:01:49,910][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:01:50,567][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:01:51,285][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:01:51,287][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:01:51,288][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:01:51,913][__main__][INFO] - Iteration 48 took 27s (11.93% Gen, 85.77% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 5m 21s. Estimated total time: 7h 33m 45s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 52s. +[2026-03-25 16:01:51,915][__main__][INFO] - Starting iteration 48. +[2026-03-25 16:01:51,918][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 16:01:51,919][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:01:55,205][__main__][INFO] - Number of regex retries in iteration 48: 0 +[2026-03-25 16:01:55,206][__main__][INFO] - agents played in iteration 48 are Bob, Alice +[2026-03-25 16:01:55,754][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:01:56,406][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:01:56,697][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:01:57,019][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:01:57,338][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:01:57,659][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:01:57,979][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:01:58,298][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:01:58,618][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:01:58,938][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:01:59,259][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:01:59,580][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:01:59,900][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:02:00,222][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:02:00,543][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:02:00,865][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:02:01,184][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:02:01,504][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:02:01,825][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:02:02,146][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:02:02,466][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:02:02,788][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:02:03,107][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:02:03,426][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:02:03,746][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:02:04,065][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:02:04,387][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:02:04,709][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:02:05,029][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:02:05,350][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:02:05,671][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:02:05,992][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:02:06,313][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:02:06,633][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:02:06,954][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:02:07,275][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:02:07,595][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:02:07,917][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:02:08,238][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:02:08,559][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:02:08,881][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:02:09,202][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:02:09,523][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:02:09,845][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:02:10,166][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:02:10,486][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:02:10,808][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:02:11,128][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:02:11,449][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:02:11,769][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:02:12,089][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:02:12,409][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:02:12,730][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:02:13,344][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:02:13,666][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:02:13,985][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:02:14,306][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:02:14,627][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:02:14,949][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:02:15,269][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:02:15,589][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:02:15,909][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:02:16,230][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:02:16,551][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:02:16,870][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:02:17,191][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:02:17,848][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:02:18,564][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:02:18,567][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:02:18,568][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:02:19,190][__main__][INFO] - Iteration 49 took 27s (12.05% Gen, 85.66% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 5m 42s. Estimated total time: 7h 34m 33s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 16s. +[2026-03-25 16:02:19,193][__main__][INFO] - Starting iteration 49. +[2026-03-25 16:02:19,196][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 16:02:19,196][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:02:22,469][__main__][INFO] - Number of regex retries in iteration 49: 0 +[2026-03-25 16:02:22,469][__main__][INFO] - agents played in iteration 49 are Bob, Alice +[2026-03-25 16:02:22,997][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:02:23,651][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:02:23,940][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:02:24,261][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:02:24,582][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:02:24,902][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:02:25,223][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:02:25,544][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:02:25,865][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:02:26,185][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:02:26,506][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:02:26,825][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:02:27,146][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:02:27,468][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:02:27,788][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:02:28,109][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:02:28,431][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:02:28,752][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:02:29,071][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:02:29,391][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:02:29,712][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:02:30,032][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:02:30,352][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:02:30,671][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:02:30,992][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:02:31,312][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:02:31,631][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:02:31,952][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:02:32,273][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:02:32,595][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:02:32,916][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:02:33,237][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:02:33,558][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:02:33,877][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:02:34,197][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:02:34,516][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:02:34,837][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:02:35,157][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:02:35,476][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:02:35,799][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:02:36,119][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:02:36,438][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:02:36,760][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:02:37,080][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:02:37,399][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:02:37,718][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:02:38,039][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:02:38,360][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:02:38,680][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:02:39,000][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:02:39,319][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:02:39,639][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:02:39,959][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:02:40,573][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:02:40,895][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:02:41,214][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:02:41,534][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:02:41,854][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:02:42,175][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:02:42,495][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:02:42,816][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:02:43,135][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:02:43,456][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:02:43,775][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:02:44,096][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:02:44,416][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:02:45,072][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:02:45,792][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:02:45,794][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:02:45,796][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:02:46,420][__main__][INFO] - Iteration 50 took 27s (12.02% Gen, 85.68% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 4m 26s. Estimated total time: 7h 33m 45s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 52s. +[2026-03-25 16:02:46,422][__main__][INFO] - Starting iteration 50. +[2026-03-25 16:02:46,425][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 0 and human policies 1. +[2026-03-25 16:02:46,426][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:02:49,698][__main__][INFO] - Number of regex retries in iteration 50: 0 +[2026-03-25 16:02:49,699][__main__][INFO] - agents played in iteration 50 are Bob, Alice +[2026-03-25 16:02:50,240][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:02:50,893][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:02:51,184][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:02:51,504][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:02:51,824][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:02:52,145][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:02:52,465][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:02:52,784][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:02:53,104][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:02:53,425][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:02:53,745][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:02:54,066][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:02:54,385][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:02:54,705][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:02:55,027][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:02:55,349][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:02:55,670][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:02:55,991][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:02:56,312][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:02:56,632][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:02:56,952][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:02:57,273][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:02:57,592][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:02:57,914][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:02:58,235][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:02:58,556][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:02:58,876][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:02:59,197][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:02:59,519][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:02:59,840][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:03:00,162][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:03:00,485][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:03:00,805][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:03:01,125][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:03:01,447][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:03:01,769][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:03:02,090][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:03:02,411][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:03:02,732][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:03:03,053][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:03:03,375][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:03:03,696][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:03:04,018][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:03:04,339][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:03:04,659][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:03:04,978][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:03:05,299][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:03:05,619][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:03:05,939][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:03:06,262][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:03:06,583][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:03:06,902][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:03:07,224][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:03:07,840][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:03:08,160][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:03:08,483][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:03:08,805][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:03:09,127][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:03:09,448][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:03:09,769][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:03:10,090][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:03:10,411][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:03:10,733][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:03:11,054][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:03:11,374][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:03:11,694][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:03:12,354][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:03:13,086][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:03:13,088][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:03:13,090][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:03:14,245][__main__][INFO] - Iteration 51 took 27s (11.77% Gen, 84.08% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 13m 54s. Estimated total time: 7h 43m 40s. Time estimates for 10 more iterations: 4m 38s, 100 more iterations: 46m 22s, 500 more iterations: 3h 51m 50s. +[2026-03-25 16:03:14,247][__main__][INFO] - Starting iteration 51. +[2026-03-25 16:03:14,250][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:03:14,251][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:03:17,705][__main__][INFO] - Number of regex retries in iteration 51: 0 +[2026-03-25 16:03:17,706][__main__][INFO] - agents played in iteration 51 are Bob, Alice +[2026-03-25 16:03:18,242][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:03:18,899][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:03:19,189][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:03:19,511][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:03:19,830][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:03:20,151][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:03:20,472][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:03:20,792][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:03:21,112][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:03:21,432][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:03:21,753][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:03:22,074][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:03:22,395][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:03:22,716][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:03:23,037][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:03:23,359][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:03:23,680][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:03:24,001][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:03:24,324][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:03:24,646][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:03:24,966][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:03:25,287][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:03:25,607][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:03:25,928][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:03:26,251][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:03:26,573][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:03:26,894][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:03:27,215][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:03:27,536][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:03:27,856][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:03:28,177][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:03:28,497][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:03:28,818][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:03:29,138][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:03:29,458][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:03:29,778][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:03:30,098][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:03:30,419][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:03:30,740][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:03:31,062][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:03:31,384][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:03:31,705][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:03:32,027][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:03:32,347][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:03:32,668][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:03:32,989][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:03:33,311][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:03:33,632][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:03:33,954][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:03:34,275][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:03:34,597][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:03:34,917][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:03:35,239][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:03:35,854][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:03:36,174][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:03:36,494][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:03:36,815][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:03:37,135][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:03:37,455][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:03:37,776][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:03:38,096][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:03:38,416][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:03:38,737][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:03:39,058][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:03:39,378][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:03:39,697][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:03:40,358][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:03:41,078][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:03:41,080][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:03:41,082][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:03:41,700][__main__][INFO] - Iteration 52 took 27s (12.59% Gen, 85.15% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 7m 16s. Estimated total time: 7h 37m 30s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 45s, 500 more iterations: 3h 48m 45s. +[2026-03-25 16:03:41,702][__main__][INFO] - Starting iteration 52. +[2026-03-25 16:03:41,705][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:03:41,706][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:03:45,042][__main__][INFO] - Number of regex retries in iteration 52: 0 +[2026-03-25 16:03:45,042][__main__][INFO] - agents played in iteration 52 are Bob, Alice +[2026-03-25 16:03:45,665][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:03:46,320][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:03:46,610][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:03:46,931][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:03:47,253][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:03:47,574][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:03:47,895][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:03:48,215][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:03:48,534][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:03:48,854][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:03:49,174][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:03:49,495][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:03:49,815][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:03:50,137][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:03:50,459][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:03:50,781][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:03:51,103][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:03:51,424][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:03:51,745][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:03:52,065][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:03:52,385][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:03:52,706][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:03:53,027][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:03:53,348][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:03:53,668][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:03:53,989][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:03:54,309][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:03:54,630][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:03:54,950][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:03:55,270][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:03:55,591][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:03:55,912][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:03:56,235][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:03:56,555][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:03:56,876][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:03:57,197][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:03:57,517][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:03:57,837][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:03:58,157][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:03:58,478][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:03:58,798][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:03:59,118][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:03:59,439][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:03:59,760][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:04:00,080][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:04:00,401][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:04:00,722][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:04:01,043][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:04:01,365][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:04:01,687][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:04:02,008][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:04:02,329][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:04:02,650][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:04:03,266][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:04:03,587][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:04:03,909][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:04:04,230][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:04:04,550][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:04:04,870][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:04:05,191][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:04:05,510][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:04:05,830][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:04:06,150][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:04:06,471][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:04:06,791][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:04:07,112][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:04:07,770][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:04:08,499][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:04:08,502][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:04:08,503][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:04:09,120][__main__][INFO] - Iteration 53 took 27s (12.17% Gen, 85.58% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 6m 14s. Estimated total time: 7h 36m 55s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 41s, 500 more iterations: 3h 48m 27s. +[2026-03-25 16:04:09,122][__main__][INFO] - Starting iteration 53. +[2026-03-25 16:04:09,125][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:04:09,126][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:04:12,385][__main__][INFO] - Number of regex retries in iteration 53: 0 +[2026-03-25 16:04:12,386][__main__][INFO] - agents played in iteration 53 are Bob, Alice +[2026-03-25 16:04:13,559][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:04:14,211][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:04:14,501][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:04:14,821][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:04:15,143][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:04:15,467][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:04:15,786][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:04:16,106][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:04:16,427][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:04:16,747][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:04:17,068][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:04:17,389][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:04:17,709][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:04:18,030][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:04:18,351][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:04:18,671][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:04:18,991][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:04:19,311][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:04:19,631][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:04:19,951][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:04:20,273][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:04:20,594][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:04:20,914][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:04:21,234][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:04:21,554][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:04:21,875][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:04:22,195][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:04:22,514][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:04:22,836][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:04:23,157][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:04:23,478][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:04:23,799][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:04:24,119][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:04:24,440][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:04:24,762][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:04:25,084][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:04:25,404][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:04:25,724][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:04:26,047][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:04:26,369][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:04:26,690][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:04:27,011][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:04:27,331][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:04:27,651][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:04:27,971][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:04:28,293][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:04:28,614][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:04:28,935][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:04:29,257][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:04:29,577][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:04:29,897][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:04:30,219][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:04:30,539][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:04:31,153][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:04:31,474][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:04:31,796][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:04:32,116][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:04:32,436][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:04:32,757][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:04:33,077][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:04:33,399][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:04:33,720][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:04:34,042][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:04:34,365][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:04:34,685][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:04:35,007][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:04:35,685][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:04:36,406][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:04:36,409][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:04:36,410][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:04:37,026][__main__][INFO] - Iteration 54 took 27s (11.68% Gen, 86.10% Train). Generation: 3s, Training: 24s. Estimated remaining time: 7h 13m 52s. Estimated total time: 7h 45m 2s. Time estimates for 10 more iterations: 4m 39s, 100 more iterations: 46m 30s, 500 more iterations: 3h 52m 31s. +[2026-03-25 16:04:37,028][__main__][INFO] - Starting iteration 54. +[2026-03-25 16:04:37,031][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:04:37,032][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:04:37,793][mllm.models.large_language_model_local][WARNING] - Response + +Please note my observation of your previous move. did not match regex: (|), retry 1/1 +[2026-03-25 16:04:43,210][__main__][INFO] - Number of regex retries in iteration 54: 1 +[2026-03-25 16:04:43,211][__main__][INFO] - agents played in iteration 54 are Bob, Alice +[2026-03-25 16:04:43,749][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:04:44,401][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:04:44,690][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:04:45,011][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:04:45,332][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:04:45,652][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:04:45,974][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:04:46,295][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:04:46,614][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:04:46,935][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:04:47,256][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:04:47,576][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:04:47,896][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:04:48,217][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:04:48,537][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:04:48,857][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:04:49,176][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:04:49,495][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:04:49,816][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:04:50,136][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:04:50,458][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:04:50,779][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:04:51,099][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:04:51,421][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:04:51,742][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:04:52,063][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:04:52,386][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:04:52,707][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:04:53,027][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:04:53,347][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:04:53,668][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:04:53,991][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:04:54,312][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:04:54,633][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:04:54,953][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:04:55,272][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:04:55,591][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:04:55,910][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:04:56,229][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:04:56,550][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:04:56,870][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:04:57,189][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:04:57,510][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:04:57,830][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:04:58,151][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:04:58,470][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:04:58,790][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:04:59,110][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:04:59,430][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:04:59,750][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:05:00,070][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:05:00,390][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:05:00,711][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:05:01,319][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:05:01,640][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:05:01,962][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:05:02,283][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:05:02,604][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:05:02,926][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:05:03,247][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:05:03,566][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:05:03,885][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:05:04,205][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:05:04,526][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:05:04,847][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:05:05,168][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:05:05,818][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:05:06,545][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:05:06,547][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:05:06,549][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:05:07,167][__main__][INFO] - Iteration 55 took 30s (20.50% Gen, 77.44% Train). Generation: 6s, Training: 23s. Estimated remaining time: 7h 50m 37s. Estimated total time: 8h 22m 16s. Time estimates for 10 more iterations: 5m 1s, 100 more iterations: 50m 13s, 500 more iterations: 4h 11m 8s. +[2026-03-25 16:05:07,169][__main__][INFO] - Starting iteration 55. +[2026-03-25 16:05:07,173][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:05:07,173][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:05:10,479][__main__][INFO] - Number of regex retries in iteration 55: 0 +[2026-03-25 16:05:10,480][__main__][INFO] - agents played in iteration 55 are Bob, Alice +[2026-03-25 16:05:11,700][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:05:12,345][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:05:12,637][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:05:12,959][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:05:13,281][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:05:13,602][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:05:13,924][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:05:14,245][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:05:14,564][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:05:14,886][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:05:15,207][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:05:15,528][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:05:15,847][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:05:16,168][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:05:16,489][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:05:16,809][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:05:17,128][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:05:17,449][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:05:17,768][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:05:18,089][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:05:18,408][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:05:18,729][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:05:19,050][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:05:19,370][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:05:19,690][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:05:20,010][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:05:20,330][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:05:20,649][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:05:20,970][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:05:21,291][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:05:21,611][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:05:21,939][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:05:22,255][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:05:22,576][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:05:22,895][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:05:23,217][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:05:23,536][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:05:23,857][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:05:24,177][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:05:24,498][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:05:24,819][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:05:25,139][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:05:25,459][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:05:25,779][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:05:26,099][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:05:26,418][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:05:26,739][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:05:27,059][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:05:27,379][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:05:27,700][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:05:28,019][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:05:28,340][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:05:28,660][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:05:29,270][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:05:29,590][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:05:29,909][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:05:30,230][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:05:30,550][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:05:30,870][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:05:31,189][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:05:31,509][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:05:31,829][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:05:32,149][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:05:32,470][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:05:32,791][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:05:33,111][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:05:33,760][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:05:34,486][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:05:34,489][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:05:34,490][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:05:35,110][__main__][INFO] - Iteration 56 took 27s (11.83% Gen, 85.94% Train). Generation: 3s, Training: 24s. Estimated remaining time: 7h 13m 30s. Estimated total time: 7h 45m 38s. Time estimates for 10 more iterations: 4m 39s, 100 more iterations: 46m 33s, 500 more iterations: 3h 52m 49s. +[2026-03-25 16:05:35,112][__main__][INFO] - Starting iteration 56. +[2026-03-25 16:05:35,115][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:05:35,115][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:05:38,393][__main__][INFO] - Number of regex retries in iteration 56: 0 +[2026-03-25 16:05:38,394][__main__][INFO] - agents played in iteration 56 are Bob, Alice +[2026-03-25 16:05:38,935][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:05:39,580][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:05:39,871][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:05:40,191][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:05:40,511][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:05:40,831][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:05:41,151][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:05:41,470][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:05:41,791][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:05:42,111][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:05:42,431][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:05:42,751][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:05:43,071][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:05:43,391][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:05:43,710][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:05:44,029][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:05:44,350][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:05:44,669][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:05:44,988][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:05:45,308][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:05:45,629][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:05:45,948][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:05:46,268][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:05:46,587][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:05:46,908][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:05:47,229][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:05:47,550][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:05:47,869][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:05:48,189][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:05:48,508][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:05:48,828][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:05:49,148][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:05:49,469][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:05:49,789][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:05:50,109][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:05:50,430][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:05:50,751][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:05:51,070][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:05:51,390][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:05:51,711][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:05:52,031][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:05:52,352][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:05:52,673][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:05:52,992][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:05:53,312][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:05:53,631][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:05:53,951][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:05:54,271][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:05:54,590][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:05:54,910][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:05:55,231][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:05:55,552][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:05:55,871][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:05:56,481][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:05:56,803][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:05:57,123][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:05:57,444][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:05:57,766][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:05:58,085][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:05:58,406][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:05:58,726][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:05:59,046][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:05:59,367][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:05:59,689][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:06:00,009][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:06:00,329][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:06:00,979][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:06:01,703][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:06:01,706][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:06:01,707][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:06:02,318][__main__][INFO] - Iteration 57 took 27s (12.05% Gen, 85.70% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 0m 49s. Estimated total time: 7h 33m 23s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 20s, 500 more iterations: 3h 46m 41s. +[2026-03-25 16:06:02,320][__main__][INFO] - Starting iteration 57. +[2026-03-25 16:06:02,323][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:06:02,324][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:06:05,554][__main__][INFO] - Number of regex retries in iteration 57: 0 +[2026-03-25 16:06:05,555][__main__][INFO] - agents played in iteration 57 are Bob, Alice +[2026-03-25 16:06:06,123][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:06:06,769][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:06:07,059][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:06:07,380][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:06:07,700][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:06:08,021][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:06:08,342][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:06:08,662][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:06:08,983][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:06:09,303][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:06:09,625][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:06:09,946][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:06:10,267][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:06:10,586][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:06:10,907][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:06:11,228][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:06:11,549][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:06:11,870][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:06:12,190][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:06:12,511][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:06:12,832][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:06:13,154][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:06:13,475][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:06:13,797][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:06:14,119][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:06:14,439][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:06:14,759][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:06:15,079][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:06:15,400][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:06:15,720][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:06:16,043][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:06:16,364][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:06:16,685][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:06:17,005][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:06:17,326][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:06:17,646][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:06:17,966][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:06:18,285][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:06:18,606][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:06:18,926][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:06:19,247][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:06:19,569][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:06:19,888][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:06:20,209][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:06:20,529][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:06:20,850][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:06:21,171][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:06:21,490][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:06:21,810][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:06:22,131][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:06:22,451][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:06:22,771][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:06:23,093][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:06:23,704][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:06:24,025][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:06:24,346][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:06:24,667][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:06:24,988][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:06:25,308][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:06:25,628][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:06:25,948][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:06:26,267][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:06:26,587][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:06:26,906][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:06:27,227][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:06:27,547][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:06:28,197][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:06:28,923][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:06:28,925][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:06:28,927][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:06:29,546][__main__][INFO] - Iteration 58 took 27s (11.87% Gen, 85.85% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 0m 42s. Estimated total time: 7h 33m 44s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 52s. +[2026-03-25 16:06:29,549][__main__][INFO] - Starting iteration 58. +[2026-03-25 16:06:29,552][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:06:29,552][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:06:32,811][__main__][INFO] - Number of regex retries in iteration 58: 0 +[2026-03-25 16:06:32,812][__main__][INFO] - agents played in iteration 58 are Bob, Alice +[2026-03-25 16:06:33,339][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:06:33,986][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:06:34,278][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:06:34,598][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:06:34,919][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:06:35,241][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:06:35,561][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:06:35,883][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:06:36,205][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:06:36,525][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:06:36,845][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:06:37,167][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:06:37,486][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:06:37,805][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:06:38,125][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:06:38,445][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:06:38,765][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:06:39,086][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:06:39,406][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:06:39,726][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:06:40,046][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:06:40,367][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:06:40,688][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:06:41,009][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:06:41,331][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:06:41,651][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:06:41,971][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:06:42,291][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:06:42,611][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:06:42,931][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:06:43,252][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:06:43,573][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:06:43,892][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:06:44,213][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:06:44,533][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:06:44,854][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:06:45,176][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:06:45,496][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:06:45,816][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:06:46,136][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:06:46,457][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:06:46,778][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:06:47,099][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:06:47,420][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:06:47,739][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:06:48,061][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:06:48,382][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:06:48,704][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:06:49,025][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:06:49,346][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:06:49,666][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:06:49,986][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:06:50,307][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:06:50,922][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:06:51,242][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:06:51,564][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:06:51,883][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:06:52,205][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:06:52,526][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:06:52,846][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:06:53,166][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:06:53,487][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:06:53,808][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:06:54,128][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:06:54,449][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:06:54,769][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:06:55,419][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:06:56,143][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:06:56,145][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:06:56,147][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:06:56,764][__main__][INFO] - Iteration 59 took 27s (11.98% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 0m 4s. Estimated total time: 7h 33m 33s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 46s. +[2026-03-25 16:06:56,766][__main__][INFO] - Starting iteration 59. +[2026-03-25 16:06:56,769][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:06:56,770][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:07:00,047][__main__][INFO] - Number of regex retries in iteration 59: 0 +[2026-03-25 16:07:00,048][__main__][INFO] - agents played in iteration 59 are Bob, Alice +[2026-03-25 16:07:00,592][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:07:01,242][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:07:01,533][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:07:01,856][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:07:02,176][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:07:02,496][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:07:02,815][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:07:03,136][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:07:03,457][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:07:03,777][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:07:04,098][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:07:04,418][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:07:04,739][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:07:05,060][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:07:05,380][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:07:05,699][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:07:06,019][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:07:06,340][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:07:06,661][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:07:06,981][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:07:07,303][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:07:07,625][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:07:07,946][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:07:08,265][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:07:08,586][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:07:08,907][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:07:09,227][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:07:09,547][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:07:09,866][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:07:10,187][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:07:10,507][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:07:10,828][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:07:11,149][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:07:11,470][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:07:11,789][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:07:12,108][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:07:12,428][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:07:12,749][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:07:13,070][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:07:13,391][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:07:13,710][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:07:14,031][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:07:14,351][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:07:14,673][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:07:14,995][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:07:15,316][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:07:15,637][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:07:15,958][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:07:16,278][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:07:16,597][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:07:16,918][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:07:17,239][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:07:17,559][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:07:18,171][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:07:18,490][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:07:18,811][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:07:19,131][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:07:19,454][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:07:19,775][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:07:20,095][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:07:20,415][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:07:20,736][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:07:21,057][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:07:21,377][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:07:21,697][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:07:22,017][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:07:22,668][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:07:23,391][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:07:23,393][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:07:23,395][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:07:24,011][__main__][INFO] - Iteration 60 took 27s (12.03% Gen, 85.70% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 0m 6s. Estimated total time: 7h 34m 2s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 1s. +[2026-03-25 16:07:24,013][__main__][INFO] - Starting iteration 60. +[2026-03-25 16:07:24,016][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:07:24,016][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:07:27,203][__main__][INFO] - Number of regex retries in iteration 60: 0 +[2026-03-25 16:07:27,204][__main__][INFO] - agents played in iteration 60 are Bob, Alice +[2026-03-25 16:07:27,731][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:07:28,378][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:07:28,669][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:07:28,990][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:07:29,311][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:07:29,631][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:07:29,951][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:07:30,271][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:07:30,592][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:07:30,912][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:07:31,232][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:07:31,554][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:07:31,875][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:07:32,196][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:07:32,516][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:07:32,837][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:07:33,159][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:07:33,478][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:07:33,798][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:07:34,120][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:07:34,440][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:07:34,761][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:07:35,079][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:07:35,399][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:07:35,719][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:07:36,038][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:07:36,360][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:07:36,679][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:07:37,000][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:07:37,321][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:07:37,641][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:07:37,963][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:07:38,283][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:07:38,604][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:07:38,923][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:07:39,246][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:07:39,566][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:07:39,885][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:07:40,206][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:07:40,525][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:07:40,846][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:07:41,166][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:07:41,485][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:07:41,805][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:07:42,126][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:07:42,445][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:07:42,765][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:07:43,085][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:07:43,406][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:07:43,728][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:07:44,049][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:07:44,369][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:07:44,690][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:07:45,302][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:07:45,622][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:07:45,944][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:07:46,266][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:07:46,585][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:07:46,905][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:07:47,226][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:07:47,546][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:07:47,866][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:07:48,186][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:07:48,507][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:07:48,828][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:07:49,149][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:07:49,799][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:07:50,519][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:07:50,522][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:07:50,523][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:07:51,140][__main__][INFO] - Iteration 61 took 27s (11.75% Gen, 85.97% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 57m 41s. Estimated total time: 7h 32m 4s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 12s, 500 more iterations: 3h 46m 2s. +[2026-03-25 16:07:51,142][__main__][INFO] - Starting iteration 61. +[2026-03-25 16:07:51,145][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:07:51,145][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:07:54,363][__main__][INFO] - Number of regex retries in iteration 61: 0 +[2026-03-25 16:07:54,363][__main__][INFO] - agents played in iteration 61 are Bob, Alice +[2026-03-25 16:07:54,945][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:07:55,594][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:07:55,884][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:07:56,205][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:07:56,525][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:07:56,846][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:07:57,165][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:07:57,486][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:07:57,806][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:07:58,126][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:07:58,446][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:07:58,767][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:07:59,087][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:07:59,409][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:07:59,730][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:08:00,050][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:08:00,372][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:08:00,691][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:08:01,012][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:08:01,333][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:08:01,654][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:08:01,975][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:08:02,295][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:08:02,614][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:08:02,935][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:08:03,255][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:08:03,576][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:08:03,896][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:08:04,216][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:08:04,537][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:08:04,858][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:08:05,179][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:08:05,497][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:08:05,818][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:08:06,138][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:08:06,457][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:08:06,778][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:08:07,098][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:08:07,419][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:08:07,741][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:08:08,060][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:08:08,379][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:08:08,700][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:08:09,019][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:08:09,339][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:08:09,661][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:08:09,981][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:08:10,302][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:08:10,621][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:08:10,941][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:08:11,262][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:08:11,582][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:08:11,903][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:08:12,513][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:08:12,833][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:08:13,154][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:08:13,474][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:08:13,794][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:08:14,115][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:08:14,436][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:08:14,756][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:08:15,076][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:08:15,396][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:08:15,717][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:08:16,039][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:08:16,361][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:08:17,013][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:08:17,741][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:08:17,743][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:08:17,744][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:08:18,364][__main__][INFO] - Iteration 62 took 27s (11.82% Gen, 85.90% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 58m 49s. Estimated total time: 7h 33m 39s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 49s. +[2026-03-25 16:08:18,366][__main__][INFO] - Starting iteration 62. +[2026-03-25 16:08:18,369][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:08:18,370][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:08:21,585][__main__][INFO] - Number of regex retries in iteration 62: 0 +[2026-03-25 16:08:21,586][__main__][INFO] - agents played in iteration 62 are Bob, Alice +[2026-03-25 16:08:22,123][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:08:22,771][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:08:23,062][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:08:23,382][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:08:23,704][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:08:24,025][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:08:24,345][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:08:24,665][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:08:24,987][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:08:25,308][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:08:25,627][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:08:25,948][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:08:26,267][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:08:26,588][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:08:26,909][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:08:27,230][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:08:27,550][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:08:27,870][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:08:28,191][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:08:28,511][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:08:28,832][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:08:29,153][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:08:29,474][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:08:29,794][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:08:30,113][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:08:30,434][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:08:30,753][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:08:31,074][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:08:31,393][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:08:31,713][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:08:32,033][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:08:32,353][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:08:32,673][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:08:32,994][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:08:33,316][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:08:33,637][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:08:33,957][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:08:34,276][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:08:34,597][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:08:34,919][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:08:35,240][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:08:35,561][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:08:35,880][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:08:36,200][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:08:36,520][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:08:36,840][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:08:37,161][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:08:37,481][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:08:37,803][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:08:38,123][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:08:38,442][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:08:38,763][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:08:39,086][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:08:39,701][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:08:40,022][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:08:40,342][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:08:40,662][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:08:40,985][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:08:41,305][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:08:41,624][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:08:41,945][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:08:42,266][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:08:42,587][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:08:42,908][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:08:43,228][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:08:43,549][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:08:44,201][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:08:44,926][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:08:44,928][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:08:44,930][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:08:45,546][__main__][INFO] - Iteration 63 took 27s (11.83% Gen, 85.89% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 57m 39s. Estimated total time: 7h 32m 57s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 17s, 500 more iterations: 3h 46m 28s. +[2026-03-25 16:08:45,548][__main__][INFO] - Starting iteration 63. +[2026-03-25 16:08:45,551][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:08:45,551][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:08:48,767][__main__][INFO] - Number of regex retries in iteration 63: 0 +[2026-03-25 16:08:48,768][__main__][INFO] - agents played in iteration 63 are Bob, Alice +[2026-03-25 16:08:49,310][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:08:49,960][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:08:50,252][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:08:50,574][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:08:50,894][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:08:51,216][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:08:51,536][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:08:51,857][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:08:52,177][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:08:52,498][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:08:52,818][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:08:53,137][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:08:53,458][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:08:53,778][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:08:54,098][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:08:54,417][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:08:54,737][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:08:55,058][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:08:55,377][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:08:55,699][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:08:56,019][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:08:56,340][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:08:56,661][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:08:56,982][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:08:57,302][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:08:57,623][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:08:57,945][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:08:58,263][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:08:58,584][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:08:58,905][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:08:59,226][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:08:59,547][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:08:59,868][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:09:00,189][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:09:00,510][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:09:00,830][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:09:01,151][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:09:01,472][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:09:01,794][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:09:02,115][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:09:02,436][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:09:02,756][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:09:03,077][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:09:03,397][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:09:03,719][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:09:04,039][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:09:04,360][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:09:04,681][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:09:05,001][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:09:05,321][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:09:05,642][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:09:05,963][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:09:06,285][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:09:06,895][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:09:07,215][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:09:07,536][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:09:07,856][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:09:08,177][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:09:08,496][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:09:08,817][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:09:09,137][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:09:09,457][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:09:09,777][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:09:10,097][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:09:10,419][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:09:10,740][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:09:11,391][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:09:12,115][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:09:12,118][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:09:12,119][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:09:12,737][__main__][INFO] - Iteration 64 took 27s (11.83% Gen, 85.89% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 57m 21s. Estimated total time: 7h 33m 6s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 33s. +[2026-03-25 16:09:12,739][__main__][INFO] - Starting iteration 64. +[2026-03-25 16:09:12,742][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:09:12,742][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:09:15,983][__main__][INFO] - Number of regex retries in iteration 64: 0 +[2026-03-25 16:09:15,983][__main__][INFO] - agents played in iteration 64 are Bob, Alice +[2026-03-25 16:09:16,549][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:09:17,199][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:09:17,488][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:09:17,809][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:09:18,129][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:09:18,449][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:09:18,771][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:09:19,091][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:09:19,411][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:09:19,731][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:09:20,052][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:09:20,374][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:09:20,695][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:09:21,017][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:09:21,337][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:09:21,657][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:09:21,977][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:09:22,298][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:09:22,617][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:09:22,937][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:09:23,257][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:09:23,577][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:09:23,897][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:09:24,217][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:09:24,538][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:09:24,859][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:09:25,179][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:09:25,498][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:09:25,819][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:09:26,140][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:09:26,461][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:09:26,780][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:09:27,099][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:09:27,420][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:09:27,742][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:09:28,062][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:09:28,383][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:09:28,704][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:09:29,025][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:09:29,347][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:09:29,666][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:09:29,986][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:09:30,307][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:09:30,628][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:09:30,948][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:09:31,270][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:09:31,592][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:09:31,912][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:09:32,232][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:09:32,551][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:09:32,873][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:09:33,194][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:09:33,515][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:09:34,127][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:09:34,446][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:09:34,768][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:09:35,089][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:09:35,410][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:09:35,730][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:09:36,052][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:09:36,372][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:09:36,693][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:09:37,015][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:09:37,335][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:09:37,655][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:09:37,977][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:09:38,628][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:09:39,351][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:09:39,353][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:09:39,354][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:09:39,972][__main__][INFO] - Iteration 65 took 27s (11.90% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 57m 39s. Estimated total time: 7h 33m 51s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 55s. +[2026-03-25 16:09:39,975][__main__][INFO] - Starting iteration 65. +[2026-03-25 16:09:39,978][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:09:39,978][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:09:43,206][__main__][INFO] - Number of regex retries in iteration 65: 0 +[2026-03-25 16:09:43,207][__main__][INFO] - agents played in iteration 65 are Bob, Alice +[2026-03-25 16:09:43,774][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:09:44,421][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:09:44,711][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:09:45,032][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:09:45,355][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:09:45,676][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:09:45,997][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:09:46,316][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:09:46,636][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:09:46,956][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:09:47,277][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:09:47,598][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:09:47,920][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:09:48,242][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:09:48,564][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:09:48,884][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:09:49,204][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:09:49,525][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:09:49,845][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:09:50,167][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:09:50,488][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:09:50,809][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:09:51,129][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:09:51,450][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:09:51,769][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:09:52,090][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:09:52,410][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:09:52,729][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:09:53,049][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:09:53,369][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:09:53,689][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:09:54,011][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:09:54,331][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:09:54,653][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:09:54,972][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:09:55,292][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:09:55,612][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:09:55,932][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:09:56,251][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:09:56,572][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:09:56,893][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:09:57,213][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:09:57,536][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:09:57,858][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:09:58,179][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:09:58,500][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:09:58,821][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:09:59,143][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:09:59,464][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:09:59,785][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:10:00,107][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:10:00,426][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:10:00,748][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:10:01,362][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:10:01,681][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:10:02,002][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:10:02,324][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:10:02,646][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:10:02,966][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:10:03,287][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:10:03,608][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:10:03,930][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:10:04,250][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:10:04,570][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:10:04,890][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:10:05,210][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:10:05,863][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:10:06,590][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:10:06,592][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:10:06,594][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:10:07,211][__main__][INFO] - Iteration 66 took 27s (11.86% Gen, 85.87% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 57m 15s. Estimated total time: 7h 33m 54s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 57s. +[2026-03-25 16:10:07,214][__main__][INFO] - Starting iteration 66. +[2026-03-25 16:10:07,216][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:10:07,217][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:10:10,422][__main__][INFO] - Number of regex retries in iteration 66: 0 +[2026-03-25 16:10:10,422][__main__][INFO] - agents played in iteration 66 are Bob, Alice +[2026-03-25 16:10:10,973][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:10:11,620][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:10:11,909][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:10:12,231][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:10:12,551][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:10:12,872][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:10:13,191][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:10:13,511][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:10:13,832][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:10:14,153][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:10:14,473][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:10:14,792][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:10:15,112][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:10:15,433][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:10:15,754][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:10:16,076][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:10:16,397][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:10:16,717][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:10:17,037][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:10:17,358][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:10:17,678][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:10:17,998][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:10:18,320][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:10:18,639][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:10:18,959][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:10:19,279][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:10:19,598][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:10:19,919][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:10:20,238][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:10:20,560][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:10:20,880][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:10:21,199][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:10:21,519][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:10:21,842][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:10:22,163][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:10:22,484][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:10:22,803][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:10:23,125][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:10:23,446][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:10:23,768][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:10:24,089][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:10:24,409][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:10:24,730][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:10:25,051][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:10:25,371][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:10:25,691][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:10:26,011][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:10:26,331][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:10:26,653][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:10:26,974][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:10:27,295][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:10:27,615][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:10:27,936][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:10:28,548][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:10:28,868][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:10:29,189][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:10:29,510][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:10:29,832][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:10:30,155][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:10:30,475][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:10:30,795][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:10:31,115][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:10:31,436][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:10:31,758][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:10:32,077][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:10:32,398][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:10:33,050][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:10:33,774][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:10:33,776][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:10:33,778][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:10:34,398][__main__][INFO] - Iteration 67 took 27s (11.79% Gen, 85.92% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 55m 56s. Estimated total time: 7h 33m 2s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 31s. +[2026-03-25 16:10:34,401][__main__][INFO] - Starting iteration 67. +[2026-03-25 16:10:34,404][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:10:34,404][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:10:37,627][__main__][INFO] - Number of regex retries in iteration 67: 0 +[2026-03-25 16:10:37,628][__main__][INFO] - agents played in iteration 67 are Bob, Alice +[2026-03-25 16:10:38,173][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:10:39,503][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:10:39,793][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:10:40,114][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:10:40,435][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:10:40,756][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:10:41,077][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:10:41,399][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:10:41,718][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:10:42,039][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:10:42,360][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:10:42,679][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:10:42,998][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:10:43,320][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:10:43,640][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:10:43,962][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:10:44,284][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:10:44,606][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:10:44,926][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:10:45,246][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:10:45,567][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:10:45,888][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:10:46,209][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:10:46,530][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:10:46,850][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:10:47,170][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:10:47,491][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:10:47,811][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:10:48,131][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:10:48,451][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:10:48,770][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:10:49,089][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:10:49,409][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:10:49,730][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:10:50,052][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:10:50,374][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:10:50,693][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:10:51,012][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:10:51,332][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:10:51,653][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:10:51,973][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:10:52,292][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:10:52,613][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:10:52,932][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:10:53,253][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:10:53,573][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:10:53,892][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:10:54,214][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:10:54,535][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:10:54,857][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:10:55,177][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:10:55,497][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:10:55,816][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:10:56,426][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:10:56,747][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:10:57,068][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:10:57,388][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:10:57,710][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:10:58,031][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:10:58,351][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:10:58,672][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:10:58,991][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:10:59,311][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:10:59,632][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:10:59,953][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:11:00,273][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:11:00,924][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:11:01,650][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:11:01,652][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:11:01,654][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:11:02,273][__main__][INFO] - Iteration 68 took 27s (11.57% Gen, 86.21% Train). Generation: 3s, Training: 24s. Estimated remaining time: 7h 6m 56s. Estimated total time: 7h 44m 30s. Time estimates for 10 more iterations: 4m 38s, 100 more iterations: 46m 27s, 500 more iterations: 3h 52m 15s. +[2026-03-25 16:11:02,276][__main__][INFO] - Starting iteration 68. +[2026-03-25 16:11:02,279][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:11:02,279][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:11:05,524][__main__][INFO] - Number of regex retries in iteration 68: 0 +[2026-03-25 16:11:05,525][__main__][INFO] - agents played in iteration 68 are Bob, Alice +[2026-03-25 16:11:06,073][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:11:06,721][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:11:07,012][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:11:07,334][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:11:07,654][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:11:07,976][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:11:08,298][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:11:08,618][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:11:08,939][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:11:09,259][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:11:09,578][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:11:09,898][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:11:10,217][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:11:10,537][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:11:10,856][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:11:11,177][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:11:11,497][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:11:11,818][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:11:12,140][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:11:12,461][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:11:12,780][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:11:13,099][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:11:13,418][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:11:13,739][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:11:14,058][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:11:14,378][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:11:14,699][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:11:15,019][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:11:15,339][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:11:15,661][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:11:15,982][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:11:16,302][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:11:16,623][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:11:16,943][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:11:17,263][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:11:17,583][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:11:17,904][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:11:18,226][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:11:18,547][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:11:18,867][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:11:19,187][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:11:19,508][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:11:19,830][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:11:20,150][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:11:20,470][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:11:20,792][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:11:21,113][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:11:21,432][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:11:21,755][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:11:22,076][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:11:22,397][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:11:22,717][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:11:23,038][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:11:23,653][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:11:23,975][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:11:24,296][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:11:24,615][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:11:24,936][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:11:25,256][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:11:25,575][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:11:25,897][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:11:26,217][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:11:26,536][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:11:26,856][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:11:27,176][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:11:27,498][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:11:28,148][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:11:28,872][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:11:28,874][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:11:28,876][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:11:29,509][__main__][INFO] - Iteration 69 took 27s (11.92% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 55m 49s. Estimated total time: 7h 33m 51s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 55s. +[2026-03-25 16:11:29,512][__main__][INFO] - Starting iteration 69. +[2026-03-25 16:11:29,514][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:11:29,515][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:11:32,755][__main__][INFO] - Number of regex retries in iteration 69: 0 +[2026-03-25 16:11:32,756][__main__][INFO] - agents played in iteration 69 are Bob, Alice +[2026-03-25 16:11:33,334][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:11:33,982][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:11:34,272][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:11:34,594][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:11:34,914][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:11:35,232][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:11:35,551][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:11:35,870][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:11:36,191][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:11:36,510][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:11:36,829][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:11:37,150][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:11:37,470][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:11:37,791][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:11:38,110][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:11:38,430][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:11:38,751][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:11:39,071][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:11:39,391][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:11:39,711][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:11:40,032][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:11:40,353][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:11:40,672][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:11:40,992][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:11:41,312][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:11:41,631][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:11:41,951][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:11:42,270][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:11:42,590][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:11:42,909][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:11:43,228][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:11:43,549][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:11:43,869][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:11:44,189][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:11:44,508][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:11:44,829][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:11:45,148][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:11:45,469][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:11:45,789][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:11:46,109][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:11:46,430][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:11:46,750][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:11:47,069][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:11:47,389][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:11:47,710][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:11:48,029][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:11:48,350][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:11:48,670][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:11:48,989][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:11:49,310][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:11:49,630][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:11:49,950][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:11:50,271][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:11:50,883][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:11:51,203][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:11:51,523][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:11:51,845][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:11:52,165][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:11:52,484][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:11:52,803][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:11:53,125][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:11:53,445][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:11:53,764][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:11:54,084][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:11:54,405][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:11:54,724][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:11:55,375][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:11:56,097][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:11:56,099][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:11:56,101][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:11:56,719][__main__][INFO] - Iteration 70 took 27s (11.91% Gen, 85.81% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 54m 56s. Estimated total time: 7h 33m 25s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 20s, 500 more iterations: 3h 46m 42s. +[2026-03-25 16:11:56,721][__main__][INFO] - Starting iteration 70. +[2026-03-25 16:11:56,724][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:11:56,725][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:11:59,962][__main__][INFO] - Number of regex retries in iteration 70: 0 +[2026-03-25 16:11:59,963][__main__][INFO] - agents played in iteration 70 are Bob, Alice +[2026-03-25 16:12:00,545][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:12:01,191][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:12:01,482][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:12:01,804][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:12:02,124][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:12:02,445][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:12:02,767][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:12:03,089][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:12:03,409][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:12:03,728][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:12:04,049][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:12:04,370][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:12:04,689][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:12:05,010][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:12:05,331][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:12:05,651][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:12:05,972][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:12:06,292][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:12:06,612][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:12:06,933][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:12:07,253][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:12:07,573][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:12:07,892][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:12:08,212][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:12:08,530][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:12:08,851][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:12:09,171][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:12:09,491][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:12:09,812][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:12:10,132][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:12:10,454][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:12:10,775][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:12:11,096][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:12:11,416][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:12:11,735][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:12:12,055][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:12:12,374][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:12:12,693][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:12:13,013][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:12:13,332][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:12:13,653][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:12:13,972][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:12:14,292][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:12:14,615][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:12:14,935][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:12:15,256][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:12:15,577][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:12:15,897][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:12:16,218][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:12:16,538][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:12:16,858][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:12:17,176][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:12:17,496][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:12:18,113][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:12:18,435][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:12:18,756][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:12:19,077][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:12:19,397][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:12:19,716][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:12:20,035][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:12:20,357][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:12:20,676][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:12:20,996][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:12:21,317][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:12:21,637][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:12:21,957][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:12:22,608][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:12:23,331][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:12:23,333][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:12:23,334][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:12:23,953][__main__][INFO] - Iteration 71 took 27s (11.89% Gen, 85.83% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 54m 53s. Estimated total time: 7h 33m 49s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 54s. +[2026-03-25 16:12:23,955][__main__][INFO] - Starting iteration 71. +[2026-03-25 16:12:23,958][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:12:23,959][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:12:27,198][__main__][INFO] - Number of regex retries in iteration 71: 0 +[2026-03-25 16:12:27,199][__main__][INFO] - agents played in iteration 71 are Bob, Alice +[2026-03-25 16:12:27,762][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:12:28,411][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:12:28,701][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:12:29,022][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:12:29,344][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:12:29,666][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:12:29,984][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:12:30,304][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:12:30,624][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:12:30,945][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:12:31,266][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:12:31,586][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:12:31,907][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:12:32,226][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:12:32,546][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:12:32,866][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:12:33,185][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:12:33,504][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:12:33,824][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:12:34,144][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:12:34,464][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:12:34,784][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:12:35,104][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:12:35,424][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:12:35,745][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:12:36,065][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:12:36,385][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:12:36,706][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:12:37,026][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:12:37,346][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:12:37,667][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:12:37,987][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:12:38,307][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:12:38,625][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:12:38,947][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:12:39,268][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:12:39,590][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:12:39,909][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:12:40,228][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:12:40,549][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:12:40,870][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:12:41,190][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:12:41,509][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:12:41,830][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:12:42,150][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:12:42,470][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:12:42,791][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:12:43,110][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:12:43,430][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:12:43,751][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:12:44,070][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:12:44,391][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:12:44,710][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:12:45,319][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:12:45,640][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:12:45,961][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:12:46,281][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:12:46,601][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:12:46,921][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:12:47,241][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:12:47,562][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:12:47,884][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:12:48,204][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:12:48,525][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:12:48,846][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:12:49,166][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:12:49,818][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:12:50,541][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:12:50,543][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:12:50,545][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:12:51,169][__main__][INFO] - Iteration 72 took 27s (11.91% Gen, 85.79% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 54m 8s. Estimated total time: 7h 33m 32s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 46s. +[2026-03-25 16:12:51,172][__main__][INFO] - Starting iteration 72. +[2026-03-25 16:12:51,175][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:12:51,176][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:12:54,456][__main__][INFO] - Number of regex retries in iteration 72: 0 +[2026-03-25 16:12:54,457][__main__][INFO] - agents played in iteration 72 are Bob, Alice +[2026-03-25 16:12:55,024][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:12:55,675][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:12:55,965][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:12:56,288][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:12:56,608][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:12:56,930][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:12:57,251][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:12:57,571][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:12:57,890][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:12:58,210][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:12:58,531][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:12:58,850][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:12:59,170][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:12:59,489][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:12:59,812][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:13:00,133][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:13:00,452][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:13:00,774][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:13:01,094][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:13:01,413][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:13:01,734][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:13:02,055][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:13:02,376][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:13:02,697][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:13:03,016][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:13:03,337][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:13:03,659][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:13:03,981][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:13:04,302][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:13:04,623][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:13:04,945][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:13:05,268][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:13:05,588][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:13:05,909][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:13:06,230][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:13:06,551][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:13:06,873][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:13:07,193][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:13:07,514][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:13:07,834][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:13:08,155][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:13:08,473][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:13:08,792][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:13:09,114][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:13:09,434][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:13:09,753][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:13:10,072][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:13:10,393][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:13:10,712][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:13:11,032][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:13:11,353][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:13:11,673][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:13:11,992][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:13:12,603][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:13:12,923][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:13:13,242][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:13:13,562][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:13:13,884][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:13:14,205][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:13:14,525][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:13:14,846][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:13:15,166][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:13:15,485][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:13:15,805][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:13:16,125][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:13:16,446][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:13:17,099][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:13:17,819][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:13:17,821][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:13:17,823][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:13:18,441][__main__][INFO] - Iteration 73 took 27s (12.03% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 54m 36s. Estimated total time: 7h 34m 27s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 13s. +[2026-03-25 16:13:18,444][__main__][INFO] - Starting iteration 73. +[2026-03-25 16:13:18,446][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:13:18,447][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:13:21,681][__main__][INFO] - Number of regex retries in iteration 73: 0 +[2026-03-25 16:13:21,681][__main__][INFO] - agents played in iteration 73 are Bob, Alice +[2026-03-25 16:13:22,218][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:13:22,864][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:13:23,154][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:13:23,474][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:13:23,796][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:13:24,116][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:13:24,435][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:13:24,756][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:13:25,075][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:13:25,397][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:13:25,718][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:13:26,038][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:13:26,360][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:13:26,679][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:13:27,000][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:13:27,321][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:13:27,642][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:13:27,963][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:13:28,284][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:13:28,605][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:13:28,927][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:13:29,247][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:13:29,568][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:13:29,889][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:13:30,208][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:13:30,529][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:13:30,849][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:13:31,170][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:13:31,492][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:13:31,811][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:13:32,131][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:13:32,453][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:13:32,772][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:13:33,093][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:13:33,413][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:13:33,734][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:13:34,056][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:13:34,377][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:13:34,697][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:13:35,018][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:13:35,339][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:13:35,659][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:13:35,979][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:13:36,299][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:13:36,619][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:13:36,938][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:13:37,260][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:13:37,579][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:13:37,899][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:13:38,219][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:13:38,539][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:13:38,859][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:13:39,178][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:13:39,787][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:13:40,109][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:13:40,430][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:13:40,751][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:13:41,071][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:13:41,391][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:13:41,712][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:13:42,032][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:13:42,354][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:13:42,676][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:13:42,998][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:13:43,319][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:13:43,641][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:13:44,293][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:13:45,016][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:13:45,019][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:13:45,021][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:13:45,641][__main__][INFO] - Iteration 74 took 27s (11.89% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 52m 57s. Estimated total time: 7h 33m 15s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 37s. +[2026-03-25 16:13:45,643][__main__][INFO] - Starting iteration 74. +[2026-03-25 16:13:45,646][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:13:45,647][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:13:48,878][__main__][INFO] - Number of regex retries in iteration 74: 0 +[2026-03-25 16:13:48,879][__main__][INFO] - agents played in iteration 74 are Bob, Alice +[2026-03-25 16:13:49,416][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:13:50,069][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:13:50,360][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:13:50,680][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:13:51,002][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:13:51,324][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:13:51,646][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:13:51,968][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:13:52,288][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:13:52,609][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:13:52,930][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:13:53,251][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:13:53,572][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:13:53,892][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:13:54,213][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:13:54,535][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:13:54,856][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:13:55,178][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:13:55,498][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:13:55,819][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:13:56,138][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:13:56,458][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:13:56,780][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:13:57,101][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:13:57,423][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:13:57,745][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:13:58,066][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:13:58,386][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:13:58,707][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:13:59,027][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:13:59,347][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:13:59,669][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:13:59,989][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:14:00,308][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:14:00,629][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:14:00,949][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:14:01,270][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:14:01,591][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:14:01,912][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:14:02,232][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:14:02,552][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:14:02,872][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:14:03,194][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:14:03,515][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:14:03,836][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:14:04,157][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:14:04,476][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:14:04,797][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:14:05,117][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:14:05,438][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:14:05,758][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:14:06,079][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:14:06,401][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:14:07,011][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:14:07,331][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:14:07,651][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:14:07,971][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:14:08,292][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:14:08,614][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:14:08,934][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:14:09,254][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:14:09,575][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:14:09,896][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:14:10,218][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:14:10,538][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:14:10,858][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:14:11,511][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:14:12,231][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:14:12,233][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:14:12,235][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:14:12,856][__main__][INFO] - Iteration 75 took 27s (11.88% Gen, 85.83% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 52m 45s. Estimated total time: 7h 33m 30s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 45s. +[2026-03-25 16:14:12,858][__main__][INFO] - Starting iteration 75. +[2026-03-25 16:14:12,861][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:14:12,862][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:14:16,095][__main__][INFO] - Number of regex retries in iteration 75: 0 +[2026-03-25 16:14:16,096][__main__][INFO] - agents played in iteration 75 are Bob, Alice +[2026-03-25 16:14:16,631][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:14:17,284][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:14:17,578][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:14:17,902][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:14:18,224][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:14:18,545][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:14:18,866][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:14:19,187][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:14:19,507][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:14:19,829][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:14:20,149][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:14:20,470][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:14:20,790][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:14:21,110][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:14:21,432][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:14:21,753][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:14:22,075][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:14:22,397][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:14:22,717][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:14:23,038][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:14:23,357][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:14:23,678][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:14:24,000][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:14:24,319][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:14:24,640][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:14:24,959][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:14:25,279][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:14:25,600][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:14:25,921][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:14:26,242][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:14:26,564][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:14:26,884][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:14:27,206][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:14:27,526][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:14:27,847][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:14:28,168][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:14:28,489][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:14:28,810][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:14:29,131][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:14:29,451][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:14:29,771][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:14:30,091][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:14:30,412][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:14:30,735][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:14:31,055][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:14:31,376][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:14:31,695][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:14:32,017][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:14:32,337][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:14:32,661][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:14:32,981][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:14:33,301][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:14:33,623][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:14:34,239][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:14:34,560][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:14:34,880][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:14:35,202][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:14:35,523][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:14:35,845][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:14:36,166][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:14:36,486][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:14:36,808][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:14:37,128][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:14:37,449][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:14:37,770][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:14:38,091][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:14:38,749][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:14:39,472][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:14:39,474][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:14:39,475][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:14:40,098][__main__][INFO] - Iteration 76 took 27s (11.87% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 52m 45s. Estimated total time: 7h 33m 57s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 58s. +[2026-03-25 16:14:40,100][__main__][INFO] - Starting iteration 76. +[2026-03-25 16:14:40,103][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:14:40,104][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:14:43,339][__main__][INFO] - Number of regex retries in iteration 76: 0 +[2026-03-25 16:14:43,340][__main__][INFO] - agents played in iteration 76 are Bob, Alice +[2026-03-25 16:14:43,877][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:14:44,538][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:14:44,828][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:14:45,150][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:14:45,471][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:14:45,791][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:14:46,113][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:14:46,432][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:14:46,752][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:14:47,071][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:14:47,392][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:14:47,712][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:14:48,034][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:14:48,355][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:14:48,676][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:14:48,996][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:14:49,317][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:14:49,637][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:14:49,959][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:14:50,279][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:14:50,600][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:14:50,921][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:14:51,241][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:14:51,562][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:14:51,882][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:14:52,204][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:14:52,524][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:14:52,845][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:14:53,164][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:14:53,485][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:14:53,806][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:14:54,127][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:14:54,447][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:14:54,768][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:14:55,089][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:14:55,410][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:14:55,731][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:14:56,051][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:14:56,371][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:14:56,691][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:14:57,012][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:14:57,331][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:14:57,653][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:14:57,974][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:14:58,295][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:14:58,617][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:14:58,938][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:14:59,258][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:14:59,579][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:14:59,901][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:15:00,221][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:15:00,541][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:15:00,860][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:15:01,474][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:15:01,795][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:15:02,116][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:15:02,437][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:15:02,757][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:15:03,078][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:15:03,399][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:15:03,719][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:15:04,038][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:15:04,359][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:15:04,679][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:15:04,999][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:15:05,320][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:15:05,978][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:15:06,700][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:15:06,703][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:15:06,704][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:15:07,326][__main__][INFO] - Iteration 77 took 27s (11.89% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 52m 4s. Estimated total time: 7h 33m 43s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 51s. +[2026-03-25 16:15:07,328][__main__][INFO] - Starting iteration 77. +[2026-03-25 16:15:07,331][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:15:07,332][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:15:10,587][__main__][INFO] - Number of regex retries in iteration 77: 0 +[2026-03-25 16:15:10,588][__main__][INFO] - agents played in iteration 77 are Bob, Alice +[2026-03-25 16:15:11,178][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:15:11,833][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:15:12,123][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:15:12,445][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:15:12,766][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:15:13,086][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:15:13,407][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:15:13,728][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:15:14,048][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:15:14,370][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:15:14,691][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:15:15,012][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:15:15,332][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:15:15,654][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:15:15,975][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:15:16,296][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:15:16,617][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:15:16,937][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:15:17,257][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:15:17,577][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:15:17,897][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:15:18,218][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:15:18,538][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:15:18,858][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:15:19,179][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:15:19,499][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:15:19,818][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:15:20,141][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:15:20,463][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:15:20,784][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:15:21,105][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:15:21,427][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:15:21,748][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:15:22,069][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:15:22,390][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:15:22,709][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:15:23,030][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:15:23,352][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:15:23,674][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:15:23,995][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:15:24,316][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:15:24,638][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:15:24,957][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:15:25,279][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:15:25,599][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:15:25,920][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:15:26,239][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:15:26,560][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:15:26,880][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:15:27,200][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:15:27,521][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:15:27,841][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:15:28,162][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:15:28,778][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:15:29,099][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:15:29,421][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:15:29,743][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:15:30,065][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:15:30,386][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:15:30,707][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:15:31,028][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:15:31,348][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:15:31,669][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:15:31,991][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:15:32,311][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:15:32,631][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:15:33,290][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:15:34,013][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:15:34,015][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:15:34,017][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:15:34,637][__main__][INFO] - Iteration 78 took 27s (11.92% Gen, 85.80% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 52m 59s. Estimated total time: 7h 35m 6s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 33s. +[2026-03-25 16:15:34,639][__main__][INFO] - Starting iteration 78. +[2026-03-25 16:15:34,642][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:15:34,643][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:15:37,892][__main__][INFO] - Number of regex retries in iteration 78: 0 +[2026-03-25 16:15:37,893][__main__][INFO] - agents played in iteration 78 are Bob, Alice +[2026-03-25 16:15:38,448][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:15:39,103][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:15:39,391][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:15:39,712][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:15:40,034][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:15:40,353][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:15:40,673][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:15:40,993][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:15:41,313][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:15:41,633][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:15:41,953][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:15:42,275][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:15:42,595][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:15:42,913][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:15:43,234][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:15:43,555][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:15:43,876][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:15:44,197][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:15:44,519][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:15:44,839][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:15:45,159][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:15:45,479][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:15:45,799][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:15:46,120][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:15:46,440][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:15:46,761][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:15:47,083][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:15:47,404][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:15:47,724][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:15:48,045][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:15:48,366][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:15:48,688][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:15:49,008][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:15:49,329][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:15:49,649][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:15:49,970][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:15:50,290][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:15:50,611][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:15:50,931][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:15:51,252][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:15:51,572][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:15:51,894][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:15:52,214][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:15:52,534][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:15:52,855][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:15:53,174][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:15:53,494][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:15:53,815][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:15:54,136][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:15:54,455][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:15:54,777][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:15:55,098][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:15:55,419][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:15:56,032][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:15:56,352][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:15:56,672][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:15:56,992][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:15:57,313][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:15:57,633][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:15:57,952][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:15:58,272][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:15:58,592][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:15:58,912][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:15:59,232][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:15:59,553][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:15:59,872][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:16:00,530][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:16:01,255][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:16:01,257][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:16:01,259][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:16:01,879][__main__][INFO] - Iteration 79 took 27s (11.93% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 51m 23s. Estimated total time: 7h 33m 58s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 59s. +[2026-03-25 16:16:01,882][__main__][INFO] - Starting iteration 79. +[2026-03-25 16:16:01,884][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:16:01,885][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:16:05,150][__main__][INFO] - Number of regex retries in iteration 79: 0 +[2026-03-25 16:16:05,151][__main__][INFO] - agents played in iteration 79 are Bob, Alice +[2026-03-25 16:16:05,723][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:16:06,372][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:16:06,663][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:16:06,984][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:16:07,304][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:16:07,625][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:16:07,946][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:16:08,267][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:16:08,588][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:16:08,908][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:16:09,229][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:16:09,550][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:16:09,870][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:16:10,191][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:16:10,511][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:16:10,831][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:16:11,151][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:16:11,472][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:16:11,792][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:16:12,111][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:16:12,431][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:16:12,751][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:16:13,072][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:16:13,393][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:16:13,714][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:16:14,035][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:16:14,355][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:16:14,675][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:16:14,995][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:16:15,316][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:16:15,635][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:16:15,956][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:16:16,277][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:16:16,597][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:16:16,917][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:16:17,237][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:16:17,557][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:16:17,877][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:16:18,198][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:16:18,518][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:16:18,840][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:16:19,159][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:16:19,477][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:16:19,798][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:16:20,118][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:16:20,439][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:16:20,758][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:16:21,077][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:16:21,399][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:16:21,719][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:16:22,040][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:16:22,361][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:16:22,680][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:16:23,292][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:16:23,613][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:16:23,933][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:16:24,253][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:16:24,574][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:16:24,894][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:16:25,215][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:16:25,537][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:16:25,857][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:16:26,176][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:16:26,496][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:16:26,815][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:16:27,135][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:16:27,788][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:16:28,510][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:16:28,512][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:16:28,514][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:16:29,136][__main__][INFO] - Iteration 80 took 27s (11.98% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 51m 10s. Estimated total time: 7h 34m 12s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 6s. +[2026-03-25 16:16:29,138][__main__][INFO] - Starting iteration 80. +[2026-03-25 16:16:29,141][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:16:29,142][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:16:32,394][__main__][INFO] - Number of regex retries in iteration 80: 0 +[2026-03-25 16:16:32,395][__main__][INFO] - agents played in iteration 80 are Bob, Alice +[2026-03-25 16:16:32,973][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:16:33,623][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:16:33,912][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:16:34,232][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:16:34,553][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:16:34,873][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:16:35,193][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:16:35,513][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:16:35,833][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:16:36,154][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:16:36,473][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:16:36,794][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:16:37,114][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:16:37,433][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:16:37,752][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:16:38,073][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:16:38,393][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:16:38,713][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:16:39,032][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:16:39,352][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:16:39,673][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:16:39,992][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:16:40,312][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:16:40,631][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:16:40,952][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:16:41,271][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:16:41,592][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:16:41,914][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:16:42,234][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:16:42,554][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:16:42,873][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:16:43,193][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:16:43,513][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:16:43,834][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:16:44,153][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:16:44,474][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:16:44,794][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:16:45,114][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:16:45,434][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:16:45,756][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:16:46,077][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:16:46,396][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:16:46,718][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:16:47,039][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:16:47,359][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:16:47,680][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:16:48,000][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:16:48,321][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:16:48,642][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:16:48,964][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:16:49,283][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:16:49,604][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:16:49,926][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:16:50,537][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:16:50,857][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:16:51,177][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:16:51,499][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:16:51,819][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:16:52,139][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:16:52,460][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:16:52,781][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:16:53,103][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:16:53,424][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:16:53,746][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:16:54,066][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:16:54,387][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:16:55,041][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:16:55,768][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:16:55,770][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:16:55,772][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:16:56,393][__main__][INFO] - Iteration 81 took 27s (11.94% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 50m 43s. Estimated total time: 7h 34m 12s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 6s. +[2026-03-25 16:16:56,395][__main__][INFO] - Starting iteration 81. +[2026-03-25 16:16:56,398][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:16:56,398][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:16:59,655][__main__][INFO] - Number of regex retries in iteration 81: 0 +[2026-03-25 16:16:59,656][__main__][INFO] - agents played in iteration 81 are Bob, Alice +[2026-03-25 16:17:00,217][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:17:00,866][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:17:01,157][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:17:01,478][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:17:01,798][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:17:02,117][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:17:02,436][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:17:02,757][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:17:03,080][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:17:03,402][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:17:03,722][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:17:04,044][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:17:04,363][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:17:04,684][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:17:05,004][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:17:05,325][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:17:05,646][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:17:05,967][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:17:06,288][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:17:06,608][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:17:06,929][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:17:07,250][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:17:07,570][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:17:07,890][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:17:08,210][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:17:08,531][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:17:08,850][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:17:09,171][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:17:09,490][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:17:09,811][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:17:10,132][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:17:10,452][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:17:10,772][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:17:11,092][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:17:11,411][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:17:11,732][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:17:12,052][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:17:12,373][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:17:12,694][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:17:13,015][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:17:13,336][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:17:13,656][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:17:13,978][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:17:14,298][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:17:14,618][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:17:14,940][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:17:15,261][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:17:15,581][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:17:15,902][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:17:16,222][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:17:16,544][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:17:16,865][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:17:17,186][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:17:17,800][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:17:18,121][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:17:18,441][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:17:18,761][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:17:19,080][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:17:19,401][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:17:19,720][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:17:20,040][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:17:20,362][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:17:20,681][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:17:21,003][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:17:21,324][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:17:21,644][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:17:22,299][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:17:23,026][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:17:23,028][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:17:23,030][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:17:23,647][__main__][INFO] - Iteration 82 took 27s (11.95% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 50m 14s. Estimated total time: 7h 34m 10s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 5s. +[2026-03-25 16:17:23,649][__main__][INFO] - Starting iteration 82. +[2026-03-25 16:17:23,652][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:17:23,653][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:17:26,923][__main__][INFO] - Number of regex retries in iteration 82: 0 +[2026-03-25 16:17:26,924][__main__][INFO] - agents played in iteration 82 are Bob, Alice +[2026-03-25 16:17:27,503][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:17:28,158][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:17:28,448][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:17:28,769][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:17:29,089][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:17:29,410][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:17:29,731][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:17:30,053][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:17:30,372][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:17:30,691][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:17:31,012][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:17:31,334][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:17:31,654][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:17:31,973][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:17:32,292][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:17:32,612][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:17:32,931][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:17:33,252][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:17:33,572][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:17:33,892][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:17:34,213][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:17:34,534][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:17:34,853][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:17:35,175][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:17:35,496][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:17:35,815][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:17:36,137][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:17:36,457][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:17:36,777][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:17:37,097][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:17:37,419][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:17:37,740][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:17:38,062][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:17:38,384][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:17:38,704][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:17:39,024][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:17:39,346][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:17:39,667][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:17:39,986][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:17:40,307][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:17:40,626][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:17:40,947][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:17:41,268][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:17:41,588][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:17:41,907][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:17:42,227][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:17:42,547][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:17:42,870][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:17:43,191][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:17:43,512][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:17:43,833][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:17:44,154][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:17:44,475][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:17:45,090][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:17:45,411][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:17:45,731][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:17:46,052][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:17:46,372][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:17:46,692][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:17:47,012][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:17:47,332][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:17:47,653][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:17:47,973][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:17:48,293][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:17:48,612][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:17:48,932][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:17:49,590][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:17:50,310][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:17:50,313][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:17:50,314][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:17:50,942][__main__][INFO] - Iteration 83 took 27s (11.99% Gen, 85.70% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 50m 28s. Estimated total time: 7h 34m 51s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 25s. +[2026-03-25 16:17:50,945][__main__][INFO] - Starting iteration 83. +[2026-03-25 16:17:50,948][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:17:50,949][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:17:54,244][__main__][INFO] - Number of regex retries in iteration 83: 0 +[2026-03-25 16:17:54,245][__main__][INFO] - agents played in iteration 83 are Bob, Alice +[2026-03-25 16:17:54,825][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:17:55,482][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:17:55,772][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:17:56,094][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:17:56,413][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:17:56,734][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:17:57,055][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:17:57,375][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:17:57,695][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:17:58,017][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:17:58,337][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:17:58,657][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:17:58,977][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:17:59,299][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:17:59,619][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:17:59,939][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:18:00,260][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:18:00,582][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:18:00,902][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:18:01,222][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:18:01,544][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:18:01,865][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:18:02,185][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:18:02,506][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:18:02,827][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:18:03,149][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:18:03,471][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:18:03,792][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:18:04,112][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:18:04,432][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:18:04,753][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:18:05,074][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:18:05,394][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:18:05,715][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:18:06,036][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:18:06,356][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:18:06,676][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:18:06,997][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:18:07,318][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:18:07,640][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:18:07,962][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:18:08,284][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:18:08,603][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:18:08,923][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:18:09,245][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:18:09,565][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:18:09,885][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:18:10,206][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:18:10,526][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:18:10,846][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:18:11,167][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:18:11,488][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:18:11,807][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:18:12,422][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:18:12,744][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:18:13,066][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:18:13,386][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:18:13,706][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:18:14,027][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:18:14,348][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:18:14,669][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:18:14,991][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:18:15,311][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:18:15,630][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:18:15,951][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:18:16,271][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:18:16,930][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:18:17,654][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:18:17,656][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:18:17,658][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:18:18,278][__main__][INFO] - Iteration 84 took 27s (12.06% Gen, 85.67% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 50m 40s. Estimated total time: 7h 35m 30s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 45s. +[2026-03-25 16:18:18,281][__main__][INFO] - Starting iteration 84. +[2026-03-25 16:18:18,284][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:18:18,284][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:18:21,549][__main__][INFO] - Number of regex retries in iteration 84: 0 +[2026-03-25 16:18:21,549][__main__][INFO] - agents played in iteration 84 are Bob, Alice +[2026-03-25 16:18:22,108][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:18:22,763][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:18:23,053][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:18:23,375][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:18:23,696][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:18:24,016][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:18:24,337][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:18:24,659][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:18:24,978][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:18:25,299][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:18:25,620][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:18:25,942][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:18:26,261][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:18:26,582][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:18:26,904][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:18:27,225][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:18:27,547][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:18:27,867][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:18:28,187][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:18:28,508][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:18:28,828][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:18:29,150][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:18:29,471][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:18:29,793][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:18:30,113][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:18:30,433][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:18:30,754][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:18:31,074][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:18:31,394][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:18:31,713][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:18:32,033][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:18:32,353][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:18:32,672][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:18:32,992][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:18:33,312][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:18:33,633][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:18:33,952][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:18:34,272][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:18:34,591][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:18:34,912][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:18:35,231][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:18:35,551][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:18:35,870][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:18:36,189][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:18:36,510][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:18:36,831][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:18:37,151][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:18:37,471][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:18:37,791][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:18:38,111][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:18:38,430][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:18:38,751][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:18:39,071][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:18:39,686][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:18:40,006][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:18:40,326][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:18:40,648][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:18:40,968][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:18:41,289][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:18:41,610][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:18:41,931][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:18:42,251][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:18:42,572][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:18:42,892][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:18:43,213][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:18:43,532][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:18:44,191][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:18:44,917][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:18:44,920][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:18:44,921][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:18:45,543][__main__][INFO] - Iteration 85 took 27s (11.98% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 49m 2s. Estimated total time: 7h 34m 20s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 10s. +[2026-03-25 16:18:45,545][__main__][INFO] - Starting iteration 85. +[2026-03-25 16:18:45,548][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:18:45,549][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:18:48,836][__main__][INFO] - Number of regex retries in iteration 85: 0 +[2026-03-25 16:18:48,836][__main__][INFO] - agents played in iteration 85 are Bob, Alice +[2026-03-25 16:18:49,417][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:18:50,075][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:18:50,365][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:18:50,688][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:18:51,009][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:18:51,329][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:18:51,650][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:18:51,970][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:18:52,291][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:18:52,611][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:18:52,933][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:18:53,254][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:18:53,574][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:18:53,895][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:18:54,214][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:18:54,535][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:18:54,855][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:18:55,175][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:18:55,495][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:18:55,815][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:18:56,134][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:18:56,456][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:18:56,776][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:18:57,097][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:18:57,416][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:18:57,738][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:18:58,058][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:18:58,379][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:18:58,699][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:18:59,019][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:18:59,340][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:18:59,660][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:18:59,982][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:19:00,302][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:19:00,622][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:19:00,943][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:19:01,265][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:19:01,584][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:19:01,906][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:19:02,226][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:19:02,546][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:19:02,866][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:19:03,186][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:19:03,506][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:19:03,827][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:19:04,147][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:19:04,467][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:19:04,787][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:19:05,108][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:19:05,429][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:19:05,749][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:19:06,070][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:19:06,390][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:19:07,004][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:19:07,324][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:19:07,645][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:19:07,966][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:19:08,286][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:19:08,605][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:19:08,926][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:19:09,247][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:19:09,567][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:19:09,888][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:19:10,209][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:19:10,529][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:19:10,850][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:19:11,508][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:19:12,228][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:19:12,230][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:19:12,231][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:19:12,854][__main__][INFO] - Iteration 86 took 27s (12.04% Gen, 85.67% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 49m 21s. Estimated total time: 7h 35m 7s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 33s. +[2026-03-25 16:19:12,856][__main__][INFO] - Starting iteration 86. +[2026-03-25 16:19:12,859][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:19:12,860][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:19:16,137][__main__][INFO] - Number of regex retries in iteration 86: 0 +[2026-03-25 16:19:16,138][__main__][INFO] - agents played in iteration 86 are Bob, Alice +[2026-03-25 16:19:16,779][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:19:17,432][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:19:17,722][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:19:18,044][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:19:18,365][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:19:18,687][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:19:19,006][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:19:19,328][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:19:19,649][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:19:19,969][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:19:20,289][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:19:20,609][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:19:20,928][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:19:21,249][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:19:21,570][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:19:21,891][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:19:22,211][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:19:22,531][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:19:22,851][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:19:23,171][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:19:23,492][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:19:23,813][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:19:24,133][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:19:24,453][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:19:24,773][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:19:25,092][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:19:25,412][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:19:25,731][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:19:26,051][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:19:26,372][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:19:26,691][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:19:27,011][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:19:27,332][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:19:27,651][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:19:27,972][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:19:28,291][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:19:28,611][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:19:28,932][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:19:29,253][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:19:29,572][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:19:29,892][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:19:30,214][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:19:30,534][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:19:30,854][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:19:31,175][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:19:31,496][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:19:31,815][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:19:32,135][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:19:32,455][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:19:32,777][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:19:33,097][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:19:33,418][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:19:33,740][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:19:34,354][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:19:34,675][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:19:34,996][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:19:35,317][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:19:35,638][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:19:35,959][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:19:36,279][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:19:36,599][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:19:36,920][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:19:37,240][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:19:37,559][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:19:37,879][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:19:38,199][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:19:38,857][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:19:39,583][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:19:39,585][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:19:39,587][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:19:40,209][__main__][INFO] - Iteration 87 took 27s (11.99% Gen, 85.74% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 49m 37s. Estimated total time: 7h 35m 50s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 35s, 500 more iterations: 3h 47m 55s. +[2026-03-25 16:19:40,211][__main__][INFO] - Starting iteration 87. +[2026-03-25 16:19:40,214][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:19:40,215][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:19:43,520][__main__][INFO] - Number of regex retries in iteration 87: 0 +[2026-03-25 16:19:43,520][__main__][INFO] - agents played in iteration 87 are Bob, Alice +[2026-03-25 16:19:44,071][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:19:44,726][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:19:45,017][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:19:45,340][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:19:45,660][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:19:45,981][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:19:46,301][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:19:46,621][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:19:46,942][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:19:47,264][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:19:47,583][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:19:47,905][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:19:48,225][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:19:48,544][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:19:48,865][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:19:49,185][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:19:49,506][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:19:49,827][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:19:50,148][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:19:50,469][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:19:50,790][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:19:51,111][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:19:51,431][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:19:51,752][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:19:52,073][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:19:52,392][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:19:52,712][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:19:53,032][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:19:53,354][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:19:53,674][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:19:53,994][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:19:54,314][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:19:54,634][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:19:54,955][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:19:55,276][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:19:55,596][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:19:55,915][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:19:56,236][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:19:56,555][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:19:56,874][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:19:57,195][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:19:57,514][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:19:57,834][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:19:58,153][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:19:58,473][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:19:58,792][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:19:59,111][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:19:59,431][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:19:59,752][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:20:00,071][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:20:00,390][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:20:00,711][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:20:01,032][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:20:01,647][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:20:01,967][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:20:02,287][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:20:02,607][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:20:02,928][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:20:03,249][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:20:03,570][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:20:03,890][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:20:04,210][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:20:04,531][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:20:04,852][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:20:05,172][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:20:05,492][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:20:06,150][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:20:06,873][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:20:06,875][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:20:06,877][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:20:07,501][__main__][INFO] - Iteration 88 took 27s (12.11% Gen, 85.59% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 48m 8s. Estimated total time: 7h 34m 48s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 24s. +[2026-03-25 16:20:07,504][__main__][INFO] - Starting iteration 88. +[2026-03-25 16:20:07,507][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:20:07,507][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:20:10,788][__main__][INFO] - Number of regex retries in iteration 88: 0 +[2026-03-25 16:20:10,789][__main__][INFO] - agents played in iteration 88 are Bob, Alice +[2026-03-25 16:20:11,352][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:20:12,007][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:20:12,297][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:20:12,619][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:20:12,940][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:20:13,261][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:20:13,581][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:20:13,901][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:20:14,222][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:20:14,542][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:20:14,863][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:20:15,183][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:20:15,503][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:20:15,825][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:20:16,145][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:20:16,466][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:20:16,786][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:20:17,106][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:20:17,427][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:20:17,748][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:20:18,068][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:20:18,388][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:20:18,709][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:20:19,030][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:20:19,351][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:20:19,671][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:20:19,991][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:20:20,311][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:20:20,631][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:20:20,951][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:20:21,272][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:20:21,593][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:20:21,913][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:20:22,233][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:20:22,554][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:20:22,875][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:20:23,196][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:20:23,518][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:20:23,837][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:20:24,157][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:20:24,478][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:20:24,798][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:20:25,118][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:20:25,437][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:20:25,757][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:20:26,078][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:20:26,397][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:20:26,717][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:20:27,038][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:20:27,358][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:20:27,679][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:20:27,999][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:20:28,321][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:20:28,934][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:20:29,254][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:20:29,574][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:20:29,895][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:20:30,215][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:20:30,534][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:20:30,855][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:20:31,174][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:20:31,496][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:20:31,816][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:20:32,136][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:20:32,456][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:20:32,776][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:20:33,439][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:20:34,171][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:20:34,173][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:20:34,175][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:20:34,803][__main__][INFO] - Iteration 89 took 27s (12.02% Gen, 85.67% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 47m 49s. Estimated total time: 7h 34m 56s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 28s. +[2026-03-25 16:20:34,805][__main__][INFO] - Starting iteration 89. +[2026-03-25 16:20:34,808][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:20:34,809][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:20:38,106][__main__][INFO] - Number of regex retries in iteration 89: 0 +[2026-03-25 16:20:38,107][__main__][INFO] - agents played in iteration 89 are Bob, Alice +[2026-03-25 16:20:38,706][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:20:39,364][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:20:39,655][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:20:39,976][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:20:40,295][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:20:40,615][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:20:40,933][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:20:41,253][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:20:41,573][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:20:41,892][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:20:42,213][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:20:42,532][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:20:42,851][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:20:43,172][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:20:43,493][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:20:43,812][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:20:44,133][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:20:44,453][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:20:44,774][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:20:45,093][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:20:45,414][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:20:45,734][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:20:46,053][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:20:46,373][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:20:46,694][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:20:47,014][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:20:47,333][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:20:47,655][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:20:47,975][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:20:48,294][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:20:48,615][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:20:48,935][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:20:49,256][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:20:49,576][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:20:49,897][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:20:50,217][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:20:50,537][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:20:50,858][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:20:51,180][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:20:51,501][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:20:51,820][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:20:52,141][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:20:52,461][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:20:52,780][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:20:53,099][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:20:53,422][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:20:53,743][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:20:54,064][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:20:54,385][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:20:54,706][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:20:55,026][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:20:55,346][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:20:55,666][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:20:56,280][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:20:56,601][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:20:56,920][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:20:57,240][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:20:57,562][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:20:57,882][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:20:58,203][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:20:58,523][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:20:58,844][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:20:59,165][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:20:59,485][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:20:59,805][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:21:00,126][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:21:00,784][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:21:01,505][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:21:01,507][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:21:01,508][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:21:02,133][__main__][INFO] - Iteration 90 took 27s (12.07% Gen, 85.64% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 47m 51s. Estimated total time: 7h 35m 26s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 43s. +[2026-03-25 16:21:02,135][__main__][INFO] - Starting iteration 90. +[2026-03-25 16:21:02,138][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:21:02,139][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:21:05,362][__main__][INFO] - Number of regex retries in iteration 90: 0 +[2026-03-25 16:21:05,362][__main__][INFO] - agents played in iteration 90 are Bob, Alice +[2026-03-25 16:21:05,906][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:21:06,560][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:21:06,851][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:21:07,171][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:21:07,492][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:21:07,812][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:21:08,131][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:21:08,450][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:21:08,770][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:21:09,089][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:21:09,410][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:21:09,729][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:21:10,050][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:21:10,371][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:21:10,690][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:21:11,009][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:21:11,328][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:21:11,649][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:21:11,969][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:21:12,290][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:21:12,610][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:21:12,932][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:21:13,252][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:21:13,572][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:21:13,892][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:21:14,211][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:21:14,530][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:21:14,851][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:21:15,171][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:21:15,492][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:21:15,812][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:21:16,133][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:21:16,454][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:21:16,775][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:21:17,095][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:21:17,417][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:21:17,735][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:21:18,057][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:21:18,378][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:21:18,699][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:21:19,018][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:21:19,338][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:21:19,658][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:21:19,979][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:21:20,300][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:21:20,619][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:21:20,940][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:21:21,261][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:21:21,580][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:21:21,900][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:21:22,219][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:21:22,538][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:21:22,859][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:21:23,473][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:21:23,793][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:21:24,113][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:21:24,433][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:21:24,754][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:21:25,073][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:21:25,393][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:21:25,714][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:21:26,034][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:21:26,353][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:21:26,673][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:21:26,992][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:21:27,312][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:21:27,972][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:21:28,698][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:21:28,700][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:21:28,702][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:21:29,364][__main__][INFO] - Iteration 91 took 27s (11.84% Gen, 85.72% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 45m 45s. Estimated total time: 7h 33m 46s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 53s. +[2026-03-25 16:21:29,366][__main__][INFO] - Starting iteration 91. +[2026-03-25 16:21:29,369][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:21:29,370][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:21:32,604][__main__][INFO] - Number of regex retries in iteration 91: 0 +[2026-03-25 16:21:32,605][__main__][INFO] - agents played in iteration 91 are Bob, Alice +[2026-03-25 16:21:33,179][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:21:33,833][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:21:34,123][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:21:34,446][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:21:34,767][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:21:35,086][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:21:35,406][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:21:35,725][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:21:36,046][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:21:36,367][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:21:36,688][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:21:37,009][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:21:37,330][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:21:37,651][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:21:37,971][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:21:38,290][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:21:38,610][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:21:38,931][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:21:39,252][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:21:39,572][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:21:39,891][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:21:40,212][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:21:40,533][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:21:40,853][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:21:41,172][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:21:41,492][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:21:41,813][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:21:42,133][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:21:42,452][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:21:42,771][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:21:43,092][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:21:43,411][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:21:43,730][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:21:44,051][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:21:44,372][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:21:44,691][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:21:45,012][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:21:45,331][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:21:45,650][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:21:45,970][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:21:46,289][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:21:46,609][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:21:46,930][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:21:47,251][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:21:47,571][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:21:47,891][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:21:48,211][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:21:48,530][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:21:48,851][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:21:49,170][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:21:49,489][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:21:49,808][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:21:50,130][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:21:50,745][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:21:51,066][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:21:51,386][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:21:51,706][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:21:52,026][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:21:52,346][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:21:52,667][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:21:52,986][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:21:53,306][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:21:53,627][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:21:53,948][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:21:54,268][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:21:54,587][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:21:55,245][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:21:55,969][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:21:55,971][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:21:55,973][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:21:56,597][__main__][INFO] - Iteration 92 took 27s (11.88% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 45m 19s. Estimated total time: 7h 33m 48s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 54s. +[2026-03-25 16:21:56,599][__main__][INFO] - Starting iteration 92. +[2026-03-25 16:21:56,602][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:21:56,603][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:21:59,783][__main__][INFO] - Number of regex retries in iteration 92: 0 +[2026-03-25 16:21:59,783][__main__][INFO] - agents played in iteration 92 are Bob, Alice +[2026-03-25 16:22:00,351][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:22:00,999][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:22:01,289][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:22:01,611][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:22:01,931][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:22:02,251][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:22:02,573][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:22:02,895][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:22:03,216][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:22:03,537][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:22:03,858][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:22:04,178][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:22:04,498][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:22:04,817][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:22:05,138][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:22:05,456][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:22:05,776][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:22:06,095][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:22:06,416][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:22:06,737][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:22:07,057][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:22:07,378][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:22:07,697][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:22:08,018][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:22:08,337][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:22:08,658][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:22:08,977][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:22:09,297][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:22:09,615][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:22:09,936][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:22:10,256][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:22:10,577][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:22:10,896][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:22:11,215][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:22:11,536][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:22:11,857][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:22:12,177][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:22:12,496][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:22:12,815][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:22:13,135][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:22:13,456][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:22:13,776][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:22:14,095][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:22:14,414][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:22:14,734][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:22:15,055][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:22:15,375][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:22:15,694][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:22:16,013][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:22:16,333][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:22:16,653][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:22:16,973][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:22:17,293][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:22:17,904][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:22:18,224][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:22:18,546][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:22:18,866][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:22:19,185][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:22:19,505][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:22:19,826][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:22:20,148][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:22:20,469][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:22:20,790][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:22:21,110][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:22:21,432][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:22:21,752][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:22:22,404][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:22:23,127][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:22:23,129][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:22:23,131][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:22:23,755][__main__][INFO] - Iteration 93 took 27s (11.71% Gen, 85.98% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 43m 38s. Estimated total time: 7h 32m 34s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 15s, 500 more iterations: 3h 46m 17s. +[2026-03-25 16:22:23,758][__main__][INFO] - Starting iteration 93. +[2026-03-25 16:22:23,761][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:22:23,762][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:22:26,959][__main__][INFO] - Number of regex retries in iteration 93: 0 +[2026-03-25 16:22:26,960][__main__][INFO] - agents played in iteration 93 are Bob, Alice +[2026-03-25 16:22:27,521][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:22:28,170][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:22:28,460][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:22:28,780][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:22:29,102][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:22:29,422][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:22:29,743][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:22:30,065][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:22:30,384][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:22:30,705][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:22:31,025][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:22:31,346][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:22:31,666][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:22:31,986][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:22:32,306][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:22:32,627][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:22:32,948][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:22:33,268][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:22:33,588][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:22:33,909][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:22:34,231][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:22:34,552][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:22:34,872][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:22:35,191][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:22:35,510][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:22:35,829][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:22:36,150][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:22:36,471][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:22:36,790][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:22:37,110][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:22:37,429][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:22:37,750][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:22:38,071][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:22:38,392][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:22:38,712][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:22:39,032][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:22:39,354][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:22:39,673][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:22:39,994][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:22:40,315][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:22:40,634][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:22:40,954][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:22:41,274][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:22:41,595][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:22:41,916][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:22:42,236][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:22:42,557][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:22:42,877][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:22:43,198][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:22:43,519][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:22:43,839][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:22:44,160][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:22:44,479][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:22:45,090][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:22:45,409][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:22:45,730][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:22:46,050][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:22:46,371][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:22:46,691][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:22:47,013][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:22:47,333][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:22:47,652][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:22:47,973][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:22:48,295][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:22:48,615][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:22:48,936][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:22:49,588][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:22:50,309][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:22:50,311][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:22:50,313][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:22:50,946][__main__][INFO] - Iteration 94 took 27s (11.77% Gen, 85.90% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 43m 43s. Estimated total time: 7h 33m 6s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 33s. +[2026-03-25 16:22:50,949][__main__][INFO] - Starting iteration 94. +[2026-03-25 16:22:50,952][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:22:50,952][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:22:54,231][__main__][INFO] - Number of regex retries in iteration 94: 0 +[2026-03-25 16:22:54,231][__main__][INFO] - agents played in iteration 94 are Bob, Alice +[2026-03-25 16:22:54,777][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:22:55,427][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:22:55,717][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:22:56,039][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:22:56,359][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:22:56,679][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:22:56,999][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:22:57,319][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:22:57,641][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:22:57,960][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:22:58,279][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:22:58,600][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:22:58,920][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:22:59,240][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:22:59,561][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:22:59,883][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:23:00,204][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:23:00,525][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:23:00,847][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:23:01,168][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:23:01,489][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:23:01,810][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:23:02,131][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:23:02,452][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:23:02,771][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:23:03,093][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:23:03,412][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:23:03,732][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:23:04,053][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:23:04,372][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:23:04,694][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:23:05,013][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:23:05,333][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:23:05,653][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:23:05,973][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:23:06,295][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:23:06,616][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:23:06,937][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:23:07,258][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:23:07,577][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:23:07,897][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:23:08,217][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:23:08,538][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:23:08,859][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:23:09,180][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:23:09,501][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:23:09,821][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:23:10,141][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:23:10,462][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:23:10,784][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:23:11,104][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:23:11,425][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:23:11,747][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:23:12,359][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:23:12,679][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:23:12,998][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:23:13,319][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:23:13,639][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:23:13,960][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:23:14,281][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:23:14,602][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:23:14,923][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:23:15,244][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:23:15,565][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:23:15,887][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:23:16,207][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:23:16,860][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:23:17,580][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:23:17,582][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:23:17,584][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:23:18,203][__main__][INFO] - Iteration 95 took 27s (12.03% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 44m 22s. Estimated total time: 7h 34m 12s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 6s. +[2026-03-25 16:23:18,206][__main__][INFO] - Starting iteration 95. +[2026-03-25 16:23:18,209][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:23:18,209][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:23:21,402][__main__][INFO] - Number of regex retries in iteration 95: 0 +[2026-03-25 16:23:21,403][__main__][INFO] - agents played in iteration 95 are Bob, Alice +[2026-03-25 16:23:21,950][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:23:22,598][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:23:22,888][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:23:23,209][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:23:23,528][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:23:23,849][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:23:24,170][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:23:24,492][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:23:24,812][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:23:25,132][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:23:25,453][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:23:25,774][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:23:26,094][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:23:26,414][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:23:26,734][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:23:27,054][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:23:27,373][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:23:27,692][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:23:28,013][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:23:28,334][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:23:28,655][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:23:28,976][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:23:29,295][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:23:29,614][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:23:29,934][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:23:30,253][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:23:30,573][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:23:30,894][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:23:31,213][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:23:31,534][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:23:31,853][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:23:32,173][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:23:32,492][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:23:32,811][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:23:33,132][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:23:33,452][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:23:33,772][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:23:34,092][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:23:34,412][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:23:34,732][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:23:35,053][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:23:35,374][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:23:35,693][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:23:36,014][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:23:36,335][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:23:36,656][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:23:36,976][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:23:37,296][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:23:37,616][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:23:37,937][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:23:38,257][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:23:38,577][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:23:38,896][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:23:39,507][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:23:39,827][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:23:40,148][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:23:40,469][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:23:40,790][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:23:41,110][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:23:41,431][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:23:41,751][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:23:42,071][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:23:42,390][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:23:42,710][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:23:43,032][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:23:43,354][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:23:44,011][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:23:44,727][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:23:44,730][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:23:44,731][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:23:45,352][__main__][INFO] - Iteration 96 took 27s (11.77% Gen, 85.94% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 42m 6s. Estimated total time: 7h 32m 24s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 14s, 500 more iterations: 3h 46m 12s. +[2026-03-25 16:23:45,354][__main__][INFO] - Starting iteration 96. +[2026-03-25 16:23:45,357][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:23:45,358][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:23:48,557][__main__][INFO] - Number of regex retries in iteration 96: 0 +[2026-03-25 16:23:48,558][__main__][INFO] - agents played in iteration 96 are Bob, Alice +[2026-03-25 16:23:49,099][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:23:49,748][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:23:50,038][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:23:50,359][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:23:50,680][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:23:50,999][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:23:51,321][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:23:51,641][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:23:51,961][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:23:52,280][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:23:52,600][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:23:52,920][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:23:53,241][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:23:53,561][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:23:53,881][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:23:54,202][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:23:54,522][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:23:54,843][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:23:55,165][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:23:55,485][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:23:55,807][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:23:56,128][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:23:56,448][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:23:56,767][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:23:57,088][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:23:57,408][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:23:57,729][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:23:58,050][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:23:58,371][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:23:58,690][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:23:59,011][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:23:59,332][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:23:59,653][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:23:59,975][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:24:00,296][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:24:00,617][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:24:00,938][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:24:01,258][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:24:01,577][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:24:01,897][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:24:02,218][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:24:02,541][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:24:02,860][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:24:03,182][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:24:03,504][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:24:03,826][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:24:04,146][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:24:04,468][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:24:04,787][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:24:05,108][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:24:05,427][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:24:05,748][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:24:06,069][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:24:06,681][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:24:07,001][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:24:07,323][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:24:07,645][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:24:07,966][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:24:08,286][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:24:08,606][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:24:08,928][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:24:09,249][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:24:09,571][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:24:09,892][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:24:10,212][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:24:10,531][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:24:11,184][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:24:11,906][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:24:11,909][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:24:11,910][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:24:12,534][__main__][INFO] - Iteration 97 took 27s (11.77% Gen, 85.93% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 42m 13s. Estimated total time: 7h 32m 57s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 17s, 500 more iterations: 3h 46m 28s. +[2026-03-25 16:24:12,536][__main__][INFO] - Starting iteration 97. +[2026-03-25 16:24:12,539][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:24:12,540][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:24:15,758][__main__][INFO] - Number of regex retries in iteration 97: 0 +[2026-03-25 16:24:15,759][__main__][INFO] - agents played in iteration 97 are Bob, Alice +[2026-03-25 16:24:16,300][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:24:16,955][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:24:17,246][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:24:17,566][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:24:17,888][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:24:18,208][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:24:18,528][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:24:18,850][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:24:19,171][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:24:19,491][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:24:19,813][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:24:20,134][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:24:20,455][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:24:20,777][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:24:21,098][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:24:21,419][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:24:21,740][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:24:22,060][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:24:22,380][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:24:22,700][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:24:23,019][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:24:23,340][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:24:23,661][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:24:23,981][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:24:24,302][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:24:24,621][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:24:24,941][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:24:25,262][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:24:25,584][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:24:25,905][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:24:26,227][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:24:26,549][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:24:26,871][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:24:27,190][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:24:27,512][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:24:27,833][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:24:28,152][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:24:28,473][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:24:28,793][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:24:29,113][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:24:29,434][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:24:29,754][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:24:30,073][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:24:30,394][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:24:30,714][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:24:31,036][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:24:31,357][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:24:31,679][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:24:31,998][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:24:32,317][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:24:32,638][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:24:32,959][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:24:33,279][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:24:33,898][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:24:34,218][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:24:34,538][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:24:34,858][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:24:35,178][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:24:35,498][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:24:35,818][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:24:36,139][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:24:36,460][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:24:36,781][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:24:37,102][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:24:37,422][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:24:37,744][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:24:38,403][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:24:39,128][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:24:39,130][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:24:39,132][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:24:39,758][__main__][INFO] - Iteration 98 took 27s (11.83% Gen, 85.87% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 42m 27s. Estimated total time: 7h 33m 39s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 49s. +[2026-03-25 16:24:39,760][__main__][INFO] - Starting iteration 98. +[2026-03-25 16:24:39,763][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:24:39,764][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:24:42,998][__main__][INFO] - Number of regex retries in iteration 98: 0 +[2026-03-25 16:24:42,999][__main__][INFO] - agents played in iteration 98 are Bob, Alice +[2026-03-25 16:24:43,546][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:24:44,196][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:24:44,489][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:24:44,810][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:24:45,131][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:24:45,453][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:24:45,774][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:24:46,094][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:24:46,415][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:24:46,736][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:24:47,057][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:24:47,377][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:24:47,697][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:24:48,020][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:24:48,342][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:24:48,663][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:24:48,983][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:24:49,304][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:24:49,626][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:24:49,948][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:24:50,269][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:24:50,590][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:24:50,911][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:24:51,232][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:24:51,553][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:24:51,873][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:24:52,192][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:24:52,512][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:24:52,832][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:24:53,152][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:24:53,471][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:24:53,791][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:24:54,113][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:24:54,432][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:24:54,753][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:24:55,074][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:24:55,394][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:24:55,714][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:24:56,034][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:24:56,353][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:24:56,674][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:24:56,993][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:24:57,313][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:24:57,634][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:24:57,954][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:24:58,276][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:24:58,597][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:24:58,918][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:24:59,238][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:24:59,559][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:24:59,880][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:25:00,199][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:25:00,520][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:25:01,131][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:25:01,451][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:25:01,771][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:25:02,092][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:25:02,412][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:25:02,734][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:25:03,055][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:25:03,376][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:25:03,697][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:25:04,019][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:25:04,339][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:25:04,662][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:25:04,984][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:25:05,636][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:25:06,356][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:25:06,359][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:25:06,360][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:25:06,982][__main__][INFO] - Iteration 99 took 27s (11.89% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 42m 0s. Estimated total time: 7h 33m 39s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 49s. +[2026-03-25 16:25:06,984][__main__][INFO] - Starting iteration 99. +[2026-03-25 16:25:06,987][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:25:06,987][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:25:10,274][__main__][INFO] - Number of regex retries in iteration 99: 0 +[2026-03-25 16:25:10,275][__main__][INFO] - agents played in iteration 99 are Bob, Alice +[2026-03-25 16:25:10,916][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:25:11,565][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:25:11,856][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:25:12,179][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:25:12,499][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:25:12,819][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:25:13,140][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:25:13,461][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:25:13,782][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:25:14,104][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:25:14,425][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:25:14,747][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:25:15,067][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:25:15,388][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:25:15,709][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:25:16,030][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:25:16,351][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:25:16,671][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:25:16,992][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:25:17,311][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:25:17,632][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:25:17,952][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:25:18,273][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:25:18,592][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:25:18,912][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:25:19,232][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:25:19,552][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:25:19,873][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:25:20,193][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:25:20,515][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:25:20,835][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:25:21,156][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:25:21,476][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:25:21,797][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:25:22,119][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:25:22,438][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:25:22,759][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:25:23,079][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:25:23,400][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:25:23,722][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:25:24,044][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:25:24,365][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:25:24,686][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:25:25,009][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:25:25,330][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:25:25,650][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:25:25,971][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:25:26,291][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:25:26,612][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:25:26,932][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:25:27,252][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:25:27,574][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:25:27,896][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:25:28,508][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:25:28,828][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:25:29,149][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:25:29,471][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:25:29,791][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:25:30,113][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:25:30,434][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:25:30,755][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:25:31,076][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:25:31,397][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:25:31,717][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:25:32,038][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:25:32,358][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:25:33,011][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:25:33,762][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:25:33,764][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:25:33,766][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:25:34,386][__main__][INFO] - Iteration 100 took 27s (12.00% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 44m 33s. Estimated total time: 7h 36m 40s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 40s, 500 more iterations: 3h 48m 20s. +[2026-03-25 16:25:34,388][__main__][INFO] - Starting iteration 100. +[2026-03-25 16:25:34,392][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 1 and human policies 1. +[2026-03-25 16:25:34,392][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:25:37,628][__main__][INFO] - Number of regex retries in iteration 100: 0 +[2026-03-25 16:25:37,628][__main__][INFO] - agents played in iteration 100 are Bob, Alice +[2026-03-25 16:25:38,183][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:25:38,830][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:25:39,124][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:25:39,445][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:25:39,766][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:25:40,087][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:25:40,407][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:25:40,727][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:25:41,048][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:25:41,369][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:25:41,690][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:25:42,010][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:25:42,332][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:25:42,653][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:25:42,973][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:25:43,292][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:25:43,613][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:25:43,935][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:25:44,257][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:25:44,577][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:25:44,897][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:25:45,219][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:25:45,539][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:25:45,861][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:25:46,183][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:25:46,505][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:25:46,827][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:25:47,148][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:25:47,469][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:25:47,790][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:25:48,111][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:25:48,432][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:25:48,753][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:25:49,073][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:25:49,393][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:25:49,715][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:25:50,035][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:25:50,357][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:25:50,678][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:25:50,998][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:25:51,318][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:25:51,638][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:25:51,958][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:25:52,278][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:25:52,598][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:25:52,919][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:25:53,240][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:25:53,560][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:25:53,880][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:25:54,199][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:25:54,520][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:25:54,840][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:25:55,162][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:25:55,773][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:25:56,093][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:25:56,413][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:25:56,734][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:25:57,053][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:25:57,373][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:25:57,695][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:25:58,016][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:25:58,337][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:25:58,657][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:25:58,979][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:25:59,299][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:25:59,621][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:26:00,275][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:26:00,997][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:26:00,999][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:26:01,001][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:26:02,205][__main__][INFO] - Iteration 101 took 27s (11.63% Gen, 84.03% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 51m 0s. Estimated total time: 7h 43m 34s. Time estimates for 10 more iterations: 4m 38s, 100 more iterations: 46m 21s, 500 more iterations: 3h 51m 47s. +[2026-03-25 16:26:02,207][__main__][INFO] - Starting iteration 101. +[2026-03-25 16:26:02,210][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:26:02,211][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:26:05,459][__main__][INFO] - Number of regex retries in iteration 101: 0 +[2026-03-25 16:26:05,460][__main__][INFO] - agents played in iteration 101 are Bob, Alice +[2026-03-25 16:26:06,035][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:26:06,684][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:26:06,975][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:26:07,297][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:26:07,618][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:26:07,938][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:26:08,260][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:26:08,580][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:26:08,899][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:26:09,218][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:26:09,539][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:26:09,859][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:26:10,180][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:26:10,502][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:26:10,823][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:26:11,144][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:26:11,465][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:26:11,787][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:26:12,107][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:26:12,429][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:26:12,750][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:26:13,071][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:26:13,391][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:26:13,711][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:26:14,033][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:26:14,354][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:26:14,675][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:26:14,995][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:26:15,316][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:26:15,636][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:26:15,957][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:26:16,278][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:26:16,599][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:26:16,919][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:26:17,239][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:26:17,559][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:26:17,879][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:26:18,200][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:26:18,521][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:26:18,842][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:26:19,163][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:26:19,484][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:26:19,806][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:26:20,129][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:26:20,449][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:26:20,769][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:26:21,089][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:26:21,411][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:26:21,731][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:26:22,051][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:26:22,373][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:26:22,693][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:26:23,014][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:26:23,630][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:26:23,951][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:26:24,272][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:26:24,593][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:26:24,914][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:26:25,235][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:26:25,556][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:26:25,877][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:26:26,198][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:26:26,518][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:26:26,839][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:26:27,160][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:26:27,480][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:26:28,141][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:26:28,863][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:26:28,866][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:26:28,867][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:26:29,490][__main__][INFO] - Iteration 102 took 27s (11.91% Gen, 85.80% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 41m 38s. Estimated total time: 7h 34m 40s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 20s. +[2026-03-25 16:26:29,492][__main__][INFO] - Starting iteration 102. +[2026-03-25 16:26:29,499][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:26:29,499][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:26:32,741][__main__][INFO] - Number of regex retries in iteration 102: 0 +[2026-03-25 16:26:32,742][__main__][INFO] - agents played in iteration 102 are Bob, Alice +[2026-03-25 16:26:33,361][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:26:34,017][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:26:34,306][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:26:34,629][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:26:34,951][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:26:35,272][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:26:35,592][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:26:35,911][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:26:36,232][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:26:36,552][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:26:36,874][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:26:37,194][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:26:37,515][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:26:37,837][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:26:38,157][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:26:38,478][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:26:38,799][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:26:39,120][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:26:39,440][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:26:39,760][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:26:40,080][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:26:40,399][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:26:40,720][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:26:41,041][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:26:41,362][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:26:41,682][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:26:42,001][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:26:42,323][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:26:42,643][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:26:42,964][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:26:43,283][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:26:43,603][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:26:43,924][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:26:44,244][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:26:44,565][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:26:44,886][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:26:45,207][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:26:45,527][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:26:45,849][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:26:46,170][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:26:46,490][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:26:46,812][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:26:47,131][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:26:47,451][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:26:47,772][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:26:48,092][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:26:48,411][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:26:48,733][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:26:49,054][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:26:49,376][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:26:49,695][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:26:50,017][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:26:50,338][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:26:50,954][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:26:51,275][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:26:51,595][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:26:51,915][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:26:52,235][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:26:52,555][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:26:52,875][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:26:53,195][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:26:53,516][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:26:53,837][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:26:54,158][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:26:54,480][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:26:54,799][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:26:55,459][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:26:56,189][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:26:56,191][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:26:56,193][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:26:56,818][__main__][INFO] - Iteration 103 took 27s (11.87% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 41m 55s. Estimated total time: 7h 35m 24s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 42s. +[2026-03-25 16:26:56,821][__main__][INFO] - Starting iteration 103. +[2026-03-25 16:26:56,824][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:26:56,824][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:27:00,069][__main__][INFO] - Number of regex retries in iteration 103: 0 +[2026-03-25 16:27:00,070][__main__][INFO] - agents played in iteration 103 are Bob, Alice +[2026-03-25 16:27:00,614][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:27:01,269][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:27:01,558][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:27:01,880][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:27:02,199][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:27:02,519][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:27:02,840][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:27:03,160][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:27:03,481][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:27:03,801][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:27:04,122][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:27:04,443][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:27:04,765][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:27:05,085][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:27:05,405][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:27:05,726][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:27:06,047][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:27:06,369][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:27:06,688][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:27:07,009][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:27:07,330][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:27:07,651][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:27:07,971][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:27:08,292][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:27:08,611][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:27:08,931][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:27:09,252][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:27:09,571][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:27:09,892][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:27:10,213][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:27:10,533][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:27:10,853][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:27:11,174][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:27:11,493][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:27:11,813][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:27:12,133][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:27:12,454][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:27:12,774][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:27:13,094][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:27:13,413][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:27:13,733][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:27:14,054][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:27:14,375][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:27:14,696][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:27:15,017][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:27:15,338][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:27:15,659][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:27:15,978][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:27:16,299][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:27:16,619][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:27:16,939][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:27:17,259][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:27:17,578][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:27:18,194][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:27:18,515][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:27:18,836][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:27:19,157][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:27:19,478][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:27:19,798][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:27:20,118][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:27:20,439][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:27:20,758][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:27:21,078][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:27:21,397][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:27:21,717][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:27:22,037][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:27:22,695][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:27:23,416][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:27:23,419][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:27:23,420][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:27:24,046][__main__][INFO] - Iteration 104 took 27s (11.92% Gen, 85.77% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 39m 47s. Estimated total time: 7h 33m 43s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 51s. +[2026-03-25 16:27:24,049][__main__][INFO] - Starting iteration 104. +[2026-03-25 16:27:24,052][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:27:24,052][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:27:27,308][__main__][INFO] - Number of regex retries in iteration 104: 0 +[2026-03-25 16:27:27,309][__main__][INFO] - agents played in iteration 104 are Bob, Alice +[2026-03-25 16:27:27,884][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:27:28,539][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:27:28,828][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:27:29,150][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:27:29,472][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:27:29,792][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:27:30,113][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:27:30,434][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:27:30,755][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:27:31,076][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:27:31,395][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:27:31,717][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:27:32,037][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:27:32,357][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:27:32,678][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:27:32,999][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:27:33,319][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:27:33,640][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:27:33,960][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:27:34,279][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:27:34,599][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:27:34,920][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:27:35,241][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:27:35,560][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:27:35,879][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:27:36,199][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:27:36,520][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:27:36,840][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:27:37,161][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:27:37,480][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:27:37,800][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:27:38,120][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:27:38,439][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:27:38,759][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:27:39,081][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:27:39,401][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:27:39,722][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:27:40,041][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:27:40,362][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:27:40,682][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:27:41,003][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:27:41,324][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:27:41,646][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:27:41,966][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:27:42,286][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:27:42,607][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:27:42,928][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:27:43,248][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:27:43,568][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:27:43,889][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:27:44,210][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:27:44,531][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:27:44,850][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:27:45,465][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:27:45,786][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:27:46,108][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:27:46,429][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:27:46,749][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:27:47,070][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:27:47,389][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:27:47,710][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:27:48,030][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:27:48,351][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:27:48,672][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:27:48,993][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:27:49,313][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:27:49,973][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:27:50,704][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:27:50,706][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:27:50,707][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:27:51,330][__main__][INFO] - Iteration 105 took 27s (11.94% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 40m 15s. Estimated total time: 7h 34m 39s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 19s. +[2026-03-25 16:27:51,332][__main__][INFO] - Starting iteration 105. +[2026-03-25 16:27:51,335][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:27:51,336][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:27:54,593][__main__][INFO] - Number of regex retries in iteration 105: 0 +[2026-03-25 16:27:54,594][__main__][INFO] - agents played in iteration 105 are Bob, Alice +[2026-03-25 16:27:55,155][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:27:55,824][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:27:56,114][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:27:56,435][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:27:56,757][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:27:57,077][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:27:57,397][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:27:57,717][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:27:58,037][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:27:58,357][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:27:58,678][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:27:58,998][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:27:59,318][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:27:59,639][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:27:59,959][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:28:00,278][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:28:00,597][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:28:00,918][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:28:01,238][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:28:01,558][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:28:01,877][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:28:02,197][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:28:02,517][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:28:02,836][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:28:03,155][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:28:03,476][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:28:03,796][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:28:04,117][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:28:04,438][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:28:04,757][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:28:05,078][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:28:05,398][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:28:05,717][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:28:06,038][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:28:06,357][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:28:06,677][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:28:06,998][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:28:07,317][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:28:07,636][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:28:07,958][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:28:08,278][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:28:08,598][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:28:08,917][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:28:09,238][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:28:09,558][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:28:09,878][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:28:10,198][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:28:10,517][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:28:10,837][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:28:11,158][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:28:11,479][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:28:11,798][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:28:12,118][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:28:12,733][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:28:13,052][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:28:13,372][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:28:13,692][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:28:14,012][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:28:14,332][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:28:14,652][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:28:14,974][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:28:15,293][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:28:15,613][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:28:15,934][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:28:16,255][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:28:16,575][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:28:17,233][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:28:17,958][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:28:17,960][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:28:17,962][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:28:18,585][__main__][INFO] - Iteration 106 took 27s (11.96% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 39m 20s. Estimated total time: 7h 34m 11s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 5s. +[2026-03-25 16:28:18,588][__main__][INFO] - Starting iteration 106. +[2026-03-25 16:28:18,591][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:28:18,591][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:28:21,831][__main__][INFO] - Number of regex retries in iteration 106: 0 +[2026-03-25 16:28:21,832][__main__][INFO] - agents played in iteration 106 are Bob, Alice +[2026-03-25 16:28:22,372][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:28:23,026][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:28:23,316][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:28:23,637][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:28:23,957][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:28:24,277][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:28:24,597][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:28:24,918][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:28:25,240][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:28:25,560][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:28:25,881][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:28:26,202][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:28:26,522][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:28:26,842][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:28:27,161][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:28:27,482][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:28:27,802][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:28:28,122][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:28:28,443][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:28:28,763][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:28:29,083][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:28:29,405][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:28:29,727][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:28:30,047][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:28:30,368][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:28:30,688][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:28:31,010][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:28:31,330][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:28:31,650][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:28:31,970][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:28:32,291][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:28:32,613][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:28:32,933][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:28:33,255][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:28:33,575][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:28:33,894][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:28:34,215][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:28:34,535][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:28:34,856][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:28:35,175][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:28:35,495][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:28:35,815][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:28:36,135][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:28:36,456][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:28:36,775][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:28:37,097][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:28:37,418][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:28:37,738][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:28:38,059][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:28:38,379][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:28:38,699][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:28:39,020][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:28:39,341][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:28:39,955][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:28:40,275][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:28:40,595][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:28:40,914][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:28:41,234][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:28:41,554][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:28:41,874][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:28:42,194][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:28:42,514][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:28:42,835][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:28:43,155][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:28:43,475][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:28:43,795][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:28:44,452][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:28:45,181][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:28:45,183][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:28:45,185][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:28:45,803][__main__][INFO] - Iteration 107 took 27s (11.91% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 38m 15s. Estimated total time: 7h 33m 33s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 46s. +[2026-03-25 16:28:45,805][__main__][INFO] - Starting iteration 107. +[2026-03-25 16:28:45,808][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:28:45,809][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:28:49,065][__main__][INFO] - Number of regex retries in iteration 107: 0 +[2026-03-25 16:28:49,066][__main__][INFO] - agents played in iteration 107 are Bob, Alice +[2026-03-25 16:28:49,613][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:28:50,272][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:28:50,562][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:28:50,885][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:28:51,204][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:28:51,523][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:28:51,843][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:28:52,165][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:28:52,486][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:28:52,807][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:28:53,128][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:28:53,449][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:28:53,768][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:28:54,087][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:28:54,407][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:28:54,727][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:28:55,048][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:28:55,369][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:28:55,689][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:28:56,009][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:28:56,329][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:28:56,650][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:28:56,970][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:28:57,289][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:28:57,610][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:28:57,930][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:28:58,251][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:28:58,572][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:28:58,892][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:28:59,213][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:28:59,533][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:28:59,855][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:29:00,176][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:29:00,496][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:29:00,816][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:29:01,137][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:29:01,458][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:29:01,777][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:29:02,098][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:29:02,419][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:29:02,740][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:29:03,061][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:29:03,404][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:29:03,725][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:29:04,048][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:29:04,369][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:29:04,688][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:29:05,008][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:29:05,327][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:29:05,648][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:29:05,968][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:29:06,289][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:29:06,608][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:29:07,222][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:29:07,545][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:29:07,866][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:29:08,186][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:29:08,506][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:29:08,828][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:29:09,148][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:29:09,468][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:29:09,790][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:29:10,110][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:29:10,430][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:29:10,751][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:29:11,072][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:29:11,731][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:29:12,455][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:29:12,457][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:29:12,459][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:29:13,083][__main__][INFO] - Iteration 108 took 27s (11.94% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 38m 50s. Estimated total time: 7h 34m 36s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 18s. +[2026-03-25 16:29:13,086][__main__][INFO] - Starting iteration 108. +[2026-03-25 16:29:13,089][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:29:13,089][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:29:16,325][__main__][INFO] - Number of regex retries in iteration 108: 0 +[2026-03-25 16:29:16,326][__main__][INFO] - agents played in iteration 108 are Bob, Alice +[2026-03-25 16:29:16,872][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:29:17,527][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:29:17,816][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:29:18,136][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:29:18,456][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:29:18,777][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:29:19,097][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:29:19,418][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:29:19,738][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:29:20,058][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:29:20,378][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:29:20,698][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:29:21,017][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:29:21,337][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:29:21,656][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:29:21,977][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:29:22,297][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:29:22,616][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:29:22,937][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:29:23,257][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:29:23,578][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:29:23,898][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:29:24,219][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:29:24,538][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:29:24,859][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:29:25,181][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:29:25,500][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:29:25,820][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:29:26,140][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:29:26,460][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:29:26,782][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:29:27,102][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:29:27,422][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:29:27,742][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:29:28,062][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:29:28,382][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:29:28,702][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:29:29,023][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:29:29,344][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:29:29,666][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:29:29,985][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:29:30,307][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:29:30,629][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:29:30,949][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:29:31,269][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:29:31,591][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:29:31,911][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:29:32,232][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:29:32,552][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:29:32,872][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:29:33,191][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:29:33,512][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:29:33,832][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:29:34,447][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:29:34,767][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:29:35,087][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:29:35,409][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:29:35,729][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:29:36,051][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:29:36,372][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:29:36,692][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:29:37,012][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:29:37,333][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:29:37,654][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:29:37,974][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:29:38,297][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:29:38,952][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:29:39,678][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:29:39,680][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:29:39,682][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:29:40,305][__main__][INFO] - Iteration 109 took 27s (11.89% Gen, 85.81% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 37m 24s. Estimated total time: 7h 33m 37s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 48s. +[2026-03-25 16:29:40,307][__main__][INFO] - Starting iteration 109. +[2026-03-25 16:29:40,310][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:29:40,310][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:29:43,549][__main__][INFO] - Number of regex retries in iteration 109: 0 +[2026-03-25 16:29:43,550][__main__][INFO] - agents played in iteration 109 are Bob, Alice +[2026-03-25 16:29:44,094][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:29:44,746][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:29:45,036][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:29:45,358][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:29:45,678][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:29:45,999][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:29:46,319][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:29:46,639][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:29:46,959][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:29:47,280][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:29:47,600][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:29:47,921][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:29:48,241][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:29:48,561][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:29:48,882][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:29:49,201][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:29:49,522][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:29:49,845][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:29:50,167][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:29:50,488][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:29:50,807][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:29:51,128][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:29:51,449][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:29:51,770][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:29:52,091][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:29:52,412][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:29:52,733][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:29:53,055][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:29:53,376][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:29:53,698][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:29:54,019][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:29:54,339][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:29:54,660][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:29:54,980][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:29:55,301][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:29:55,622][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:29:55,943][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:29:56,263][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:29:56,585][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:29:56,907][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:29:57,228][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:29:57,549][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:29:57,870][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:29:58,191][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:29:58,512][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:29:58,833][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:29:59,154][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:29:59,474][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:29:59,796][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:30:00,117][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:30:00,438][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:30:00,759][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:30:01,080][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:30:01,693][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:30:02,013][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:30:02,333][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:30:02,653][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:30:02,973][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:30:03,295][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:30:03,616][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:30:03,935][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:30:04,256][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:30:04,578][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:30:04,897][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:30:05,218][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:30:05,539][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:30:06,194][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:30:06,917][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:30:06,919][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:30:06,920][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:30:07,545][__main__][INFO] - Iteration 110 took 27s (11.89% Gen, 85.81% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 37m 16s. Estimated total time: 7h 33m 56s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 58s. +[2026-03-25 16:30:07,547][__main__][INFO] - Starting iteration 110. +[2026-03-25 16:30:07,550][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:30:07,551][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:30:10,812][__main__][INFO] - Number of regex retries in iteration 110: 0 +[2026-03-25 16:30:10,812][__main__][INFO] - agents played in iteration 110 are Bob, Alice +[2026-03-25 16:30:11,371][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:30:12,024][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:30:12,314][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:30:12,635][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:30:12,956][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:30:13,277][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:30:13,597][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:30:13,918][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:30:14,239][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:30:14,561][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:30:14,882][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:30:15,203][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:30:15,524][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:30:15,848][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:30:16,169][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:30:16,490][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:30:16,811][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:30:17,131][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:30:17,453][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:30:17,772][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:30:18,093][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:30:18,414][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:30:18,734][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:30:19,055][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:30:19,375][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:30:19,695][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:30:20,017][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:30:20,339][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:30:20,661][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:30:20,984][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:30:21,305][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:30:21,627][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:30:21,947][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:30:22,267][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:30:22,587][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:30:22,909][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:30:23,230][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:30:23,551][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:30:23,872][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:30:24,192][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:30:24,513][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:30:24,834][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:30:25,155][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:30:25,474][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:30:25,794][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:30:26,114][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:30:26,436][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:30:26,758][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:30:27,079][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:30:27,398][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:30:27,718][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:30:28,039][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:30:28,359][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:30:28,972][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:30:29,293][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:30:29,615][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:30:29,935][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:30:30,257][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:30:30,576][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:30:30,898][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:30:31,219][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:30:31,539][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:30:31,859][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:30:32,180][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:30:32,500][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:30:32,820][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:30:33,476][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:30:34,202][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:30:34,204][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:30:34,206][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:30:34,826][__main__][INFO] - Iteration 111 took 27s (11.96% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 37m 29s. Estimated total time: 7h 34m 37s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 18s. +[2026-03-25 16:30:34,829][__main__][INFO] - Starting iteration 111. +[2026-03-25 16:30:34,832][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:30:34,832][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:30:38,088][__main__][INFO] - Number of regex retries in iteration 111: 0 +[2026-03-25 16:30:38,089][__main__][INFO] - agents played in iteration 111 are Bob, Alice +[2026-03-25 16:30:38,652][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:30:39,302][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:30:39,592][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:30:39,914][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:30:40,235][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:30:40,555][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:30:40,876][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:30:41,196][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:30:41,516][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:30:41,836][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:30:42,158][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:30:42,478][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:30:42,797][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:30:43,118][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:30:43,439][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:30:43,760][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:30:44,081][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:30:44,402][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:30:44,721][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:30:45,041][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:30:45,362][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:30:45,682][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:30:46,003][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:30:46,325][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:30:46,645][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:30:46,967][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:30:47,288][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:30:47,609][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:30:47,930][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:30:48,251][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:30:48,572][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:30:48,892][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:30:49,212][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:30:49,533][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:30:49,853][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:30:50,174][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:30:50,495][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:30:50,816][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:30:51,137][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:30:51,457][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:30:51,777][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:30:52,099][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:30:52,419][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:30:52,739][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:30:53,059][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:30:53,379][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:30:53,699][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:30:54,019][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:30:54,340][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:30:54,661][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:30:54,981][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:30:55,301][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:30:55,621][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:30:56,234][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:30:56,553][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:30:56,873][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:30:57,193][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:30:57,513][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:30:57,833][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:30:58,154][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:30:58,474][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:30:58,796][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:30:59,115][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:30:59,435][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:30:59,757][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:31:00,078][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:31:00,732][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:31:01,458][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:31:01,460][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:31:01,462][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:31:02,084][__main__][INFO] - Iteration 112 took 27s (11.95% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 36m 38s. Estimated total time: 7h 34m 12s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 6s. +[2026-03-25 16:31:02,086][__main__][INFO] - Starting iteration 112. +[2026-03-25 16:31:02,089][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:31:02,089][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:31:05,345][__main__][INFO] - Number of regex retries in iteration 112: 0 +[2026-03-25 16:31:05,346][__main__][INFO] - agents played in iteration 112 are Bob, Alice +[2026-03-25 16:31:05,912][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:31:06,561][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:31:06,850][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:31:07,171][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:31:07,492][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:31:07,814][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:31:08,134][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:31:08,455][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:31:08,775][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:31:09,095][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:31:09,415][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:31:09,735][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:31:10,055][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:31:10,374][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:31:10,696][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:31:11,016][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:31:11,338][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:31:11,658][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:31:11,979][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:31:12,299][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:31:12,619][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:31:12,940][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:31:13,260][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:31:13,580][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:31:13,901][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:31:14,220][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:31:14,541][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:31:14,862][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:31:15,182][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:31:15,502][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:31:15,823][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:31:16,145][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:31:16,466][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:31:16,786][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:31:17,108][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:31:17,429][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:31:17,750][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:31:18,071][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:31:18,391][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:31:18,711][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:31:19,031][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:31:19,352][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:31:19,674][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:31:19,994][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:31:20,316][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:31:20,637][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:31:20,958][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:31:21,279][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:31:21,600][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:31:21,922][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:31:22,243][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:31:22,564][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:31:22,886][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:31:23,503][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:31:23,825][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:31:24,147][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:31:24,468][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:31:24,789][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:31:25,110][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:31:25,431][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:31:25,753][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:31:26,073][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:31:26,394][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:31:26,713][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:31:27,032][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:31:27,355][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:31:28,017][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:31:28,747][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:31:28,749][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:31:28,751][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:31:29,379][__main__][INFO] - Iteration 113 took 27s (11.93% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 36m 49s. Estimated total time: 7h 34m 51s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 25s. +[2026-03-25 16:31:29,381][__main__][INFO] - Starting iteration 113. +[2026-03-25 16:31:29,385][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:31:29,385][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:31:32,654][__main__][INFO] - Number of regex retries in iteration 113: 0 +[2026-03-25 16:31:32,655][__main__][INFO] - agents played in iteration 113 are Bob, Alice +[2026-03-25 16:31:33,232][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:31:33,879][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:31:34,170][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:31:34,490][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:31:34,810][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:31:35,130][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:31:35,450][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:31:35,772][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:31:36,092][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:31:36,413][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:31:36,733][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:31:37,053][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:31:37,374][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:31:37,693][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:31:38,014][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:31:38,335][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:31:38,655][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:31:38,975][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:31:39,295][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:31:39,616][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:31:39,937][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:31:40,258][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:31:40,579][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:31:40,898][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:31:41,219][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:31:41,539][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:31:41,859][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:31:42,179][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:31:42,498][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:31:42,819][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:31:43,140][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:31:43,460][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:31:43,780][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:31:44,101][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:31:44,421][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:31:44,741][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:31:45,061][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:31:45,383][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:31:45,703][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:31:46,024][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:31:46,345][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:31:46,665][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:31:46,988][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:31:47,308][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:31:47,628][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:31:47,948][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:31:48,268][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:31:48,588][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:31:48,908][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:31:49,229][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:31:49,549][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:31:49,870][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:31:50,192][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:31:50,804][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:31:51,125][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:31:51,445][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:31:51,766][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:31:52,086][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:31:52,406][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:31:52,727][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:31:53,047][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:31:53,366][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:31:53,687][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:31:54,008][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:31:54,328][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:31:54,649][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:31:55,302][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:31:56,026][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:31:56,028][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:31:56,030][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:31:56,655][__main__][INFO] - Iteration 114 took 27s (11.99% Gen, 85.71% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 36m 3s. Estimated total time: 7h 34m 31s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 15s. +[2026-03-25 16:31:56,658][__main__][INFO] - Starting iteration 114. +[2026-03-25 16:31:56,660][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:31:56,661][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:31:59,938][__main__][INFO] - Number of regex retries in iteration 114: 0 +[2026-03-25 16:31:59,939][__main__][INFO] - agents played in iteration 114 are Bob, Alice +[2026-03-25 16:32:00,489][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:32:01,138][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:32:01,429][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:32:01,752][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:32:02,074][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:32:02,394][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:32:02,714][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:32:03,034][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:32:03,355][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:32:03,677][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:32:03,997][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:32:04,318][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:32:04,638][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:32:04,959][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:32:05,281][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:32:05,602][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:32:05,921][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:32:06,242][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:32:06,562][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:32:06,884][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:32:07,206][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:32:07,527][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:32:07,849][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:32:08,170][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:32:08,491][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:32:08,811][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:32:09,133][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:32:09,454][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:32:09,777][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:32:10,099][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:32:10,418][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:32:10,738][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:32:11,059][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:32:11,381][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:32:11,701][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:32:12,021][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:32:12,340][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:32:12,660][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:32:12,979][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:32:13,298][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:32:13,620][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:32:13,940][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:32:14,261][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:32:14,580][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:32:14,900][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:32:15,220][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:32:15,540][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:32:15,862][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:32:16,181][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:32:16,502][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:32:16,822][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:32:17,142][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:32:17,463][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:32:18,075][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:32:18,397][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:32:18,719][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:32:19,040][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:32:19,362][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:32:19,682][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:32:20,004][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:32:20,326][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:32:20,647][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:32:20,969][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:32:21,291][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:32:21,612][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:32:21,932][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:32:22,586][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:32:23,316][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:32:23,318][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:32:23,320][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:32:23,945][__main__][INFO] - Iteration 115 took 27s (12.01% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 35m 48s. Estimated total time: 7h 34m 45s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 22s. +[2026-03-25 16:32:23,947][__main__][INFO] - Starting iteration 115. +[2026-03-25 16:32:23,950][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:32:23,951][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:32:27,179][__main__][INFO] - Number of regex retries in iteration 115: 0 +[2026-03-25 16:32:27,180][__main__][INFO] - agents played in iteration 115 are Bob, Alice +[2026-03-25 16:32:27,750][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:32:28,399][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:32:28,690][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:32:29,010][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:32:29,330][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:32:29,650][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:32:29,970][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:32:30,290][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:32:30,610][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:32:30,930][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:32:31,251][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:32:31,570][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:32:31,891][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:32:32,211][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:32:32,530][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:32:32,851][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:32:33,173][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:32:33,492][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:32:33,813][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:32:34,133][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:32:34,452][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:32:34,772][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:32:35,093][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:32:35,412][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:32:35,734][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:32:36,054][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:32:36,374][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:32:36,694][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:32:37,015][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:32:37,335][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:32:37,655][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:32:37,975][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:32:38,294][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:32:38,615][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:32:38,936][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:32:39,259][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:32:39,579][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:32:39,898][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:32:40,219][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:32:40,538][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:32:40,859][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:32:41,182][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:32:41,502][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:32:41,823][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:32:42,143][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:32:42,463][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:32:42,782][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:32:43,103][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:32:43,424][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:32:43,747][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:32:44,066][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:32:44,387][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:32:44,706][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:32:45,318][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:32:45,640][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:32:45,959][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:32:46,279][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:32:46,601][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:32:46,921][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:32:47,242][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:32:47,564][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:32:47,885][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:32:48,207][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:32:48,529][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:32:48,849][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:32:49,170][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:32:49,823][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:32:50,554][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:32:50,556][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:32:50,558][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:32:51,186][__main__][INFO] - Iteration 116 took 27s (11.86% Gen, 85.83% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 34m 34s. Estimated total time: 7h 33m 57s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 58s. +[2026-03-25 16:32:51,189][__main__][INFO] - Starting iteration 116. +[2026-03-25 16:32:51,192][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:32:51,193][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:32:54,472][__main__][INFO] - Number of regex retries in iteration 116: 0 +[2026-03-25 16:32:54,473][__main__][INFO] - agents played in iteration 116 are Bob, Alice +[2026-03-25 16:32:55,042][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:32:55,693][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:32:55,984][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:32:56,304][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:32:56,626][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:32:56,947][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:32:57,270][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:32:57,591][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:32:57,912][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:32:58,232][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:32:58,553][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:32:58,874][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:32:59,197][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:32:59,519][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:32:59,839][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:33:00,160][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:33:00,480][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:33:00,799][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:33:01,120][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:33:01,439][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:33:01,761][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:33:02,080][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:33:02,401][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:33:02,721][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:33:03,040][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:33:03,362][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:33:03,681][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:33:04,001][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:33:04,322][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:33:04,641][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:33:04,961][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:33:05,283][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:33:05,602][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:33:05,924][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:33:06,246][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:33:06,566][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:33:06,885][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:33:07,207][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:33:07,528][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:33:07,848][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:33:08,170][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:33:08,491][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:33:08,812][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:33:09,133][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:33:09,455][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:33:09,775][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:33:10,096][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:33:10,417][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:33:10,737][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:33:11,059][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:33:11,380][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:33:11,702][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:33:12,021][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:33:12,635][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:33:12,956][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:33:13,278][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:33:13,600][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:33:13,921][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:33:14,243][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:33:14,564][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:33:14,885][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:33:15,207][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:33:15,528][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:33:15,850][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:33:16,170][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:33:16,490][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:33:17,144][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:33:17,869][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:33:17,871][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:33:17,873][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:33:18,500][__main__][INFO] - Iteration 117 took 27s (12.01% Gen, 85.68% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 35m 18s. Estimated total time: 7h 35m 9s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 34s. +[2026-03-25 16:33:18,503][__main__][INFO] - Starting iteration 117. +[2026-03-25 16:33:18,505][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:33:18,506][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:33:21,791][__main__][INFO] - Number of regex retries in iteration 117: 0 +[2026-03-25 16:33:21,792][__main__][INFO] - agents played in iteration 117 are Bob, Alice +[2026-03-25 16:33:22,349][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:33:23,001][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:33:23,290][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:33:23,612][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:33:23,934][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:33:24,253][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:33:24,574][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:33:24,893][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:33:25,214][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:33:25,533][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:33:25,853][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:33:26,172][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:33:26,492][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:33:26,812][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:33:27,133][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:33:27,456][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:33:27,776][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:33:28,097][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:33:28,416][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:33:28,737][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:33:29,058][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:33:29,381][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:33:29,702][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:33:30,022][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:33:30,343][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:33:30,664][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:33:30,986][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:33:31,308][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:33:31,628][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:33:31,950][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:33:32,272][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:33:32,593][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:33:32,912][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:33:33,233][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:33:33,553][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:33:33,872][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:33:34,192][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:33:34,513][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:33:34,834][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:33:35,154][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:33:35,475][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:33:35,795][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:33:36,115][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:33:36,435][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:33:36,755][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:33:37,075][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:33:37,395][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:33:37,716][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:33:38,036][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:33:38,356][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:33:38,676][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:33:38,997][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:33:39,319][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:33:39,931][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:33:40,253][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:33:40,574][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:33:40,895][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:33:41,215][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:33:41,535][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:33:41,856][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:33:42,177][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:33:42,497][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:33:42,819][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:33:43,139][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:33:43,461][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:33:43,783][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:33:44,439][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:33:45,167][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:33:45,169][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:33:45,171][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:33:45,798][__main__][INFO] - Iteration 118 took 27s (12.04% Gen, 85.66% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 34m 35s. Estimated total time: 7h 34m 53s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 26s. +[2026-03-25 16:33:45,800][__main__][INFO] - Starting iteration 118. +[2026-03-25 16:33:45,803][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:33:45,804][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:33:49,101][__main__][INFO] - Number of regex retries in iteration 118: 0 +[2026-03-25 16:33:49,102][__main__][INFO] - agents played in iteration 118 are Bob, Alice +[2026-03-25 16:33:49,650][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:33:50,305][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:33:50,594][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:33:50,915][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:33:51,236][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:33:51,556][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:33:51,877][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:33:52,197][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:33:52,518][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:33:52,837][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:33:53,158][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:33:53,479][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:33:53,799][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:33:54,118][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:33:54,438][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:33:54,760][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:33:55,080][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:33:55,400][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:33:55,721][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:33:56,043][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:33:56,362][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:33:56,682][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:33:57,002][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:33:57,323][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:33:57,643][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:33:57,965][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:33:58,287][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:33:58,607][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:33:58,929][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:33:59,250][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:33:59,572][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:33:59,892][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:34:00,212][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:34:00,532][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:34:00,852][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:34:01,173][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:34:01,495][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:34:01,816][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:34:02,137][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:34:02,458][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:34:02,779][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:34:03,100][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:34:03,422][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:34:03,744][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:34:04,066][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:34:04,387][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:34:04,708][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:34:05,028][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:34:05,348][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:34:05,669][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:34:05,989][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:34:06,311][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:34:06,632][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:34:07,245][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:34:07,566][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:34:07,886][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:34:08,207][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:34:08,528][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:34:08,849][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:34:09,168][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:34:09,488][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:34:09,808][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:34:10,129][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:34:10,450][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:34:10,771][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:34:11,093][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:34:11,746][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:34:12,477][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:34:12,480][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:34:12,481][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:34:13,103][__main__][INFO] - Iteration 119 took 27s (12.08% Gen, 85.64% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 34m 15s. Estimated total time: 7h 35m 0s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 30s. +[2026-03-25 16:34:13,105][__main__][INFO] - Starting iteration 119. +[2026-03-25 16:34:13,108][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:34:13,109][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:34:16,381][__main__][INFO] - Number of regex retries in iteration 119: 0 +[2026-03-25 16:34:16,382][__main__][INFO] - agents played in iteration 119 are Bob, Alice +[2026-03-25 16:34:16,929][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:34:17,579][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:34:17,870][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:34:18,191][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:34:18,512][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:34:18,832][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:34:19,152][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:34:19,472][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:34:19,792][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:34:20,113][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:34:20,433][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:34:20,753][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:34:21,074][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:34:21,396][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:34:21,718][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:34:22,037][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:34:22,359][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:34:22,680][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:34:22,999][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:34:23,319][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:34:23,641][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:34:23,962][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:34:24,282][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:34:24,602][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:34:24,922][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:34:25,244][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:34:25,564][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:34:25,886][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:34:26,207][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:34:26,526][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:34:26,848][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:34:27,168][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:34:27,488][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:34:27,808][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:34:28,129][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:34:28,450][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:34:28,770][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:34:29,090][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:34:29,411][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:34:29,732][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:34:30,053][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:34:30,375][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:34:30,694][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:34:31,015][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:34:31,336][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:34:31,657][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:34:31,978][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:34:32,300][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:34:32,621][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:34:32,942][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:34:33,263][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:34:33,582][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:34:33,901][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:34:34,514][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:34:34,833][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:34:35,152][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:34:35,474][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:34:35,794][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:34:36,116][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:34:36,435][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:34:36,755][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:34:37,076][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:34:37,396][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:34:37,716][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:34:38,037][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:34:38,358][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:34:39,012][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:34:39,724][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:34:39,726][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:34:39,727][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:34:40,352][__main__][INFO] - Iteration 120 took 27s (12.01% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 32m 52s. Estimated total time: 7h 34m 4s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 2s. +[2026-03-25 16:34:40,354][__main__][INFO] - Starting iteration 120. +[2026-03-25 16:34:40,357][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:34:40,358][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:34:43,555][__main__][INFO] - Number of regex retries in iteration 120: 0 +[2026-03-25 16:34:43,556][__main__][INFO] - agents played in iteration 120 are Bob, Alice +[2026-03-25 16:34:44,097][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:34:44,747][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:34:45,037][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:34:45,358][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:34:45,679][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:34:45,999][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:34:46,320][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:34:46,641][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:34:46,963][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:34:47,285][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:34:47,605][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:34:47,926][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:34:48,247][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:34:48,568][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:34:48,888][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:34:49,207][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:34:49,529][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:34:49,850][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:34:50,171][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:34:50,492][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:34:50,812][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:34:51,133][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:34:51,453][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:34:51,774][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:34:52,095][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:34:52,416][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:34:52,738][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:34:53,059][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:34:53,378][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:34:53,699][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:34:54,022][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:34:54,343][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:34:54,664][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:34:54,985][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:34:55,304][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:34:55,625][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:34:55,946][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:34:56,267][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:34:56,587][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:34:56,909][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:34:57,229][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:34:57,550][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:34:57,870][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:34:58,190][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:34:58,511][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:34:58,832][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:34:59,152][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:34:59,473][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:34:59,794][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:35:00,116][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:35:00,437][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:35:00,758][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:35:01,079][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:35:01,692][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:35:02,012][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:35:02,333][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:35:02,653][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:35:02,973][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:35:03,294][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:35:03,615][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:35:03,936][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:35:04,255][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:35:04,574][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:35:04,894][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:35:05,214][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:35:05,535][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:35:06,189][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:35:06,909][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:35:06,911][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:35:06,913][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:35:07,539][__main__][INFO] - Iteration 121 took 27s (11.76% Gen, 85.92% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 31m 23s. Estimated total time: 7h 33m 3s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 31s. +[2026-03-25 16:35:07,542][__main__][INFO] - Starting iteration 121. +[2026-03-25 16:35:07,545][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:35:07,546][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:35:10,744][__main__][INFO] - Number of regex retries in iteration 121: 0 +[2026-03-25 16:35:10,745][__main__][INFO] - agents played in iteration 121 are Bob, Alice +[2026-03-25 16:35:11,295][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:35:11,945][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:35:12,236][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:35:12,557][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:35:12,876][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:35:13,196][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:35:13,516][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:35:13,836][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:35:14,156][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:35:14,478][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:35:14,798][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:35:15,119][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:35:15,440][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:35:15,760][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:35:16,081][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:35:16,401][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:35:16,723][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:35:17,045][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:35:17,366][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:35:17,688][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:35:18,009][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:35:18,329][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:35:18,649][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:35:18,970][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:35:19,290][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:35:19,610][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:35:19,931][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:35:20,252][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:35:20,574][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:35:20,895][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:35:21,217][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:35:21,537][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:35:21,857][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:35:22,178][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:35:22,499][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:35:22,820][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:35:23,141][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:35:23,462][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:35:23,782][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:35:24,101][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:35:24,421][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:35:24,743][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:35:25,063][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:35:25,383][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:35:25,702][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:35:26,024][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:35:26,345][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:35:26,667][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:35:26,987][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:35:27,309][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:35:27,629][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:35:27,950][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:35:28,270][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:35:28,883][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:35:29,203][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:35:29,523][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:35:29,844][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:35:30,165][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:35:30,485][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:35:30,808][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:35:31,128][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:35:31,448][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:35:31,768][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:35:32,088][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:35:32,409][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:35:32,730][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:35:33,386][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:35:34,107][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:35:34,110][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:35:34,111][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:35:34,735][__main__][INFO] - Iteration 122 took 27s (11.77% Gen, 85.93% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 31m 3s. Estimated total time: 7h 33m 10s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 35s. +[2026-03-25 16:35:34,737][__main__][INFO] - Starting iteration 122. +[2026-03-25 16:35:34,740][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:35:34,741][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:35:37,956][__main__][INFO] - Number of regex retries in iteration 122: 0 +[2026-03-25 16:35:37,957][__main__][INFO] - agents played in iteration 122 are Bob, Alice +[2026-03-25 16:35:38,517][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:35:39,171][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:35:39,461][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:35:39,781][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:35:40,100][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:35:40,420][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:35:40,740][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:35:41,061][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:35:41,381][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:35:41,701][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:35:42,023][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:35:42,345][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:35:42,667][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:35:42,988][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:35:43,309][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:35:43,629][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:35:43,950][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:35:44,271][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:35:44,592][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:35:44,912][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:35:45,233][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:35:45,555][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:35:45,876][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:35:46,195][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:35:46,514][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:35:46,834][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:35:47,153][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:35:47,473][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:35:47,793][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:35:48,113][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:35:48,433][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:35:48,754][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:35:49,075][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:35:49,395][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:35:49,715][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:35:50,036][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:35:50,357][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:35:50,678][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:35:50,998][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:35:51,319][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:35:51,640][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:35:51,961][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:35:52,280][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:35:52,599][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:35:52,920][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:35:53,241][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:35:53,562][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:35:53,883][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:35:54,203][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:35:54,525][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:35:54,846][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:35:55,166][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:35:55,485][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:35:56,098][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:35:56,419][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:35:56,738][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:35:57,058][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:35:57,377][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:35:57,696][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:35:58,017][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:35:58,337][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:35:58,658][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:35:58,979][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:35:59,299][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:35:59,620][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:35:59,940][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:36:00,595][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:36:01,323][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:36:01,326][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:36:01,327][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:36:01,954][__main__][INFO] - Iteration 123 took 27s (11.82% Gen, 85.87% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 31m 0s. Estimated total time: 7h 33m 34s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 47s. +[2026-03-25 16:36:01,956][__main__][INFO] - Starting iteration 123. +[2026-03-25 16:36:01,959][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:36:01,960][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:36:05,178][__main__][INFO] - Number of regex retries in iteration 123: 0 +[2026-03-25 16:36:05,179][__main__][INFO] - agents played in iteration 123 are Bob, Alice +[2026-03-25 16:36:05,724][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:36:06,374][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:36:06,664][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:36:06,985][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:36:07,306][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:36:07,627][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:36:07,949][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:36:08,269][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:36:08,590][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:36:08,911][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:36:09,232][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:36:09,553][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:36:09,874][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:36:10,193][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:36:10,514][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:36:10,834][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:36:11,155][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:36:11,477][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:36:11,798][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:36:12,118][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:36:12,438][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:36:12,759][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:36:13,079][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:36:13,398][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:36:13,719][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:36:14,039][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:36:14,360][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:36:14,679][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:36:15,000][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:36:15,320][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:36:15,641][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:36:15,961][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:36:16,281][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:36:16,601][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:36:16,920][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:36:17,240][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:36:17,562][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:36:17,882][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:36:18,204][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:36:18,524][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:36:18,843][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:36:19,165][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:36:19,488][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:36:19,809][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:36:20,130][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:36:20,451][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:36:20,773][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:36:21,094][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:36:21,414][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:36:21,734][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:36:22,055][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:36:22,375][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:36:22,695][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:36:23,307][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:36:23,627][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:36:23,948][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:36:24,269][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:36:24,588][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:36:24,909][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:36:25,230][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:36:25,550][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:36:25,870][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:36:26,190][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:36:26,509][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:36:26,830][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:36:27,149][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:36:27,803][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:36:28,524][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:36:28,526][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:36:28,528][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:36:29,153][__main__][INFO] - Iteration 124 took 27s (11.84% Gen, 85.86% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 30m 13s. Estimated total time: 7h 33m 14s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 37s. +[2026-03-25 16:36:29,156][__main__][INFO] - Starting iteration 124. +[2026-03-25 16:36:29,158][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:36:29,159][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:36:32,401][__main__][INFO] - Number of regex retries in iteration 124: 0 +[2026-03-25 16:36:32,402][__main__][INFO] - agents played in iteration 124 are Bob, Alice +[2026-03-25 16:36:32,944][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:36:33,594][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:36:33,884][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:36:34,205][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:36:34,527][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:36:34,851][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:36:35,171][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:36:35,492][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:36:35,812][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:36:36,132][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:36:36,453][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:36:36,773][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:36:37,093][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:36:37,413][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:36:37,734][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:36:38,053][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:36:38,373][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:36:38,695][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:36:39,015][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:36:39,334][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:36:39,654][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:36:39,974][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:36:40,295][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:36:40,616][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:36:40,937][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:36:41,257][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:36:41,578][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:36:41,898][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:36:42,218][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:36:42,539][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:36:42,860][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:36:43,180][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:36:43,500][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:36:43,822][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:36:44,142][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:36:44,464][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:36:44,785][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:36:45,107][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:36:45,428][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:36:45,749][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:36:46,071][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:36:46,392][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:36:46,713][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:36:47,034][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:36:47,355][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:36:47,675][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:36:47,996][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:36:48,315][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:36:48,635][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:36:48,955][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:36:49,276][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:36:49,596][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:36:49,917][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:36:50,534][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:36:50,853][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:36:51,174][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:36:51,493][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:36:51,814][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:36:52,134][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:36:52,454][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:36:52,775][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:36:53,097][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:36:53,418][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:36:53,739][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:36:54,060][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:36:54,381][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:36:55,044][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:36:55,765][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:36:55,767][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:36:55,769][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:36:56,396][__main__][INFO] - Iteration 125 took 27s (11.91% Gen, 85.79% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 30m 29s. Estimated total time: 7h 33m 58s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 59s. +[2026-03-25 16:36:56,398][__main__][INFO] - Starting iteration 125. +[2026-03-25 16:36:56,401][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:36:56,402][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:36:59,638][__main__][INFO] - Number of regex retries in iteration 125: 0 +[2026-03-25 16:36:59,639][__main__][INFO] - agents played in iteration 125 are Bob, Alice +[2026-03-25 16:37:00,183][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:37:00,841][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:37:01,131][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:37:01,452][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:37:01,773][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:37:02,092][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:37:02,412][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:37:02,732][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:37:03,053][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:37:03,375][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:37:03,694][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:37:04,014][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:37:04,335][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:37:04,656][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:37:04,978][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:37:05,298][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:37:05,619][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:37:05,940][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:37:06,260][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:37:06,580][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:37:06,899][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:37:07,221][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:37:07,541][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:37:07,861][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:37:08,182][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:37:08,503][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:37:08,823][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:37:09,145][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:37:09,467][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:37:09,789][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:37:10,110][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:37:10,431][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:37:10,752][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:37:11,073][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:37:11,394][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:37:11,715][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:37:12,035][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:37:12,356][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:37:12,678][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:37:12,998][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:37:13,318][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:37:13,638][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:37:13,960][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:37:14,280][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:37:14,599][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:37:14,921][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:37:15,241][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:37:15,562][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:37:15,882][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:37:16,203][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:37:16,524][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:37:16,845][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:37:17,166][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:37:17,784][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:37:18,107][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:37:18,430][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:37:18,749][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:37:19,072][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:37:19,393][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:37:19,713][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:37:20,035][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:37:20,357][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:37:20,678][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:37:21,000][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:37:21,321][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:37:21,643][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:37:22,304][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:37:23,033][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:37:23,036][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:37:23,037][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:37:23,663][__main__][INFO] - Iteration 126 took 27s (11.87% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 30m 27s. Estimated total time: 7h 34m 23s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 11s. +[2026-03-25 16:37:23,666][__main__][INFO] - Starting iteration 126. +[2026-03-25 16:37:23,669][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:37:23,669][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:37:26,886][__main__][INFO] - Number of regex retries in iteration 126: 0 +[2026-03-25 16:37:26,887][__main__][INFO] - agents played in iteration 126 are Bob, Alice +[2026-03-25 16:37:27,422][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:37:28,081][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:37:28,371][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:37:28,691][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:37:29,011][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:37:29,332][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:37:29,653][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:37:29,974][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:37:30,294][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:37:30,614][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:37:30,934][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:37:31,254][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:37:31,573][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:37:31,893][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:37:32,213][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:37:32,534][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:37:32,854][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:37:33,174][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:37:33,493][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:37:33,813][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:37:34,134][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:37:34,454][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:37:34,774][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:37:35,094][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:37:35,414][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:37:35,735][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:37:36,055][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:37:36,374][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:37:36,694][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:37:37,014][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:37:37,334][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:37:37,655][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:37:37,975][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:37:38,296][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:37:38,617][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:37:38,936][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:37:39,257][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:37:39,578][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:37:39,899][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:37:40,219][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:37:40,538][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:37:40,858][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:37:41,180][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:37:41,500][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:37:41,821][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:37:42,141][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:37:42,462][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:37:42,783][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:37:43,103][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:37:43,424][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:37:43,743][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:37:44,064][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:37:44,384][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:37:45,000][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:37:45,322][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:37:45,643][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:37:45,965][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:37:46,288][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:37:46,608][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:37:46,929][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:37:47,250][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:37:47,570][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:37:47,891][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:37:48,211][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:37:48,532][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:37:48,852][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:37:49,506][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:37:50,226][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:37:50,228][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:37:50,230][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:37:50,856][__main__][INFO] - Iteration 127 took 27s (11.83% Gen, 85.86% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 28m 45s. Estimated total time: 7h 33m 8s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 34s. +[2026-03-25 16:37:50,859][__main__][INFO] - Starting iteration 127. +[2026-03-25 16:37:50,862][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:37:50,862][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:37:54,082][__main__][INFO] - Number of regex retries in iteration 127: 0 +[2026-03-25 16:37:54,083][__main__][INFO] - agents played in iteration 127 are Bob, Alice +[2026-03-25 16:37:54,619][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:37:55,269][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:37:55,559][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:37:55,880][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:37:56,200][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:37:56,520][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:37:56,840][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:37:57,160][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:37:57,481][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:37:57,802][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:37:58,122][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:37:58,442][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:37:58,763][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:37:59,083][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:37:59,403][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:37:59,723][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:38:00,042][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:38:00,364][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:38:00,685][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:38:01,006][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:38:01,326][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:38:01,649][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:38:01,971][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:38:02,291][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:38:02,613][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:38:02,933][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:38:03,254][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:38:03,575][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:38:03,895][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:38:04,215][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:38:04,534][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:38:04,854][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:38:05,175][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:38:05,495][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:38:05,816][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:38:06,135][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:38:06,455][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:38:06,775][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:38:07,095][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:38:07,416][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:38:07,737][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:38:08,056][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:38:08,379][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:38:08,699][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:38:09,019][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:38:09,339][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:38:09,660][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:38:09,980][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:38:10,301][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:38:10,623][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:38:10,943][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:38:11,264][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:38:11,585][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:38:12,196][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:38:12,516][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:38:12,836][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:38:13,156][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:38:13,478][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:38:13,798][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:38:14,120][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:38:14,440][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:38:14,761][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:38:15,081][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:38:15,401][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:38:15,722][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:38:16,042][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:38:16,695][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:38:17,416][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:38:17,419][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:38:17,420][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:38:18,044][__main__][INFO] - Iteration 128 took 27s (11.85% Gen, 85.85% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 28m 12s. Estimated total time: 7h 33m 3s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 31s. +[2026-03-25 16:38:18,046][__main__][INFO] - Starting iteration 128. +[2026-03-25 16:38:18,049][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:38:18,049][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:38:21,304][__main__][INFO] - Number of regex retries in iteration 128: 0 +[2026-03-25 16:38:21,305][__main__][INFO] - agents played in iteration 128 are Bob, Alice +[2026-03-25 16:38:21,861][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:38:22,510][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:38:22,800][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:38:23,121][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:38:23,440][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:38:23,760][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:38:24,081][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:38:24,401][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:38:24,722][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:38:25,042][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:38:25,364][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:38:25,686][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:38:26,005][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:38:26,326][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:38:26,645][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:38:26,966][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:38:27,287][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:38:27,608][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:38:27,929][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:38:28,250][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:38:28,572][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:38:28,893][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:38:29,213][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:38:29,533][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:38:29,854][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:38:30,174][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:38:30,494][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:38:30,813][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:38:31,134][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:38:31,455][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:38:31,776][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:38:32,096][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:38:32,415][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:38:32,735][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:38:33,056][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:38:33,377][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:38:33,696][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:38:34,015][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:38:34,335][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:38:34,657][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:38:34,978][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:38:35,298][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:38:35,618][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:38:35,939][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:38:36,258][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:38:36,578][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:38:36,900][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:38:37,219][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:38:37,540][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:38:37,860][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:38:38,180][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:38:38,501][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:38:38,821][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:38:39,431][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:38:39,751][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:38:40,072][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:38:40,392][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:38:40,712][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:38:41,032][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:38:41,351][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:38:41,672][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:38:41,993][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:38:42,314][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:38:42,635][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:38:42,957][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:38:43,279][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:38:43,931][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:38:44,651][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:38:44,654][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:38:44,655][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:38:45,279][__main__][INFO] - Iteration 129 took 27s (11.95% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 28m 33s. Estimated total time: 7h 33m 51s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 55s. +[2026-03-25 16:38:45,281][__main__][INFO] - Starting iteration 129. +[2026-03-25 16:38:45,284][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:38:45,285][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:38:48,511][__main__][INFO] - Number of regex retries in iteration 129: 0 +[2026-03-25 16:38:48,512][__main__][INFO] - agents played in iteration 129 are Bob, Alice +[2026-03-25 16:38:49,058][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:38:49,706][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:38:49,997][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:38:50,318][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:38:50,638][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:38:50,959][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:38:51,279][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:38:51,600][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:38:51,920][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:38:52,242][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:38:52,564][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:38:52,887][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:38:53,207][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:38:53,528][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:38:53,848][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:38:54,170][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:38:54,490][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:38:54,810][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:38:55,130][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:38:55,451][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:38:55,772][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:38:56,093][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:38:56,414][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:38:56,733][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:38:57,054][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:38:57,373][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:38:57,694][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:38:58,014][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:38:58,336][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:38:58,658][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:38:58,977][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:38:59,298][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:38:59,618][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:38:59,940][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:39:00,261][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:39:00,581][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:39:00,901][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:39:01,221][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:39:01,543][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:39:01,864][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:39:02,184][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:39:02,505][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:39:02,826][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:39:03,147][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:39:03,468][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:39:03,789][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:39:04,110][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:39:04,431][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:39:04,752][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:39:05,073][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:39:05,393][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:39:05,713][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:39:06,034][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:39:06,649][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:39:06,969][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:39:07,289][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:39:07,608][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:39:07,929][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:39:08,249][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:39:08,570][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:39:08,891][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:39:09,212][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:39:09,531][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:39:09,851][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:39:10,173][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:39:10,494][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:39:11,146][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:39:11,873][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:39:11,875][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:39:11,877][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:39:12,503][__main__][INFO] - Iteration 130 took 27s (11.86% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 27m 54s. Estimated total time: 7h 33m 39s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 49s. +[2026-03-25 16:39:12,505][__main__][INFO] - Starting iteration 130. +[2026-03-25 16:39:12,508][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:39:12,509][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:39:15,730][__main__][INFO] - Number of regex retries in iteration 130: 0 +[2026-03-25 16:39:15,731][__main__][INFO] - agents played in iteration 130 are Bob, Alice +[2026-03-25 16:39:16,272][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:39:16,930][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:39:17,221][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:39:17,543][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:39:17,864][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:39:18,183][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:39:18,503][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:39:18,823][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:39:19,143][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:39:19,464][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:39:19,786][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:39:20,106][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:39:20,427][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:39:20,748][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:39:21,067][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:39:21,388][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:39:21,708][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:39:22,030][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:39:22,350][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:39:22,670][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:39:22,990][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:39:23,310][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:39:23,632][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:39:23,953][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:39:24,274][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:39:24,594][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:39:24,914][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:39:25,234][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:39:25,555][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:39:25,875][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:39:26,195][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:39:26,516][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:39:26,837][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:39:27,158][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:39:27,479][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:39:27,800][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:39:28,121][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:39:28,441][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:39:28,762][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:39:29,083][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:39:29,404][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:39:29,725][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:39:30,047][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:39:30,368][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:39:30,688][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:39:31,009][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:39:31,330][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:39:31,651][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:39:31,971][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:39:32,291][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:39:32,613][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:39:32,934][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:39:33,255][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:39:33,867][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:39:34,187][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:39:34,509][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:39:34,830][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:39:35,150][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:39:35,471][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:39:35,791][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:39:36,113][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:39:36,434][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:39:36,755][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:39:37,075][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:39:37,395][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:39:37,716][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:39:38,369][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:39:39,091][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:39:39,093][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:39:39,095][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:39:39,721][__main__][INFO] - Iteration 131 took 27s (11.84% Gen, 85.85% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 27m 21s. Estimated total time: 7h 33m 33s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 46s. +[2026-03-25 16:39:39,723][__main__][INFO] - Starting iteration 131. +[2026-03-25 16:39:39,726][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:39:39,727][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:39:42,971][__main__][INFO] - Number of regex retries in iteration 131: 0 +[2026-03-25 16:39:42,972][__main__][INFO] - agents played in iteration 131 are Bob, Alice +[2026-03-25 16:39:43,522][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:39:44,171][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:39:44,463][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:39:44,785][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:39:45,107][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:39:45,428][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:39:45,748][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:39:46,069][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:39:46,389][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:39:46,709][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:39:47,029][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:39:47,351][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:39:47,673][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:39:47,994][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:39:48,316][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:39:48,637][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:39:48,958][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:39:49,279][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:39:49,600][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:39:49,919][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:39:50,241][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:39:50,561][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:39:50,882][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:39:51,203][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:39:51,526][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:39:51,847][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:39:52,168][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:39:52,488][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:39:52,809][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:39:53,130][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:39:53,450][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:39:53,772][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:39:54,093][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:39:54,414][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:39:54,735][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:39:55,056][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:39:55,377][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:39:55,699][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:39:56,020][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:39:56,340][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:39:56,660][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:39:56,980][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:39:57,301][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:39:57,623][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:39:57,944][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:39:58,266][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:39:58,587][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:39:58,908][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:39:59,229][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:39:59,549][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:39:59,869][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:40:00,190][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:40:00,510][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:40:01,123][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:40:01,443][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:40:01,764][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:40:02,086][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:40:02,407][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:40:02,729][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:40:03,049][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:40:03,370][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:40:03,691][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:40:04,012][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:40:04,333][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:40:04,654][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:40:04,974][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:40:05,634][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:40:06,355][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:40:06,357][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:40:06,359][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:40:06,987][__main__][INFO] - Iteration 132 took 27s (11.90% Gen, 85.79% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 27m 42s. Estimated total time: 7h 34m 21s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 10s. +[2026-03-25 16:40:06,989][__main__][INFO] - Starting iteration 132. +[2026-03-25 16:40:06,992][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:40:06,993][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:40:10,226][__main__][INFO] - Number of regex retries in iteration 132: 0 +[2026-03-25 16:40:10,227][__main__][INFO] - agents played in iteration 132 are Bob, Alice +[2026-03-25 16:40:10,773][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:40:11,429][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:40:11,718][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:40:12,040][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:40:12,360][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:40:12,681][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:40:13,002][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:40:13,322][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:40:13,643][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:40:13,964][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:40:14,283][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:40:14,603][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:40:14,922][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:40:15,244][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:40:15,565][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:40:15,886][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:40:16,208][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:40:16,529][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:40:16,849][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:40:17,169][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:40:17,490][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:40:17,811][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:40:18,130][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:40:18,452][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:40:18,773][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:40:19,094][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:40:19,415][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:40:19,736][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:40:20,058][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:40:20,379][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:40:20,699][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:40:21,021][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:40:21,341][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:40:21,663][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:40:21,983][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:40:22,303][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:40:22,622][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:40:22,941][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:40:23,261][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:40:23,582][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:40:23,903][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:40:24,224][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:40:24,544][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:40:24,866][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:40:25,188][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:40:25,508][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:40:25,829][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:40:26,149][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:40:26,471][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:40:26,792][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:40:27,112][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:40:27,432][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:40:27,752][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:40:28,367][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:40:28,688][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:40:29,008][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:40:29,329][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:40:29,649][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:40:29,971][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:40:30,291][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:40:30,611][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:40:30,933][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:40:31,253][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:40:31,573][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:40:31,893][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:40:32,215][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:40:32,880][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:40:33,608][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:40:33,611][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:40:33,612][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:40:34,244][__main__][INFO] - Iteration 133 took 27s (11.87% Gen, 85.81% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 27m 6s. Estimated total time: 7h 34m 12s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 6s. +[2026-03-25 16:40:34,246][__main__][INFO] - Starting iteration 133. +[2026-03-25 16:40:34,249][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:40:34,249][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:40:37,480][__main__][INFO] - Number of regex retries in iteration 133: 0 +[2026-03-25 16:40:37,481][__main__][INFO] - agents played in iteration 133 are Bob, Alice +[2026-03-25 16:40:38,028][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:40:38,679][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:40:38,969][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:40:39,290][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:40:39,610][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:40:39,931][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:40:40,251][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:40:40,572][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:40:40,893][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:40:41,213][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:40:41,533][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:40:41,854][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:40:42,175][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:40:42,494][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:40:42,815][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:40:43,137][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:40:43,458][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:40:43,779][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:40:44,100][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:40:44,420][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:40:44,740][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:40:45,061][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:40:45,383][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:40:45,705][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:40:46,026][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:40:46,348][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:40:46,668][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:40:46,989][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:40:47,311][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:40:47,632][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:40:47,953][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:40:48,273][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:40:48,593][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:40:48,914][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:40:49,236][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:40:49,557][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:40:49,878][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:40:50,198][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:40:50,519][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:40:50,839][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:40:51,160][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:40:51,481][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:40:51,802][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:40:52,122][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:40:52,443][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:40:52,764][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:40:53,085][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:40:53,407][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:40:53,728][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:40:54,050][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:40:54,370][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:40:54,691][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:40:55,013][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:40:55,625][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:40:55,947][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:40:56,267][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:40:56,587][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:40:56,909][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:40:57,230][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:40:57,551][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:40:57,872][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:40:58,192][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:40:58,513][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:40:58,833][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:40:59,153][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:40:59,474][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:41:00,127][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:41:00,855][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:41:00,857][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:41:00,859][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:41:01,483][__main__][INFO] - Iteration 134 took 27s (11.86% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 26m 21s. Estimated total time: 7h 33m 55s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 57s. +[2026-03-25 16:41:01,486][__main__][INFO] - Starting iteration 134. +[2026-03-25 16:41:01,489][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:41:01,490][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:41:04,744][__main__][INFO] - Number of regex retries in iteration 134: 0 +[2026-03-25 16:41:04,745][__main__][INFO] - agents played in iteration 134 are Bob, Alice +[2026-03-25 16:41:05,285][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:41:05,935][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:41:06,225][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:41:06,546][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:41:06,866][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:41:07,188][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:41:07,508][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:41:07,828][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:41:08,150][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:41:08,469][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:41:08,790][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:41:09,111][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:41:09,432][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:41:09,753][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:41:10,074][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:41:10,393][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:41:10,714][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:41:11,033][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:41:11,354][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:41:11,674][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:41:11,994][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:41:12,315][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:41:12,634][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:41:12,954][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:41:13,275][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:41:13,595][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:41:13,915][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:41:14,234][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:41:14,554][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:41:14,875][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:41:15,195][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:41:15,516][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:41:15,837][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:41:16,157][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:41:16,477][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:41:16,798][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:41:17,117][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:41:17,439][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:41:17,759][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:41:18,078][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:41:18,399][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:41:18,720][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:41:19,040][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:41:19,362][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:41:19,682][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:41:20,003][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:41:20,323][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:41:20,644][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:41:20,966][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:41:21,287][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:41:21,609][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:41:21,931][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:41:22,251][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:41:22,867][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:41:23,187][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:41:23,507][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:41:23,828][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:41:24,150][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:41:24,471][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:41:24,793][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:41:25,113][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:41:25,433][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:41:25,753][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:41:26,074][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:41:26,394][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:41:26,714][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:41:27,369][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:41:28,097][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:41:28,100][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:41:28,101][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:41:28,726][__main__][INFO] - Iteration 135 took 27s (11.95% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 25m 56s. Estimated total time: 7h 33m 57s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 58s. +[2026-03-25 16:41:28,728][__main__][INFO] - Starting iteration 135. +[2026-03-25 16:41:28,731][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:41:28,732][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:41:31,971][__main__][INFO] - Number of regex retries in iteration 135: 0 +[2026-03-25 16:41:31,972][__main__][INFO] - agents played in iteration 135 are Bob, Alice +[2026-03-25 16:41:32,540][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:41:33,191][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:41:33,481][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:41:33,801][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:41:34,121][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:41:34,442][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:41:34,761][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:41:35,084][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:41:35,406][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:41:35,726][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:41:36,046][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:41:36,367][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:41:36,688][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:41:37,008][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:41:37,329][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:41:37,651][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:41:37,973][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:41:38,294][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:41:38,615][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:41:38,935][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:41:39,255][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:41:39,576][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:41:39,896][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:41:40,216][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:41:40,535][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:41:40,855][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:41:41,175][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:41:41,494][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:41:41,814][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:41:42,133][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:41:42,454][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:41:42,773][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:41:43,093][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:41:43,414][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:41:43,734][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:41:44,055][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:41:44,376][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:41:44,696][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:41:45,016][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:41:45,336][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:41:45,656][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:41:45,976][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:41:46,295][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:41:46,615][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:41:46,935][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:41:47,256][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:41:47,577][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:41:47,897][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:41:48,217][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:41:48,538][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:41:48,858][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:41:49,178][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:41:49,499][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:41:50,111][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:41:50,431][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:41:50,752][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:41:51,073][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:41:51,394][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:41:51,714][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:41:52,035][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:41:52,356][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:41:52,676][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:41:52,996][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:41:53,317][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:41:53,636][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:41:53,957][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:41:54,612][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:41:55,341][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:41:55,343][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:41:55,345][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:41:55,968][__main__][INFO] - Iteration 136 took 27s (11.90% Gen, 85.81% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 25m 29s. Estimated total time: 7h 33m 58s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 59s. +[2026-03-25 16:41:55,971][__main__][INFO] - Starting iteration 136. +[2026-03-25 16:41:55,974][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:41:55,974][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:41:59,253][__main__][INFO] - Number of regex retries in iteration 136: 0 +[2026-03-25 16:41:59,254][__main__][INFO] - agents played in iteration 136 are Bob, Alice +[2026-03-25 16:41:59,803][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:42:00,453][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:42:00,745][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:42:01,066][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:42:01,387][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:42:01,706][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:42:02,027][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:42:02,348][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:42:02,669][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:42:02,991][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:42:03,312][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:42:03,633][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:42:03,953][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:42:04,274][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:42:04,595][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:42:04,915][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:42:05,237][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:42:05,557][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:42:05,876][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:42:06,196][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:42:06,517][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:42:06,838][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:42:07,159][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:42:07,480][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:42:07,800][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:42:08,120][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:42:08,442][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:42:08,762][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:42:09,083][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:42:09,402][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:42:09,721][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:42:10,042][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:42:10,362][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:42:10,682][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:42:11,002][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:42:11,322][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:42:11,643][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:42:11,964][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:42:12,284][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:42:12,605][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:42:12,925][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:42:13,248][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:42:13,569][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:42:13,890][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:42:14,211][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:42:14,532][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:42:14,853][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:42:15,174][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:42:15,493][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:42:15,813][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:42:16,134][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:42:16,453][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:42:16,774][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:42:17,388][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:42:17,707][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:42:18,027][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:42:18,348][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:42:18,668][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:42:18,988][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:42:19,309][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:42:19,631][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:42:19,952][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:42:20,273][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:42:20,592][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:42:20,913][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:42:21,235][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:42:21,890][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:42:22,611][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:42:22,613][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:42:22,615][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:42:23,238][__main__][INFO] - Iteration 137 took 27s (12.03% Gen, 85.68% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 25m 29s. Estimated total time: 7h 34m 25s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 12s. +[2026-03-25 16:42:23,240][__main__][INFO] - Starting iteration 137. +[2026-03-25 16:42:23,243][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:42:23,244][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:42:26,539][__main__][INFO] - Number of regex retries in iteration 137: 0 +[2026-03-25 16:42:26,540][__main__][INFO] - agents played in iteration 137 are Bob, Alice +[2026-03-25 16:42:27,088][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:42:27,740][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:42:28,033][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:42:28,354][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:42:28,675][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:42:28,995][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:42:29,317][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:42:29,637][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:42:29,957][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:42:30,279][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:42:30,599][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:42:30,919][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:42:31,239][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:42:31,560][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:42:31,882][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:42:32,202][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:42:32,521][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:42:32,841][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:42:33,162][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:42:33,482][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:42:33,802][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:42:34,123][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:42:34,442][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:42:34,763][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:42:35,083][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:42:35,403][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:42:35,724][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:42:36,044][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:42:36,364][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:42:36,684][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:42:37,003][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:42:37,323][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:42:37,643][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:42:37,964][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:42:38,283][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:42:38,604][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:42:38,925][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:42:39,247][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:42:39,566][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:42:39,887][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:42:40,208][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:42:40,528][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:42:40,850][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:42:41,170][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:42:41,489][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:42:41,810][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:42:42,130][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:42:42,450][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:42:42,770][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:42:43,089][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:42:43,408][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:42:43,728][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:42:44,049][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:42:44,662][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:42:44,982][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:42:45,302][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:42:45,623][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:42:45,943][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:42:46,263][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:42:46,584][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:42:46,903][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:42:47,222][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:42:47,544][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:42:47,865][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:42:48,186][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:42:48,506][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:42:49,160][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:42:49,923][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:42:49,925][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:42:49,927][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:42:50,555][__main__][INFO] - Iteration 138 took 27s (12.07% Gen, 85.63% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 25m 49s. Estimated total time: 7h 35m 12s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 36s. +[2026-03-25 16:42:50,557][__main__][INFO] - Starting iteration 138. +[2026-03-25 16:42:50,560][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:42:50,560][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:42:53,828][__main__][INFO] - Number of regex retries in iteration 138: 0 +[2026-03-25 16:42:53,829][__main__][INFO] - agents played in iteration 138 are Bob, Alice +[2026-03-25 16:42:54,386][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:42:55,543][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:42:55,833][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:42:56,155][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:42:56,475][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:42:56,794][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:42:57,115][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:42:57,434][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:42:57,755][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:42:58,075][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:42:58,394][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:42:58,715][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:42:59,034][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:42:59,355][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:42:59,676][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:42:59,997][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:43:00,317][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:43:00,637][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:43:00,958][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:43:01,279][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:43:01,599][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:43:01,919][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:43:02,240][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:43:02,560][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:43:02,880][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:43:03,201][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:43:03,521][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:43:03,841][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:43:04,162][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:43:04,484][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:43:04,804][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:43:05,123][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:43:05,444][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:43:05,765][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:43:06,086][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:43:06,408][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:43:06,729][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:43:07,050][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:43:07,371][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:43:07,691][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:43:08,012][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:43:08,331][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:43:08,652][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:43:08,971][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:43:09,291][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:43:09,612][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:43:09,933][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:43:10,254][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:43:10,575][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:43:10,895][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:43:11,214][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:43:11,535][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:43:11,856][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:43:12,473][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:43:12,793][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:43:13,115][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:43:13,434][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:43:13,755][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:43:14,076][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:43:14,396][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:43:14,715][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:43:15,036][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:43:15,356][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:43:15,675][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:43:15,997][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:43:16,319][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:43:16,983][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:43:17,709][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:43:17,711][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:43:17,713][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:43:18,341][__main__][INFO] - Iteration 139 took 27s (11.77% Gen, 85.97% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 33m 11s. Estimated total time: 7h 43m 2s. Time estimates for 10 more iterations: 4m 37s, 100 more iterations: 46m 18s, 500 more iterations: 3h 51m 31s. +[2026-03-25 16:43:18,343][__main__][INFO] - Starting iteration 139. +[2026-03-25 16:43:18,346][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:43:18,347][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:43:21,614][__main__][INFO] - Number of regex retries in iteration 139: 0 +[2026-03-25 16:43:21,615][__main__][INFO] - agents played in iteration 139 are Bob, Alice +[2026-03-25 16:43:22,170][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:43:22,829][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:43:23,119][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:43:23,441][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:43:23,761][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:43:24,083][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:43:24,402][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:43:24,722][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:43:25,043][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:43:25,364][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:43:25,684][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:43:26,004][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:43:26,326][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:43:26,648][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:43:26,969][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:43:27,289][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:43:27,610][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:43:27,930][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:43:28,250][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:43:28,571][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:43:28,891][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:43:29,212][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:43:29,533][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:43:29,854][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:43:30,175][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:43:30,495][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:43:30,817][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:43:31,136][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:43:31,456][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:43:31,775][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:43:32,095][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:43:32,416][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:43:32,736][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:43:33,057][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:43:33,376][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:43:33,696][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:43:34,017][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:43:34,337][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:43:34,658][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:43:34,977][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:43:35,298][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:43:35,620][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:43:35,940][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:43:36,261][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:43:36,581][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:43:36,901][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:43:37,221][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:43:37,540][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:43:37,861][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:43:38,182][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:43:38,503][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:43:38,823][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:43:39,144][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:43:39,758][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:43:40,079][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:43:40,400][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:43:40,720][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:43:41,039][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:43:41,360][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:43:41,681][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:43:42,001][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:43:42,322][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:43:42,643][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:43:42,964][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:43:43,283][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:43:43,604][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:43:44,258][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:43:44,988][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:43:44,990][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:43:44,992][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:43:45,618][__main__][INFO] - Iteration 140 took 27s (11.98% Gen, 85.72% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 24m 14s. Estimated total time: 7h 34m 32s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 16s. +[2026-03-25 16:43:45,620][__main__][INFO] - Starting iteration 140. +[2026-03-25 16:43:45,623][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:43:45,624][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:43:48,904][__main__][INFO] - Number of regex retries in iteration 140: 0 +[2026-03-25 16:43:48,905][__main__][INFO] - agents played in iteration 140 are Bob, Alice +[2026-03-25 16:43:49,456][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:43:50,113][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:43:50,402][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:43:50,725][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:43:51,045][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:43:51,366][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:43:51,686][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:43:52,007][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:43:52,328][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:43:52,649][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:43:52,970][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:43:53,290][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:43:53,611][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:43:53,933][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:43:54,255][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:43:54,577][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:43:54,898][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:43:55,218][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:43:55,538][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:43:55,858][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:43:56,179][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:43:56,500][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:43:56,819][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:43:57,139][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:43:57,459][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:43:57,781][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:43:58,102][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:43:58,422][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:43:58,744][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:43:59,063][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:43:59,384][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:43:59,704][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:44:00,024][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:44:00,345][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:44:00,665][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:44:00,986][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:44:01,308][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:44:01,628][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:44:01,948][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:44:02,270][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:44:02,590][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:44:02,911][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:44:03,232][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:44:03,552][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:44:03,873][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:44:04,193][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:44:04,515][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:44:04,835][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:44:05,155][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:44:05,475][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:44:05,796][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:44:06,116][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:44:06,438][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:44:07,057][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:44:07,378][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:44:07,699][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:44:08,020][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:44:08,342][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:44:08,662][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:44:08,983][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:44:09,303][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:44:09,624][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:44:09,944][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:44:10,264][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:44:10,584][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:44:10,905][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:44:11,569][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:44:12,296][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:44:12,298][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:44:12,299][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:44:12,929][__main__][INFO] - Iteration 141 took 27s (12.02% Gen, 85.67% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 24m 21s. Estimated total time: 7h 35m 7s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 33s. +[2026-03-25 16:44:12,932][__main__][INFO] - Starting iteration 141. +[2026-03-25 16:44:12,935][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:44:12,935][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:44:16,210][__main__][INFO] - Number of regex retries in iteration 141: 0 +[2026-03-25 16:44:16,211][__main__][INFO] - agents played in iteration 141 are Bob, Alice +[2026-03-25 16:44:16,769][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:44:17,429][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:44:17,719][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:44:18,041][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:44:18,361][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:44:18,681][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:44:19,001][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:44:19,321][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:44:19,642][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:44:19,961][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:44:20,281][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:44:20,602][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:44:20,923][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:44:21,245][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:44:21,565][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:44:21,887][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:44:22,208][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:44:22,529][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:44:22,850][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:44:23,169][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:44:23,488][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:44:23,809][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:44:24,129][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:44:24,450][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:44:24,771][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:44:25,090][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:44:25,411][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:44:25,731][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:44:26,052][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:44:26,371][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:44:26,691][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:44:27,011][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:44:27,331][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:44:27,651][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:44:27,971][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:44:28,290][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:44:28,610][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:44:28,930][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:44:29,250][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:44:29,570][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:44:29,892][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:44:30,211][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:44:30,533][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:44:30,855][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:44:31,176][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:44:31,497][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:44:31,816][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:44:32,136][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:44:32,456][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:44:32,776][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:44:33,096][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:44:33,416][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:44:33,736][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:44:34,353][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:44:34,673][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:44:34,994][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:44:35,314][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:44:35,635][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:44:35,956][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:44:36,276][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:44:36,597][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:44:36,917][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:44:37,238][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:44:37,558][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:44:37,879][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:44:38,199][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:44:38,865][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:44:39,595][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:44:39,597][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:44:39,599][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:44:40,228][__main__][INFO] - Iteration 142 took 27s (12.00% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 23m 41s. Estimated total time: 7h 34m 54s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 27s. +[2026-03-25 16:44:40,230][__main__][INFO] - Starting iteration 142. +[2026-03-25 16:44:40,233][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:44:40,234][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:44:43,525][__main__][INFO] - Number of regex retries in iteration 142: 0 +[2026-03-25 16:44:43,526][__main__][INFO] - agents played in iteration 142 are Bob, Alice +[2026-03-25 16:44:44,097][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:44:44,761][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:44:45,051][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:44:45,374][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:44:45,693][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:44:46,013][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:44:46,333][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:44:46,654][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:44:46,975][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:44:47,295][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:44:47,614][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:44:47,935][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:44:48,256][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:44:48,575][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:44:48,895][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:44:49,217][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:44:49,536][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:44:49,856][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:44:50,177][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:44:50,498][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:44:50,818][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:44:51,139][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:44:51,460][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:44:51,781][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:44:52,101][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:44:52,421][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:44:52,742][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:44:53,061][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:44:53,382][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:44:53,702][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:44:54,023][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:44:54,343][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:44:54,665][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:44:54,986][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:44:55,309][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:44:55,630][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:44:55,951][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:44:56,273][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:44:56,594][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:44:56,914][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:44:57,234][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:44:57,555][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:44:57,874][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:44:58,194][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:44:58,515][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:44:58,835][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:44:59,157][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:44:59,480][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:44:59,800][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:45:00,120][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:45:00,442][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:45:00,761][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:45:01,081][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:45:01,698][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:45:02,017][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:45:02,337][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:45:02,658][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:45:02,979][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:45:03,301][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:45:03,621][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:45:03,942][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:45:04,262][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:45:04,582][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:45:04,902][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:45:05,222][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:45:05,543][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:45:06,206][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:45:06,934][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:45:06,937][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:45:06,938][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:45:07,567][__main__][INFO] - Iteration 143 took 27s (12.04% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 23m 54s. Estimated total time: 7h 35m 34s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 47s. +[2026-03-25 16:45:07,570][__main__][INFO] - Starting iteration 143. +[2026-03-25 16:45:07,573][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:45:07,573][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:45:10,839][__main__][INFO] - Number of regex retries in iteration 143: 0 +[2026-03-25 16:45:10,840][__main__][INFO] - agents played in iteration 143 are Bob, Alice +[2026-03-25 16:45:11,385][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:45:12,044][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:45:12,334][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:45:12,657][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:45:12,979][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:45:13,298][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:45:13,619][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:45:13,940][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:45:14,261][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:45:14,582][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:45:14,903][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:45:15,224][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:45:15,545][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:45:15,867][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:45:16,189][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:45:16,509][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:45:16,830][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:45:17,151][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:45:17,472][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:45:17,793][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:45:18,114][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:45:18,435][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:45:18,757][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:45:19,078][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:45:19,400][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:45:19,720][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:45:20,042][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:45:20,364][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:45:20,685][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:45:21,005][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:45:21,327][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:45:21,648][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:45:21,969][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:45:22,289][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:45:22,610][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:45:22,931][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:45:23,251][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:45:23,572][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:45:23,893][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:45:24,214][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:45:24,534][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:45:24,856][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:45:25,176][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:45:25,495][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:45:25,816][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:45:26,136][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:45:26,457][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:45:26,779][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:45:27,099][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:45:27,420][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:45:27,741][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:45:28,061][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:45:28,382][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:45:29,000][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:45:29,320][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:45:29,640][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:45:29,961][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:45:30,281][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:45:30,600][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:45:30,920][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:45:31,240][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:45:31,561][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:45:31,881][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:45:32,202][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:45:32,523][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:45:32,844][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:45:33,506][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:45:34,242][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:45:34,244][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:45:34,246][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:45:34,867][__main__][INFO] - Iteration 144 took 27s (11.97% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 22m 48s. Estimated total time: 7h 34m 55s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 27s. +[2026-03-25 16:45:34,870][__main__][INFO] - Starting iteration 144. +[2026-03-25 16:45:34,872][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:45:34,873][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:45:38,241][__main__][INFO] - Number of regex retries in iteration 144: 0 +[2026-03-25 16:45:38,242][__main__][INFO] - agents played in iteration 144 are Bob, Alice +[2026-03-25 16:45:38,823][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:45:39,483][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:45:39,773][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:45:40,093][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:45:40,415][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:45:40,735][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:45:41,055][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:45:41,377][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:45:41,696][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:45:42,017][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:45:42,338][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:45:42,658][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:45:42,979][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:45:43,299][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:45:43,620][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:45:43,940][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:45:44,261][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:45:44,580][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:45:44,900][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:45:45,221][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:45:45,541][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:45:45,862][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:45:46,183][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:45:46,504][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:45:46,824][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:45:47,143][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:45:47,465][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:45:47,787][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:45:48,109][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:45:48,428][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:45:48,749][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:45:49,070][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:45:49,391][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:45:49,713][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:45:50,036][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:45:50,358][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:45:50,679][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:45:51,000][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:45:51,322][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:45:51,642][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:45:51,962][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:45:52,283][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:45:52,603][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:45:52,923][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:45:53,244][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:45:53,564][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:45:53,884][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:45:54,204][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:45:54,526][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:45:54,846][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:45:55,168][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:45:55,489][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:45:55,811][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:45:56,428][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:45:56,751][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:45:57,071][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:45:57,392][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:45:57,713][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:45:58,034][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:45:58,354][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:45:58,674][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:45:58,993][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:45:59,312][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:45:59,635][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:45:59,955][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:46:00,276][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:46:00,940][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:46:01,675][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:46:01,677][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:46:01,679][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:46:02,306][__main__][INFO] - Iteration 145 took 27s (12.28% Gen, 85.43% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 24m 40s. Estimated total time: 7h 37m 15s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 43s, 500 more iterations: 3h 48m 37s. +[2026-03-25 16:46:02,309][__main__][INFO] - Starting iteration 145. +[2026-03-25 16:46:02,312][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:46:02,312][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:46:05,591][__main__][INFO] - Number of regex retries in iteration 145: 0 +[2026-03-25 16:46:05,592][__main__][INFO] - agents played in iteration 145 are Bob, Alice +[2026-03-25 16:46:06,134][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:46:06,794][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:46:07,084][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:46:07,406][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:46:07,726][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:46:08,048][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:46:08,370][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:46:08,689][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:46:09,010][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:46:09,331][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:46:09,652][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:46:09,973][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:46:10,295][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:46:10,614][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:46:10,935][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:46:11,255][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:46:11,576][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:46:11,896][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:46:12,216][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:46:12,537][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:46:12,859][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:46:13,181][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:46:13,502][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:46:13,824][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:46:14,144][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:46:14,463][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:46:14,784][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:46:15,104][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:46:15,424][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:46:15,744][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:46:16,065][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:46:16,387][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:46:16,708][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:46:17,029][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:46:17,351][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:46:17,672][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:46:17,993][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:46:18,314][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:46:18,635][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:46:18,956][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:46:19,276][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:46:19,596][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:46:19,917][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:46:20,237][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:46:20,558][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:46:20,880][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:46:21,201][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:46:21,521][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:46:21,841][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:46:22,162][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:46:22,482][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:46:22,802][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:46:23,122][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:46:23,741][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:46:24,061][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:46:24,381][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:46:24,703][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:46:25,024][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:46:25,345][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:46:25,667][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:46:25,989][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:46:26,310][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:46:26,633][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:46:26,954][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:46:27,274][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:46:27,595][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:46:28,262][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:46:28,994][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:46:28,997][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:46:28,998][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:46:29,627][__main__][INFO] - Iteration 146 took 27s (12.01% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 22m 14s. Estimated total time: 7h 35m 16s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 38s. +[2026-03-25 16:46:29,629][__main__][INFO] - Starting iteration 146. +[2026-03-25 16:46:29,632][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:46:29,633][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:46:32,934][__main__][INFO] - Number of regex retries in iteration 146: 0 +[2026-03-25 16:46:32,935][__main__][INFO] - agents played in iteration 146 are Bob, Alice +[2026-03-25 16:46:33,525][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:46:34,184][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:46:34,475][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:46:34,797][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:46:35,119][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:46:35,438][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:46:35,759][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:46:36,081][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:46:36,402][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:46:36,722][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:46:37,042][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:46:37,364][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:46:37,686][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:46:38,008][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:46:38,329][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:46:38,652][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:46:38,973][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:46:39,294][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:46:39,614][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:46:39,936][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:46:40,258][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:46:40,579][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:46:40,899][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:46:41,219][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:46:41,540][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:46:41,860][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:46:42,180][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:46:42,500][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:46:42,821][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:46:43,143][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:46:43,462][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:46:43,783][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:46:44,104][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:46:44,424][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:46:44,744][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:46:45,065][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:46:45,386][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:46:45,708][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:46:46,030][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:46:46,352][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:46:46,674][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:46:46,994][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:46:47,315][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:46:47,635][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:46:47,955][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:46:48,276][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:46:48,596][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:46:48,917][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:46:49,237][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:46:49,558][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:46:49,880][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:46:50,201][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:46:50,522][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:46:51,134][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:46:51,456][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:46:51,776][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:46:52,097][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:46:52,417][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:46:52,736][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:46:53,057][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:46:53,377][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:46:53,697][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:46:54,017][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:46:54,338][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:46:54,659][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:46:54,981][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:46:55,641][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:46:56,372][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:46:56,374][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:46:56,376][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:46:57,002][__main__][INFO] - Iteration 147 took 27s (12.06% Gen, 85.64% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 22m 41s. Estimated total time: 7h 36m 10s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 37s, 500 more iterations: 3h 48m 5s. +[2026-03-25 16:46:57,006][__main__][INFO] - Starting iteration 147. +[2026-03-25 16:46:57,009][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:46:57,010][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:47:00,348][__main__][INFO] - Number of regex retries in iteration 147: 0 +[2026-03-25 16:47:00,349][__main__][INFO] - agents played in iteration 147 are Bob, Alice +[2026-03-25 16:47:00,906][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:47:01,556][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:47:01,847][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:47:02,168][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:47:02,487][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:47:02,809][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:47:03,130][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:47:03,451][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:47:03,771][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:47:04,092][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:47:04,414][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:47:04,735][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:47:05,055][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:47:05,376][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:47:05,695][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:47:06,015][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:47:06,337][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:47:06,656][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:47:06,976][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:47:07,297][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:47:07,617][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:47:07,937][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:47:08,257][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:47:08,576][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:47:08,896][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:47:09,217][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:47:09,537][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:47:09,859][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:47:10,179][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:47:10,500][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:47:10,820][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:47:11,140][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:47:11,461][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:47:11,782][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:47:12,103][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:47:12,425][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:47:12,746][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:47:13,067][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:47:13,388][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:47:13,710][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:47:14,030][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:47:14,352][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:47:14,671][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:47:14,992][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:47:15,314][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:47:15,635][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:47:15,956][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:47:16,276][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:47:16,597][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:47:16,917][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:47:17,239][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:47:17,560][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:47:17,880][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:47:18,495][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:47:18,815][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:47:19,135][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:47:19,455][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:47:19,777][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:47:20,098][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:47:20,419][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:47:20,739][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:47:21,060][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:47:21,381][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:47:21,701][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:47:22,022][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:47:22,343][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:47:23,000][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:47:23,726][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:47:23,729][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:47:23,730][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:47:24,355][__main__][INFO] - Iteration 148 took 27s (12.21% Gen, 85.50% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 21m 50s. Estimated total time: 7h 35m 46s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 34s, 500 more iterations: 3h 47m 53s. +[2026-03-25 16:47:24,357][__main__][INFO] - Starting iteration 148. +[2026-03-25 16:47:24,360][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:47:24,360][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:47:27,665][__main__][INFO] - Number of regex retries in iteration 148: 0 +[2026-03-25 16:47:27,666][__main__][INFO] - agents played in iteration 148 are Bob, Alice +[2026-03-25 16:47:28,224][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:47:28,881][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:47:29,171][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:47:29,493][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:47:29,812][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:47:30,134][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:47:30,455][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:47:30,776][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:47:31,096][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:47:31,418][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:47:31,737][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:47:32,057][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:47:32,377][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:47:32,697][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:47:33,017][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:47:33,338][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:47:33,659][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:47:33,979][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:47:34,300][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:47:34,621][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:47:34,940][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:47:35,261][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:47:35,582][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:47:35,901][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:47:36,222][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:47:36,542][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:47:36,863][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:47:37,183][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:47:37,502][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:47:37,822][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:47:38,142][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:47:38,463][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:47:38,784][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:47:39,105][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:47:39,425][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:47:39,746][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:47:40,068][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:47:40,389][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:47:40,710][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:47:41,030][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:47:41,351][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:47:41,672][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:47:41,993][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:47:42,314][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:47:42,633][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:47:42,954][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:47:43,275][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:47:43,594][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:47:43,915][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:47:44,236][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:47:44,556][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:47:44,876][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:47:45,196][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:47:45,810][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:47:46,130][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:47:46,452][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:47:46,773][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:47:47,094][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:47:47,414][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:47:47,735][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:47:48,054][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:47:48,375][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:47:48,695][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:47:49,016][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:47:49,336][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:47:49,656][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:47:50,318][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:47:51,042][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:47:51,044][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:47:51,046][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:47:51,670][__main__][INFO] - Iteration 149 took 27s (12.10% Gen, 85.61% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 20m 46s. Estimated total time: 7h 35m 10s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 35s. +[2026-03-25 16:47:51,672][__main__][INFO] - Starting iteration 149. +[2026-03-25 16:47:51,675][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:47:51,675][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:47:54,970][__main__][INFO] - Number of regex retries in iteration 149: 0 +[2026-03-25 16:47:54,971][__main__][INFO] - agents played in iteration 149 are Bob, Alice +[2026-03-25 16:47:55,540][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:47:56,192][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:47:56,482][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:47:56,804][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:47:57,124][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:47:57,444][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:47:57,766][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:47:58,087][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:47:58,407][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:47:58,727][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:47:59,048][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:47:59,369][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:47:59,690][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:48:00,010][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:48:00,331][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:48:00,652][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:48:00,973][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:48:01,294][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:48:01,615][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:48:01,936][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:48:02,255][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:48:02,576][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:48:02,895][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:48:03,215][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:48:03,536][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:48:03,858][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:48:04,178][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:48:04,500][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:48:04,821][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:48:05,141][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:48:05,462][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:48:05,781][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:48:06,101][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:48:06,423][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:48:06,743][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:48:07,065][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:48:07,387][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:48:07,709][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:48:08,030][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:48:08,352][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:48:08,674][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:48:08,995][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:48:09,316][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:48:09,636][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:48:09,956][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:48:10,277][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:48:10,598][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:48:10,919][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:48:11,240][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:48:11,560][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:48:11,881][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:48:12,202][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:48:12,523][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:48:13,137][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:48:13,457][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:48:13,779][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:48:14,101][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:48:14,421][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:48:14,742][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:48:15,064][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:48:15,386][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:48:15,708][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:48:16,030][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:48:16,352][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:48:16,673][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:48:16,995][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:48:17,652][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:48:18,379][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:48:18,382][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:48:18,383][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:48:19,008][__main__][INFO] - Iteration 150 took 27s (12.06% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 20m 42s. Estimated total time: 7h 35m 34s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 47s. +[2026-03-25 16:48:19,010][__main__][INFO] - Starting iteration 150. +[2026-03-25 16:48:19,013][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 2 and human policies 1. +[2026-03-25 16:48:19,014][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:48:22,225][__main__][INFO] - Number of regex retries in iteration 150: 0 +[2026-03-25 16:48:22,226][__main__][INFO] - agents played in iteration 150 are Bob, Alice +[2026-03-25 16:48:22,775][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:48:23,427][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:48:23,718][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:48:24,039][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:48:24,358][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:48:24,679][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:48:24,999][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:48:25,320][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:48:25,640][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:48:25,961][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:48:26,280][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:48:26,601][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:48:26,921][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:48:27,242][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:48:27,562][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:48:27,883][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:48:28,204][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:48:28,524][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:48:28,844][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:48:29,164][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:48:29,486][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:48:29,806][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:48:30,126][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:48:30,446][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:48:30,768][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:48:31,089][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:48:31,411][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:48:31,732][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:48:32,053][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:48:32,374][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:48:32,694][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:48:33,015][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:48:33,336][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:48:33,657][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:48:33,978][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:48:34,299][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:48:34,619][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:48:34,938][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:48:35,258][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:48:35,578][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:48:35,898][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:48:36,219][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:48:36,540][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:48:36,860][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:48:37,181][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:48:37,501][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:48:37,821][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:48:38,143][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:48:38,463][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:48:38,784][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:48:39,106][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:48:39,425][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:48:39,747][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:48:40,403][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:48:40,725][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:48:41,045][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:48:41,366][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:48:41,689][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:48:42,008][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:48:42,330][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:48:42,651][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:48:42,971][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:48:43,293][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:48:43,615][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:48:43,937][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:48:44,259][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:48:45,421][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 16:48:46,175][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:48:46,178][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:48:46,179][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:48:47,406][__main__][INFO] - Iteration 151 took 28s (11.31% Gen, 84.36% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 37m 54s. Estimated total time: 7h 53m 13s. Time estimates for 10 more iterations: 4m 43s, 100 more iterations: 47m 19s, 500 more iterations: 3h 56m 36s. +[2026-03-25 16:48:47,411][__main__][INFO] - Starting iteration 151. +[2026-03-25 16:48:47,419][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:48:47,420][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:48:50,702][__main__][INFO] - Number of regex retries in iteration 151: 0 +[2026-03-25 16:48:50,702][__main__][INFO] - agents played in iteration 151 are Bob, Alice +[2026-03-25 16:48:51,260][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:48:51,912][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:48:52,213][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:48:52,535][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:48:52,856][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:48:53,176][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:48:53,496][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:48:53,816][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:48:54,136][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:48:54,457][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:48:54,777][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:48:55,099][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:48:55,419][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:48:55,740][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:48:56,062][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:48:56,383][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:48:56,703][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:48:57,025][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:48:57,347][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:48:57,670][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:48:57,992][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:48:58,313][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:48:58,634][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:48:58,955][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:48:59,276][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:48:59,597][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:48:59,917][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:49:00,237][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:49:00,558][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:49:00,878][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:49:01,198][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:49:01,518][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:49:01,840][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:49:02,160][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:49:02,480][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:49:02,800][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:49:03,121][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:49:03,441][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:49:03,762][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:49:04,081][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:49:04,401][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:49:04,722][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:49:05,043][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:49:05,363][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:49:05,684][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:49:06,006][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:49:06,328][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:49:06,648][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:49:06,970][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:49:07,292][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:49:07,614][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:49:07,935][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:49:08,256][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:49:08,872][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:49:09,193][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:49:09,514][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:49:09,834][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:49:10,156][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:49:10,476][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:49:10,797][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:49:11,117][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:49:11,436][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:49:11,756][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:49:12,075][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:49:12,395][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:49:12,716][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:49:13,376][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:49:14,102][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:49:14,105][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:49:14,106][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:49:14,733][__main__][INFO] - Iteration 152 took 27s (12.02% Gen, 85.68% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 19m 28s. Estimated total time: 7h 35m 15s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 37s. +[2026-03-25 16:49:14,737][__main__][INFO] - Starting iteration 152. +[2026-03-25 16:49:14,740][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:49:14,741][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:49:17,999][__main__][INFO] - Number of regex retries in iteration 152: 0 +[2026-03-25 16:49:18,999][__main__][INFO] - agents played in iteration 152 are Bob, Alice +[2026-03-25 16:49:18,562][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:49:19,219][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:49:19,509][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:49:19,832][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:49:20,153][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:49:20,473][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:49:20,795][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:49:21,115][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:49:21,437][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:49:21,757][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:49:22,079][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:49:22,399][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:49:22,720][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:49:23,039][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:49:23,360][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:49:23,681][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:49:24,004][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:49:24,325][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:49:24,646][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:49:24,966][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:49:25,287][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:49:25,607][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:49:25,928][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:49:26,250][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:49:26,570][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:49:26,890][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:49:27,212][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:49:27,532][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:49:27,852][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:49:28,173][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:49:28,495][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:49:28,816][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:49:29,137][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:49:29,459][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:49:29,781][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:49:30,101][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:49:30,421][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:49:30,741][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:49:31,061][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:49:31,383][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:49:31,702][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:49:32,022][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:49:32,343][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:49:32,665][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:49:32,986][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:49:33,305][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:49:33,626][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:49:33,947][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:49:34,269][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:49:34,590][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:49:34,911][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:49:35,232][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:49:35,555][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:49:36,191][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:49:36,511][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:49:36,831][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:49:37,152][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:49:37,472][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:49:37,793][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:49:38,114][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:49:38,433][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:49:38,755][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:49:39,076][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:49:39,395][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:49:39,716][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:49:40,036][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:49:40,705][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:49:41,434][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:49:41,437][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:49:41,438][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:49:42,063][__main__][INFO] - Iteration 153 took 27s (11.93% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 19m 10s. Estimated total time: 7h 35m 24s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 42s. +[2026-03-25 16:49:42,066][__main__][INFO] - Starting iteration 153. +[2026-03-25 16:49:42,069][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:49:42,069][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:49:45,307][__main__][INFO] - Number of regex retries in iteration 153: 0 +[2026-03-25 16:49:45,307][__main__][INFO] - agents played in iteration 153 are Bob, Alice +[2026-03-25 16:49:45,860][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:49:46,512][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:49:46,802][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:49:47,122][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:49:47,443][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:49:47,764][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:49:48,085][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:49:48,406][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:49:48,728][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:49:49,049][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:49:49,370][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:49:49,691][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:49:50,012][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:49:50,333][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:49:50,654][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:49:50,975][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:49:51,296][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:49:51,617][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:49:51,936][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:49:52,257][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:49:52,579][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:49:52,901][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:49:53,220][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:49:53,540][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:49:53,861][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:49:54,182][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:49:54,503][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:49:54,825][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:49:55,145][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:49:55,467][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:49:55,789][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:49:56,110][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:49:56,430][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:49:56,750][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:49:57,071][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:49:57,391][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:49:57,712][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:49:58,033][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:49:58,353][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:49:58,675][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:49:58,996][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:49:59,316][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:49:59,636][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:49:59,956][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:50:00,278][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:50:00,598][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:50:00,917][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:50:01,238][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:50:01,559][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:50:01,879][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:50:02,200][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:50:02,520][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:50:02,841][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:50:03,454][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:50:03,774][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:50:04,095][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:50:04,416][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:50:04,736][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:50:05,057][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:50:05,376][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:50:05,696][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:50:06,016][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:50:06,337][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:50:06,660][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:50:06,980][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:50:07,300][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:50:07,957][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:50:08,678][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:50:08,680][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:50:08,682][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:50:09,322][__main__][INFO] - Iteration 154 took 27s (11.88% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 17m 32s. Estimated total time: 7h 34m 14s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 7s. +[2026-03-25 16:50:09,324][__main__][INFO] - Starting iteration 154. +[2026-03-25 16:50:09,327][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:50:09,328][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:50:12,587][__main__][INFO] - Number of regex retries in iteration 154: 0 +[2026-03-25 16:50:12,588][__main__][INFO] - agents played in iteration 154 are Bob, Alice +[2026-03-25 16:50:13,143][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:50:13,792][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:50:14,083][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:50:14,404][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:50:14,723][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:50:15,043][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:50:15,363][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:50:15,683][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:50:16,003][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:50:16,323][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:50:16,644][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:50:16,964][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:50:17,284][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:50:17,603][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:50:17,925][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:50:18,245][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:50:18,565][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:50:18,885][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:50:19,206][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:50:19,525][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:50:19,844][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:50:20,166][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:50:20,487][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:50:20,809][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:50:21,130][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:50:21,451][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:50:21,772][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:50:22,091][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:50:22,412][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:50:22,733][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:50:23,054][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:50:23,376][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:50:23,696][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:50:24,016][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:50:24,336][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:50:24,658][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:50:24,978][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:50:25,298][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:50:25,619][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:50:25,939][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:50:26,259][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:50:26,579][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:50:26,898][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:50:27,219][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:50:27,539][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:50:27,858][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:50:28,179][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:50:28,499][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:50:28,820][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:50:29,141][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:50:29,461][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:50:29,782][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:50:30,103][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:50:30,717][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:50:31,038][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:50:31,358][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:50:31,678][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:50:31,999][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:50:32,319][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:50:32,639][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:50:32,959][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:50:33,280][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:50:33,602][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:50:33,923][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:50:34,243][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:50:34,565][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:50:35,219][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:50:35,950][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:50:35,953][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:50:35,954][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:50:36,581][__main__][INFO] - Iteration 155 took 27s (11.96% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 17m 5s. Estimated total time: 7h 34m 14s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 7s. +[2026-03-25 16:50:36,583][__main__][INFO] - Starting iteration 155. +[2026-03-25 16:50:36,587][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:50:36,587][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:50:39,805][__main__][INFO] - Number of regex retries in iteration 155: 0 +[2026-03-25 16:50:39,805][__main__][INFO] - agents played in iteration 155 are Bob, Alice +[2026-03-25 16:50:40,341][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:50:40,992][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:50:41,282][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:50:41,603][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:50:41,922][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:50:42,243][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:50:42,565][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:50:42,886][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:50:43,206][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:50:43,528][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:50:43,850][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:50:44,170][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:50:44,493][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:50:44,814][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:50:45,135][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:50:45,457][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:50:45,778][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:50:46,100][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:50:46,421][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:50:46,742][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:50:47,061][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:50:47,382][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:50:47,702][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:50:48,022][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:50:48,342][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:50:48,662][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:50:48,983][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:50:49,304][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:50:49,624][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:50:49,946][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:50:50,267][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:50:50,589][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:50:50,911][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:50:51,231][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:50:51,552][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:50:51,873][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:50:52,194][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:50:52,516][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:50:52,836][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:50:53,158][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:50:53,477][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:50:53,797][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:50:54,118][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:50:54,437][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:50:54,758][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:50:55,079][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:50:55,399][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:50:55,719][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:50:56,044][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:50:56,365][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:50:56,686][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:50:57,006][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:50:57,328][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:50:57,956][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:50:58,278][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:50:58,598][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:50:58,919][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:50:59,239][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:50:59,561][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:50:59,883][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:51:00,203][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:51:00,523][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:51:00,843][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:51:01,164][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:51:01,485][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:51:01,806][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:51:02,459][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:51:03,185][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:51:03,188][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:51:03,190][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:51:03,816][__main__][INFO] - Iteration 156 took 27s (11.82% Gen, 85.87% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 16m 14s. Estimated total time: 7h 33m 50s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 55s. +[2026-03-25 16:51:03,818][__main__][INFO] - Starting iteration 156. +[2026-03-25 16:51:03,822][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:51:03,822][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:51:07,064][__main__][INFO] - Number of regex retries in iteration 156: 0 +[2026-03-25 16:51:07,065][__main__][INFO] - agents played in iteration 156 are Bob, Alice +[2026-03-25 16:51:07,600][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:51:08,248][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:51:08,538][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:51:08,859][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:51:09,180][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:51:09,500][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:51:09,820][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:51:10,141][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:51:10,462][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:51:10,782][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:51:11,102][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:51:11,422][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:51:11,743][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:51:12,064][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:51:12,385][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:51:12,708][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:51:13,027][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:51:13,349][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:51:13,670][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:51:13,989][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:51:14,311][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:51:14,630][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:51:14,951][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:51:15,272][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:51:15,592][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:51:15,913][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:51:16,234][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:51:16,555][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:51:16,876][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:51:17,196][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:51:17,517][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:51:17,836][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:51:18,157][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:51:18,478][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:51:18,798][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:51:19,117][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:51:19,436][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:51:19,757][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:51:20,079][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:51:20,401][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:51:20,722][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:51:21,043][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:51:21,363][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:51:21,683][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:51:22,002][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:51:22,323][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:51:22,644][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:51:22,965][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:51:23,285][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:51:23,606][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:51:23,926][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:51:24,246][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:51:24,567][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:51:25,183][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:51:25,504][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:51:25,824][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:51:26,145][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:51:26,467][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:51:26,787][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:51:27,106][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:51:27,427][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:51:27,749][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:51:28,070][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:51:28,390][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:51:28,710][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:51:29,031][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:51:29,687][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:51:30,414][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:51:30,416][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:51:30,418][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:51:31,045][__main__][INFO] - Iteration 157 took 27s (11.91% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 15m 41s. Estimated total time: 7h 33m 44s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 52s. +[2026-03-25 16:51:31,048][__main__][INFO] - Starting iteration 157. +[2026-03-25 16:51:31,052][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:51:31,052][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:51:34,316][__main__][INFO] - Number of regex retries in iteration 157: 0 +[2026-03-25 16:51:34,317][__main__][INFO] - agents played in iteration 157 are Bob, Alice +[2026-03-25 16:51:34,883][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:51:35,547][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:51:35,838][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:51:36,162][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:51:36,482][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:51:36,802][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:51:37,124][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:51:37,444][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:51:37,763][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:51:38,083][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:51:38,404][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:51:38,724][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:51:39,044][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:51:39,365][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:51:39,685][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:51:40,006][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:51:40,326][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:51:40,648][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:51:40,970][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:51:41,290][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:51:41,611][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:51:41,931][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:51:42,253][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:51:42,573][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:51:42,895][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:51:43,216][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:51:43,536][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:51:43,858][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:51:44,177][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:51:44,496][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:51:44,816][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:51:45,136][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:51:45,457][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:51:45,778][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:51:46,098][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:51:46,418][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:51:46,739][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:51:47,060][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:51:47,383][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:51:47,703][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:51:48,023][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:51:48,343][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:51:48,664][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:51:48,985][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:51:49,306][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:51:49,626][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:51:49,947][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:51:50,267][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:51:50,588][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:51:50,908][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:51:51,229][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:51:51,551][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:51:51,871][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:51:52,491][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:51:52,811][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:51:53,130][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:51:53,451][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:51:53,772][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:51:54,093][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:51:54,414][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:51:54,735][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:51:55,055][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:51:55,376][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:51:55,695][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:51:56,015][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:51:56,334][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:51:56,989][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:51:57,744][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:51:57,747][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:51:57,748][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:51:58,339][__main__][INFO] - Iteration 158 took 27s (11.96% Gen, 85.86% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 16m 18s. Estimated total time: 7h 34m 48s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 24s. +[2026-03-25 16:51:58,342][__main__][INFO] - Starting iteration 158. +[2026-03-25 16:51:58,345][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:51:58,345][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:52:01,566][__main__][INFO] - Number of regex retries in iteration 158: 0 +[2026-03-25 16:52:01,566][__main__][INFO] - agents played in iteration 158 are Bob, Alice +[2026-03-25 16:52:02,094][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:52:02,745][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:52:03,036][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:52:03,359][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:52:03,680][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:52:04,001][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:52:04,322][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:52:04,643][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:52:04,964][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:52:05,283][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:52:05,603][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:52:05,923][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:52:06,242][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:52:06,563][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:52:06,883][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:52:07,203][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:52:07,525][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:52:07,846][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:52:08,167][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:52:08,488][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:52:08,808][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:52:09,130][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:52:09,451][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:52:09,771][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:52:10,092][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:52:10,412][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:52:10,733][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:52:11,053][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:52:11,372][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:52:11,691][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:52:12,011][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:52:12,332][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:52:12,653][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:52:12,973][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:52:13,294][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:52:13,615][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:52:13,936][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:52:14,256][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:52:14,577][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:52:14,896][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:52:15,216][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:52:15,535][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:52:15,856][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:52:16,177][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:52:16,497][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:52:16,817][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:52:17,137][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:52:17,457][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:52:17,777][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:52:18,097][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:52:18,417][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:52:18,738][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:52:19,060][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:52:19,672][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:52:19,993][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:52:20,314][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:52:20,634][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:52:20,955][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:52:21,275][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:52:21,594][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:52:21,914][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:52:22,235][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:52:22,557][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:52:22,877][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:52:23,199][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:52:23,520][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:52:24,175][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:52:24,896][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:52:24,898][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:52:24,900][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:52:25,528][__main__][INFO] - Iteration 159 took 27s (11.85% Gen, 85.83% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 14m 6s. Estimated total time: 7h 33m 4s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 32s. +[2026-03-25 16:52:25,531][__main__][INFO] - Starting iteration 159. +[2026-03-25 16:52:25,534][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:52:25,534][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:52:28,794][__main__][INFO] - Number of regex retries in iteration 159: 0 +[2026-03-25 16:52:28,795][__main__][INFO] - agents played in iteration 159 are Bob, Alice +[2026-03-25 16:52:29,363][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:52:30,028][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:52:30,318][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:52:30,639][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:52:30,961][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:52:31,281][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:52:31,601][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:52:31,922][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:52:32,243][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:52:32,565][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:52:32,884][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:52:33,204][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:52:33,524][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:52:33,845][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:52:34,164][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:52:34,483][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:52:34,804][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:52:35,125][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:52:35,446][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:52:35,765][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:52:36,085][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:52:36,405][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:52:36,725][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:52:37,044][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:52:37,363][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:52:37,684][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:52:38,004][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:52:38,323][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:52:38,644][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:52:38,965][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:52:39,284][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:52:39,604][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:52:39,925][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:52:40,247][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:52:40,566][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:52:40,889][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:52:41,209][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:52:41,530][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:52:41,853][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:52:42,173][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:52:42,493][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:52:42,813][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:52:43,135][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:52:43,455][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:52:43,775][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:52:44,096][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:52:44,417][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:52:44,737][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:52:45,058][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:52:45,377][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:52:45,697][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:52:46,019][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:52:46,339][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:52:46,964][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:52:47,287][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:52:47,607][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:52:47,927][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:52:48,246][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:52:48,566][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:52:48,886][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:52:49,206][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:52:49,526][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:52:49,845][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:52:50,167][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:52:50,489][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:52:50,811][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:52:51,466][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:52:52,200][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:52:52,202][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:52:52,204][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:52:52,828][__main__][INFO] - Iteration 160 took 27s (11.95% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 15m 30s. Estimated total time: 7h 34m 55s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 27s. +[2026-03-25 16:52:52,830][__main__][INFO] - Starting iteration 160. +[2026-03-25 16:52:52,834][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:52:52,834][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:52:56,099][__main__][INFO] - Number of regex retries in iteration 160: 0 +[2026-03-25 16:52:56,100][__main__][INFO] - agents played in iteration 160 are Bob, Alice +[2026-03-25 16:52:56,649][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:52:57,299][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:52:57,589][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:52:57,911][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:52:58,231][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:52:58,554][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:52:58,875][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:52:59,194][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:52:59,516][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:52:59,835][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:53:00,155][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:53:00,476][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:53:00,795][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:53:01,116][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:53:01,436][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:53:01,757][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:53:02,077][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:53:02,396][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:53:02,717][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:53:03,038][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:53:03,358][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:53:03,678][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:53:03,997][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:53:04,317][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:53:04,637][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:53:04,957][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:53:05,277][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:53:05,596][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:53:05,917][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:53:06,238][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:53:06,559][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:53:06,878][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:53:07,197][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:53:07,517][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:53:07,839][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:53:08,159][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:53:08,480][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:53:08,799][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:53:09,119][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:53:09,439][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:53:09,759][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:53:10,079][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:53:10,399][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:53:10,719][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:53:11,038][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:53:11,359][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:53:11,680][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:53:11,999][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:53:12,320][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:53:12,640][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:53:12,961][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:53:13,281][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:53:13,601][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:53:14,225][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:53:14,546][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:53:14,867][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:53:15,187][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:53:15,509][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:53:15,830][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:53:16,151][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:53:16,471][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:53:16,793][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:53:17,115][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:53:17,436][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:53:17,757][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:53:18,079][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:53:18,747][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:53:19,475][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:53:19,477][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:53:19,479][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:53:20,115][__main__][INFO] - Iteration 161 took 27s (11.97% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 14m 50s. Estimated total time: 7h 34m 42s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 21s. +[2026-03-25 16:53:20,118][__main__][INFO] - Starting iteration 161. +[2026-03-25 16:53:20,121][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:53:20,121][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:53:23,362][__main__][INFO] - Number of regex retries in iteration 161: 0 +[2026-03-25 16:53:23,362][__main__][INFO] - agents played in iteration 161 are Bob, Alice +[2026-03-25 16:53:23,911][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:53:24,563][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:53:24,852][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:53:25,174][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:53:25,496][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:53:25,817][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:53:26,137][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:53:26,457][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:53:26,777][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:53:27,097][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:53:27,417][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:53:27,737][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:53:28,058][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:53:28,380][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:53:28,699][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:53:29,020][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:53:29,341][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:53:29,662][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:53:29,982][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:53:30,304][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:53:30,625][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:53:30,945][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:53:31,266][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:53:31,585][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:53:31,905][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:53:32,226][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:53:32,546][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:53:32,866][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:53:33,186][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:53:33,508][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:53:33,829][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:53:34,151][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:53:34,471][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:53:34,791][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:53:35,110][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:53:35,432][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:53:35,753][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:53:36,074][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:53:36,393][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:53:36,714][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:53:37,033][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:53:37,355][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:53:37,676][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:53:37,997][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:53:38,318][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:53:38,639][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:53:38,958][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:53:39,277][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:53:39,597][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:53:39,916][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:53:40,237][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:53:40,558][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:53:40,878][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:53:41,491][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:53:41,810][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:53:42,131][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:53:42,452][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:53:42,772][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:53:43,093][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:53:43,414][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:53:43,736][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:53:44,056][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:53:44,377][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:53:44,697][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:53:45,018][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:53:45,338][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:53:45,992][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:53:46,721][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:53:46,723][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:53:46,724][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:53:47,350][__main__][INFO] - Iteration 162 took 27s (11.90% Gen, 85.79% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 13m 30s. Estimated total time: 7h 33m 50s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 55s. +[2026-03-25 16:53:47,352][__main__][INFO] - Starting iteration 162. +[2026-03-25 16:53:47,355][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:53:47,356][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:53:50,615][__main__][INFO] - Number of regex retries in iteration 162: 0 +[2026-03-25 16:53:50,616][__main__][INFO] - agents played in iteration 162 are Bob, Alice +[2026-03-25 16:53:51,158][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:53:51,810][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:53:52,100][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:53:52,421][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:53:52,744][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:53:53,064][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:53:53,386][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:53:53,708][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:53:54,029][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:53:54,352][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:53:54,673][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:53:54,993][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:53:55,314][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:53:55,635][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:53:55,955][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:53:56,275][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:53:56,595][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:53:56,917][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:53:57,238][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:53:57,557][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:53:57,878][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:53:58,197][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:53:58,517][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:53:58,838][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:53:59,160][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:53:59,480][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:53:59,801][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:54:00,120][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:54:00,440][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:54:00,761][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:54:01,082][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:54:01,403][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:54:01,724][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:54:02,044][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:54:02,364][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:54:02,686][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:54:03,008][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:54:03,329][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:54:03,651][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:54:03,971][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:54:04,293][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:54:04,614][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:54:04,935][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:54:05,257][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:54:05,577][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:54:05,898][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:54:06,219][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:54:06,541][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:54:06,863][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:54:07,183][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:54:07,505][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:54:07,826][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:54:08,147][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:54:08,761][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:54:09,083][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:54:09,403][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:54:09,724][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:54:10,044][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:54:10,365][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:54:10,686][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:54:11,006][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:54:11,327][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:54:11,647][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:54:11,968][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:54:12,287][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:54:12,608][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:54:13,264][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:54:13,990][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:54:13,992][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:54:13,994][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:54:14,622][__main__][INFO] - Iteration 163 took 27s (11.96% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 13m 41s. Estimated total time: 7h 34m 28s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 14s. +[2026-03-25 16:54:14,625][__main__][INFO] - Starting iteration 163. +[2026-03-25 16:54:14,627][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:54:14,628][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:54:17,859][__main__][INFO] - Number of regex retries in iteration 163: 0 +[2026-03-25 16:54:17,860][__main__][INFO] - agents played in iteration 163 are Bob, Alice +[2026-03-25 16:54:18,402][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:54:19,053][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:54:19,345][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:54:19,665][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:54:19,987][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:54:20,309][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:54:20,630][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:54:20,950][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:54:21,272][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:54:21,594][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:54:21,915][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:54:22,235][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:54:22,556][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:54:22,877][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:54:23,196][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:54:23,517][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:54:23,838][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:54:24,159][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:54:24,478][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:54:24,797][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:54:25,118][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:54:25,438][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:54:25,758][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:54:26,077][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:54:26,396][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:54:26,717][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:54:27,037][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:54:27,357][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:54:27,678][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:54:27,997][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:54:28,318][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:54:28,639][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:54:28,958][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:54:29,278][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:54:29,598][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:54:29,920][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:54:30,240][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:54:30,562][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:54:30,884][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:54:31,205][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:54:31,525][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:54:31,846][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:54:32,165][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:54:32,487][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:54:32,808][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:54:33,130][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:54:33,451][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:54:33,773][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:54:34,093][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:54:34,414][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:54:34,736][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:54:35,057][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:54:35,378][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:54:36,003][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:54:36,323][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:54:36,643][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:54:36,963][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:54:37,284][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:54:37,605][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:54:37,925][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:54:38,245][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:54:38,566][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:54:38,886][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:54:39,208][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:54:39,529][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:54:39,851][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:54:40,521][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:54:41,242][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:54:41,244][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:54:41,246][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:54:41,869][__main__][INFO] - Iteration 164 took 27s (11.86% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 12m 48s. Estimated total time: 7h 34m 2s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 1s. +[2026-03-25 16:54:41,872][__main__][INFO] - Starting iteration 164. +[2026-03-25 16:54:41,875][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:54:41,875][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:54:45,114][__main__][INFO] - Number of regex retries in iteration 164: 0 +[2026-03-25 16:54:45,115][__main__][INFO] - agents played in iteration 164 are Bob, Alice +[2026-03-25 16:54:45,659][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:54:46,308][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:54:46,601][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:54:46,924][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:54:47,245][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:54:47,567][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:54:47,889][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:54:48,209][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:54:48,531][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:54:48,853][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:54:49,174][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:54:49,494][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:54:49,815][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:54:50,138][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:54:50,460][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:54:50,780][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:54:51,100][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:54:51,421][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:54:51,740][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:54:52,061][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:54:52,380][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:54:52,700][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:54:53,022][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:54:53,342][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:54:53,663][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:54:53,983][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:54:54,304][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:54:54,624][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:54:54,944][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:54:55,266][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:54:55,588][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:54:55,909][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:54:56,232][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:54:56,553][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:54:56,874][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:54:57,195][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:54:57,517][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:54:57,837][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:54:58,157][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:54:58,478][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:54:58,800][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:54:59,119][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:54:59,440][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:54:59,761][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:55:00,082][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:55:00,403][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:55:00,724][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:55:01,045][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:55:01,366][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:55:01,687][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:55:02,007][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:55:02,327][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:55:02,649][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:55:03,263][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:55:03,585][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:55:03,905][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:55:04,226][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:55:04,548][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:55:04,869][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:55:05,190][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:55:05,512][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:55:05,832][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:55:06,154][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:55:06,476][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:55:06,798][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:55:07,118][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:55:07,774][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:55:08,500][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:55:08,502][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:55:08,504][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:55:09,146][__main__][INFO] - Iteration 165 took 27s (11.88% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 12m 51s. Estimated total time: 7h 34m 32s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 16s. +[2026-03-25 16:55:09,148][__main__][INFO] - Starting iteration 165. +[2026-03-25 16:55:09,151][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:55:09,152][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:55:12,391][__main__][INFO] - Number of regex retries in iteration 165: 0 +[2026-03-25 16:55:12,391][__main__][INFO] - agents played in iteration 165 are Bob, Alice +[2026-03-25 16:55:12,934][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:55:13,584][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:55:13,873][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:55:14,195][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:55:14,517][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:55:14,838][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:55:15,158][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:55:15,479][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:55:15,799][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:55:16,120][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:55:16,441][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:55:16,762][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:55:17,081][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:55:17,400][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:55:17,720][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:55:18,040][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:55:18,361][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:55:18,680][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:55:18,999][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:55:19,320][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:55:19,640][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:55:19,962][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:55:20,283][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:55:20,604][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:55:20,924][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:55:21,245][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:55:21,566][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:55:21,886][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:55:22,205][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:55:22,525][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:55:22,845][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:55:23,166][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:55:23,485][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:55:23,806][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:55:24,125][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:55:24,446][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:55:24,768][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:55:25,088][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:55:25,408][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:55:25,730][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:55:26,051][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:55:26,372][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:55:26,693][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:55:27,014][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:55:27,334][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:55:27,655][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:55:27,975][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:55:28,295][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:55:28,616][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:55:28,938][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:55:29,261][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:55:29,580][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:55:29,902][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:55:30,514][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:55:30,834][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:55:31,155][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:55:31,476][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:55:31,798][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:55:32,118][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:55:32,437][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:55:32,757][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:55:33,077][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:55:33,399][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:55:33,719][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:55:34,039][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:55:34,359][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:55:35,012][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:55:35,739][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:55:35,741][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:55:35,743][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:55:36,372][__main__][INFO] - Iteration 166 took 27s (11.90% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 11m 33s. Estimated total time: 7h 33m 41s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 50s. +[2026-03-25 16:55:36,375][__main__][INFO] - Starting iteration 166. +[2026-03-25 16:55:36,378][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:55:36,378][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:55:39,642][__main__][INFO] - Number of regex retries in iteration 166: 0 +[2026-03-25 16:55:39,643][__main__][INFO] - agents played in iteration 166 are Bob, Alice +[2026-03-25 16:55:40,190][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:55:40,842][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:55:41,133][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:55:41,455][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:55:41,774][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:55:42,096][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:55:42,416][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:55:42,737][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:55:43,058][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:55:43,377][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:55:43,696][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:55:44,016][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:55:44,337][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:55:44,657][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:55:44,977][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:55:45,298][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:55:45,619][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:55:45,940][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:55:46,261][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:55:46,583][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:55:46,904][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:55:47,225][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:55:47,546][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:55:47,867][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:55:48,189][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:55:48,510][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:55:48,831][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:55:49,152][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:55:49,475][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:55:49,798][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:55:50,119][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:55:50,440][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:55:50,761][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:55:51,083][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:55:51,404][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:55:51,725][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:55:52,047][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:55:52,367][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:55:52,687][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:55:53,008][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:55:53,328][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:55:53,648][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:55:53,969][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:55:54,289][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:55:54,610][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:55:54,932][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:55:55,252][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:55:55,572][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:55:55,892][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:55:56,212][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:55:56,534][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:55:56,854][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:55:57,176][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:55:57,788][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:55:58,109][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:55:58,431][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:55:58,753][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:55:59,074][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:55:59,394][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:55:59,715][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:56:00,034][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:56:00,354][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:56:00,674][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:56:00,993][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:56:01,314][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:56:01,634][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:56:02,294][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:56:03,027][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:56:03,029][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:56:03,031][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:56:03,657][__main__][INFO] - Iteration 167 took 27s (11.97% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 12m 4s. Estimated total time: 7h 34m 40s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 20s. +[2026-03-25 16:56:03,659][__main__][INFO] - Starting iteration 167. +[2026-03-25 16:56:03,662][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:56:03,662][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:56:06,911][__main__][INFO] - Number of regex retries in iteration 167: 0 +[2026-03-25 16:56:06,912][__main__][INFO] - agents played in iteration 167 are Bob, Alice +[2026-03-25 16:56:07,458][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:56:08,109][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:56:08,401][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:56:08,721][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:56:09,042][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:56:09,363][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:56:09,683][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:56:10,004][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:56:10,324][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:56:10,645][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:56:10,967][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:56:11,286][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:56:11,605][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:56:11,926][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:56:12,247][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:56:12,566][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:56:12,885][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:56:13,206][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:56:13,527][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:56:13,847][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:56:14,167][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:56:14,486][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:56:14,807][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:56:15,127][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:56:15,448][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:56:15,769][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:56:16,088][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:56:16,409][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:56:16,729][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:56:17,051][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:56:17,371][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:56:17,691][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:56:18,012][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:56:18,333][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:56:18,653][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:56:18,975][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:56:19,294][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:56:19,614][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:56:19,934][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:56:20,256][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:56:20,578][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:56:20,897][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:56:21,217][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:56:21,537][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:56:21,859][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:56:22,179][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:56:22,499][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:56:22,820][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:56:23,141][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:56:23,461][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:56:23,781][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:56:24,100][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:56:24,421][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:56:25,034][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:56:25,355][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:56:25,677][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:56:25,996][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:56:26,317][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:56:26,638][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:56:26,958][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:56:27,279][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:56:27,599][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:56:27,918][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:56:28,238][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:56:28,559][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:56:28,880][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:56:29,537][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:56:30,266][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:56:30,268][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:56:30,270][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:56:30,879][__main__][INFO] - Iteration 168 took 27s (11.94% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 10m 34s. Estimated total time: 7h 33m 37s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 48s. +[2026-03-25 16:56:30,881][__main__][INFO] - Starting iteration 168. +[2026-03-25 16:56:30,884][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:56:30,885][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:56:34,139][__main__][INFO] - Number of regex retries in iteration 168: 0 +[2026-03-25 16:56:34,140][__main__][INFO] - agents played in iteration 168 are Bob, Alice +[2026-03-25 16:56:34,691][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:56:35,344][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:56:35,634][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:56:35,955][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:56:36,277][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:56:36,599][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:56:36,919][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:56:37,241][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:56:37,562][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:56:37,882][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:56:38,204][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:56:38,527][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:56:38,849][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:56:39,171][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:56:39,493][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:56:39,814][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:56:40,136][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:56:40,457][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:56:40,778][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:56:41,098][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:56:41,420][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:56:41,741][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:56:42,063][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:56:42,383][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:56:42,704][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:56:43,025][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:56:43,346][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:56:43,667][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:56:43,988][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:56:44,310][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:56:44,631][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:56:44,952][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:56:45,272][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:56:45,593][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:56:45,914][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:56:46,234][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:56:46,555][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:56:46,875][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:56:47,195][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:56:47,516][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:56:47,837][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:56:48,157][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:56:48,478][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:56:48,798][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:56:49,117][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:56:49,437][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:56:49,758][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:56:50,079][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:56:50,400][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:56:50,720][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:56:51,040][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:56:51,361][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:56:51,681][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:56:52,294][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:56:52,615][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:56:52,935][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:56:53,256][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:56:53,577][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:56:53,896][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:56:54,216][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:56:54,537][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:56:54,857][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:56:55,177][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:56:55,497][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:56:55,816][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:56:56,136][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:56:56,792][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:56:57,519][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:56:57,522][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:56:57,524][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:56:58,137][__main__][INFO] - Iteration 169 took 27s (11.94% Gen, 85.80% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 10m 43s. Estimated total time: 7h 34m 14s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 7s. +[2026-03-25 16:56:58,140][__main__][INFO] - Starting iteration 169. +[2026-03-25 16:56:58,142][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:56:58,143][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:57:01,408][__main__][INFO] - Number of regex retries in iteration 169: 0 +[2026-03-25 16:57:01,409][__main__][INFO] - agents played in iteration 169 are Bob, Alice +[2026-03-25 16:57:01,962][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:57:02,611][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:57:02,900][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:57:03,219][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:57:03,540][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:57:03,858][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:57:04,178][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:57:04,498][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:57:04,816][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:57:05,137][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:57:05,459][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:57:05,780][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:57:06,101][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:57:06,422][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:57:06,742][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:57:07,062][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:57:07,383][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:57:07,703][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:57:08,024][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:57:08,344][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:57:08,665][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:57:08,985][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:57:09,305][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:57:09,626][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:57:09,946][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:57:10,265][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:57:10,584][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:57:10,903][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:57:11,224][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:57:11,545][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:57:11,866][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:57:12,184][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:57:12,504][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:57:12,824][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:57:13,145][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:57:13,465][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:57:13,784][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:57:14,103][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:57:14,422][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:57:14,743][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:57:15,063][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:57:15,383][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:57:15,703][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:57:16,024][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:57:16,343][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:57:16,664][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:57:16,984][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:57:17,304][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:57:17,623][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:57:17,944][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:57:18,265][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:57:18,583][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:57:18,902][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:57:19,514][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:57:19,834][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:57:20,157][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:57:20,478][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:57:20,797][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:57:21,117][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:57:21,438][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:57:21,758][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:57:22,078][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:57:22,400][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:57:22,721][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:57:23,040][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:57:23,360][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:57:24,014][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:57:24,769][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:57:24,771][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:57:24,773][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:57:25,396][__main__][INFO] - Iteration 170 took 27s (11.98% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 10m 16s. Estimated total time: 7h 34m 14s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 7s. +[2026-03-25 16:57:25,398][__main__][INFO] - Starting iteration 170. +[2026-03-25 16:57:25,402][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:57:25,402][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:57:28,779][__main__][INFO] - Number of regex retries in iteration 170: 0 +[2026-03-25 16:57:28,780][__main__][INFO] - agents played in iteration 170 are Bob, Alice +[2026-03-25 16:57:29,346][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:57:29,998][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:57:30,287][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:57:30,609][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:57:30,930][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:57:31,251][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:57:31,570][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:57:31,891][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:57:32,212][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:57:32,532][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:57:32,853][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:57:33,174][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:57:33,495][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:57:33,816][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:57:34,137][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:57:34,458][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:57:34,777][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:57:35,097][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:57:35,418][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:57:35,737][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:57:36,057][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:57:36,377][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:57:36,697][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:57:37,017][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:57:37,336][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:57:37,657][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:57:37,977][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:57:38,298][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:57:38,618][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:57:38,938][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:57:39,258][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:57:39,579][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:57:39,899][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:57:40,219][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:57:40,539][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:57:40,860][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:57:41,181][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:57:41,502][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:57:41,822][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:57:42,141][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:57:42,461][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:57:42,780][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:57:43,100][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:57:43,421][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:57:43,741][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:57:44,063][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:57:44,383][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:57:44,703][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:57:45,023][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:57:45,343][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:57:45,664][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:57:45,984][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:57:46,304][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:57:46,918][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:57:47,237][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:57:47,558][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:57:47,877][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:57:48,198][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:57:48,517][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:57:48,838][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:57:49,157][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:57:49,476][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:57:49,797][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:57:50,117][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:57:50,438][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:57:50,759][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:57:51,415][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:57:52,142][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:57:52,145][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:57:52,146][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:57:52,774][__main__][INFO] - Iteration 171 took 27s (12.34% Gen, 85.36% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 11m 48s. Estimated total time: 7h 36m 13s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 37s, 500 more iterations: 3h 48m 6s. +[2026-03-25 16:57:52,776][__main__][INFO] - Starting iteration 171. +[2026-03-25 16:57:52,779][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:57:52,780][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:57:56,031][__main__][INFO] - Number of regex retries in iteration 171: 0 +[2026-03-25 16:57:56,032][__main__][INFO] - agents played in iteration 171 are Bob, Alice +[2026-03-25 16:57:56,614][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:57:57,276][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:57:57,566][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:57:57,887][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:57:58,208][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:57:58,528][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:57:58,848][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:57:59,169][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:57:59,489][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:57:59,809][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:58:00,131][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:58:00,451][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:58:00,772][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:58:01,091][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:58:01,413][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:58:01,733][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:58:02,053][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:58:02,373][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:58:02,693][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:58:03,012][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:58:03,333][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:58:03,653][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:58:03,974][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:58:04,293][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:58:04,614][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:58:04,934][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:58:05,255][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:58:05,575][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:58:05,894][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:58:06,215][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:58:06,535][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:58:06,856][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:58:07,177][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:58:07,497][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:58:07,818][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:58:08,137][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:58:08,458][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:58:08,778][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:58:09,098][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:58:09,417][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:58:09,737][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:58:10,058][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:58:10,378][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:58:10,699][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:58:11,020][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:58:11,339][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:58:11,659][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:58:11,979][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:58:12,299][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:58:12,620][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:58:12,940][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:58:13,261][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:58:13,581][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:58:14,200][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:58:14,522][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:58:14,842][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:58:15,162][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:58:15,482][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:58:15,801][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:58:16,123][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:58:16,443][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:58:16,763][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:58:17,084][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:58:17,404][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:58:17,725][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:58:18,046][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:58:18,721][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:58:19,483][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:58:19,485][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:58:19,487][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:58:20,130][__main__][INFO] - Iteration 172 took 27s (11.89% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 10m 59s. Estimated total time: 7h 35m 51s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 35s, 500 more iterations: 3h 47m 55s. +[2026-03-25 16:58:20,132][__main__][INFO] - Starting iteration 172. +[2026-03-25 16:58:20,135][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:58:20,136][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:58:23,441][__main__][INFO] - Number of regex retries in iteration 172: 0 +[2026-03-25 16:58:23,441][__main__][INFO] - agents played in iteration 172 are Bob, Alice +[2026-03-25 16:58:24,017][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:58:24,678][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:58:24,968][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:58:25,290][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:58:25,610][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:58:25,931][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:58:26,250][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:58:26,569][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:58:26,889][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:58:27,209][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:58:27,528][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:58:27,850][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:58:28,170][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:58:28,491][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:58:28,811][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:58:29,131][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:58:29,451][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:58:29,772][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:58:30,092][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:58:30,413][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:58:30,733][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:58:31,054][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:58:31,374][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:58:31,695][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:58:32,015][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:58:32,334][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:58:32,655][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:58:32,976][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:58:33,296][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:58:33,617][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:58:33,938][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:58:34,257][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:58:34,577][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:58:34,897][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:58:35,218][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:58:35,538][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:58:35,858][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:58:36,177][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:58:36,499][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:58:36,819][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:58:37,140][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:58:37,460][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:58:37,779][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:58:38,099][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:58:38,420][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:58:38,740][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:58:39,060][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:58:39,380][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:58:39,699][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:58:40,019][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:58:40,340][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:58:40,660][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:58:40,980][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:58:41,598][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:58:41,918][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:58:42,238][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:58:42,558][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:58:42,878][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:58:43,197][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:58:43,517][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:58:43,838][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:58:44,157][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:58:44,478][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:58:44,797][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:58:45,118][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:58:45,437][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:58:46,109][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:58:46,844][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:58:46,846][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:58:46,848][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:58:47,486][__main__][INFO] - Iteration 173 took 27s (12.09% Gen, 85.58% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 10m 31s. Estimated total time: 7h 35m 51s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 35s, 500 more iterations: 3h 47m 55s. +[2026-03-25 16:58:47,488][__main__][INFO] - Starting iteration 173. +[2026-03-25 16:58:47,491][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:58:47,492][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:58:50,771][__main__][INFO] - Number of regex retries in iteration 173: 0 +[2026-03-25 16:58:50,772][__main__][INFO] - agents played in iteration 173 are Bob, Alice +[2026-03-25 16:58:51,329][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:58:51,989][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:58:52,280][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:58:52,601][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:58:52,922][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:58:53,242][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:58:53,562][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:58:53,882][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:58:54,203][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:58:54,524][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:58:54,843][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:58:55,164][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:58:55,483][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:58:55,803][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:58:56,123][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:58:56,443][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:58:56,764][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:58:57,084][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:58:57,405][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:58:57,724][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:58:58,044][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:58:58,365][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:58:58,684][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:58:59,004][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:58:59,323][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:58:59,644][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:58:59,964][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:59:00,284][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:59:00,603][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:59:00,924][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:59:01,245][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:59:01,564][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:59:01,885][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:59:02,205][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:59:02,527][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:59:02,848][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:59:03,168][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:59:03,489][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:59:03,809][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:59:04,129][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:59:04,450][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:59:04,772][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:59:05,091][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:59:05,412][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:59:05,734][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:59:06,054][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:59:06,375][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:59:06,698][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:59:07,020][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:59:07,340][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:59:07,662][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:59:07,983][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:59:08,303][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:59:08,921][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:59:09,242][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:59:09,562][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:59:09,884][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:59:10,203][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:59:10,523][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:59:10,843][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:59:11,163][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:59:11,484][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:59:11,804][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:59:12,124][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:59:12,446][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:59:12,766][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:59:13,432][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:59:14,164][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:59:14,167][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:59:14,169][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:59:14,806][__main__][INFO] - Iteration 174 took 27s (12.01% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 9m 28s. Estimated total time: 7h 35m 15s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 37s. +[2026-03-25 16:59:14,808][__main__][INFO] - Starting iteration 174. +[2026-03-25 16:59:14,811][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:59:14,812][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:59:18,096][__main__][INFO] - Number of regex retries in iteration 174: 0 +[2026-03-25 16:59:18,097][__main__][INFO] - agents played in iteration 174 are Bob, Alice +[2026-03-25 16:59:18,654][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:59:19,312][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:59:19,601][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:59:19,921][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:59:20,241][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:59:20,560][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:59:20,880][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:59:21,202][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:59:21,523][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:59:21,844][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:59:22,165][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:59:22,484][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:59:22,805][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:59:23,124][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:59:23,445][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:59:23,766][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:59:24,085][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:59:24,405][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:59:24,726][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:59:25,046][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:59:25,366][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:59:25,685][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:59:26,005][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:59:26,325][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:59:26,645][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:59:26,965][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:59:27,285][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:59:27,604][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:59:27,925][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:59:28,245][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:59:28,566][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:59:28,885][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:59:29,205][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:59:29,525][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:59:29,845][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:59:30,165][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:59:30,485][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:59:30,804][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:59:31,124][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:59:31,445][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:59:31,765][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:59:32,085][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:59:32,404][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 16:59:32,725][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 16:59:33,046][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 16:59:33,367][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 16:59:33,686][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 16:59:34,008][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 16:59:34,328][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 16:59:34,649][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 16:59:34,971][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 16:59:35,290][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 16:59:35,611][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 16:59:36,228][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 16:59:36,549][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 16:59:36,869][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 16:59:37,188][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 16:59:37,508][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 16:59:37,828][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 16:59:38,150][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 16:59:38,472][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 16:59:38,792][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 16:59:39,113][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 16:59:39,434][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 16:59:39,754][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 16:59:40,075][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 16:59:40,739][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 16:59:41,477][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 16:59:41,479][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 16:59:41,481][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 16:59:42,114][__main__][INFO] - Iteration 175 took 27s (12.03% Gen, 85.64% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 8m 48s. Estimated total time: 7h 35m 3s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 31s. +[2026-03-25 16:59:42,116][__main__][INFO] - Starting iteration 175. +[2026-03-25 16:59:42,119][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 16:59:42,120][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 16:59:45,405][__main__][INFO] - Number of regex retries in iteration 175: 0 +[2026-03-25 16:59:45,406][__main__][INFO] - agents played in iteration 175 are Bob, Alice +[2026-03-25 16:59:45,948][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 16:59:46,608][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 16:59:46,897][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 16:59:47,218][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 16:59:47,539][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 16:59:47,859][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 16:59:48,181][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 16:59:48,501][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 16:59:48,823][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 16:59:49,144][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 16:59:49,465][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 16:59:49,786][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 16:59:50,106][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 16:59:50,428][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 16:59:50,750][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 16:59:51,071][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 16:59:51,392][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 16:59:51,714][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 16:59:52,033][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 16:59:52,353][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 16:59:52,675][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 16:59:52,995][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 16:59:53,315][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 16:59:53,635][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 16:59:53,956][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 16:59:54,278][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 16:59:54,598][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 16:59:54,917][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 16:59:55,238][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 16:59:55,561][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 16:59:55,881][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 16:59:56,202][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 16:59:56,522][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 16:59:56,842][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 16:59:57,161][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 16:59:57,481][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 16:59:57,801][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 16:59:58,124][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 16:59:58,444][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 16:59:58,765][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 16:59:59,086][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 16:59:59,407][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 16:59:59,727][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:00:00,049][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:00:00,370][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:00:00,690][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:00:01,011][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:00:01,331][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:00:01,652][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:00:01,974][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:00:02,295][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:00:02,617][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:00:02,938][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:00:03,558][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:00:03,879][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:00:04,201][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:00:04,523][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:00:04,844][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:00:05,164][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:00:05,487][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:00:05,807][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:00:06,129][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:00:06,451][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:00:06,772][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:00:07,092][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:00:07,414][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:00:08,086][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:00:08,822][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:00:08,824][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:00:08,826][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:00:09,459][__main__][INFO] - Iteration 176 took 27s (12.02% Gen, 85.66% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 8m 59s. Estimated total time: 7h 35m 41s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 34s, 500 more iterations: 3h 47m 50s. +[2026-03-25 17:00:09,462][__main__][INFO] - Starting iteration 176. +[2026-03-25 17:00:09,465][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:00:09,465][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:00:12,744][__main__][INFO] - Number of regex retries in iteration 176: 0 +[2026-03-25 17:00:12,745][__main__][INFO] - agents played in iteration 176 are Bob, Alice +[2026-03-25 17:00:13,288][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:00:13,951][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:00:14,241][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:00:14,564][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:00:14,883][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:00:15,203][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:00:15,524][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:00:15,845][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:00:16,166][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:00:16,486][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:00:16,806][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:00:17,126][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:00:17,445][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:00:17,765][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:00:18,085][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:00:18,405][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:00:18,727][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:00:19,046][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:00:19,367][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:00:19,689][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:00:20,010][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:00:20,330][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:00:20,652][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:00:20,974][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:00:21,294][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:00:21,615][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:00:21,936][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:00:22,257][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:00:22,577][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:00:22,898][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:00:23,217][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:00:23,538][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:00:23,858][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:00:24,179][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:00:24,498][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:00:24,818][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:00:25,138][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:00:25,458][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:00:25,779][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:00:26,099][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:00:26,421][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:00:26,742][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:00:27,062][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:00:27,383][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:00:27,703][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:00:28,025][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:00:28,344][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:00:28,665][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:00:28,985][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:00:29,305][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:00:29,626][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:00:29,947][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:00:30,267][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:00:30,885][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:00:31,206][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:00:31,526][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:00:31,846][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:00:32,166][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:00:32,486][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:00:32,806][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:00:33,126][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:00:33,446][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:00:33,765][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:00:34,086][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:00:34,406][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:00:34,726][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:00:35,392][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:00:36,123][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:00:36,125][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:00:36,127][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:00:36,763][__main__][INFO] - Iteration 177 took 27s (12.02% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 7m 50s. Estimated total time: 7h 34m 59s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 29s. +[2026-03-25 17:00:36,765][__main__][INFO] - Starting iteration 177. +[2026-03-25 17:00:36,768][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:00:36,769][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:00:40,039][__main__][INFO] - Number of regex retries in iteration 177: 0 +[2026-03-25 17:00:40,040][__main__][INFO] - agents played in iteration 177 are Bob, Alice +[2026-03-25 17:00:40,585][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:00:41,246][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:00:41,536][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:00:41,858][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:00:42,177][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:00:42,497][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:00:42,817][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:00:43,137][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:00:43,458][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:00:43,780][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:00:44,100][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:00:44,421][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:00:44,743][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:00:45,064][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:00:45,384][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:00:45,705][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:00:46,024][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:00:46,344][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:00:46,664][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:00:46,986][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:00:47,306][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:00:47,625][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:00:47,945][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:00:48,267][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:00:48,587][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:00:48,908][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:00:49,227][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:00:49,547][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:00:49,867][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:00:50,187][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:00:50,509][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:00:50,831][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:00:51,153][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:00:51,472][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:00:51,793][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:00:52,114][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:00:52,434][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:00:52,753][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:00:53,075][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:00:53,396][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:00:53,716][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:00:54,037][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:00:54,358][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:00:54,678][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:00:54,999][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:00:55,320][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:00:55,639][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:00:55,959][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:00:56,280][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:00:56,600][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:00:56,921][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:00:57,242][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:00:57,564][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:00:58,183][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:00:58,503][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:00:58,824][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:00:59,143][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:00:59,465][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:00:59,785][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:01:00,104][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:01:00,425][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:01:00,747][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:01:01,067][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:01:01,387][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:01:01,708][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:01:02,030][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:01:02,696][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:01:03,436][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:01:03,438][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:01:03,440][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:01:04,080][__main__][INFO] - Iteration 178 took 27s (11.97% Gen, 85.67% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 7m 37s. Estimated total time: 7h 35m 13s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 36s. +[2026-03-25 17:01:04,083][__main__][INFO] - Starting iteration 178. +[2026-03-25 17:01:04,086][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:01:04,086][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:01:07,373][__main__][INFO] - Number of regex retries in iteration 178: 0 +[2026-03-25 17:01:07,373][__main__][INFO] - agents played in iteration 178 are Bob, Alice +[2026-03-25 17:01:07,916][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:01:08,577][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:01:08,869][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:01:09,190][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:01:09,510][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:01:09,831][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:01:10,153][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:01:10,474][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:01:10,795][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:01:11,114][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:01:11,434][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:01:11,754][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:01:12,074][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:01:12,395][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:01:12,715][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:01:13,036][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:01:13,357][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:01:13,678][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:01:13,999][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:01:14,319][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:01:14,639][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:01:14,960][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:01:15,281][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:01:15,602][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:01:15,922][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:01:16,244][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:01:16,563][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:01:16,884][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:01:17,203][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:01:17,523][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:01:17,844][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:01:18,165][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:01:18,486][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:01:18,806][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:01:19,127][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:01:19,448][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:01:19,768][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:01:20,088][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:01:20,409][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:01:20,731][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:01:21,052][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:01:21,373][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:01:21,693][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:01:22,014][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:01:22,335][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:01:22,656][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:01:22,977][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:01:23,298][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:01:23,619][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:01:23,940][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:01:24,261][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:01:24,581][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:01:24,900][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:01:25,518][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:01:25,838][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:01:26,159][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:01:26,478][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:01:26,799][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:01:27,118][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:01:27,439][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:01:27,759][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:01:28,080][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:01:28,401][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:01:28,722][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:01:29,043][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:01:29,364][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:01:30,027][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:01:30,776][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:01:30,778][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:01:30,779][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:01:31,418][__main__][INFO] - Iteration 179 took 27s (12.03% Gen, 85.63% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 7m 30s. Estimated total time: 7h 35m 33s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 46s. +[2026-03-25 17:01:31,421][__main__][INFO] - Starting iteration 179. +[2026-03-25 17:01:31,424][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:01:31,425][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:01:34,736][__main__][INFO] - Number of regex retries in iteration 179: 0 +[2026-03-25 17:01:34,737][__main__][INFO] - agents played in iteration 179 are Bob, Alice +[2026-03-25 17:01:35,280][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:01:35,937][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:01:36,227][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:01:36,549][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:01:36,869][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:01:37,189][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:01:37,509][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:01:37,831][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:01:38,153][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:01:38,474][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:01:38,794][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:01:39,116][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:01:39,435][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:01:39,757][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:01:40,078][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:01:40,398][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:01:40,718][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:01:41,038][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:01:41,358][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:01:41,678][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:01:41,997][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:01:42,319][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:01:42,638][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:01:42,959][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:01:43,281][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:01:43,602][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:01:43,922][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:01:44,244][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:01:44,565][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:01:44,886][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:01:45,206][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:01:45,527][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:01:45,850][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:01:46,171][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:01:46,492][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:01:46,812][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:01:47,134][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:01:47,454][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:01:47,775][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:01:48,096][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:01:48,417][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:01:48,738][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:01:49,057][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:01:49,378][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:01:49,699][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:01:50,019][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:01:50,339][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:01:50,659][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:01:50,980][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:01:51,300][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:01:51,619][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:01:51,938][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:01:52,259][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:01:52,874][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:01:53,194][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:01:53,515][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:01:53,835][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:01:54,155][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:01:54,475][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:01:54,796][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:01:55,116][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:01:55,436][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:01:55,757][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:01:56,078][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:01:56,399][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:01:56,720][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:01:57,381][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:01:58,119][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:01:58,121][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:01:58,123][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:01:58,752][__main__][INFO] - Iteration 180 took 27s (12.12% Gen, 85.57% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 6m 58s. Estimated total time: 7h 35m 29s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 44s. +[2026-03-25 17:01:58,755][__main__][INFO] - Starting iteration 180. +[2026-03-25 17:01:58,758][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:01:58,758][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:02:02,101][__main__][INFO] - Number of regex retries in iteration 180: 0 +[2026-03-25 17:02:02,101][__main__][INFO] - agents played in iteration 180 are Bob, Alice +[2026-03-25 17:02:02,654][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:02:03,310][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:02:03,600][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:02:03,921][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:02:04,240][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:02:04,560][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:02:04,879][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:02:05,200][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:02:05,520][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:02:05,841][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:02:06,162][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:02:06,481][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:02:06,803][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:02:07,122][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:02:07,443][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:02:07,762][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:02:08,082][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:02:08,401][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:02:08,720][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:02:09,040][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:02:09,361][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:02:09,682][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:02:10,004][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:02:10,324][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:02:10,645][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:02:10,965][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:02:11,285][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:02:11,607][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:02:11,928][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:02:12,249][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:02:12,568][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:02:12,887][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:02:13,207][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:02:13,527][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:02:13,846][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:02:14,166][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:02:14,485][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:02:14,806][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:02:15,126][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:02:15,446][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:02:15,768][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:02:16,088][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:02:16,409][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:02:16,729][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:02:17,050][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:02:17,372][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:02:17,691][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:02:18,012][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:02:18,331][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:02:18,653][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:02:18,974][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:02:19,295][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:02:19,615][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:02:20,231][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:02:20,553][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:02:20,874][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:02:21,196][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:02:21,518][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:02:21,839][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:02:22,160][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:02:22,482][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:02:22,803][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:02:23,125][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:02:23,444][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:02:23,764][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:02:24,085][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:02:24,747][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:02:25,483][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:02:25,485][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:02:25,487][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:02:26,083][__main__][INFO] - Iteration 181 took 27s (12.23% Gen, 85.58% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 6m 27s. Estimated total time: 7h 35m 26s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 43s. +[2026-03-25 17:02:26,085][__main__][INFO] - Starting iteration 181. +[2026-03-25 17:02:26,088][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:02:26,089][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:02:29,309][__main__][INFO] - Number of regex retries in iteration 181: 0 +[2026-03-25 17:02:29,310][__main__][INFO] - agents played in iteration 181 are Bob, Alice +[2026-03-25 17:02:29,857][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:02:30,515][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:02:30,804][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:02:31,126][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:02:31,447][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:02:31,767][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:02:32,087][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:02:32,407][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:02:32,728][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:02:33,049][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:02:33,368][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:02:33,690][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:02:34,009][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:02:34,331][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:02:34,651][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:02:34,971][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:02:35,292][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:02:35,613][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:02:35,933][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:02:36,254][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:02:36,576][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:02:36,895][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:02:37,217][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:02:37,538][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:02:37,858][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:02:38,179][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:02:38,498][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:02:38,818][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:02:39,138][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:02:39,458][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:02:39,779][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:02:40,098][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:02:40,418][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:02:40,738][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:02:41,058][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:02:41,379][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:02:41,700][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:02:42,021][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:02:42,342][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:02:42,661][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:02:42,981][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:02:43,300][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:02:43,619][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:02:43,938][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:02:44,259][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:02:44,580][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:02:44,900][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:02:45,220][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:02:45,540][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:02:45,861][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:02:46,181][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:02:46,501][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:02:46,820][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:02:47,432][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:02:47,754][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:02:48,073][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:02:48,393][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:02:48,714][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:02:49,035][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:02:49,356][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:02:49,677][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:02:49,996][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:02:50,318][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:02:50,638][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:02:50,959][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:02:51,278][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:02:51,933][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:02:52,664][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:02:52,666][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:02:52,668][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:02:53,297][__main__][INFO] - Iteration 182 took 27s (11.84% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 4m 3s. Estimated total time: 7h 33m 29s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 20s, 500 more iterations: 3h 46m 44s. +[2026-03-25 17:02:53,299][__main__][INFO] - Starting iteration 182. +[2026-03-25 17:02:53,302][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:02:53,302][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:02:56,534][__main__][INFO] - Number of regex retries in iteration 182: 0 +[2026-03-25 17:02:56,535][__main__][INFO] - agents played in iteration 182 are Bob, Alice +[2026-03-25 17:02:57,117][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:02:57,773][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:02:58,062][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:02:58,382][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:02:58,704][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:02:59,024][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:02:59,343][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:02:59,664][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:02:59,986][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:03:00,305][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:03:00,625][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:03:00,946][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:03:01,265][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:03:01,587][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:03:01,908][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:03:02,229][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:03:02,551][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:03:02,871][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:03:03,193][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:03:03,513][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:03:03,834][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:03:04,154][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:03:04,474][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:03:04,795][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:03:05,115][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:03:05,435][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:03:05,755][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:03:06,076][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:03:06,397][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:03:06,718][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:03:07,038][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:03:07,358][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:03:07,678][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:03:07,999][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:03:08,319][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:03:08,639][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:03:08,960][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:03:09,281][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:03:09,604][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:03:09,926][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:03:10,247][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:03:10,569][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:03:10,891][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:03:11,212][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:03:11,531][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:03:11,853][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:03:12,174][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:03:12,496][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:03:12,816][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:03:13,136][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:03:13,458][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:03:13,779][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:03:14,101][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:03:14,718][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:03:15,039][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:03:15,359][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:03:15,682][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:03:16,002][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:03:16,322][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:03:16,642][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:03:16,963][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:03:17,284][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:03:17,604][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:03:17,924][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:03:18,245][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:03:18,567][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:03:19,230][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:03:19,957][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:03:19,959][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:03:19,961][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:03:20,592][__main__][INFO] - Iteration 183 took 27s (11.85% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 4m 58s. Estimated total time: 7h 34m 50s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 25s. +[2026-03-25 17:03:20,594][__main__][INFO] - Starting iteration 183. +[2026-03-25 17:03:20,597][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:03:20,598][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:03:23,996][__main__][INFO] - Number of regex retries in iteration 183: 0 +[2026-03-25 17:03:23,997][__main__][INFO] - agents played in iteration 183 are Bob, Alice +[2026-03-25 17:03:24,551][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:03:25,207][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:03:25,546][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:03:25,868][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:03:26,188][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:03:26,508][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:03:26,827][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:03:27,147][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:03:27,467][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:03:27,786][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:03:28,107][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:03:28,427][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:03:28,747][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:03:29,067][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:03:29,387][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:03:29,707][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:03:30,026][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:03:30,348][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:03:30,667][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:03:30,987][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:03:31,306][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:03:31,625][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:03:31,945][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:03:32,265][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:03:32,585][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:03:32,905][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:03:33,225][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:03:33,545][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:03:33,865][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:03:34,185][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:03:34,505][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:03:34,825][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:03:35,144][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:03:35,465][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:03:35,786][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:03:36,108][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:03:36,430][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:03:36,752][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:03:37,074][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:03:37,394][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:03:37,715][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:03:38,035][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:03:38,355][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:03:38,676][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:03:38,997][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:03:39,318][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:03:39,639][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:03:39,959][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:03:40,280][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:03:40,599][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:03:40,919][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:03:41,241][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:03:41,562][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:03:42,178][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:03:42,500][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:03:42,819][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:03:43,140][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:03:43,459][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:03:43,779][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:03:44,100][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:03:44,420][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:03:44,740][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:03:45,061][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:03:45,380][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:03:45,699][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:03:46,020][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:03:46,689][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:03:47,429][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:03:47,431][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:03:47,433][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:03:48,059][__main__][INFO] - Iteration 184 took 27s (12.38% Gen, 85.34% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 7m 22s. Estimated total time: 7h 37m 42s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 46s, 500 more iterations: 3h 48m 51s. +[2026-03-25 17:03:48,061][__main__][INFO] - Starting iteration 184. +[2026-03-25 17:03:48,064][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:03:48,064][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:03:51,339][__main__][INFO] - Number of regex retries in iteration 184: 0 +[2026-03-25 17:03:51,340][__main__][INFO] - agents played in iteration 184 are Bob, Alice +[2026-03-25 17:03:51,940][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:03:52,596][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:03:52,886][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:03:53,206][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:03:53,526][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:03:53,845][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:03:54,166][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:03:54,487][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:03:54,809][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:03:55,128][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:03:55,449][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:03:55,767][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:03:56,088][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:03:56,410][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:03:56,730][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:03:57,052][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:03:57,373][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:03:57,693][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:03:58,014][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:03:58,334][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:03:58,654][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:03:58,976][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:03:59,296][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:03:59,617][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:03:59,939][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:04:00,259][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:04:00,578][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:04:00,899][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:04:01,219][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:04:01,539][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:04:01,859][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:04:02,179][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:04:02,499][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:04:02,819][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:04:03,140][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:04:03,459][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:04:03,780][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:04:04,102][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:04:04,422][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:04:04,741][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:04:05,059][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:04:05,380][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:04:05,699][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:04:06,020][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:04:06,339][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:04:06,659][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:04:06,980][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:04:07,300][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:04:07,621][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:04:07,942][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:04:08,263][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:04:08,582][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:04:08,901][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:04:09,512][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:04:09,833][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:04:10,154][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:04:10,473][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:04:10,792][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:04:11,113][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:04:11,435][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:04:11,756][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:04:12,075][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:04:12,398][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:04:12,720][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:04:13,041][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:04:13,361][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:04:14,018][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:04:14,752][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:04:14,755][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:04:14,756][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:04:15,401][__main__][INFO] - Iteration 185 took 27s (11.98% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 4m 50s. Estimated total time: 7h 35m 38s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 49s. +[2026-03-25 17:04:15,404][__main__][INFO] - Starting iteration 185. +[2026-03-25 17:04:15,407][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:04:15,408][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:04:18,644][__main__][INFO] - Number of regex retries in iteration 185: 0 +[2026-03-25 17:04:18,645][__main__][INFO] - agents played in iteration 185 are Bob, Alice +[2026-03-25 17:04:19,213][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:04:19,869][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:04:20,158][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:04:20,480][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:04:20,799][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:04:21,121][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:04:21,440][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:04:21,761][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:04:22,082][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:04:22,403][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:04:22,722][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:04:23,042][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:04:23,361][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:04:23,681][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:04:24,000][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:04:24,321][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:04:24,642][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:04:24,962][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:04:25,281][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:04:25,602][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:04:25,921][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:04:26,240][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:04:26,560][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:04:26,881][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:04:27,202][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:04:27,523][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:04:27,844][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:04:28,165][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:04:28,485][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:04:28,805][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:04:29,126][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:04:29,446][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:04:29,766][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:04:30,085][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:04:30,405][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:04:30,725][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:04:31,045][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:04:31,366][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:04:31,685][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:04:32,005][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:04:32,324][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:04:32,645][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:04:32,965][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:04:33,285][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:04:33,605][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:04:33,926][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:04:34,246][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:04:34,566][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:04:34,886][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:04:35,205][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:04:35,526][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:04:35,846][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:04:36,165][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:04:36,780][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:04:37,100][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:04:37,419][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:04:37,739][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:04:38,060][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:04:38,380][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:04:38,701][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:04:39,022][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:04:39,342][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:04:39,662][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:04:39,984][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:04:40,304][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:04:40,624][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:04:41,283][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:04:42,015][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:04:42,018][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:04:42,019][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:04:42,646][__main__][INFO] - Iteration 186 took 27s (11.88% Gen, 85.81% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 2m 45s. Estimated total time: 7h 34m 0s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 0s. +[2026-03-25 17:04:42,649][__main__][INFO] - Starting iteration 186. +[2026-03-25 17:04:42,652][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:04:42,652][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:04:45,907][__main__][INFO] - Number of regex retries in iteration 186: 0 +[2026-03-25 17:04:45,908][__main__][INFO] - agents played in iteration 186 are Bob, Alice +[2026-03-25 17:04:46,464][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:04:47,113][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:04:47,405][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:04:47,726][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:04:48,047][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:04:48,367][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:04:48,687][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:04:49,006][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:04:49,326][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:04:49,648][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:04:49,968][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:04:50,288][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:04:50,610][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:04:50,930][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:04:51,250][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:04:51,572][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:04:51,892][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:04:52,211][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:04:52,532][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:04:52,853][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:04:53,174][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:04:53,493][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:04:53,813][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:04:54,134][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:04:54,455][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:04:54,775][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:04:55,096][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:04:55,416][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:04:55,736][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:04:56,057][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:04:56,378][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:04:56,699][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:04:57,020][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:04:57,339][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:04:57,659][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:04:57,979][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:04:58,301][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:04:58,622][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:04:58,942][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:04:59,263][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:04:59,584][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:04:59,903][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:05:00,224][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:05:00,546][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:05:00,868][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:05:01,189][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:05:01,509][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:05:01,829][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:05:02,151][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:05:02,472][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:05:02,793][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:05:03,114][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:05:03,435][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:05:04,048][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:05:04,369][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:05:04,689][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:05:05,010][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:05:05,331][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:05:05,651][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:05:05,972][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:05:06,294][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:05:06,613][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:05:06,934][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:05:07,254][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:05:07,574][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:05:07,894][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:05:08,549][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:05:09,292][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:05:09,294][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:05:09,296][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:05:09,963][__main__][INFO] - Iteration 187 took 27s (11.92% Gen, 85.63% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 3m 30s. Estimated total time: 7h 35m 12s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 36s. +[2026-03-25 17:05:09,965][__main__][INFO] - Starting iteration 187. +[2026-03-25 17:05:09,968][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:05:09,969][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:05:13,193][__main__][INFO] - Number of regex retries in iteration 187: 0 +[2026-03-25 17:05:13,194][__main__][INFO] - agents played in iteration 187 are Bob, Alice +[2026-03-25 17:05:13,745][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:05:14,395][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:05:14,685][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:05:15,006][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:05:15,327][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:05:15,647][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:05:15,967][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:05:16,289][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:05:16,609][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:05:16,930][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:05:17,253][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:05:17,573][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:05:17,893][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:05:18,214][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:05:18,534][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:05:18,856][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:05:19,176][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:05:19,497][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:05:19,817][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:05:20,138][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:05:20,461][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:05:20,780][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:05:21,101][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:05:21,420][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:05:21,741][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:05:22,062][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:05:22,383][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:05:22,703][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:05:23,024][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:05:23,343][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:05:23,664][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:05:23,984][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:05:24,304][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:05:24,625][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:05:24,946][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:05:25,267][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:05:25,587][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:05:25,907][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:05:26,228][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:05:26,549][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:05:26,868][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:05:27,188][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:05:27,508][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:05:27,827][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:05:28,148][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:05:28,468][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:05:28,787][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:05:29,107][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:05:29,426][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:05:29,747][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:05:30,068][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:05:30,387][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:05:30,706][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:05:31,319][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:05:31,639][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:05:31,959][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:05:32,280][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:05:32,600][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:05:32,921][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:05:33,241][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:05:33,562][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:05:33,881][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:05:34,202][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:05:34,523][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:05:34,844][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:05:35,165][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:05:35,819][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:05:36,549][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:05:36,552][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:05:36,553][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:05:37,218][__main__][INFO] - Iteration 188 took 27s (11.84% Gen, 85.72% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 2m 1s. Estimated total time: 7h 34m 10s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 5s. +[2026-03-25 17:05:37,220][__main__][INFO] - Starting iteration 188. +[2026-03-25 17:05:37,223][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:05:37,224][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:05:40,453][__main__][INFO] - Number of regex retries in iteration 188: 0 +[2026-03-25 17:05:40,454][__main__][INFO] - agents played in iteration 188 are Bob, Alice +[2026-03-25 17:05:40,996][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:05:41,646][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:05:41,935][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:05:42,257][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:05:42,578][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:05:42,898][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:05:43,217][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:05:43,538][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:05:43,859][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:05:44,179][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:05:44,498][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:05:44,819][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:05:45,140][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:05:45,460][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:05:45,781][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:05:46,101][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:05:46,421][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:05:46,742][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:05:47,064][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:05:47,385][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:05:47,705][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:05:48,024][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:05:48,343][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:05:48,665][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:05:48,985][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:05:49,305][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:05:49,627][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:05:49,950][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:05:50,271][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:05:50,592][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:05:50,913][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:05:51,233][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:05:51,554][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:05:51,875][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:05:52,195][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:05:52,517][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:05:52,838][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:05:53,159][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:05:53,479][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:05:53,799][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:05:54,118][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:05:54,439][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:05:54,759][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:05:55,079][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:05:55,399][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:05:55,721][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:05:56,041][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:05:56,361][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:05:56,680][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:05:57,000][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:05:57,320][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:05:57,641][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:05:57,961][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:05:58,581][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:05:58,902][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:05:59,222][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:05:59,543][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:05:59,864][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:06:00,185][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:06:00,505][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:06:00,825][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:06:01,146][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:06:01,467][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:06:01,787][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:06:02,106][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:06:02,426][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:06:03,089][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:06:03,829][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:06:03,832][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:06:03,834][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:06:04,470][__main__][INFO] - Iteration 189 took 27s (11.85% Gen, 85.80% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 1m 31s. Estimated total time: 7h 34m 7s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 3s. +[2026-03-25 17:06:04,472][__main__][INFO] - Starting iteration 189. +[2026-03-25 17:06:04,475][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:06:04,476][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:06:07,723][__main__][INFO] - Number of regex retries in iteration 189: 0 +[2026-03-25 17:06:07,723][__main__][INFO] - agents played in iteration 189 are Bob, Alice +[2026-03-25 17:06:08,274][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:06:08,934][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:06:09,224][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:06:09,546][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:06:09,866][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:06:10,188][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:06:10,509][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:06:10,830][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:06:11,151][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:06:11,472][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:06:11,794][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:06:12,113][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:06:12,435][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:06:12,756][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:06:13,078][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:06:13,399][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:06:13,720][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:06:14,040][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:06:14,361][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:06:14,683][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:06:15,003][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:06:15,324][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:06:15,644][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:06:15,964][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:06:16,286][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:06:16,607][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:06:16,929][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:06:17,249][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:06:17,569][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:06:17,891][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:06:18,212][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:06:18,533][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:06:18,854][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:06:19,175][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:06:19,496][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:06:19,817][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:06:20,138][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:06:20,458][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:06:20,779][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:06:21,098][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:06:21,418][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:06:21,739][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:06:22,059][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:06:22,380][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:06:22,701][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:06:23,022][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:06:23,342][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:06:23,663][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:06:23,984][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:06:24,305][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:06:24,627][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:06:24,948][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:06:25,268][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:06:25,889][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:06:26,211][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:06:26,532][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:06:26,853][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:06:27,174][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:06:27,494][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:06:27,814][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:06:28,136][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:06:28,457][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:06:28,777][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:06:29,097][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:06:29,419][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:06:29,740][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:06:30,408][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:06:31,150][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:06:31,152][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:06:31,154][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:06:31,791][__main__][INFO] - Iteration 190 took 27s (11.89% Gen, 85.77% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 2m 12s. Estimated total time: 7h 35m 16s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 38s. +[2026-03-25 17:06:31,793][__main__][INFO] - Starting iteration 190. +[2026-03-25 17:06:31,796][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:06:31,796][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:06:35,055][__main__][INFO] - Number of regex retries in iteration 190: 0 +[2026-03-25 17:06:35,056][__main__][INFO] - agents played in iteration 190 are Bob, Alice +[2026-03-25 17:06:35,615][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:06:36,278][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:06:36,569][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:06:36,891][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:06:37,212][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:06:37,534][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:06:37,854][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:06:38,176][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:06:38,496][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:06:38,818][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:06:39,138][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:06:39,461][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:06:39,781][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:06:40,102][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:06:40,423][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:06:40,744][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:06:41,065][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:06:41,384][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:06:41,706][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:06:42,026][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:06:42,347][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:06:42,668][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:06:42,987][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:06:43,308][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:06:43,628][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:06:43,948][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:06:44,267][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:06:44,588][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:06:44,909][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:06:45,230][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:06:45,551][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:06:45,873][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:06:46,194][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:06:46,513][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:06:46,835][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:06:47,157][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:06:47,478][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:06:47,797][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:06:48,118][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:06:48,439][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:06:48,759][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:06:49,079][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:06:49,400][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:06:49,721][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:06:50,040][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:06:50,362][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:06:50,681][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:06:51,003][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:06:51,322][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:06:51,641][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:06:51,962][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:06:52,282][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:06:52,603][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:06:53,223][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:06:53,544][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:06:53,865][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:06:54,185][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:06:54,506][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:06:54,826][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:06:55,147][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:06:55,468][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:06:55,787][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:06:56,107][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:06:56,428][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:06:56,749][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:06:57,069][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:06:57,734][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:06:58,471][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:06:58,474][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:06:58,475][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:06:59,111][__main__][INFO] - Iteration 191 took 27s (11.93% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 1m 44s. Estimated total time: 7h 35m 16s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 38s. +[2026-03-25 17:06:59,113][__main__][INFO] - Starting iteration 191. +[2026-03-25 17:06:59,116][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:06:59,117][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:07:02,366][__main__][INFO] - Number of regex retries in iteration 191: 0 +[2026-03-25 17:07:02,367][__main__][INFO] - agents played in iteration 191 are Bob, Alice +[2026-03-25 17:07:02,922][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:07:03,582][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:07:03,871][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:07:04,193][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:07:04,515][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:07:04,835][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:07:05,155][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:07:05,476][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:07:05,797][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:07:06,118][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:07:06,439][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:07:06,760][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:07:07,081][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:07:07,401][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:07:07,721][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:07:08,040][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:07:08,360][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:07:08,681][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:07:09,001][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:07:09,321][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:07:09,641][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:07:09,962][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:07:10,282][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:07:10,603][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:07:10,924][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:07:11,243][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:07:11,564][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:07:11,885][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:07:12,206][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:07:12,527][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:07:12,848][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:07:13,170][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:07:13,489][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:07:13,810][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:07:14,131][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:07:14,452][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:07:14,774][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:07:15,096][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:07:15,416][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:07:15,738][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:07:16,059][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:07:16,379][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:07:16,699][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:07:17,019][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:07:17,339][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:07:17,659][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:07:17,980][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:07:18,299][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:07:18,619][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:07:18,939][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:07:19,260][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:07:19,580][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:07:19,900][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:07:20,520][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:07:20,840][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:07:21,160][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:07:21,480][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:07:21,800][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:07:22,120][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:07:22,440][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:07:22,761][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:07:23,082][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:07:23,402][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:07:23,722][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:07:24,041][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:07:24,361][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:07:25,033][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:07:25,788][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:07:25,790][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:07:25,792][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:07:26,426][__main__][INFO] - Iteration 192 took 27s (11.90% Gen, 85.77% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 1m 12s. Estimated total time: 7h 35m 10s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 35s. +[2026-03-25 17:07:26,428][__main__][INFO] - Starting iteration 192. +[2026-03-25 17:07:26,431][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:07:26,432][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:07:29,694][__main__][INFO] - Number of regex retries in iteration 192: 0 +[2026-03-25 17:07:29,695][__main__][INFO] - agents played in iteration 192 are Bob, Alice +[2026-03-25 17:07:30,244][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:07:30,905][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:07:31,195][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:07:31,516][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:07:31,835][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:07:32,156][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:07:32,477][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:07:32,798][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:07:33,119][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:07:33,440][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:07:33,761][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:07:34,082][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:07:34,403][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:07:34,724][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:07:35,045][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:07:35,365][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:07:35,686][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:07:36,006][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:07:36,327][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:07:36,647][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:07:36,968][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:07:37,289][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:07:37,610][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:07:37,931][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:07:38,253][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:07:38,575][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:07:38,896][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:07:39,217][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:07:39,539][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:07:39,859][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:07:40,180][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:07:40,500][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:07:40,820][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:07:41,140][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:07:41,461][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:07:41,781][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:07:42,100][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:07:42,420][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:07:42,740][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:07:43,061][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:07:43,382][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:07:43,702][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:07:44,021][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:07:44,341][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:07:44,661][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:07:44,981][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:07:45,303][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:07:45,622][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:07:45,941][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:07:46,262][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:07:46,582][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:07:46,902][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:07:47,222][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:07:47,839][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:07:48,160][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:07:48,479][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:07:48,799][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:07:49,119][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:07:49,440][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:07:49,761][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:07:50,080][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:07:50,402][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:07:50,722][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:07:51,043][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:07:51,364][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:07:51,685][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:07:52,349][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:07:53,080][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:07:53,082][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:07:53,084][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:07:53,713][__main__][INFO] - Iteration 193 took 27s (11.96% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 0m 16s. Estimated total time: 7h 34m 42s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 21s. +[2026-03-25 17:07:53,715][__main__][INFO] - Starting iteration 193. +[2026-03-25 17:07:53,718][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:07:53,719][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:07:56,966][__main__][INFO] - Number of regex retries in iteration 193: 0 +[2026-03-25 17:07:56,967][__main__][INFO] - agents played in iteration 193 are Bob, Alice +[2026-03-25 17:07:57,510][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:07:58,159][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:07:58,449][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:07:58,769][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:07:59,089][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:07:59,409][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:07:59,730][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:08:00,050][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:08:00,371][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:08:00,692][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:08:01,014][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:08:01,334][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:08:01,655][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:08:01,976][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:08:02,296][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:08:02,616][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:08:02,935][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:08:03,255][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:08:03,575][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:08:03,897][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:08:04,219][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:08:04,539][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:08:04,859][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:08:05,180][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:08:05,500][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:08:05,821][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:08:06,142][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:08:06,461][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:08:06,782][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:08:07,102][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:08:07,421][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:08:07,741][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:08:08,061][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:08:08,382][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:08:08,701][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:08:09,021][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:08:09,340][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:08:09,661][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:08:09,982][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:08:10,301][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:08:10,622][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:08:10,941][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:08:11,262][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:08:11,583][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:08:11,902][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:08:12,222][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:08:12,542][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:08:12,861][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:08:13,181][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:08:13,502][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:08:13,823][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:08:14,144][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:08:14,465][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:08:15,080][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:08:15,400][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:08:15,720][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:08:16,041][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:08:16,363][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:08:16,684][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:08:17,004][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:08:17,325][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:08:17,645][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:08:17,966][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:08:18,286][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:08:18,605][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:08:18,926][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:08:19,582][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:08:20,309][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:08:20,312][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:08:20,313][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:08:20,942][__main__][INFO] - Iteration 194 took 27s (11.93% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 58m 51s. Estimated total time: 7h 33m 44s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 52s. +[2026-03-25 17:08:20,944][__main__][INFO] - Starting iteration 194. +[2026-03-25 17:08:20,947][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:08:20,948][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:08:24,221][__main__][INFO] - Number of regex retries in iteration 194: 0 +[2026-03-25 17:08:24,222][__main__][INFO] - agents played in iteration 194 are Bob, Alice +[2026-03-25 17:08:24,766][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:08:25,417][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:08:25,707][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:08:26,028][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:08:26,348][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:08:26,669][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:08:26,988][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:08:27,308][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:08:27,630][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:08:27,950][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:08:28,270][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:08:28,591][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:08:28,910][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:08:29,232][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:08:29,552][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:08:29,874][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:08:30,194][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:08:30,514][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:08:30,835][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:08:31,155][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:08:31,475][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:08:31,795][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:08:32,115][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:08:32,434][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:08:32,755][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:08:33,075][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:08:33,394][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:08:33,715][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:08:34,034][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:08:34,355][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:08:34,676][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:08:34,997][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:08:35,317][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:08:35,638][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:08:35,959][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:08:36,278][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:08:36,599][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:08:36,920][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:08:37,240][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:08:37,559][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:08:37,880][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:08:38,201][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:08:38,521][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:08:38,840][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:08:39,161][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:08:39,480][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:08:39,800][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:08:40,121][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:08:40,441][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:08:40,761][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:08:41,082][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:08:41,402][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:08:41,723][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:08:42,341][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:08:42,663][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:08:42,985][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:08:43,306][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:08:43,628][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:08:43,947][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:08:44,267][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:08:44,588][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:08:44,908][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:08:45,228][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:08:45,548][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:08:45,867][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:08:46,187][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:08:46,851][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:08:47,591][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:08:47,594][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:08:47,595][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:08:48,228][__main__][INFO] - Iteration 195 took 27s (12.00% Gen, 85.67% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 59m 21s. Estimated total time: 7h 34m 42s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 21s. +[2026-03-25 17:08:48,231][__main__][INFO] - Starting iteration 195. +[2026-03-25 17:08:48,234][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:08:48,235][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:08:51,493][__main__][INFO] - Number of regex retries in iteration 195: 0 +[2026-03-25 17:08:51,494][__main__][INFO] - agents played in iteration 195 are Bob, Alice +[2026-03-25 17:08:52,089][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:08:52,739][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:08:53,030][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:08:53,353][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:08:53,674][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:08:53,995][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:08:54,316][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:08:54,636][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:08:54,955][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:08:55,275][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:08:55,595][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:08:55,916][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:08:56,237][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:08:56,556][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:08:56,877][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:08:57,198][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:08:57,520][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:08:57,840][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:08:58,161][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:08:58,480][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:08:58,801][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:08:59,121][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:08:59,442][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:08:59,763][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:09:00,084][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:09:00,404][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:09:00,724][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:09:01,045][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:09:01,366][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:09:01,686][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:09:02,007][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:09:02,328][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:09:02,649][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:09:02,971][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:09:03,292][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:09:03,613][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:09:03,935][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:09:04,257][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:09:04,579][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:09:04,902][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:09:05,224][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:09:05,545][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:09:05,865][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:09:06,184][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:09:06,504][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:09:06,825][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:09:07,145][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:09:07,465][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:09:07,786][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:09:08,107][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:09:08,427][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:09:08,747][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:09:09,068][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:09:09,682][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:09:10,004][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:09:10,325][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:09:10,646][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:09:10,966][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:09:11,287][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:09:11,608][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:09:11,929][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:09:12,250][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:09:12,571][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:09:12,892][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:09:13,212][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:09:13,534][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:09:14,189][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:09:14,921][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:09:14,923][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:09:14,925][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:09:15,553][__main__][INFO] - Iteration 196 took 27s (11.93% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 59m 32s. Estimated total time: 7h 35m 20s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 40s. +[2026-03-25 17:09:15,555][__main__][INFO] - Starting iteration 196. +[2026-03-25 17:09:15,558][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:09:15,559][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:09:18,910][__main__][INFO] - Number of regex retries in iteration 196: 0 +[2026-03-25 17:09:18,911][__main__][INFO] - agents played in iteration 196 are Bob, Alice +[2026-03-25 17:09:19,463][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:09:20,115][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:09:20,405][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:09:20,725][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:09:21,045][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:09:21,366][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:09:21,685][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:09:22,005][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:09:22,324][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:09:22,645][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:09:22,966][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:09:23,287][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:09:23,608][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:09:23,929][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:09:24,251][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:09:24,571][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:09:24,892][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:09:25,213][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:09:25,536][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:09:25,856][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:09:26,176][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:09:26,497][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:09:26,819][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:09:27,140][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:09:27,460][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:09:27,780][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:09:28,099][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:09:28,420][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:09:28,741][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:09:29,062][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:09:29,381][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:09:29,701][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:09:30,021][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:09:30,341][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:09:30,661][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:09:30,981][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:09:31,300][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:09:31,621][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:09:31,941][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:09:32,261][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:09:32,581][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:09:32,900][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:09:33,221][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:09:33,541][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:09:33,861][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:09:34,181][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:09:34,501][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:09:34,822][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:09:35,141][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:09:35,461][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:09:35,782][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:09:36,102][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:09:36,421][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:09:37,035][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:09:37,356][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:09:37,678][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:09:37,999][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:09:38,320][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:09:38,639][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:09:38,961][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:09:39,282][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:09:39,601][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:09:39,921][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:09:40,241][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:09:40,561][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:09:40,881][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:09:41,535][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:09:42,275][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:09:42,277][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:09:42,279][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:09:42,958][__main__][INFO] - Iteration 197 took 27s (12.23% Gen, 85.28% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 0m 25s. Estimated total time: 7h 36m 40s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 40s, 500 more iterations: 3h 48m 20s. +[2026-03-25 17:09:42,961][__main__][INFO] - Starting iteration 197. +[2026-03-25 17:09:42,964][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:09:42,964][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:09:46,309][__main__][INFO] - Number of regex retries in iteration 197: 0 +[2026-03-25 17:09:46,310][__main__][INFO] - agents played in iteration 197 are Bob, Alice +[2026-03-25 17:09:46,892][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:09:47,544][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:09:47,834][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:09:48,156][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:09:48,477][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:09:48,797][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:09:49,118][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:09:49,440][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:09:49,761][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:09:50,080][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:09:50,400][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:09:50,720][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:09:51,042][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:09:51,364][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:09:51,686][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:09:52,006][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:09:52,327][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:09:52,647][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:09:52,968][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:09:53,289][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:09:53,610][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:09:53,932][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:09:54,252][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:09:54,573][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:09:54,894][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:09:55,215][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:09:55,534][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:09:55,854][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:09:56,175][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:09:56,497][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:09:56,818][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:09:57,139][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:09:57,460][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:09:57,780][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:09:58,101][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:09:58,422][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:09:58,742][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:09:59,063][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:09:59,385][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:09:59,705][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:10:00,025][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:10:00,347][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:10:00,667][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:10:00,987][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:10:01,307][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:10:01,629][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:10:01,949][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:10:02,270][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:10:02,592][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:10:02,912][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:10:03,232][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:10:03,555][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:10:03,874][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:10:04,488][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:10:04,808][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:10:05,128][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:10:05,448][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:10:05,767][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:10:06,087][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:10:06,408][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:10:06,728][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:10:07,049][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:10:07,369][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:10:07,690][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:10:08,012][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:10:08,333][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:10:08,987][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:10:09,732][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:10:09,734][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:10:09,736][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:10:10,365][__main__][INFO] - Iteration 198 took 27s (12.21% Gen, 85.49% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 0m 0s. Estimated total time: 7h 36m 42s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 40s, 500 more iterations: 3h 48m 21s. +[2026-03-25 17:10:10,368][__main__][INFO] - Starting iteration 198. +[2026-03-25 17:10:10,370][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:10:10,371][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:10:13,693][__main__][INFO] - Number of regex retries in iteration 198: 0 +[2026-03-25 17:10:13,694][__main__][INFO] - agents played in iteration 198 are Bob, Alice +[2026-03-25 17:10:14,343][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:10:15,003][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:10:15,292][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:10:15,613][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:10:15,933][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:10:16,255][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:10:16,575][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:10:16,896][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:10:17,217][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:10:17,538][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:10:17,859][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:10:18,180][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:10:18,500][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:10:18,820][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:10:19,140][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:10:19,461][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:10:19,781][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:10:20,101][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:10:20,423][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:10:20,744][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:10:21,064][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:10:21,383][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:10:21,705][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:10:22,025][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:10:22,346][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:10:22,667][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:10:22,988][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:10:23,308][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:10:23,629][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:10:23,951][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:10:24,274][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:10:24,593][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:10:24,913][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:10:25,234][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:10:25,554][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:10:25,876][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:10:26,198][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:10:26,519][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:10:26,839][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:10:27,160][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:10:27,481][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:10:27,803][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:10:28,122][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:10:28,442][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:10:28,762][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:10:29,082][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:10:29,403][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:10:29,725][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:10:30,044][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:10:30,364][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:10:30,685][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:10:31,005][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:10:31,326][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:10:31,944][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:10:32,264][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:10:32,585][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:10:32,905][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:10:33,226][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:10:33,546][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:10:33,866][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:10:34,187][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:10:34,506][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:10:34,827][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:10:35,148][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:10:35,467][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:10:35,788][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:10:36,443][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:10:37,182][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:10:37,184][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:10:37,186][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:10:37,816][__main__][INFO] - Iteration 199 took 27s (12.11% Gen, 85.59% Train). Generation: 3s, Training: 23s. Estimated remaining time: 6h 0m 16s. Estimated total time: 7h 37m 26s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 44s, 500 more iterations: 3h 48m 43s. +[2026-03-25 17:10:37,819][__main__][INFO] - Starting iteration 199. +[2026-03-25 17:10:37,822][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:10:37,822][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:10:41,117][__main__][INFO] - Number of regex retries in iteration 199: 0 +[2026-03-25 17:10:41,118][__main__][INFO] - agents played in iteration 199 are Bob, Alice +[2026-03-25 17:10:41,682][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:10:42,342][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:10:42,632][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:10:42,953][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:10:43,273][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:10:43,593][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:10:43,913][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:10:44,234][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:10:44,555][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:10:44,875][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:10:45,196][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:10:45,515][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:10:45,836][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:10:46,158][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:10:46,479][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:10:46,799][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:10:47,121][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:10:47,442][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:10:47,763][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:10:48,084][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:10:48,404][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:10:48,725][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:10:49,046][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:10:49,368][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:10:49,687][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:10:50,008][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:10:50,328][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:10:50,648][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:10:50,970][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:10:51,290][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:10:51,612][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:10:51,933][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:10:52,254][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:10:52,574][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:10:52,895][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:10:53,215][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:10:53,535][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:10:53,855][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:10:54,176][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:10:54,497][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:10:54,818][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:10:55,139][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:10:55,460][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:10:55,781][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:10:56,100][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:10:56,420][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:10:56,741][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:10:57,061][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:10:57,382][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:10:57,702][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:10:58,023][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:10:58,343][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:10:58,666][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:10:59,277][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:10:59,596][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:10:59,917][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:11:00,238][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:11:00,559][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:11:00,879][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:11:01,198][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:11:01,519][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:11:01,840][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:11:02,162][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:11:02,482][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:11:02,803][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:11:03,124][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:11:03,776][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:11:04,521][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:11:04,523][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:11:04,525][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:11:05,155][__main__][INFO] - Iteration 200 took 27s (12.06% Gen, 85.63% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 57m 56s. Estimated total time: 7h 35m 34s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 47s. +[2026-03-25 17:11:05,157][__main__][INFO] - Starting iteration 200. +[2026-03-25 17:11:05,160][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 3 and human policies 1. +[2026-03-25 17:11:05,161][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:11:08,419][__main__][INFO] - Number of regex retries in iteration 200: 0 +[2026-03-25 17:11:08,420][__main__][INFO] - agents played in iteration 200 are Bob, Alice +[2026-03-25 17:11:08,959][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:11:09,608][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:11:09,897][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:11:10,219][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:11:10,541][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:11:10,862][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:11:11,182][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:11:11,504][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:11:11,825][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:11:12,146][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:11:12,467][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:11:12,788][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:11:13,109][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:11:13,429][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:11:13,749][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:11:14,068][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:11:14,390][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:11:14,712][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:11:15,033][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:11:15,354][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:11:15,675][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:11:15,996][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:11:16,318][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:11:16,638][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:11:16,960][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:11:17,279][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:11:17,599][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:11:17,920][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:11:18,242][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:11:18,563][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:11:18,884][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:11:19,205][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:11:19,527][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:11:19,847][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:11:20,168][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:11:20,489][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:11:20,809][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:11:21,130][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:11:21,451][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:11:21,773][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:11:22,092][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:11:22,414][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:11:22,735][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:11:23,057][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:11:23,379][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:11:23,699][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:11:24,020][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:11:24,342][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:11:24,662][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:11:24,983][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:11:25,302][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:11:25,623][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:11:25,944][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:11:26,556][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:11:26,877][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:11:27,198][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:11:27,519][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:11:27,838][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:11:28,159][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:11:28,480][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:11:28,799][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:11:29,120][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:11:29,441][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:11:29,760][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:11:30,082][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:11:30,403][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:11:31,056][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:11:31,796][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:11:31,798][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:11:31,800][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:11:36,922][__main__][INFO] - Iteration 201 took 31s (10.26% Gen, 73.61% Train). Generation: 3s, Training: 23s. Estimated remaining time: 7h 11m 14s. Estimated total time: 8h 49m 23s. Time estimates for 10 more iterations: 5m 17s, 100 more iterations: 52m 56s, 500 more iterations: 4h 24m 41s. +[2026-03-25 17:11:36,924][__main__][INFO] - Starting iteration 201. +[2026-03-25 17:11:36,928][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:11:36,928][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:11:40,197][__main__][INFO] - Number of regex retries in iteration 201: 0 +[2026-03-25 17:11:40,198][__main__][INFO] - agents played in iteration 201 are Bob, Alice +[2026-03-25 17:11:40,746][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:11:41,407][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:11:41,697][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:11:42,018][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:11:42,338][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:11:42,659][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:11:42,980][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:11:43,300][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:11:43,620][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:11:43,941][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:11:44,260][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:11:44,582][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:11:44,902][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:11:45,222][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:11:45,542][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:11:45,863][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:11:46,184][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:11:46,505][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:11:46,824][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:11:47,145][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:11:47,467][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:11:47,787][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:11:48,107][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:11:48,427][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:11:48,746][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:11:49,067][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:11:49,388][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:11:49,707][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:11:50,028][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:11:50,350][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:11:50,669][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:11:50,991][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:11:51,313][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:11:51,635][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:11:51,956][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:11:52,276][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:11:52,597][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:11:52,918][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:11:53,237][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:11:53,559][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:11:53,880][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:11:54,199][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:11:54,519][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:11:54,840][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:11:55,161][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:11:55,482][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:11:55,802][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:11:56,122][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:11:56,443][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:11:56,764][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:11:57,083][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:11:57,403][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:11:57,723][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:11:58,335][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:11:58,658][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:11:58,979][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:11:59,299][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:11:59,621][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:11:59,941][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:12:00,260][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:12:00,580][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:12:00,902][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:12:01,223][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:12:01,542][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:12:01,862][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:12:02,181][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:12:02,836][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:12:03,576][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:12:03,578][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:12:03,580][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:12:04,253][__main__][INFO] - Iteration 202 took 27s (11.96% Gen, 85.57% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 56m 49s. Estimated total time: 7h 35m 26s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 43s. +[2026-03-25 17:12:04,255][__main__][INFO] - Starting iteration 202. +[2026-03-25 17:12:04,258][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:12:04,259][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:12:07,525][__main__][INFO] - Number of regex retries in iteration 202: 0 +[2026-03-25 17:12:07,526][__main__][INFO] - agents played in iteration 202 are Bob, Alice +[2026-03-25 17:12:08,066][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:12:08,716][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:12:09,006][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:12:09,327][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:12:09,646][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:12:09,966][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:12:10,287][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:12:10,607][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:12:10,928][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:12:11,249][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:12:11,569][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:12:11,889][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:12:12,209][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:12:12,530][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:12:12,850][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:12:13,172][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:12:13,493][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:12:13,814][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:12:14,133][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:12:14,454][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:12:14,775][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:12:15,095][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:12:15,415][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:12:15,736][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:12:16,059][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:12:16,380][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:12:16,700][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:12:17,020][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:12:17,341][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:12:17,663][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:12:17,984][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:12:18,304][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:12:18,625][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:12:18,945][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:12:19,265][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:12:19,587][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:12:19,909][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:12:20,231][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:12:20,552][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:12:20,875][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:12:21,196][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:12:21,517][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:12:21,837][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:12:22,158][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:12:22,479][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:12:22,799][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:12:23,121][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:12:23,441][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:12:23,762][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:12:24,084][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:12:24,407][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:12:24,730][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:12:25,051][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:12:25,670][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:12:25,992][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:12:26,313][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:12:26,635][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:12:26,955][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:12:27,276][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:12:27,596][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:12:27,917][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:12:28,237][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:12:28,559][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:12:28,879][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:12:29,200][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:12:29,521][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:12:30,178][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:12:30,917][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:12:30,919][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:12:30,921][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:12:31,551][__main__][INFO] - Iteration 203 took 27s (11.97% Gen, 85.71% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 55m 50s. Estimated total time: 7h 34m 54s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 27s. +[2026-03-25 17:12:31,554][__main__][INFO] - Starting iteration 203. +[2026-03-25 17:12:31,557][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:12:31,557][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:12:34,831][__main__][INFO] - Number of regex retries in iteration 203: 0 +[2026-03-25 17:12:34,831][__main__][INFO] - agents played in iteration 203 are Bob, Alice +[2026-03-25 17:12:35,394][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:12:36,044][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:12:36,335][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:12:36,658][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:12:36,977][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:12:37,297][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:12:37,618][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:12:37,938][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:12:38,258][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:12:38,579][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:12:38,900][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:12:39,221][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:12:39,540][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:12:39,860][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:12:40,181][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:12:40,500][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:12:40,821][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:12:41,140][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:12:41,460][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:12:41,780][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:12:42,099][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:12:42,418][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:12:42,739][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:12:43,059][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:12:43,380][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:12:43,699][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:12:44,019][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:12:44,340][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:12:44,660][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:12:44,981][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:12:45,301][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:12:45,622][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:12:45,942][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:12:46,264][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:12:46,583][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:12:46,904][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:12:47,225][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:12:47,546][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:12:47,867][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:12:48,187][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:12:48,508][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:12:48,828][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:12:49,147][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:12:49,468][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:12:49,788][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:12:50,108][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:12:50,429][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:12:50,750][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:12:51,070][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:12:51,390][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:12:51,711][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:12:52,031][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:12:52,351][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:12:52,963][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:12:53,284][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:12:53,605][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:12:53,925][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:12:54,245][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:12:54,567][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:12:54,888][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:12:55,209][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:12:55,528][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:12:55,850][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:12:56,172][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:12:56,492][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:12:56,813][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:12:57,467][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:12:58,209][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:12:58,212][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:12:58,213][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:12:58,889][__main__][INFO] - Iteration 204 took 27s (11.98% Gen, 85.54% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 56m 2s. Estimated total time: 7h 35m 33s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 46s. +[2026-03-25 17:12:58,891][__main__][INFO] - Starting iteration 204. +[2026-03-25 17:12:58,894][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:12:58,895][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:13:02,118][__main__][INFO] - Number of regex retries in iteration 204: 0 +[2026-03-25 17:13:02,118][__main__][INFO] - agents played in iteration 204 are Bob, Alice +[2026-03-25 17:13:02,666][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:13:03,317][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:13:03,606][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:13:03,928][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:13:04,249][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:13:04,568][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:13:04,889][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:13:05,209][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:13:05,530][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:13:05,850][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:13:06,172][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:13:06,493][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:13:06,815][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:13:07,136][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:13:07,457][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:13:07,778][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:13:08,099][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:13:08,420][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:13:08,741][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:13:09,062][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:13:09,383][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:13:09,704][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:13:10,026][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:13:10,346][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:13:10,667][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:13:10,989][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:13:11,309][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:13:11,629][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:13:11,951][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:13:12,272][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:13:12,595][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:13:12,915][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:13:13,236][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:13:13,559][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:13:13,880][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:13:14,201][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:13:14,522][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:13:14,843][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:13:15,165][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:13:15,486][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:13:15,806][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:13:16,127][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:13:16,448][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:13:16,769][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:13:17,088][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:13:17,410][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:13:17,731][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:13:18,054][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:13:18,374][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:13:18,695][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:13:19,016][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:13:19,338][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:13:19,658][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:13:20,275][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:13:20,597][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:13:20,917][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:13:21,239][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:13:21,561][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:13:21,882][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:13:22,203][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:13:22,523][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:13:22,844][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:13:23,165][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:13:23,487][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:13:23,807][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:13:24,129][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:13:24,782][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:13:25,520][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:13:25,523][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:13:25,525][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:13:26,154][__main__][INFO] - Iteration 205 took 27s (11.82% Gen, 85.86% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 54m 22s. Estimated total time: 7h 34m 21s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 10s. +[2026-03-25 17:13:26,158][__main__][INFO] - Starting iteration 205. +[2026-03-25 17:13:26,161][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:13:26,161][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:13:29,455][__main__][INFO] - Number of regex retries in iteration 205: 0 +[2026-03-25 17:13:29,456][__main__][INFO] - agents played in iteration 205 are Bob, Alice +[2026-03-25 17:13:30,013][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:13:30,661][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:13:30,951][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:13:31,273][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:13:31,593][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:13:31,914][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:13:32,235][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:13:32,555][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:13:32,876][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:13:33,195][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:13:33,516][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:13:33,837][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:13:34,156][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:13:34,477][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:13:34,799][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:13:35,120][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:13:35,441][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:13:35,760][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:13:36,081][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:13:36,403][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:13:36,722][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:13:37,042][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:13:37,363][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:13:37,682][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:13:38,003][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:13:38,323][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:13:38,645][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:13:38,965][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:13:39,286][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:13:39,606][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:13:39,926][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:13:40,246][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:13:40,567][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:13:40,888][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:13:41,209][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:13:41,529][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:13:41,849][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:13:42,169][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:13:42,490][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:13:42,811][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:13:43,130][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:13:43,450][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:13:43,772][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:13:44,091][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:13:44,412][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:13:44,733][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:13:45,055][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:13:45,375][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:13:45,697][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:13:46,019][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:13:46,339][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:13:46,659][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:13:46,979][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:13:47,590][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:13:47,911][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:13:48,230][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:13:48,550][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:13:48,871][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:13:49,191][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:13:49,511][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:13:49,833][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:13:50,153][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:13:50,476][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:13:50,795][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:13:51,116][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:13:51,437][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:13:52,090][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:13:52,831][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:13:52,833][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:13:52,835][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:13:53,511][__main__][INFO] - Iteration 206 took 27s (12.04% Gen, 85.48% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 55m 25s. Estimated total time: 7h 35m 51s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 35s, 500 more iterations: 3h 47m 55s. +[2026-03-25 17:13:53,514][__main__][INFO] - Starting iteration 206. +[2026-03-25 17:13:53,517][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:13:53,517][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:13:56,784][__main__][INFO] - Number of regex retries in iteration 206: 0 +[2026-03-25 17:13:56,785][__main__][INFO] - agents played in iteration 206 are Bob, Alice +[2026-03-25 17:13:57,324][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:13:57,972][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:13:58,262][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:13:58,583][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:13:58,903][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:13:59,223][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:13:59,542][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:13:59,864][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:14:00,185][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:14:00,506][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:14:00,828][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:14:01,148][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:14:01,468][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:14:01,789][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:14:02,110][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:14:02,432][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:14:02,754][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:14:03,076][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:14:03,397][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:14:03,717][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:14:04,038][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:14:04,360][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:14:04,680][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:14:05,001][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:14:05,321][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:14:05,641][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:14:05,960][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:14:06,281][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:14:06,600][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:14:06,921][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:14:07,240][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:14:07,560][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:14:07,880][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:14:08,202][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:14:08,521][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:14:08,842][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:14:09,162][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:14:09,482][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:14:09,801][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:14:10,120][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:14:10,441][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:14:10,763][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:14:11,082][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:14:11,402][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:14:11,722][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:14:12,042][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:14:12,364][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:14:12,685][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:14:13,005][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:14:13,327][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:14:13,647][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:14:13,967][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:14:14,288][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:14:14,900][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:14:15,221][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:14:15,541][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:14:15,861][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:14:16,181][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:14:16,501][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:14:16,821][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:14:17,142][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:14:17,464][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:14:17,784][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:14:18,104][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:14:18,424][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:14:18,743][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:14:19,395][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:14:20,131][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:14:20,133][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:14:20,135][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:14:20,763][__main__][INFO] - Iteration 207 took 27s (11.99% Gen, 85.70% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 53m 14s. Estimated total time: 7h 34m 7s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 3s. +[2026-03-25 17:14:20,765][__main__][INFO] - Starting iteration 207. +[2026-03-25 17:14:20,768][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:14:20,769][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:14:24,042][__main__][INFO] - Number of regex retries in iteration 207: 0 +[2026-03-25 17:14:24,043][__main__][INFO] - agents played in iteration 207 are Bob, Alice +[2026-03-25 17:14:24,585][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:14:25,233][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:14:25,522][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:14:25,844][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:14:26,165][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:14:26,485][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:14:26,806][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:14:27,126][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:14:27,447][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:14:27,766][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:14:28,086][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:14:28,407][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:14:28,728][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:14:29,047][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:14:29,366][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:14:29,688][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:14:30,008][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:14:30,329][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:14:30,648][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:14:30,969][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:14:31,289][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:14:31,609][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:14:31,929][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:14:32,250][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:14:32,570][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:14:32,890][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:14:33,211][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:14:33,531][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:14:33,851][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:14:34,172][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:14:34,493][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:14:34,813][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:14:35,134][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:14:35,456][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:14:35,776][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:14:36,097][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:14:36,419][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:14:36,741][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:14:37,062][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:14:37,383][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:14:37,702][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:14:38,022][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:14:38,343][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:14:38,663][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:14:38,982][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:14:39,302][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:14:39,623][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:14:39,944][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:14:40,264][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:14:40,584][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:14:40,905][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:14:41,225][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:14:41,546][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:14:42,161][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:14:42,480][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:14:42,800][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:14:43,121][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:14:43,441][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:14:43,761][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:14:44,083][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:14:44,402][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:14:44,722][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:14:45,042][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:14:45,362][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:14:45,683][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:14:46,002][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:14:46,653][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:14:47,393][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:14:47,396][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:14:47,397][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:14:48,086][__main__][INFO] - Iteration 208 took 27s (11.99% Gen, 85.49% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 53m 58s. Estimated total time: 7h 35m 19s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 39s. +[2026-03-25 17:14:48,089][__main__][INFO] - Starting iteration 208. +[2026-03-25 17:14:48,092][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:14:48,092][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:14:51,444][__main__][INFO] - Number of regex retries in iteration 208: 0 +[2026-03-25 17:14:51,445][__main__][INFO] - agents played in iteration 208 are Bob, Alice +[2026-03-25 17:14:52,000][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:14:52,660][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:14:52,953][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:14:53,274][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:14:53,596][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:14:53,918][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:14:54,239][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:14:54,561][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:14:54,880][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:14:55,200][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:14:55,521][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:14:55,843][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:14:56,164][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:14:56,485][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:14:56,806][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:14:57,128][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:14:57,448][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:14:57,769][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:14:58,089][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:14:58,410][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:14:58,730][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:14:59,051][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:14:59,372][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:14:59,693][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:15:00,014][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:15:00,334][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:15:00,655][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:15:00,976][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:15:01,298][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:15:01,620][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:15:01,941][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:15:02,262][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:15:02,583][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:15:02,905][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:15:03,226][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:15:03,547][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:15:03,867][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:15:04,187][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:15:04,506][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:15:04,827][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:15:05,147][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:15:05,468][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:15:05,787][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:15:06,107][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:15:06,427][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:15:06,747][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:15:07,067][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:15:07,388][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:15:07,709][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:15:08,030][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:15:08,349][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:15:08,670][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:15:08,990][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:15:09,602][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:15:09,922][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:15:10,243][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:15:10,563][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:15:10,884][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:15:11,204][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:15:11,524][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:15:11,844][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:15:12,163][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:15:12,484][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:15:12,804][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:15:13,123][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:15:13,443][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:15:14,096][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:15:14,839][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:15:14,841][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:15:14,843][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:15:15,529][__main__][INFO] - Iteration 209 took 27s (12.22% Gen, 85.27% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 55m 30s. Estimated total time: 7h 37m 18s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 43s, 500 more iterations: 3h 48m 39s. +[2026-03-25 17:15:15,532][__main__][INFO] - Starting iteration 209. +[2026-03-25 17:15:15,535][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:15:15,536][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:15:18,829][__main__][INFO] - Number of regex retries in iteration 209: 0 +[2026-03-25 17:15:18,830][__main__][INFO] - agents played in iteration 209 are Bob, Alice +[2026-03-25 17:15:19,391][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:15:20,050][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:15:20,341][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:15:20,662][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:15:20,981][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:15:21,302][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:15:21,623][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:15:21,944][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:15:22,265][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:15:22,585][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:15:22,905][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:15:23,225][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:15:23,546][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:15:23,868][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:15:24,188][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:15:24,508][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:15:24,828][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:15:25,148][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:15:25,469][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:15:25,790][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:15:26,112][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:15:26,431][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:15:26,751][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:15:27,072][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:15:27,394][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:15:27,715][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:15:28,036][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:15:28,358][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:15:28,679][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:15:28,999][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:15:29,319][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:15:29,639][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:15:29,960][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:15:30,281][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:15:30,601][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:15:30,922][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:15:31,242][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:15:31,563][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:15:31,884][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:15:32,204][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:15:32,524][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:15:32,845][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:15:33,165][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:15:33,485][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:15:33,807][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:15:34,126][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:15:34,447][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:15:34,768][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:15:35,087][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:15:35,409][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:15:35,730][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:15:36,051][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:15:36,372][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:15:36,987][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:15:37,309][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:15:37,630][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:15:37,950][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:15:38,272][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:15:38,593][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:15:38,914][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:15:39,235][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:15:39,557][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:15:39,878][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:15:40,200][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:15:40,521][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:15:40,843][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:15:41,501][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:15:42,239][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:15:42,241][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:15:42,243][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:15:42,917][__main__][INFO] - Iteration 210 took 27s (12.03% Gen, 85.50% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 54m 7s. Estimated total time: 7h 36m 22s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 38s, 500 more iterations: 3h 48m 11s. +[2026-03-25 17:15:42,919][__main__][INFO] - Starting iteration 210. +[2026-03-25 17:15:42,922][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:15:42,923][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:15:46,125][__main__][INFO] - Number of regex retries in iteration 210: 0 +[2026-03-25 17:15:46,125][__main__][INFO] - agents played in iteration 210 are Bob, Alice +[2026-03-25 17:15:46,669][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:15:47,328][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:15:47,619][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:15:47,940][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:15:48,260][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:15:48,581][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:15:48,901][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:15:49,221][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:15:49,541][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:15:49,863][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:15:50,183][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:15:50,505][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:15:50,825][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:15:51,146][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:15:51,467][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:15:51,786][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:15:52,106][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:15:52,428][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:15:52,749][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:15:53,070][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:15:53,391][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:15:53,711][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:15:54,031][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:15:54,353][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:15:54,674][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:15:54,994][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:15:55,314][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:15:55,636][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:15:55,957][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:15:56,278][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:15:56,599][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:15:56,920][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:15:57,239][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:15:57,560][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:15:57,881][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:15:58,203][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:15:58,523][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:15:58,842][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:15:59,163][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:15:59,484][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:15:59,803][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:16:00,123][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:16:00,443][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:16:00,763][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:16:01,082][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:16:01,401][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:16:01,721][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:16:02,041][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:16:02,361][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:16:02,680][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:16:03,001][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:16:03,320][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:16:03,641][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:16:04,257][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:16:04,578][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:16:04,899][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:16:05,218][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:16:05,540][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:16:05,861][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:16:06,180][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:16:06,500][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:16:06,820][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:16:07,140][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:16:07,461][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:16:07,781][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:16:08,102][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:16:08,762][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:16:09,514][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:16:09,516][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:16:09,518][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:16:10,203][__main__][INFO] - Iteration 211 took 27s (11.74% Gen, 85.74% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 51m 59s. Estimated total time: 7h 34m 41s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 20s. +[2026-03-25 17:16:10,205][__main__][INFO] - Starting iteration 211. +[2026-03-25 17:16:10,208][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:16:10,209][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:16:13,370][__main__][INFO] - Number of regex retries in iteration 211: 0 +[2026-03-25 17:16:13,371][__main__][INFO] - agents played in iteration 211 are Bob, Alice +[2026-03-25 17:16:13,908][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:16:14,558][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:16:14,846][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:16:15,168][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:16:15,489][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:16:15,811][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:16:16,132][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:16:16,455][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:16:16,776][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:16:17,095][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:16:17,416][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:16:17,736][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:16:18,057][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:16:18,380][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:16:18,700][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:16:19,020][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:16:19,340][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:16:19,662][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:16:19,981][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:16:20,301][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:16:20,622][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:16:20,942][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:16:21,263][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:16:21,583][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:16:21,903][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:16:22,222][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:16:22,543][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:16:22,863][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:16:23,182][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:16:23,503][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:16:23,823][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:16:24,143][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:16:24,463][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:16:24,785][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:16:25,106][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:16:25,427][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:16:25,747][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:16:26,067][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:16:26,388][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:16:26,708][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:16:27,030][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:16:27,349][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:16:27,668][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:16:27,989][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:16:28,311][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:16:28,634][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:16:28,956][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:16:29,279][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:16:29,600][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:16:29,921][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:16:30,242][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:16:30,564][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:16:30,886][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:16:31,500][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:16:31,821][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:16:32,141][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:16:32,461][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:16:32,782][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:16:33,101][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:16:33,421][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:16:33,741][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:16:34,063][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:16:34,382][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:16:34,702][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:16:35,023][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:16:35,343][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:16:35,996][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:16:36,733][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:16:36,735][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:16:36,737][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:16:37,367][__main__][INFO] - Iteration 212 took 27s (11.64% Gen, 86.03% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 49m 29s. Estimated total time: 7h 32m 39s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 15s, 500 more iterations: 3h 46m 19s. +[2026-03-25 17:16:37,369][__main__][INFO] - Starting iteration 212. +[2026-03-25 17:16:37,372][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:16:37,372][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:16:40,590][__main__][INFO] - Number of regex retries in iteration 212: 0 +[2026-03-25 17:16:40,590][__main__][INFO] - agents played in iteration 212 are Bob, Alice +[2026-03-25 17:16:41,205][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:16:41,853][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:16:42,144][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:16:42,465][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:16:42,786][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:16:43,108][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:16:43,428][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:16:43,748][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:16:44,069][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:16:44,389][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:16:44,709][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:16:45,030][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:16:45,350][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:16:45,671][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:16:45,993][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:16:46,313][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:16:46,634][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:16:46,955][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:16:47,277][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:16:47,598][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:16:47,919][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:16:48,240][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:16:48,559][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:16:48,881][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:16:49,200][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:16:49,520][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:16:49,841][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:16:50,164][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:16:50,483][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:16:50,804][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:16:51,125][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:16:51,446][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:16:51,767][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:16:52,087][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:16:52,407][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:16:52,728][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:16:53,049][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:16:53,369][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:16:53,690][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:16:54,010][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:16:54,330][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:16:54,651][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:16:54,972][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:16:55,294][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:16:55,616][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:16:55,937][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:16:56,258][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:16:56,579][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:16:56,900][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:16:57,220][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:16:57,540][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:16:57,861][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:16:58,182][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:16:58,794][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:16:59,114][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:16:59,435][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:16:59,756][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:17:00,076][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:17:00,398][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:17:00,719][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:17:01,040][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:17:01,360][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:17:01,681][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:17:02,001][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:17:02,321][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:17:02,641][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:17:03,293][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:17:04,032][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:17:04,034][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:17:04,036][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:17:04,739][__main__][INFO] - Iteration 213 took 27s (11.76% Gen, 85.67% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 52m 31s. Estimated total time: 7h 36m 8s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 36s, 500 more iterations: 3h 48m 4s. +[2026-03-25 17:17:04,742][__main__][INFO] - Starting iteration 213. +[2026-03-25 17:17:04,745][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:17:04,745][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:17:07,939][__main__][INFO] - Number of regex retries in iteration 213: 0 +[2026-03-25 17:17:07,940][__main__][INFO] - agents played in iteration 213 are Bob, Alice +[2026-03-25 17:17:08,479][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:17:09,126][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:17:09,416][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:17:09,737][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:17:10,058][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:17:10,377][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:17:10,697][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:17:11,018][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:17:11,337][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:17:11,658][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:17:11,980][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:17:12,299][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:17:12,620][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:17:12,941][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:17:13,262][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:17:13,581][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:17:13,902][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:17:14,222][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:17:14,543][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:17:14,865][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:17:15,186][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:17:15,505][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:17:15,826][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:17:16,148][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:17:16,469][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:17:16,789][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:17:17,109][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:17:17,431][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:17:17,753][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:17:18,074][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:17:18,396][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:17:18,718][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:17:19,040][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:17:19,360][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:17:19,681][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:17:20,001][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:17:20,324][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:17:20,645][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:17:20,967][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:17:21,288][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:17:21,609][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:17:21,930][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:17:22,250][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:17:22,571][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:17:22,892][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:17:23,214][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:17:23,536][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:17:23,857][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:17:24,178][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:17:24,498][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:17:24,818][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:17:25,139][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:17:25,460][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:17:26,077][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:17:26,397][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:17:26,718][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:17:27,040][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:17:27,359][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:17:27,680][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:17:28,001][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:17:28,322][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:17:28,643][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:17:28,964][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:17:29,285][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:17:29,606][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:17:29,927][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:17:30,588][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:17:31,334][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:17:31,337][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:17:31,338][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:17:31,974][__main__][INFO] - Iteration 214 took 27s (11.73% Gen, 85.93% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 49m 46s. Estimated total time: 7h 33m 50s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 55s. +[2026-03-25 17:17:31,977][__main__][INFO] - Starting iteration 214. +[2026-03-25 17:17:31,980][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:17:31,980][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:17:35,205][__main__][INFO] - Number of regex retries in iteration 214: 0 +[2026-03-25 17:17:35,206][__main__][INFO] - agents played in iteration 214 are Bob, Alice +[2026-03-25 17:17:35,820][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:17:36,468][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:17:36,761][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:17:37,083][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:17:37,402][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:17:37,723][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:17:38,042][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:17:38,362][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:17:38,684][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:17:39,003][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:17:39,324][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:17:39,643][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:17:39,962][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:17:40,283][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:17:40,603][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:17:40,922][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:17:41,243][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:17:41,562][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:17:41,882][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:17:42,202][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:17:42,522][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:17:42,841][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:17:43,161][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:17:43,482][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:17:43,802][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:17:44,123][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:17:44,442][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:17:44,762][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:17:45,082][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:17:45,402][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:17:45,722][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:17:46,042][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:17:46,363][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:17:46,682][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:17:47,002][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:17:47,321][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:17:47,641][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:17:47,961][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:17:48,282][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:17:48,604][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:17:48,924][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:17:49,245][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:17:49,566][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:17:49,887][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:17:50,209][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:17:50,529][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:17:50,848][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:17:51,169][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:17:51,491][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:17:51,811][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:17:52,131][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:17:52,451][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:17:52,772][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:17:53,383][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:17:53,703][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:17:54,022][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:17:54,342][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:17:54,663][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:17:54,984][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:17:55,304][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:17:55,625][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:17:55,946][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:17:56,265][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:17:56,586][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:17:56,907][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:17:57,229][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:17:57,880][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:17:58,617][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:17:58,619][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:17:58,621][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:17:59,316][__main__][INFO] - Iteration 215 took 27s (11.80% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 51m 6s. Estimated total time: 7h 35m 37s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 48s. +[2026-03-25 17:17:59,319][__main__][INFO] - Starting iteration 215. +[2026-03-25 17:17:59,322][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:17:59,323][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:18:02,564][__main__][INFO] - Number of regex retries in iteration 215: 0 +[2026-03-25 17:18:02,565][__main__][INFO] - agents played in iteration 215 are Bob, Alice +[2026-03-25 17:18:03,180][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:18:03,836][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:18:04,129][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:18:04,449][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:18:04,769][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:18:05,090][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:18:05,412][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:18:05,733][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:18:06,054][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:18:06,376][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:18:06,698][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:18:07,019][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:18:07,340][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:18:07,660][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:18:07,979][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:18:08,301][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:18:08,620][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:18:08,941][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:18:09,260][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:18:09,580][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:18:09,901][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:18:10,221][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:18:10,542][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:18:10,862][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:18:11,182][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:18:11,503][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:18:11,824][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:18:12,145][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:18:12,464][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:18:12,783][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:18:13,103][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:18:13,424][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:18:13,744][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:18:14,065][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:18:14,386][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:18:14,706][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:18:15,027][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:18:15,347][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:18:15,668][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:18:15,991][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:18:16,313][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:18:16,634][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:18:16,957][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:18:17,278][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:18:17,600][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:18:17,920][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:18:18,241][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:18:18,562][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:18:18,882][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:18:19,202][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:18:19,522][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:18:19,842][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:18:20,163][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:18:20,780][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:18:21,102][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:18:21,422][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:18:21,742][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:18:22,063][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:18:22,382][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:18:22,703][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:18:23,022][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:18:23,342][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:18:23,663][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:18:23,984][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:18:24,304][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:18:24,625][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:18:25,285][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:18:26,027][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:18:26,029][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:18:26,030][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:18:26,729][__main__][INFO] - Iteration 216 took 27s (11.83% Gen, 85.62% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 51m 48s. Estimated total time: 7h 36m 47s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 40s, 500 more iterations: 3h 48m 23s. +[2026-03-25 17:18:26,731][__main__][INFO] - Starting iteration 216. +[2026-03-25 17:18:26,734][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:18:26,735][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:18:30,241][__main__][INFO] - Number of regex retries in iteration 216: 0 +[2026-03-25 17:18:30,242][__main__][INFO] - agents played in iteration 216 are Bob, Alice +[2026-03-25 17:18:30,889][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:18:31,538][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:18:31,829][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:18:32,150][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:18:32,469][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:18:32,790][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:18:33,110][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:18:33,431][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:18:33,751][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:18:34,073][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:18:34,394][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:18:34,716][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:18:35,036][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:18:35,357][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:18:35,676][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:18:35,998][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:18:36,319][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:18:36,640][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:18:36,961][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:18:37,281][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:18:37,601][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:18:37,920][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:18:38,241][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:18:38,560][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:18:38,881][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:18:39,202][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:18:39,521][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:18:39,841][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:18:40,161][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:18:40,481][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:18:40,801][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:18:41,121][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:18:41,441][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:18:41,762][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:18:42,082][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:18:42,403][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:18:42,724][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:18:43,044][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:18:43,365][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:18:43,685][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:18:44,005][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:18:44,326][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:18:44,647][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:18:44,968][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:18:45,289][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:18:45,608][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:18:45,929][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:18:46,251][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:18:46,571][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:18:46,890][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:18:47,209][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:18:47,530][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:18:47,849][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:18:48,460][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:18:48,780][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:18:49,101][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:18:49,420][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:18:49,740][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:18:50,061][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:18:50,381][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:18:50,700][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:18:51,020][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:18:51,342][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:18:51,664][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:18:51,984][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:18:52,306][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:18:52,958][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:18:53,690][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:18:53,692][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:18:53,693][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:18:54,373][__main__][INFO] - Iteration 217 took 27s (12.69% Gen, 84.85% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 55m 12s. Estimated total time: 7h 40m 39s. Time estimates for 10 more iterations: 4m 36s, 100 more iterations: 46m 3s, 500 more iterations: 3h 50m 19s. +[2026-03-25 17:18:54,375][__main__][INFO] - Starting iteration 217. +[2026-03-25 17:18:54,378][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:18:54,378][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:18:57,619][__main__][INFO] - Number of regex retries in iteration 217: 0 +[2026-03-25 17:18:57,619][__main__][INFO] - agents played in iteration 217 are Bob, Alice +[2026-03-25 17:18:58,159][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:18:58,809][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:18:59,098][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:18:59,418][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:18:59,737][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:19:00,058][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:19:00,378][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:19:00,698][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:19:01,020][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:19:01,340][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:19:01,661][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:19:01,982][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:19:02,301][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:19:02,621][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:19:02,941][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:19:03,263][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:19:03,584][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:19:03,903][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:19:04,224][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:19:04,543][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:19:04,865][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:19:05,186][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:19:05,507][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:19:05,827][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:19:06,149][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:19:06,469][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:19:06,789][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:19:07,109][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:19:07,430][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:19:07,749][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:19:08,069][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:19:08,388][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:19:08,707][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:19:09,028][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:19:09,348][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:19:09,668][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:19:09,990][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:19:10,310][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:19:10,631][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:19:10,950][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:19:11,270][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:19:11,591][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:19:11,910][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:19:12,229][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:19:12,550][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:19:12,871][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:19:13,192][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:19:13,513][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:19:13,834][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:19:14,155][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:19:14,476][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:19:14,797][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:19:15,118][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:19:15,728][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:19:16,047][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:19:16,368][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:19:16,690][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:19:17,009][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:19:17,330][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:19:17,650][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:19:17,969][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:19:18,288][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:19:18,608][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:19:18,928][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:19:19,249][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:19:19,569][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:19:20,222][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:19:20,953][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:19:20,955][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:19:20,957][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:19:21,643][__main__][INFO] - Iteration 218 took 27s (11.89% Gen, 85.59% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 48m 32s. Estimated total time: 7h 34m 26s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 13s. +[2026-03-25 17:19:21,646][__main__][INFO] - Starting iteration 218. +[2026-03-25 17:19:21,648][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:19:21,649][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:19:24,914][__main__][INFO] - Number of regex retries in iteration 218: 0 +[2026-03-25 17:19:24,915][__main__][INFO] - agents played in iteration 218 are Bob, Alice +[2026-03-25 17:19:25,522][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:19:26,177][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:19:26,467][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:19:26,788][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:19:27,108][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:19:27,427][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:19:27,746][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:19:28,067][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:19:28,387][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:19:28,707][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:19:29,028][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:19:29,348][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:19:29,667][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:19:29,988][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:19:30,310][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:19:30,629][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:19:30,950][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:19:31,269][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:19:31,589][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:19:31,909][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:19:32,230][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:19:32,549][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:19:32,871][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:19:33,190][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:19:33,510][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:19:33,829][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:19:34,149][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:19:34,468][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:19:34,790][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:19:35,109][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:19:35,429][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:19:35,750][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:19:36,070][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:19:36,389][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:19:36,708][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:19:37,029][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:19:37,348][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:19:37,669][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:19:37,989][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:19:38,309][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:19:38,630][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:19:38,950][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:19:39,271][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:19:39,590][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:19:39,911][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:19:40,230][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:19:40,549][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:19:40,868][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:19:41,187][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:19:41,507][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:19:41,826][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:19:42,145][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:19:42,466][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:19:43,089][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:19:43,410][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:19:43,732][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:19:44,054][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:19:44,376][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:19:44,697][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:19:45,018][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:19:45,338][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:19:45,658][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:19:45,978][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:19:46,298][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:19:46,618][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:19:46,937][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:19:47,590][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:19:48,327][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:19:48,329][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:19:48,331][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:19:49,029][__main__][INFO] - Iteration 219 took 27s (11.93% Gen, 85.52% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 49m 59s. Estimated total time: 7h 36m 21s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 38s, 500 more iterations: 3h 48m 10s. +[2026-03-25 17:19:49,031][__main__][INFO] - Starting iteration 219. +[2026-03-25 17:19:49,034][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:19:49,034][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:19:52,254][__main__][INFO] - Number of regex retries in iteration 219: 0 +[2026-03-25 17:19:52,255][__main__][INFO] - agents played in iteration 219 are Bob, Alice +[2026-03-25 17:19:52,797][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:19:53,446][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:19:53,735][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:19:54,057][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:19:54,378][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:19:54,697][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:19:55,017][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:19:55,337][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:19:55,658][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:19:55,977][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:19:56,297][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:19:56,617][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:19:56,937][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:19:57,258][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:19:57,578][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:19:57,897][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:19:58,218][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:19:58,537][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:19:58,858][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:19:59,177][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:19:59,498][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:19:59,819][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:20:00,138][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:20:00,457][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:20:00,778][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:20:01,096][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:20:01,416][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:20:01,736][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:20:02,058][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:20:02,377][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:20:02,698][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:20:03,017][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:20:03,337][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:20:03,658][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:20:03,977][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:20:04,298][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:20:04,619][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:20:04,940][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:20:05,261][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:20:05,581][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:20:05,902][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:20:06,222][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:20:06,543][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:20:06,862][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:20:07,182][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:20:07,503][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:20:07,823][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:20:08,143][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:20:08,464][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:20:08,784][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:20:09,104][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:20:09,423][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:20:09,743][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:20:10,355][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:20:10,675][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:20:10,995][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:20:11,316][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:20:11,637][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:20:11,957][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:20:12,277][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:20:12,597][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:20:12,918][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:20:13,239][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:20:13,560][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:20:13,881][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:20:14,201][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:20:14,853][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:20:15,600][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:20:15,602][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:20:15,604][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:20:16,234][__main__][INFO] - Iteration 220 took 27s (11.84% Gen, 85.83% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 46m 32s. Estimated total time: 7h 33m 21s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 20s, 500 more iterations: 3h 46m 40s. +[2026-03-25 17:20:16,237][__main__][INFO] - Starting iteration 220. +[2026-03-25 17:20:16,240][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:20:16,240][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:20:19,908][__main__][INFO] - Number of regex retries in iteration 220: 0 +[2026-03-25 17:20:19,909][__main__][INFO] - agents played in iteration 220 are Bob, Alice +[2026-03-25 17:20:20,448][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:20:21,096][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:20:21,387][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:20:21,707][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:20:22,025][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:20:22,345][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:20:22,665][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:20:22,985][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:20:23,304][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:20:23,623][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:20:23,942][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:20:24,262][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:20:24,581][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:20:24,901][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:20:25,222][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:20:25,542][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:20:25,862][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:20:26,184][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:20:26,503][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:20:26,823][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:20:27,144][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:20:27,464][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:20:27,785][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:20:28,106][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:20:28,428][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:20:28,747][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:20:29,068][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:20:29,389][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:20:29,709][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:20:30,030][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:20:30,350][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:20:30,672][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:20:30,993][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:20:31,314][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:20:31,635][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:20:31,958][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:20:32,279][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:20:32,599][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:20:32,919][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:20:33,240][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:20:33,559][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:20:33,880][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:20:34,201][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:20:34,523][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:20:34,843][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:20:35,165][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:20:35,485][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:20:35,806][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:20:36,126][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:20:36,447][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:20:36,766][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:20:37,087][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:20:37,407][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:20:38,023][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:20:38,344][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:20:38,665][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:20:38,986][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:20:39,309][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:20:39,628][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:20:39,947][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:20:40,268][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:20:40,588][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:20:40,907][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:20:41,228][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:20:41,547][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:20:41,866][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:20:42,529][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:20:43,273][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:20:43,275][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:20:43,277][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:20:43,915][__main__][INFO] - Iteration 221 took 27s (13.26% Gen, 84.43% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 54m 0s. Estimated total time: 7h 41m 16s. Time estimates for 10 more iterations: 4m 36s, 100 more iterations: 46m 7s, 500 more iterations: 3h 50m 38s. +[2026-03-25 17:20:43,917][__main__][INFO] - Starting iteration 221. +[2026-03-25 17:20:43,920][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:20:43,921][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:20:47,169][__main__][INFO] - Number of regex retries in iteration 221: 0 +[2026-03-25 17:20:47,170][__main__][INFO] - agents played in iteration 221 are Bob, Alice +[2026-03-25 17:20:47,717][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:20:48,373][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:20:48,664][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:20:48,986][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:20:49,307][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:20:49,626][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:20:49,946][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:20:50,268][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:20:50,589][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:20:50,908][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:20:51,228][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:20:51,548][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:20:51,868][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:20:52,187][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:20:52,507][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:20:52,827][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:20:53,147][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:20:53,468][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:20:53,787][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:20:54,108][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:20:54,427][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:20:54,746][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:20:55,066][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:20:55,385][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:20:55,704][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:20:56,024][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:20:56,343][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:20:56,664][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:20:56,983][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:20:57,303][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:20:57,622][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:20:57,942][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:20:58,262][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:20:58,582][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:20:58,902][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:20:59,223][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:20:59,542][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:20:59,864][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:21:00,184][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:21:00,505][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:21:00,825][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:21:01,145][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:21:01,464][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:21:01,784][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:21:02,105][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:21:02,426][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:21:02,746][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:21:03,067][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:21:03,388][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:21:03,708][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:21:04,029][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:21:04,350][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:21:04,669][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:21:05,284][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:21:05,604][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:21:05,923][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:21:06,243][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:21:06,563][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:21:06,883][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:21:07,203][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:21:07,523][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:21:07,842][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:21:08,163][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:21:08,484][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:21:08,803][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:21:09,124][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:21:09,784][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:21:10,525][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:21:10,527][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:21:10,529][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:21:11,165][__main__][INFO] - Iteration 222 took 27s (11.93% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 46m 22s. Estimated total time: 7h 34m 5s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 2s. +[2026-03-25 17:21:11,167][__main__][INFO] - Starting iteration 222. +[2026-03-25 17:21:11,170][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:21:11,171][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:21:14,409][__main__][INFO] - Number of regex retries in iteration 222: 0 +[2026-03-25 17:21:14,410][__main__][INFO] - agents played in iteration 222 are Bob, Alice +[2026-03-25 17:21:14,956][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:21:15,614][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:21:16,415][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:21:16,735][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:21:17,057][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:21:17,377][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:21:17,697][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:21:18,019][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:21:18,339][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:21:18,660][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:21:18,980][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:21:19,300][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:21:19,619][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:21:19,939][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:21:20,258][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:21:20,579][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:21:20,899][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:21:21,220][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:21:21,541][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:21:21,863][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:21:22,185][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:21:22,506][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:21:22,827][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:21:23,148][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:21:23,469][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:21:23,790][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:21:24,110][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:21:24,429][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:21:24,750][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:21:25,071][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:21:25,393][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:21:25,714][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:21:26,035][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:21:26,355][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:21:26,676][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:21:26,996][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:21:27,315][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:21:27,636][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:21:27,956][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:21:28,277][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:21:28,597][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:21:28,918][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:21:29,239][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:21:29,560][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:21:29,881][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:21:30,202][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:21:30,522][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:21:30,842][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:21:31,163][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:21:31,483][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:21:31,802][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:21:32,123][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:21:32,444][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:21:33,060][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:21:33,382][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:21:33,704][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:21:34,024][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:21:34,345][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:21:34,665][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:21:34,986][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:21:35,305][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:21:35,626][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:21:35,946][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:21:36,267][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:21:36,588][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:21:36,907][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:21:37,566][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:21:38,303][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:21:38,306][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:21:38,307][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:21:38,938][__main__][INFO] - Iteration 223 took 27s (11.66% Gen, 86.06% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 54m 37s. Estimated total time: 7h 42m 48s. Time estimates for 10 more iterations: 4m 37s, 100 more iterations: 46m 16s, 500 more iterations: 3h 51m 24s. +[2026-03-25 17:21:38,940][__main__][INFO] - Starting iteration 223. +[2026-03-25 17:21:38,943][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:21:38,944][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:21:42,196][__main__][INFO] - Number of regex retries in iteration 223: 0 +[2026-03-25 17:21:42,197][__main__][INFO] - agents played in iteration 223 are Bob, Alice +[2026-03-25 17:21:42,736][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:21:43,385][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:21:43,675][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:21:43,997][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:21:44,318][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:21:44,638][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:21:44,958][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:21:45,278][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:21:45,598][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:21:45,918][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:21:46,238][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:21:46,559][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:21:46,881][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:21:47,202][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:21:47,522][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:21:47,840][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:21:48,161][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:21:48,481][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:21:48,801][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:21:49,121][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:21:49,440][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:21:49,762][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:21:50,083][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:21:50,403][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:21:50,721][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:21:51,041][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:21:51,362][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:21:51,681][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:21:52,001][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:21:52,321][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:21:52,641][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:21:52,961][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:21:53,280][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:21:53,601][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:21:53,923][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:21:54,244][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:21:54,564][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:21:54,883][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:21:55,204][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:21:55,525][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:21:55,845][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:21:56,164][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:21:56,487][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:21:56,807][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:21:57,127][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:21:57,448][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:21:57,767][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:21:58,088][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:21:58,408][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:21:58,728][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:21:59,048][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:21:59,369][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:21:59,689][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:22:00,299][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:22:00,619][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:22:00,940][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:22:01,261][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:22:01,581][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:22:01,900][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:22:02,220][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:22:02,541][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:22:02,861][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:22:03,180][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:22:03,500][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:22:03,819][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:22:04,139][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:22:05,511][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 17:22:06,248][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:22:06,250][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:22:06,252][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:22:06,881][__main__][INFO] - Iteration 224 took 27s (11.64% Gen, 86.10% Train). Generation: 3s, Training: 24s. Estimated remaining time: 5h 56m 59s. Estimated total time: 7h 45m 38s. Time estimates for 10 more iterations: 4m 39s, 100 more iterations: 46m 33s, 500 more iterations: 3h 52m 49s. +[2026-03-25 17:22:06,883][__main__][INFO] - Starting iteration 224. +[2026-03-25 17:22:06,886][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:22:06,887][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:22:10,131][__main__][INFO] - Number of regex retries in iteration 224: 0 +[2026-03-25 17:22:10,131][__main__][INFO] - agents played in iteration 224 are Bob, Alice +[2026-03-25 17:22:10,704][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:22:11,353][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:22:11,643][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:22:11,965][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:22:12,286][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:22:12,607][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:22:12,930][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:22:13,252][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:22:13,573][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:22:13,893][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:22:14,214][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:22:14,534][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:22:14,855][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:22:15,176][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:22:15,496][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:22:15,816][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:22:16,136][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:22:16,456][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:22:16,777][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:22:17,097][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:22:17,416][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:22:17,735][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:22:18,056][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:22:18,379][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:22:18,700][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:22:19,020][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:22:19,339][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:22:19,660][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:22:19,980][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:22:20,300][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:22:20,621][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:22:20,941][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:22:21,262][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:22:21,582][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:22:21,901][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:22:22,221][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:22:22,542][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:22:22,862][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:22:23,183][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:22:23,503][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:22:23,822][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:22:24,142][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:22:24,463][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:22:24,784][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:22:25,103][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:22:25,424][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:22:25,744][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:22:26,065][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:22:26,384][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:22:26,703][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:22:27,023][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:22:27,342][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:22:27,663][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:22:28,273][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:22:28,594][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:22:28,915][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:22:29,235][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:22:29,556][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:22:29,877][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:22:30,198][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:22:30,521][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:22:30,841][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:22:31,162][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:22:31,481][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:22:31,801][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:22:32,120][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:22:32,775][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:22:33,511][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:22:33,513][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:22:33,515][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:22:34,184][__main__][INFO] - Iteration 225 took 27s (11.89% Gen, 85.66% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 45m 52s. Estimated total time: 7h 34m 59s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 29s. +[2026-03-25 17:22:34,187][__main__][INFO] - Starting iteration 225. +[2026-03-25 17:22:34,190][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:22:34,190][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:22:37,509][__main__][INFO] - Number of regex retries in iteration 225: 0 +[2026-03-25 17:22:37,510][__main__][INFO] - agents played in iteration 225 are Bob, Alice +[2026-03-25 17:22:38,080][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:22:38,726][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:22:39,016][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:22:39,337][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:22:39,656][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:22:39,975][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:22:40,295][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:22:40,616][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:22:40,935][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:22:41,255][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:22:41,575][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:22:41,894][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:22:42,213][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:22:42,534][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:22:42,854][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:22:43,173][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:22:43,494][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:22:43,815][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:22:44,135][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:22:44,457][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:22:44,778][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:22:45,096][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:22:45,416][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:22:45,736][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:22:46,057][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:22:46,377][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:22:46,696][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:22:47,015][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:22:47,335][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:22:47,656][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:22:47,976][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:22:48,297][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:22:48,617][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:22:48,937][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:22:49,258][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:22:49,578][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:22:49,897][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:22:50,218][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:22:50,537][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:22:50,858][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:22:51,178][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:22:51,499][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:22:51,819][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:22:52,139][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:22:52,460][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:22:52,779][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:22:53,098][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:22:53,419][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:22:53,739][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:22:54,060][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:22:54,381][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:22:54,700][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:22:55,021][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:22:55,631][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:22:55,952][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:22:56,272][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:22:56,591][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:22:56,910][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:22:57,229][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:22:57,548][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:22:57,868][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:22:58,189][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:22:58,509][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:22:58,828][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:22:59,150][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:22:59,470][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:23:00,121][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:23:00,858][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:23:00,860][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:23:00,862][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:23:01,494][__main__][INFO] - Iteration 226 took 27s (12.16% Gen, 85.52% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 45m 31s. Estimated total time: 7h 35m 5s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 32s. +[2026-03-25 17:23:01,496][__main__][INFO] - Starting iteration 226. +[2026-03-25 17:23:01,499][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:23:01,500][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:23:04,757][__main__][INFO] - Number of regex retries in iteration 226: 0 +[2026-03-25 17:23:04,758][__main__][INFO] - agents played in iteration 226 are Bob, Alice +[2026-03-25 17:23:05,338][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:23:05,996][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:23:06,286][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:23:06,607][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:23:06,928][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:23:07,248][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:23:07,569][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:23:07,889][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:23:08,210][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:23:08,532][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:23:08,851][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:23:09,173][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:23:09,496][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:23:09,816][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:23:10,137][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:23:10,458][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:23:10,780][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:23:11,099][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:23:11,420][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:23:11,742][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:23:12,061][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:23:12,381][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:23:12,700][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:23:13,020][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:23:13,341][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:23:13,662][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:23:13,983][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:23:14,303][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:23:14,624][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:23:14,942][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:23:15,262][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:23:15,583][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:23:15,902][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:23:16,221][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:23:16,540][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:23:16,860][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:23:17,178][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:23:17,500][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:23:17,819][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:23:18,138][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:23:18,458][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:23:18,778][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:23:19,099][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:23:19,418][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:23:19,737][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:23:20,058][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:23:20,379][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:23:20,701][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:23:21,022][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:23:21,342][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:23:21,661][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:23:21,982][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:23:22,301][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:23:22,910][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:23:23,230][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:23:23,550][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:23:23,870][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:23:24,191][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:23:24,509][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:23:24,829][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:23:25,149][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:23:25,470][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:23:25,789][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:23:26,109][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:23:26,428][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:23:26,748][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:23:27,398][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:23:28,132][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:23:28,134][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:23:28,136][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:23:28,813][__main__][INFO] - Iteration 227 took 27s (11.93% Gen, 85.59% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 45m 13s. Estimated total time: 7h 35m 14s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 37s. +[2026-03-25 17:23:28,815][__main__][INFO] - Starting iteration 227. +[2026-03-25 17:23:28,818][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:23:28,819][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:23:32,905][__main__][INFO] - Number of regex retries in iteration 227: 0 +[2026-03-25 17:23:32,906][__main__][INFO] - agents played in iteration 227 are Bob, Alice +[2026-03-25 17:23:33,466][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:23:34,111][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:23:34,402][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:23:34,724][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:23:35,043][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:23:35,362][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:23:35,683][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:23:36,002][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:23:36,323][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:23:36,642][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:23:36,962][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:23:37,281][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:23:37,600][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:23:37,921][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:23:38,241][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:23:38,562][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:23:38,882][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:23:39,202][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:23:39,521][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:23:39,841][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:23:40,160][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:23:40,481][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:23:40,800][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:23:41,120][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:23:41,441][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:23:41,761][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:23:42,081][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:23:42,400][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:23:42,722][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:23:43,042][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:23:43,362][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:23:43,682][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:23:44,002][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:23:44,321][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:23:44,641][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:23:44,961][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:23:45,281][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:23:45,601][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:23:45,922][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:23:46,243][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:23:46,562][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:23:46,883][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:23:47,204][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:23:47,524][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:23:47,843][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:23:48,164][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:23:48,483][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:23:48,802][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:23:49,124][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:23:49,445][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:23:49,764][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:23:50,085][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:23:50,406][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:23:51,021][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:23:51,342][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:23:51,663][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:23:51,983][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:23:52,302][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:23:52,623][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:23:52,943][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:23:53,264][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:23:53,583][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:23:53,902][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:23:54,223][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:23:54,544][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:23:54,863][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:23:55,522][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:23:56,268][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:23:56,271][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:23:56,273][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:23:56,902][__main__][INFO] - Iteration 228 took 28s (14.55% Gen, 83.20% Train). Generation: 4s, Training: 23s. Estimated remaining time: 5h 57m 35s. Estimated total time: 7h 48m 4s. Time estimates for 10 more iterations: 4m 40s, 100 more iterations: 46m 48s, 500 more iterations: 3h 54m 2s. +[2026-03-25 17:23:56,904][__main__][INFO] - Starting iteration 228. +[2026-03-25 17:23:56,907][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:23:56,908][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:24:00,150][__main__][INFO] - Number of regex retries in iteration 228: 0 +[2026-03-25 17:24:00,151][__main__][INFO] - agents played in iteration 228 are Bob, Alice +[2026-03-25 17:24:00,692][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:24:01,337][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:24:01,628][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:24:01,950][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:24:02,270][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:24:02,590][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:24:02,912][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:24:03,232][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:24:03,552][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:24:03,873][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:24:04,193][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:24:04,514][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:24:04,835][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:24:05,156][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:24:05,477][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:24:05,799][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:24:06,120][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:24:06,441][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:24:06,762][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:24:07,082][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:24:07,401][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:24:07,721][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:24:08,042][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:24:08,362][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:24:08,681][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:24:09,001][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:24:09,320][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:24:09,640][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:24:09,960][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:24:10,279][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:24:10,600][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:24:10,921][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:24:11,242][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:24:11,561][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:24:11,882][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:24:12,202][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:24:12,522][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:24:12,842][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:24:13,163][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:24:13,483][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:24:13,802][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:24:14,122][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:24:14,441][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:24:14,761][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:24:15,081][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:24:15,401][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:24:15,722][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:24:16,042][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:24:16,362][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:24:16,683][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:24:17,004][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:24:17,323][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:24:17,643][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:24:18,254][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:24:18,575][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:24:18,896][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:24:19,216][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:24:19,538][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:24:19,858][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:24:20,179][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:24:20,498][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:24:20,817][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:24:21,138][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:24:21,458][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:24:21,777][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:24:22,097][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:24:22,746][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:24:23,482][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:24:23,485][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:24:23,486][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:24:24,114][__main__][INFO] - Iteration 229 took 27s (11.92% Gen, 85.77% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 42m 31s. Estimated total time: 7h 33m 28s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 20s, 500 more iterations: 3h 46m 44s. +[2026-03-25 17:24:24,117][__main__][INFO] - Starting iteration 229. +[2026-03-25 17:24:24,120][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:24:24,120][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:24:27,368][__main__][INFO] - Number of regex retries in iteration 229: 0 +[2026-03-25 17:24:27,368][__main__][INFO] - agents played in iteration 229 are Bob, Alice +[2026-03-25 17:24:27,917][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:24:28,563][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:24:28,852][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:24:29,174][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:24:29,492][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:24:29,812][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:24:30,132][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:24:30,452][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:24:30,772][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:24:31,092][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:24:31,411][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:24:31,732][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:24:32,053][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:24:32,373][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:24:32,693][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:24:33,013][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:24:33,334][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:24:33,655][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:24:33,977][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:24:34,297][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:24:34,618][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:24:34,938][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:24:35,258][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:24:35,578][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:24:35,899][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:24:36,220][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:24:36,542][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:24:36,862][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:24:37,184][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:24:37,505][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:24:37,825][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:24:38,147][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:24:38,467][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:24:38,789][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:24:39,109][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:24:39,429][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:24:39,749][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:24:40,068][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:24:40,389][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:24:40,710][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:24:41,029][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:24:41,350][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:24:41,671][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:24:41,990][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:24:42,310][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:24:42,630][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:24:42,950][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:24:43,270][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:24:43,591][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:24:43,912][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:24:44,232][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:24:44,553][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:24:44,873][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:24:45,483][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:24:45,803][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:24:46,123][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:24:46,442][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:24:46,762][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:24:47,083][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:24:47,404][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:24:47,724][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:24:48,045][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:24:48,365][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:24:48,686][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:24:49,007][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:24:49,327][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:24:49,979][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:24:50,716][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:24:50,719][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:24:50,721][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:24:51,396][__main__][INFO] - Iteration 230 took 27s (11.91% Gen, 85.61% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 43m 13s. Estimated total time: 7h 34m 36s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 18s. +[2026-03-25 17:24:51,398][__main__][INFO] - Starting iteration 230. +[2026-03-25 17:24:51,401][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:24:51,401][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:24:54,716][__main__][INFO] - Number of regex retries in iteration 230: 0 +[2026-03-25 17:24:54,717][__main__][INFO] - agents played in iteration 230 are Bob, Alice +[2026-03-25 17:24:55,266][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:24:55,910][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:24:56,203][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:24:56,523][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:24:56,843][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:24:57,164][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:24:57,482][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:24:57,802][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:24:58,122][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:24:58,441][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:24:58,760][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:24:59,081][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:24:59,402][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:24:59,723][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:25:00,043][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:25:00,364][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:25:00,685][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:25:01,003][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:25:01,323][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:25:01,643][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:25:01,964][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:25:02,285][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:25:02,604][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:25:02,924][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:25:03,243][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:25:03,562][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:25:03,881][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:25:04,202][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:25:04,524][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:25:04,844][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:25:05,165][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:25:05,484][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:25:05,804][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:25:06,124][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:25:06,444][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:25:06,764][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:25:07,084][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:25:07,405][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:25:07,725][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:25:08,045][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:25:08,366][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:25:08,687][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:25:09,006][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:25:09,326][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:25:09,646][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:25:09,967][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:25:10,288][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:25:10,609][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:25:10,928][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:25:11,248][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:25:11,567][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:25:11,886][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:25:12,207][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:25:12,818][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:25:13,138][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:25:13,458][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:25:13,778][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:25:14,098][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:25:14,418][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:25:14,738][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:25:15,059][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:25:15,381][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:25:15,701][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:25:16,021][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:25:16,342][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:25:16,662][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:25:17,313][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:25:18,043][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:25:18,045][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:25:18,046][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:25:18,727][__main__][INFO] - Iteration 231 took 27s (12.13% Gen, 85.37% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 43m 36s. Estimated total time: 7h 35m 27s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 43s. +[2026-03-25 17:25:18,731][__main__][INFO] - Starting iteration 231. +[2026-03-25 17:25:18,734][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:25:18,735][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:25:22,028][__main__][INFO] - Number of regex retries in iteration 231: 0 +[2026-03-25 17:25:22,029][__main__][INFO] - agents played in iteration 231 are Bob, Alice +[2026-03-25 17:25:22,573][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:25:23,218][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:25:23,508][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:25:23,829][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:25:24,149][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:25:24,471][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:25:24,790][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:25:25,112][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:25:25,432][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:25:25,753][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:25:26,074][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:25:26,397][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:25:26,718][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:25:27,040][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:25:27,362][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:25:27,684][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:25:28,004][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:25:28,326][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:25:28,646][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:25:28,967][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:25:29,288][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:25:29,608][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:25:29,927][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:25:30,246][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:25:30,568][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:25:30,889][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:25:31,209][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:25:31,530][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:25:31,850][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:25:32,170][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:25:32,491][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:25:32,810][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:25:33,131][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:25:33,453][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:25:33,774][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:25:34,096][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:25:34,417][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:25:34,738][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:25:35,060][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:25:35,381][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:25:35,703][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:25:36,023][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:25:36,343][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:25:36,665][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:25:36,987][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:25:37,307][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:25:37,628][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:25:37,949][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:25:38,269][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:25:38,589][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:25:38,911][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:25:39,230][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:25:39,551][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:25:40,161][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:25:40,481][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:25:40,800][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:25:41,122][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:25:41,442][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:25:41,764][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:25:42,084][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:25:42,403][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:25:42,723][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:25:43,044][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:25:43,364][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:25:43,685][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:25:44,004][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:25:44,655][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:25:45,393][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:25:45,396][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:25:45,397][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:25:46,025][__main__][INFO] - Iteration 232 took 27s (12.07% Gen, 85.62% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 42m 33s. Estimated total time: 7h 34m 51s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 25s. +[2026-03-25 17:25:46,027][__main__][INFO] - Starting iteration 232. +[2026-03-25 17:25:46,030][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:25:46,030][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:25:49,296][__main__][INFO] - Number of regex retries in iteration 232: 0 +[2026-03-25 17:25:49,297][__main__][INFO] - agents played in iteration 232 are Bob, Alice +[2026-03-25 17:25:49,839][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:25:50,490][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:25:50,780][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:25:51,100][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:25:51,420][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:25:51,740][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:25:52,061][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:25:52,382][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:25:52,702][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:25:53,023][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:25:53,344][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:25:53,665][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:25:53,984][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:25:54,303][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:25:54,624][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:25:54,943][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:25:55,263][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:25:55,583][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:25:55,903][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:25:56,224][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:25:56,545][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:25:56,866][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:25:57,187][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:25:57,508][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:25:57,830][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:25:58,150][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:25:58,470][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:25:58,789][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:25:59,110][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:25:59,431][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:25:59,752][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:26:00,073][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:26:00,395][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:26:00,716][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:26:01,036][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:26:01,357][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:26:01,678][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:26:01,999][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:26:02,322][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:26:02,643][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:26:02,963][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:26:03,283][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:26:03,603][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:26:03,924][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:26:04,244][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:26:04,565][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:26:04,885][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:26:05,204][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:26:05,524][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:26:05,845][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:26:06,167][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:26:06,486][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:26:06,807][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:26:07,417][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:26:07,738][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:26:08,058][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:26:08,380][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:26:08,701][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:26:09,020][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:26:09,341][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:26:09,662][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:26:09,983][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:26:10,302][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:26:10,622][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:26:10,942][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:26:11,262][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:26:11,916][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:26:12,653][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:26:12,655][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:26:12,657][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:26:13,337][__main__][INFO] - Iteration 233 took 27s (11.96% Gen, 85.54% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 42m 22s. Estimated total time: 7h 35m 8s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 34s. +[2026-03-25 17:26:13,339][__main__][INFO] - Starting iteration 233. +[2026-03-25 17:26:13,342][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:26:13,343][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:26:16,636][__main__][INFO] - Number of regex retries in iteration 233: 0 +[2026-03-25 17:26:16,637][__main__][INFO] - agents played in iteration 233 are Bob, Alice +[2026-03-25 17:26:17,181][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:26:17,829][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:26:18,119][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:26:18,441][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:26:18,761][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:26:19,082][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:26:19,403][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:26:19,723][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:26:20,044][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:26:20,366][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:26:20,687][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:26:21,007][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:26:21,328][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:26:21,649][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:26:21,971][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:26:22,290][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:26:22,610][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:26:22,931][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:26:23,250][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:26:23,570][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:26:23,890][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:26:24,210][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:26:24,531][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:26:24,852][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:26:25,171][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:26:25,491][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:26:25,811][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:26:26,131][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:26:26,453][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:26:26,774][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:26:27,093][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:26:27,414][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:26:27,735][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:26:28,055][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:26:28,376][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:26:28,697][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:26:29,018][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:26:29,338][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:26:29,659][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:26:29,980][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:26:30,300][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:26:30,620][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:26:30,940][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:26:31,262][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:26:31,583][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:26:31,902][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:26:32,223][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:26:32,542][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:26:32,862][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:26:33,183][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:26:33,503][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:26:33,823][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:26:34,144][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:26:34,754][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:26:35,075][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:26:35,396][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:26:35,717][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:26:36,036][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:26:36,357][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:26:36,677][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:26:36,998][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:26:37,318][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:26:37,638][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:26:37,959][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:26:38,280][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:26:38,601][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:26:39,251][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:26:39,976][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:26:39,978][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:26:39,979][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:26:40,675][__main__][INFO] - Iteration 234 took 27s (12.05% Gen, 85.40% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 42m 20s. Estimated total time: 7h 35m 33s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 46s. +[2026-03-25 17:26:40,677][__main__][INFO] - Starting iteration 234. +[2026-03-25 17:26:40,680][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:26:40,681][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:26:43,945][__main__][INFO] - Number of regex retries in iteration 234: 0 +[2026-03-25 17:26:43,946][__main__][INFO] - agents played in iteration 234 are Bob, Alice +[2026-03-25 17:26:44,484][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:26:45,129][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:26:45,420][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:26:45,742][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:26:46,063][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:26:46,382][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:26:46,702][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:26:47,022][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:26:47,343][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:26:47,663][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:26:47,984][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:26:48,304][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:26:48,624][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:26:48,943][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:26:49,264][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:26:49,585][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:26:49,904][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:26:50,226][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:26:50,547][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:26:50,869][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:26:51,189][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:26:51,510][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:26:51,831][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:26:52,150][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:26:52,470][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:26:52,789][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:26:53,110][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:26:53,430][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:26:53,750][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:26:54,070][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:26:54,390][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:26:54,711][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:26:55,033][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:26:55,354][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:26:55,675][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:26:55,996][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:26:56,317][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:26:56,638][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:26:56,958][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:26:57,278][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:26:57,597][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:26:57,917][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:26:58,239][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:26:58,559][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:26:58,879][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:26:59,200][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:26:59,521][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:26:59,842][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:27:00,164][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:27:00,485][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:27:00,806][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:27:01,126][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:27:01,447][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:27:02,057][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:27:02,380][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:27:02,703][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:27:03,024][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:27:03,346][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:27:03,668][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:27:03,989][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:27:04,312][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:27:04,632][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:27:04,954][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:27:05,274][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:27:05,595][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:27:05,916][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:27:06,568][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:27:07,297][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:27:07,299][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:27:07,301][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:27:07,931][__main__][INFO] - Iteration 235 took 27s (11.98% Gen, 85.70% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 40m 31s. Estimated total time: 7h 34m 11s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 5s. +[2026-03-25 17:27:07,933][__main__][INFO] - Starting iteration 235. +[2026-03-25 17:27:07,936][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:27:07,937][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:27:11,156][__main__][INFO] - Number of regex retries in iteration 235: 0 +[2026-03-25 17:27:11,157][__main__][INFO] - agents played in iteration 235 are Bob, Alice +[2026-03-25 17:27:11,701][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:27:12,347][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:27:12,638][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:27:12,960][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:27:13,280][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:27:13,600][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:27:13,921][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:27:14,242][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:27:14,562][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:27:14,882][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:27:15,202][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:27:15,523][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:27:15,843][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:27:16,164][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:27:16,485][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:27:16,806][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:27:17,126][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:27:17,447][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:27:17,766][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:27:18,087][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:27:18,407][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:27:18,728][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:27:19,049][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:27:19,369][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:27:19,690][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:27:20,011][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:27:20,332][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:27:20,654][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:27:20,975][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:27:21,295][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:27:21,617][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:27:21,938][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:27:22,257][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:27:22,577][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:27:22,898][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:27:23,219][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:27:23,540][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:27:23,861][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:27:24,183][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:27:24,502][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:27:24,822][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:27:25,143][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:27:25,465][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:27:25,786][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:27:26,107][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:27:26,428][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:27:26,749][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:27:27,068][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:27:27,388][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:27:27,708][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:27:28,029][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:27:28,350][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:27:28,671][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:27:29,280][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:27:29,602][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:27:29,923][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:27:30,243][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:27:30,563][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:27:30,884][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:27:31,204][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:27:31,524][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:27:31,845][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:27:32,166][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:27:32,485][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:27:32,804][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:27:33,124][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:27:33,775][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:27:34,511][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:27:34,513][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:27:34,515][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:27:35,202][__main__][INFO] - Iteration 236 took 27s (11.81% Gen, 85.66% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 40m 19s. Estimated total time: 7h 34m 26s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 13s. +[2026-03-25 17:27:35,204][__main__][INFO] - Starting iteration 236. +[2026-03-25 17:27:35,207][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:27:35,208][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:27:38,464][__main__][INFO] - Number of regex retries in iteration 236: 0 +[2026-03-25 17:27:38,464][__main__][INFO] - agents played in iteration 236 are Bob, Alice +[2026-03-25 17:27:39,020][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:27:39,664][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:27:39,954][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:27:40,275][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:27:40,596][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:27:40,917][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:27:41,238][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:27:41,559][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:27:41,879][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:27:42,200][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:27:42,521][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:27:42,841][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:27:43,161][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:27:43,482][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:27:43,803][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:27:44,123][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:27:44,443][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:27:44,765][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:27:45,087][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:27:45,407][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:27:45,726][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:27:46,048][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:27:46,369][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:27:46,688][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:27:47,008][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:27:47,328][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:27:47,648][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:27:47,969][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:27:48,289][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:27:48,610][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:27:48,931][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:27:49,250][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:27:49,571][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:27:49,892][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:27:50,213][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:27:50,535][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:27:50,857][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:27:51,177][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:27:51,499][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:27:51,820][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:27:52,141][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:27:52,462][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:27:52,783][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:27:53,105][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:27:53,426][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:27:53,747][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:27:54,069][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:27:54,389][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:27:54,710][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:27:55,032][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:27:55,353][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:27:55,674][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:27:55,996][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:27:56,606][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:27:56,926][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:27:57,247][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:27:57,568][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:27:57,888][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:27:58,208][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:27:58,528][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:27:58,848][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:27:59,169][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:27:59,489][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:27:59,809][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:28:00,130][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:28:00,450][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:28:01,102][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:28:01,829][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:28:01,832][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:28:01,833][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:28:02,534][__main__][INFO] - Iteration 237 took 27s (11.92% Gen, 85.51% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 40m 52s. Estimated total time: 7h 35m 27s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 43s. +[2026-03-25 17:28:02,536][__main__][INFO] - Starting iteration 237. +[2026-03-25 17:28:02,539][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:28:02,539][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:28:05,852][__main__][INFO] - Number of regex retries in iteration 237: 0 +[2026-03-25 17:28:05,852][__main__][INFO] - agents played in iteration 237 are Bob, Alice +[2026-03-25 17:28:06,405][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:28:07,050][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:28:07,342][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:28:07,662][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:28:07,981][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:28:08,302][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:28:08,622][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:28:08,942][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:28:09,263][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:28:09,584][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:28:09,903][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:28:10,223][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:28:10,544][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:28:10,865][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:28:11,185][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:28:11,507][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:28:11,826][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:28:12,146][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:28:12,465][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:28:12,784][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:28:13,104][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:28:13,425][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:28:13,745][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:28:14,066][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:28:14,385][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:28:14,706][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:28:15,027][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:28:15,347][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:28:15,667][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:28:15,987][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:28:16,306][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:28:16,627][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:28:16,946][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:28:17,267][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:28:17,588][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:28:17,908][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:28:18,227][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:28:18,548][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:28:18,869][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:28:19,190][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:28:19,510][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:28:19,832][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:28:20,152][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:28:20,474][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:28:20,796][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:28:21,117][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:28:21,437][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:28:21,758][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:28:22,079][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:28:22,400][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:28:22,721][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:28:23,042][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:28:23,362][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:28:23,973][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:28:24,294][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:28:24,614][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:28:24,935][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:28:25,256][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:28:25,577][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:28:25,896][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:28:26,217][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:28:26,537][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:28:26,857][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:28:27,179][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:28:27,498][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:28:27,819][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:28:28,469][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:28:29,203][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:28:29,205][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:28:29,207][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:28:29,833][__main__][INFO] - Iteration 238 took 27s (12.14% Gen, 85.56% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 39m 53s. Estimated total time: 7h 34m 55s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 27s. +[2026-03-25 17:28:29,836][__main__][INFO] - Starting iteration 238. +[2026-03-25 17:28:29,839][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:28:29,839][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:28:33,111][__main__][INFO] - Number of regex retries in iteration 238: 0 +[2026-03-25 17:28:33,112][__main__][INFO] - agents played in iteration 238 are Bob, Alice +[2026-03-25 17:28:33,654][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:28:34,301][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:28:34,591][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:28:34,912][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:28:35,233][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:28:35,553][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:28:35,874][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:28:36,195][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:28:36,516][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:28:36,836][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:28:37,157][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:28:37,477][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:28:37,798][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:28:38,119][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:28:38,440][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:28:38,761][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:28:39,082][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:28:39,402][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:28:39,722][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:28:40,041][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:28:40,361][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:28:40,682][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:28:41,001][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:28:41,322][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:28:41,642][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:28:41,963][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:28:42,285][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:28:42,606][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:28:42,926][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:28:43,247][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:28:43,566][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:28:43,887][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:28:44,206][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:28:44,527][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:28:44,847][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:28:45,168][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:28:45,489][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:28:45,810][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:28:46,130][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:28:46,449][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:28:46,770][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:28:47,090][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:28:47,410][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:28:47,730][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:28:48,051][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:28:48,371][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:28:48,690][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:28:49,010][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:28:49,331][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:28:49,652][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:28:49,973][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:28:50,295][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:28:50,616][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:28:51,228][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:28:51,548][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:28:51,869][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:28:52,189][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:28:52,510][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:28:52,831][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:28:53,151][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:28:53,471][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:28:53,791][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:28:54,109][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:28:54,429][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:28:54,749][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:28:55,069][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:28:55,719][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:28:56,448][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:28:56,450][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:28:56,452][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:28:57,080][__main__][INFO] - Iteration 239 took 27s (12.01% Gen, 85.68% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 38m 32s. Estimated total time: 7h 34m 1s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 0s. +[2026-03-25 17:28:57,082][__main__][INFO] - Starting iteration 239. +[2026-03-25 17:28:57,085][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:28:57,086][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:29:00,365][__main__][INFO] - Number of regex retries in iteration 239: 0 +[2026-03-25 17:29:00,366][__main__][INFO] - agents played in iteration 239 are Bob, Alice +[2026-03-25 17:29:00,915][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:29:01,561][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:29:01,851][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:29:02,172][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:29:02,491][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:29:02,811][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:29:03,132][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:29:03,452][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:29:03,774][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:29:04,095][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:29:04,416][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:29:04,736][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:29:05,057][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:29:05,378][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:29:05,699][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:29:06,018][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:29:06,339][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:29:06,660][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:29:06,981][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:29:07,300][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:29:07,621][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:29:07,942][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:29:08,262][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:29:08,583][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:29:08,903][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:29:09,223][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:29:09,544][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:29:09,864][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:29:10,184][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:29:10,503][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:29:10,825][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:29:11,143][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:29:11,464][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:29:11,783][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:29:12,103][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:29:12,423][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:29:12,744][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:29:13,063][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:29:13,383][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:29:13,703][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:29:14,023][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:29:14,343][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:29:14,663][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:29:14,983][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:29:15,302][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:29:15,621][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:29:15,942][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:29:16,261][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:29:16,581][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:29:16,900][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:29:17,221][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:29:17,542][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:29:17,863][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:29:18,473][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:29:18,795][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:29:19,114][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:29:19,435][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:29:19,755][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:29:20,077][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:29:20,398][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:29:20,718][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:29:21,037][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:29:21,358][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:29:21,677][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:29:21,997][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:29:22,317][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:29:22,968][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:29:23,693][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:29:23,695][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:29:23,697][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:29:24,326][__main__][INFO] - Iteration 240 took 27s (12.04% Gen, 85.64% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 38m 5s. Estimated total time: 7h 34m 2s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 1s. +[2026-03-25 17:29:24,329][__main__][INFO] - Starting iteration 240. +[2026-03-25 17:29:24,332][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:29:24,332][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:29:27,533][__main__][INFO] - Number of regex retries in iteration 240: 0 +[2026-03-25 17:29:27,534][__main__][INFO] - agents played in iteration 240 are Bob, Alice +[2026-03-25 17:29:28,080][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:29:28,724][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:29:29,014][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:29:29,337][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:29:29,657][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:29:29,977][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:29:30,299][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:29:30,619][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:29:30,939][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:29:31,261][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:29:31,581][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:29:31,900][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:29:32,221][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:29:32,541][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:29:32,861][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:29:33,183][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:29:33,504][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:29:33,826][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:29:34,147][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:29:34,467][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:29:34,787][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:29:35,107][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:29:35,428][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:29:35,749][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:29:36,070][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:29:36,391][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:29:36,711][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:29:37,031][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:29:37,351][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:29:37,672][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:29:37,994][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:29:38,315][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:29:38,637][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:29:38,959][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:29:39,280][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:29:39,602][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:29:39,923][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:29:40,244][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:29:40,563][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:29:40,884][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:29:41,203][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:29:41,523][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:29:41,843][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:29:42,164][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:29:42,485][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:29:42,806][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:29:43,126][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:29:43,447][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:29:43,769][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:29:44,090][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:29:44,409][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:29:44,730][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:29:45,052][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:29:45,661][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:29:45,983][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:29:46,304][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:29:46,624][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:29:46,943][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:29:47,263][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:29:47,584][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:29:47,904][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:29:48,223][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:29:48,543][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:29:48,863][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:29:49,185][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:29:49,505][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:29:50,156][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:29:50,884][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:29:50,886][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:29:50,887][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:29:51,517][__main__][INFO] - Iteration 241 took 27s (11.78% Gen, 85.90% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 36m 42s. Estimated total time: 7h 33m 5s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 32s. +[2026-03-25 17:29:51,519][__main__][INFO] - Starting iteration 241. +[2026-03-25 17:29:51,522][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:29:51,523][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:29:54,724][__main__][INFO] - Number of regex retries in iteration 241: 0 +[2026-03-25 17:29:54,725][__main__][INFO] - agents played in iteration 241 are Bob, Alice +[2026-03-25 17:29:55,265][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:29:55,911][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:29:56,200][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:29:56,520][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:29:56,840][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:29:57,162][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:29:57,483][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:29:57,803][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:29:58,124][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:29:58,445][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:29:58,764][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:29:59,085][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:29:59,405][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:29:59,726][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:30:00,045][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:30:00,368][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:30:00,688][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:30:01,009][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:30:01,329][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:30:01,649][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:30:01,968][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:30:02,288][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:30:02,609][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:30:02,928][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:30:03,249][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:30:03,569][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:30:03,890][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:30:04,209][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:30:04,529][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:30:04,849][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:30:05,170][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:30:05,489][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:30:05,809][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:30:06,130][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:30:06,450][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:30:06,770][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:30:07,092][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:30:07,411][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:30:07,730][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:30:08,051][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:30:08,372][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:30:08,691][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:30:09,011][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:30:09,330][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:30:09,651][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:30:09,971][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:30:10,291][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:30:10,610][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:30:10,930][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:30:11,249][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:30:11,571][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:30:11,891][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:30:12,210][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:30:12,820][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:30:13,141][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:30:13,460][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:30:13,780][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:30:14,101][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:30:14,421][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:30:14,742][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:30:15,063][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:30:15,382][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:30:15,703][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:30:16,023][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:30:16,344][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:30:16,664][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:30:17,314][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:30:18,047][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:30:18,050][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:30:18,051][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:30:18,678][__main__][INFO] - Iteration 242 took 27s (11.79% Gen, 85.89% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 35m 46s. Estimated total time: 7h 32m 37s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 15s, 500 more iterations: 3h 46m 18s. +[2026-03-25 17:30:18,681][__main__][INFO] - Starting iteration 242. +[2026-03-25 17:30:18,684][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:30:18,684][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:30:21,923][__main__][INFO] - Number of regex retries in iteration 242: 0 +[2026-03-25 17:30:21,923][__main__][INFO] - agents played in iteration 242 are Bob, Alice +[2026-03-25 17:30:22,482][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:30:23,139][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:30:23,429][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:30:23,751][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:30:24,072][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:30:24,392][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:30:24,713][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:30:25,035][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:30:25,358][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:30:25,679][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:30:25,998][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:30:26,317][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:30:26,638][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:30:26,959][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:30:27,279][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:30:27,599][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:30:27,920][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:30:28,239][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:30:28,560][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:30:28,882][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:30:29,201][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:30:29,523][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:30:29,844][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:30:30,165][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:30:30,486][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:30:30,807][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:30:31,128][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:30:31,449][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:30:31,770][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:30:32,091][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:30:32,411][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:30:32,732][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:30:33,056][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:30:33,378][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:30:33,698][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:30:34,017][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:30:34,337][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:30:34,658][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:30:34,979][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:30:35,299][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:30:35,619][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:30:35,940][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:30:36,262][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:30:36,580][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:30:36,900][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:30:37,220][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:30:37,539][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:30:37,860][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:30:38,181][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:30:38,501][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:30:38,821][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:30:39,142][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:30:39,464][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:30:40,076][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:30:40,396][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:30:40,716][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:30:41,038][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:30:41,358][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:30:41,677][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:30:41,997][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:30:42,317][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:30:42,637][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:30:42,958][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:30:43,279][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:30:43,599][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:30:43,918][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:30:44,570][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:30:45,297][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:30:45,299][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:30:45,301][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:30:45,982][__main__][INFO] - Iteration 243 took 27s (11.87% Gen, 85.63% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 37m 40s. Estimated total time: 7h 34m 59s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 29s. +[2026-03-25 17:30:45,984][__main__][INFO] - Starting iteration 243. +[2026-03-25 17:30:45,987][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:30:45,988][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:30:49,159][__main__][INFO] - Number of regex retries in iteration 243: 0 +[2026-03-25 17:30:49,159][__main__][INFO] - agents played in iteration 243 are Bob, Alice +[2026-03-25 17:30:49,698][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:30:50,350][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:30:50,641][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:30:50,961][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:30:51,282][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:30:51,601][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:30:51,922][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:30:52,242][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:30:52,562][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:30:52,882][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:30:53,204][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:30:53,523][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:30:53,844][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:30:54,163][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:30:54,483][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:30:54,803][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:30:55,124][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:30:55,444][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:30:55,763][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:30:56,084][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:30:56,404][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:30:56,724][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:30:57,044][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:30:57,363][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:30:57,683][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:30:58,004][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:30:58,324][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:30:58,644][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:30:58,964][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:30:59,284][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:30:59,604][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:30:59,925][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:31:00,244][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:31:00,565][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:31:00,885][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:31:01,204][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:31:01,525][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:31:01,844][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:31:02,164][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:31:02,485][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:31:02,804][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:31:03,124][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:31:03,445][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:31:03,765][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:31:04,084][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:31:04,403][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:31:04,724][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:31:05,043][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:31:05,363][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:31:05,684][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:31:06,003][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:31:06,323][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:31:06,644][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:31:07,255][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:31:07,575][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:31:07,897][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:31:08,218][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:31:08,539][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:31:08,859][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:31:09,180][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:31:09,501][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:31:09,822][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:31:10,141][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:31:10,463][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:31:10,784][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:31:11,105][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:31:11,766][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:31:12,494][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:31:12,496][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:31:12,498][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:31:13,128][__main__][INFO] - Iteration 244 took 27s (11.69% Gen, 85.99% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 34m 35s. Estimated total time: 7h 32m 21s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 14s, 500 more iterations: 3h 46m 10s. +[2026-03-25 17:31:13,130][__main__][INFO] - Starting iteration 244. +[2026-03-25 17:31:13,133][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:31:13,133][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:31:16,300][__main__][INFO] - Number of regex retries in iteration 244: 0 +[2026-03-25 17:31:16,301][__main__][INFO] - agents played in iteration 244 are Bob, Alice +[2026-03-25 17:31:16,844][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:31:17,493][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:31:17,784][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:31:18,105][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:31:18,428][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:31:18,748][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:31:19,068][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:31:19,388][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:31:19,707][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:31:20,028][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:31:20,348][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:31:20,669][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:31:20,989][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:31:21,309][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:31:21,631][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:31:21,953][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:31:22,273][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:31:22,592][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:31:22,912][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:31:23,232][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:31:23,553][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:31:23,876][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:31:24,195][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:31:24,514][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:31:24,833][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:31:25,154][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:31:25,474][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:31:25,794][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:31:26,113][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:31:26,433][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:31:26,754][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:31:27,073][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:31:27,393][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:31:27,713][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:31:28,033][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:31:28,353][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:31:28,674][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:31:28,993][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:31:29,313][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:31:29,633][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:31:29,954][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:31:30,274][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:31:30,594][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:31:30,915][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:31:31,234][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:31:31,554][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:31:31,875][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:31:32,195][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:31:32,517][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:31:32,837][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:31:33,158][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:31:33,478][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:31:33,798][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:31:34,409][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:31:34,730][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:31:35,049][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:31:35,367][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:31:35,689][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:31:36,009][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:31:36,328][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:31:36,649][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:31:36,968][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:31:37,288][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:31:37,609][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:31:37,930][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:31:38,249][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:31:38,900][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:31:39,635][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:31:39,637][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:31:39,638][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:31:40,354][__main__][INFO] - Iteration 245 took 27s (11.64% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 35m 29s. Estimated total time: 7h 33m 42s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 51s. +[2026-03-25 17:31:40,356][__main__][INFO] - Starting iteration 245. +[2026-03-25 17:31:40,359][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:31:40,360][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:31:43,582][__main__][INFO] - Number of regex retries in iteration 245: 0 +[2026-03-25 17:31:43,583][__main__][INFO] - agents played in iteration 245 are Bob, Alice +[2026-03-25 17:31:44,124][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:31:44,770][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:31:45,060][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:31:45,382][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:31:45,703][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:31:46,022][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:31:46,342][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:31:46,662][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:31:46,982][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:31:47,302][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:31:47,622][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:31:47,943][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:31:48,264][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:31:48,583][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:31:48,902][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:31:49,223][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:31:49,543][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:31:49,864][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:31:50,185][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:31:50,506][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:31:50,826][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:31:51,146][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:31:51,465][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:31:51,784][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:31:52,105][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:31:52,424][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:31:52,744][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:31:53,065][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:31:53,384][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:31:53,703][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:31:54,023][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:31:54,343][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:31:54,664][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:31:54,985][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:31:55,304][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:31:55,623][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:31:55,943][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:31:56,264][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:31:56,585][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:31:56,906][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:31:57,228][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:31:57,548][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:31:57,868][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:31:58,190][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:31:58,510][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:31:58,831][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:31:59,152][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:31:59,474][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:31:59,797][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:32:00,119][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:32:00,440][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:32:00,761][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:32:01,083][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:32:01,699][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:32:02,021][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:32:02,342][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:32:02,662][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:32:02,983][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:32:03,304][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:32:03,625][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:32:03,946][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:32:04,267][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:32:04,587][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:32:04,908][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:32:05,229][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:32:05,549][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:32:06,202][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:32:06,937][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:32:06,939][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:32:06,942][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:32:07,577][__main__][INFO] - Iteration 246 took 27s (11.84% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 34m 58s. Estimated total time: 7h 33m 38s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 49s. +[2026-03-25 17:32:07,579][__main__][INFO] - Starting iteration 246. +[2026-03-25 17:32:07,582][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:32:07,583][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:32:10,784][__main__][INFO] - Number of regex retries in iteration 246: 0 +[2026-03-25 17:32:10,785][__main__][INFO] - agents played in iteration 246 are Bob, Alice +[2026-03-25 17:32:11,334][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:32:11,979][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:32:12,270][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:32:12,591][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:32:12,913][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:32:13,234][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:32:13,555][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:32:13,875][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:32:14,195][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:32:14,514][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:32:14,836][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:32:15,157][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:32:15,477][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:32:15,797][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:32:16,118][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:32:16,438][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:32:16,759][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:32:17,080][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:32:17,399][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:32:17,719][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:32:18,038][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:32:18,359][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:32:18,679][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:32:18,999][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:32:19,318][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:32:19,638][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:32:19,958][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:32:20,280][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:32:20,601][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:32:20,921][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:32:21,243][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:32:21,563][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:32:21,883][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:32:22,203][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:32:22,523][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:32:22,843][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:32:23,162][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:32:23,482][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:32:23,803][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:32:24,123][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:32:24,442][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:32:24,763][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:32:25,084][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:32:25,403][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:32:25,722][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:32:26,043][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:32:26,362][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:32:26,683][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:32:27,004][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:32:27,323][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:32:27,643][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:32:27,963][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:32:28,282][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:32:28,891][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:32:29,213][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:32:29,533][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:32:29,852][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:32:30,174][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:32:30,493][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:32:30,814][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:32:31,134][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:32:31,455][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:32:31,776][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:32:32,096][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:32:32,416][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:32:32,736][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:32:33,386][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:32:34,112][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:32:34,114][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:32:34,116][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:32:34,744][__main__][INFO] - Iteration 247 took 27s (11.79% Gen, 85.89% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 33m 35s. Estimated total time: 7h 32m 42s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 16s, 500 more iterations: 3h 46m 21s. +[2026-03-25 17:32:34,746][__main__][INFO] - Starting iteration 247. +[2026-03-25 17:32:34,749][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:32:34,750][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:32:37,951][__main__][INFO] - Number of regex retries in iteration 247: 0 +[2026-03-25 17:32:37,952][__main__][INFO] - agents played in iteration 247 are Bob, Alice +[2026-03-25 17:32:38,494][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:32:39,140][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:32:39,432][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:32:39,754][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:32:40,074][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:32:40,393][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:32:40,715][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:32:41,034][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:32:41,357][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:32:41,678][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:32:41,999][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:32:42,319][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:32:42,639][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:32:42,959][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:32:43,280][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:32:43,601][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:32:43,921][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:32:44,242][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:32:44,562][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:32:44,881][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:32:45,202][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:32:45,522][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:32:45,843][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:32:46,163][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:32:46,484][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:32:46,803][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:32:47,125][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:32:47,447][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:32:47,767][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:32:48,089][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:32:48,410][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:32:48,732][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:32:49,054][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:32:49,375][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:32:49,696][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:32:50,017][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:32:50,338][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:32:50,659][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:32:50,978][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:32:51,299][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:32:51,619][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:32:51,941][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:32:52,261][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:32:52,581][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:32:52,902][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:32:53,222][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:32:53,543][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:32:53,862][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:32:54,183][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:32:54,503][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:32:54,823][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:32:55,142][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:32:55,463][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:32:56,075][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:32:56,396][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:32:56,716][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:32:57,037][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:32:57,358][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:32:57,678][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:32:57,997][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:32:58,318][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:32:58,639][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:32:58,960][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:32:59,281][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:32:59,602][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:32:59,923][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:33:00,577][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:33:01,337][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:33:01,339][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:33:01,341][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:33:01,986][__main__][INFO] - Iteration 248 took 27s (11.76% Gen, 85.87% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 34m 23s. Estimated total time: 7h 33m 58s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 59s. +[2026-03-25 17:33:01,989][__main__][INFO] - Starting iteration 248. +[2026-03-25 17:33:01,992][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:33:01,992][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:33:05,212][__main__][INFO] - Number of regex retries in iteration 248: 0 +[2026-03-25 17:33:05,213][__main__][INFO] - agents played in iteration 248 are Bob, Alice +[2026-03-25 17:33:05,777][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:33:06,424][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:33:06,715][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:33:07,038][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:33:07,359][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:33:07,679][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:33:07,998][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:33:08,319][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:33:08,640][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:33:08,960][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:33:09,281][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:33:09,602][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:33:09,923][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:33:10,244][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:33:10,565][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:33:10,884][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:33:11,203][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:33:11,524][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:33:11,843][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:33:12,164][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:33:12,485][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:33:12,805][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:33:13,125][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:33:13,445][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:33:13,764][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:33:14,085][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:33:14,404][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:33:14,725][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:33:15,045][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:33:15,364][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:33:15,686][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:33:16,007][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:33:16,327][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:33:16,648][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:33:16,969][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:33:17,289][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:33:17,611][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:33:17,931][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:33:18,251][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:33:18,571][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:33:18,890][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:33:19,209][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:33:19,530][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:33:19,851][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:33:20,172][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:33:20,493][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:33:20,814][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:33:21,134][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:33:21,455][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:33:21,776][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:33:22,097][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:33:22,418][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:33:22,738][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:33:23,346][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:33:23,666][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:33:23,985][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:33:24,304][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:33:24,625][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:33:24,943][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:33:25,262][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:33:25,583][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:33:25,903][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:33:26,224][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:33:26,545][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:33:26,865][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:33:27,185][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:33:27,835][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:33:28,561][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:33:28,563][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:33:28,565][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:33:29,255][__main__][INFO] - Iteration 249 took 27s (11.81% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 34m 22s. Estimated total time: 7h 34m 24s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 12s. +[2026-03-25 17:33:29,257][__main__][INFO] - Starting iteration 249. +[2026-03-25 17:33:29,260][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:33:29,261][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:33:32,479][__main__][INFO] - Number of regex retries in iteration 249: 0 +[2026-03-25 17:33:32,479][__main__][INFO] - agents played in iteration 249 are Bob, Alice +[2026-03-25 17:33:33,024][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:33:33,668][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:33:33,959][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:33:34,283][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:33:34,604][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:33:34,925][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:33:35,246][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:33:35,566][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:33:35,886][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:33:36,207][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:33:36,529][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:33:36,851][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:33:37,173][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:33:37,495][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:33:37,815][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:33:38,137][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:33:38,458][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:33:38,778][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:33:39,097][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:33:39,418][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:33:39,737][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:33:40,059][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:33:40,379][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:33:40,699][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:33:41,021][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:33:41,341][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:33:41,662][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:33:41,983][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:33:42,303][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:33:42,623][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:33:42,944][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:33:43,265][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:33:43,586][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:33:43,906][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:33:44,227][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:33:44,547][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:33:44,868][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:33:45,188][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:33:45,509][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:33:45,831][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:33:46,151][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:33:46,471][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:33:46,792][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:33:47,112][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:33:47,432][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:33:47,752][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:33:48,073][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:33:48,393][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:33:48,713][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:33:49,034][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:33:49,354][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:33:49,675][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:33:49,994][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:33:50,605][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:33:50,924][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:33:51,244][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:33:51,565][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:33:51,884][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:33:52,203][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:33:52,525][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:33:52,845][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:33:53,165][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:33:53,484][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:33:53,804][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:33:54,125][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:33:54,444][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:33:55,094][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:33:55,819][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:33:55,821][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:33:55,823][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:33:56,492][__main__][INFO] - Iteration 250 took 27s (11.82% Gen, 85.72% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 33m 23s. Estimated total time: 7h 33m 52s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 56s. +[2026-03-25 17:33:56,494][__main__][INFO] - Starting iteration 250. +[2026-03-25 17:33:56,497][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 4 and human policies 1. +[2026-03-25 17:33:56,498][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:33:59,734][__main__][INFO] - Number of regex retries in iteration 250: 0 +[2026-03-25 17:33:59,734][__main__][INFO] - agents played in iteration 250 are Bob, Alice +[2026-03-25 17:34:00,287][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:34:00,934][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:34:01,223][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:34:01,543][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:34:01,862][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:34:02,183][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:34:02,502][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:34:02,821][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:34:03,142][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:34:03,463][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:34:03,783][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:34:04,103][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:34:04,425][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:34:04,744][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:34:05,063][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:34:05,384][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:34:05,704][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:34:06,025][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:34:06,345][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:34:06,665][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:34:06,984][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:34:07,304][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:34:07,624][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:34:07,945][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:34:08,266][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:34:08,587][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:34:08,908][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:34:09,228][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:34:09,549][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:34:09,868][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:34:10,187][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:34:10,508][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:34:10,828][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:34:11,149][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:34:11,469][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:34:11,789][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:34:12,108][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:34:12,428][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:34:12,748][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:34:13,068][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:34:13,387][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:34:13,707][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:34:14,027][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:34:14,348][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:34:14,668][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:34:14,988][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:34:15,307][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:34:15,628][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:34:15,947][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:34:16,266][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:34:16,585][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:34:16,904][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:34:17,225][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:34:17,836][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:34:18,157][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:34:18,477][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:34:18,797][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:34:19,116][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:34:19,437][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:34:19,757][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:34:20,078][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:34:20,400][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:34:20,719][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:34:21,040][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:34:21,360][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:34:21,680][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:34:22,335][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:34:23,064][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:34:23,067][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:34:23,068][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:34:24,348][__main__][INFO] - Iteration 251 took 27s (11.62% Gen, 83.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 43m 15s. Estimated total time: 7h 44m 11s. Time estimates for 10 more iterations: 4m 38s, 100 more iterations: 46m 25s, 500 more iterations: 3h 52m 5s. +[2026-03-25 17:34:24,350][__main__][INFO] - Starting iteration 251. +[2026-03-25 17:34:24,353][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:34:24,354][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:34:27,638][__main__][INFO] - Number of regex retries in iteration 251: 0 +[2026-03-25 17:34:27,638][__main__][INFO] - agents played in iteration 251 are Bob, Alice +[2026-03-25 17:34:28,201][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:34:28,850][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:34:29,141][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:34:29,462][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:34:29,783][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:34:30,103][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:34:30,424][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:34:30,744][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:34:31,065][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:34:31,385][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:34:31,704][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:34:32,023][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:34:32,344][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:34:32,663][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:34:32,983][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:34:33,304][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:34:33,625][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:34:33,945][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:34:34,265][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:34:34,586][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:34:34,905][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:34:35,225][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:34:35,545][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:34:35,864][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:34:36,183][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:34:36,504][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:34:36,824][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:34:37,144][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:34:37,464][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:34:37,785][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:34:38,105][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:34:38,424][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:34:38,743][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:34:39,064][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:34:39,384][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:34:39,703][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:34:40,025][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:34:40,345][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:34:40,664][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:34:40,985][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:34:41,304][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:34:41,624][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:34:41,945][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:34:42,266][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:34:42,586][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:34:42,905][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:34:43,225][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:34:43,545][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:34:43,865][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:34:44,184][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:34:44,506][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:34:44,825][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:34:45,145][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:34:45,756][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:34:46,077][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:34:46,398][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:34:46,717][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:34:47,038][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:34:47,358][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:34:47,678][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:34:47,998][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:34:48,319][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:34:48,638][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:34:48,958][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:34:49,278][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:34:49,598][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:34:50,256][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:34:50,988][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:34:50,990][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:34:50,992][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:34:51,620][__main__][INFO] - Iteration 252 took 27s (12.05% Gen, 85.64% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 33m 3s. Estimated total time: 7h 34m 27s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 13s. +[2026-03-25 17:34:51,622][__main__][INFO] - Starting iteration 252. +[2026-03-25 17:34:51,625][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:34:51,626][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:34:54,854][__main__][INFO] - Number of regex retries in iteration 252: 0 +[2026-03-25 17:34:54,855][__main__][INFO] - agents played in iteration 252 are Bob, Alice +[2026-03-25 17:34:55,414][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:34:56,062][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:34:56,353][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:34:56,674][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:34:56,994][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:34:57,316][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:34:57,635][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:34:57,956][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:34:58,276][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:34:58,597][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:34:58,917][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:34:59,238][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:34:59,558][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:34:59,877][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:35:00,197][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:35:00,518][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:35:00,838][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:35:01,158][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:35:01,479][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:35:01,798][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:35:02,119][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:35:02,438][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:35:02,760][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:35:03,080][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:35:03,401][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:35:03,721][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:35:04,042][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:35:04,363][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:35:04,684][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:35:05,003][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:35:05,324][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:35:05,644][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:35:05,964][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:35:06,285][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:35:06,605][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:35:06,924][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:35:07,243][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:35:07,563][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:35:07,882][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:35:08,202][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:35:08,522][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:35:08,843][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:35:09,163][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:35:09,483][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:35:09,803][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:35:10,122][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:35:10,441][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:35:10,762][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:35:11,083][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:35:11,402][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:35:11,724][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:35:12,044][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:35:12,364][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:35:12,976][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:35:13,296][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:35:13,616][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:35:13,938][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:35:14,261][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:35:14,582][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:35:14,903][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:35:15,224][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:35:15,547][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:35:15,867][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:35:16,188][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:35:16,508][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:35:16,829][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:35:17,483][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:35:18,206][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:35:18,208][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:35:18,210][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:35:18,838][__main__][INFO] - Iteration 253 took 27s (11.87% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 31m 42s. Estimated total time: 7h 33m 33s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 46s. +[2026-03-25 17:35:18,840][__main__][INFO] - Starting iteration 253. +[2026-03-25 17:35:18,843][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:35:18,843][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:35:22,081][__main__][INFO] - Number of regex retries in iteration 253: 0 +[2026-03-25 17:35:22,082][__main__][INFO] - agents played in iteration 253 are Bob, Alice +[2026-03-25 17:35:22,635][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:35:23,284][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:35:23,574][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:35:23,896][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:35:24,217][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:35:24,538][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:35:24,859][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:35:25,178][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:35:25,499][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:35:25,819][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:35:26,141][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:35:26,461][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:35:26,782][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:35:27,102][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:35:27,421][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:35:27,742][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:35:28,062][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:35:28,383][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:35:28,702][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:35:29,022][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:35:29,343][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:35:29,663][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:35:29,983][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:35:30,302][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:35:30,622][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:35:30,942][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:35:31,262][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:35:31,583][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:35:31,902][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:35:32,223][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:35:32,543][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:35:32,863][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:35:33,183][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:35:33,503][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:35:33,824][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:35:34,143][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:35:34,464][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:35:34,784][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:35:35,103][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:35:35,424][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:35:35,743][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:35:36,064][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:35:36,384][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:35:36,703][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:35:37,024][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:35:37,344][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:35:37,663][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:35:37,984][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:35:38,303][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:35:38,622][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:35:38,943][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:35:39,264][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:35:39,584][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:35:40,195][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:35:40,515][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:35:40,836][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:35:41,157][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:35:41,477][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:35:41,799][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:35:42,119][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:35:42,440][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:35:42,759][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:35:43,080][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:35:43,400][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:35:43,721][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:35:44,040][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:35:44,694][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:35:45,443][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:35:45,446][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:35:45,448][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:35:46,180][__main__][INFO] - Iteration 254 took 27s (11.85% Gen, 85.47% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 33m 19s. Estimated total time: 7h 35m 38s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 49s. +[2026-03-25 17:35:46,182][__main__][INFO] - Starting iteration 254. +[2026-03-25 17:35:46,185][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:35:46,185][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:35:49,393][__main__][INFO] - Number of regex retries in iteration 254: 0 +[2026-03-25 17:35:49,393][__main__][INFO] - agents played in iteration 254 are Bob, Alice +[2026-03-25 17:35:49,940][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:35:50,587][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:35:50,876][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:35:51,198][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:35:51,517][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:35:51,838][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:35:52,157][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:35:52,479][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:35:52,799][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:35:53,118][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:35:53,438][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:35:53,760][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:35:54,080][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:35:54,402][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:35:54,722][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:35:55,043][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:35:55,364][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:35:55,684][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:35:56,006][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:35:56,325][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:35:56,646][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:35:56,965][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:35:57,284][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:35:57,606][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:35:57,927][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:35:58,245][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:35:58,565][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:35:58,886][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:35:59,205][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:35:59,526][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:35:59,846][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:36:00,167][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:36:00,487][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:36:00,806][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:36:01,126][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:36:01,446][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:36:01,767][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:36:02,089][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:36:02,410][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:36:02,731][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:36:03,052][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:36:03,374][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:36:03,697][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:36:04,019][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:36:04,340][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:36:04,660][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:36:04,981][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:36:05,300][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:36:05,620][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:36:05,942][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:36:06,262][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:36:06,583][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:36:06,902][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:36:07,514][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:36:07,834][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:36:08,154][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:36:08,474][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:36:08,795][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:36:09,117][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:36:09,438][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:36:09,760][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:36:10,079][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:36:10,400][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:36:10,722][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:36:11,043][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:36:11,364][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:36:12,016][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:36:12,748][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:36:12,750][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:36:12,752][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:36:13,379][__main__][INFO] - Iteration 255 took 27s (11.80% Gen, 85.89% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 30m 29s. Estimated total time: 7h 33m 15s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 37s. +[2026-03-25 17:36:13,382][__main__][INFO] - Starting iteration 255. +[2026-03-25 17:36:13,385][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:36:13,386][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:36:16,637][__main__][INFO] - Number of regex retries in iteration 255: 0 +[2026-03-25 17:36:16,638][__main__][INFO] - agents played in iteration 255 are Bob, Alice +[2026-03-25 17:36:17,188][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:36:17,834][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:36:18,123][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:36:18,444][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:36:18,765][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:36:19,087][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:36:19,406][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:36:19,724][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:36:20,043][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:36:20,365][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:36:20,684][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:36:21,003][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:36:21,324][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:36:21,645][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:36:21,965][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:36:22,286][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:36:22,604][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:36:22,926][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:36:23,245][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:36:23,565][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:36:23,885][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:36:24,203][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:36:24,523][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:36:24,842][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:36:25,163][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:36:25,484][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:36:25,803][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:36:26,123][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:36:26,442][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:36:26,761][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:36:27,082][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:36:27,402][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:36:27,721][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:36:28,042][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:36:28,361][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:36:28,681][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:36:29,001][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:36:29,321][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:36:29,640][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:36:29,961][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:36:30,279][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:36:30,599][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:36:30,919][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:36:31,240][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:36:31,559][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:36:31,880][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:36:32,200][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:36:32,518][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:36:32,838][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:36:33,159][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:36:33,480][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:36:33,799][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:36:34,119][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:36:34,729][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:36:35,049][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:36:35,368][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:36:35,689][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:36:36,009][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:36:36,330][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:36:36,652][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:36:36,972][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:36:37,292][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:36:37,612][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:36:37,931][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:36:38,252][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:36:38,573][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:36:39,223][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:36:39,954][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:36:39,956][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:36:39,957][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:36:40,581][__main__][INFO] - Iteration 256 took 27s (11.96% Gen, 85.74% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 30m 4s. Estimated total time: 7h 33m 16s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 38s. +[2026-03-25 17:36:40,583][__main__][INFO] - Starting iteration 256. +[2026-03-25 17:36:40,586][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:36:40,586][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:36:43,884][__main__][INFO] - Number of regex retries in iteration 256: 0 +[2026-03-25 17:36:43,885][__main__][INFO] - agents played in iteration 256 are Bob, Alice +[2026-03-25 17:36:44,450][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:36:45,099][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:36:45,389][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:36:45,709][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:36:46,030][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:36:46,350][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:36:46,671][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:36:46,991][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:36:47,311][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:36:47,632][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:36:47,951][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:36:48,271][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:36:48,590][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:36:48,910][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:36:49,231][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:36:49,553][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:36:49,875][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:36:50,196][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:36:50,517][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:36:50,838][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:36:51,159][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:36:51,479][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:36:51,802][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:36:52,123][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:36:52,446][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:36:52,766][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:36:53,088][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:36:53,410][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:36:53,731][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:36:54,050][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:36:54,370][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:36:54,689][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:36:55,010][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:36:55,331][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:36:55,651][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:36:55,970][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:36:56,289][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:36:56,610][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:36:56,929][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:36:57,250][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:36:57,570][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:36:57,891][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:36:58,213][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:36:58,533][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:36:58,852][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:36:59,172][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:36:59,492][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:36:59,811][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:37:00,132][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:37:00,451][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:37:00,771][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:37:01,091][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:37:01,411][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:37:02,021][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:37:02,344][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:37:02,664][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:37:02,985][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:37:03,305][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:37:03,624][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:37:03,945][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:37:04,266][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:37:04,586][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:37:04,906][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:37:05,226][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:37:05,546][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:37:05,866][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:37:06,518][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:37:07,246][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:37:07,248][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:37:07,249][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:37:07,950][__main__][INFO] - Iteration 257 took 27s (12.05% Gen, 85.38% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 32m 25s. Estimated total time: 7h 36m 5s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 36s, 500 more iterations: 3h 48m 2s. +[2026-03-25 17:37:07,953][__main__][INFO] - Starting iteration 257. +[2026-03-25 17:37:07,955][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:37:07,956][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:37:11,179][__main__][INFO] - Number of regex retries in iteration 257: 0 +[2026-03-25 17:37:11,180][__main__][INFO] - agents played in iteration 257 are Bob, Alice +[2026-03-25 17:37:11,743][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:37:12,390][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:37:12,680][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:37:13,002][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:37:13,323][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:37:13,642][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:37:13,961][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:37:14,282][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:37:14,601][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:37:14,921][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:37:15,240][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:37:15,561][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:37:15,881][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:37:16,200][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:37:16,521][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:37:16,841][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:37:17,162][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:37:17,481][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:37:17,800][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:37:18,119][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:37:18,440][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:37:18,759][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:37:19,080][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:37:19,400][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:37:19,720][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:37:20,039][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:37:20,360][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:37:20,681][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:37:21,002][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:37:21,323][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:37:21,643][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:37:21,963][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:37:22,284][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:37:22,605][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:37:22,924][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:37:23,246][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:37:23,566][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:37:23,885][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:37:24,204][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:37:24,524][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:37:24,845][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:37:25,166][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:37:25,487][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:37:25,808][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:37:26,127][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:37:26,448][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:37:26,770][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:37:27,090][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:37:27,409][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:37:27,729][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:37:28,048][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:37:28,369][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:37:28,690][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:37:29,300][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:37:29,621][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:37:29,942][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:37:30,264][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:37:30,584][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:37:30,904][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:37:31,223][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:37:31,542][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:37:31,863][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:37:32,184][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:37:32,503][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:37:32,823][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:37:33,143][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:37:33,794][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:37:34,525][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:37:34,527][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:37:34,529][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:37:35,154][__main__][INFO] - Iteration 258 took 27s (11.85% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 29m 12s. Estimated total time: 7h 33m 19s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 39s. +[2026-03-25 17:37:35,156][__main__][INFO] - Starting iteration 258. +[2026-03-25 17:37:35,159][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:37:35,160][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:37:38,406][__main__][INFO] - Number of regex retries in iteration 258: 0 +[2026-03-25 17:37:38,406][__main__][INFO] - agents played in iteration 258 are Bob, Alice +[2026-03-25 17:37:38,963][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:37:39,611][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:37:39,903][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:37:40,224][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:37:40,544][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:37:40,865][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:37:41,187][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:37:41,507][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:37:41,828][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:37:42,150][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:37:42,471][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:37:42,794][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:37:43,116][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:37:43,437][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:37:43,759][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:37:44,080][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:37:44,399][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:37:44,720][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:37:45,041][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:37:45,362][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:37:45,684][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:37:46,006][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:37:46,326][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:37:46,646][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:37:46,967][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:37:47,288][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:37:47,608][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:37:47,929][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:37:48,249][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:37:48,571][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:37:48,891][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:37:49,210][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:37:49,533][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:37:49,853][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:37:50,174][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:37:50,494][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:37:50,814][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:37:51,135][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:37:51,455][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:37:51,776][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:37:52,097][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:37:52,416][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:37:52,737][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:37:53,058][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:37:53,380][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:37:53,700][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:37:54,020][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:37:54,341][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:37:54,662][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:37:54,983][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:37:55,303][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:37:55,624][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:37:55,943][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:37:56,552][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:37:56,873][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:37:57,194][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:37:57,515][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:37:57,835][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:37:58,157][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:37:58,478][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:37:58,799][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:37:59,120][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:37:59,440][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:37:59,759][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:38:00,080][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:38:00,399][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:38:01,051][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:38:01,779][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:38:01,781][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:38:01,782][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:38:02,485][__main__][INFO] - Iteration 259 took 27s (11.88% Gen, 85.54% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 30m 51s. Estimated total time: 7h 35m 26s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 43s. +[2026-03-25 17:38:02,487][__main__][INFO] - Starting iteration 259. +[2026-03-25 17:38:02,490][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:38:02,490][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:38:05,728][__main__][INFO] - Number of regex retries in iteration 259: 0 +[2026-03-25 17:38:05,729][__main__][INFO] - agents played in iteration 259 are Bob, Alice +[2026-03-25 17:38:06,277][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:38:06,924][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:38:07,214][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:38:07,535][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:38:07,857][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:38:08,177][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:38:08,498][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:38:08,819][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:38:09,138][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:38:09,458][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:38:09,779][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:38:10,098][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:38:10,419][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:38:10,740][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:38:11,060][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:38:11,381][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:38:11,701][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:38:12,020][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:38:12,339][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:38:12,660][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:38:12,979][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:38:13,299][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:38:13,620][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:38:13,941][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:38:14,263][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:38:14,583][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:38:14,902][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:38:15,221][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:38:15,542][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:38:15,864][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:38:16,183][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:38:16,504][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:38:16,825][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:38:17,146][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:38:17,465][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:38:17,786][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:38:18,107][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:38:18,428][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:38:18,750][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:38:19,071][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:38:19,392][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:38:19,712][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:38:20,032][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:38:20,353][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:38:20,673][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:38:20,993][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:38:21,313][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:38:21,634][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:38:21,956][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:38:22,277][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:38:22,597][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:38:22,917][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:38:23,238][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:38:23,849][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:38:24,169][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:38:24,490][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:38:24,809][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:38:25,129][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:38:25,448][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:38:25,769][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:38:26,090][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:38:26,411][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:38:26,732][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:38:27,052][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:38:27,374][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:38:27,695][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:38:28,346][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:38:29,065][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:38:29,068][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:38:29,069][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:38:29,699][__main__][INFO] - Iteration 260 took 27s (11.90% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 28m 28s. Estimated total time: 7h 33m 30s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 45s. +[2026-03-25 17:38:29,701][__main__][INFO] - Starting iteration 260. +[2026-03-25 17:38:29,704][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:38:29,705][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:38:32,986][__main__][INFO] - Number of regex retries in iteration 260: 0 +[2026-03-25 17:38:32,986][__main__][INFO] - agents played in iteration 260 are Bob, Alice +[2026-03-25 17:38:33,546][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:38:34,193][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:38:34,483][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:38:34,804][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:38:35,124][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:38:35,445][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:38:35,765][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:38:36,085][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:38:36,405][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:38:36,725][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:38:37,045][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:38:37,365][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:38:37,686][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:38:38,007][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:38:38,326][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:38:38,647][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:38:38,968][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:38:39,289][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:38:39,610][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:38:39,930][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:38:40,250][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:38:40,571][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:38:40,891][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:38:41,212][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:38:41,532][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:38:41,854][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:38:42,175][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:38:42,494][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:38:42,814][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:38:43,135][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:38:43,457][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:38:43,778][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:38:44,097][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:38:44,419][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:38:44,739][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:38:45,059][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:38:45,379][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:38:45,700][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:38:46,019][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:38:46,341][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:38:46,663][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:38:46,984][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:38:47,305][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:38:47,625][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:38:47,944][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:38:48,265][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:38:48,584][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:38:48,905][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:38:49,225][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:38:49,546][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:38:49,866][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:38:50,187][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:38:50,507][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:38:51,116][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:38:51,437][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:38:51,758][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:38:52,079][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:38:52,399][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:38:52,720][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:38:53,040][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:38:53,361][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:38:53,683][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:38:54,004][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:38:54,324][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:38:54,645][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:38:54,965][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:38:55,616][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:38:56,341][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:38:56,343][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:38:56,345][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:38:56,971][__main__][INFO] - Iteration 261 took 27s (12.04% Gen, 85.66% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 28m 59s. Estimated total time: 7h 34m 28s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 14s. +[2026-03-25 17:38:56,974][__main__][INFO] - Starting iteration 261. +[2026-03-25 17:38:56,977][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:38:56,977][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:39:00,213][__main__][INFO] - Number of regex retries in iteration 261: 0 +[2026-03-25 17:39:00,214][__main__][INFO] - agents played in iteration 261 are Bob, Alice +[2026-03-25 17:39:00,753][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:39:01,399][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:39:01,690][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:39:02,011][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:39:02,330][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:39:02,651][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:39:02,971][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:39:03,293][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:39:03,613][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:39:03,934][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:39:04,253][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:39:04,573][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:39:04,893][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:39:05,213][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:39:05,533][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:39:05,853][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:39:06,174][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:39:06,493][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:39:06,813][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:39:07,134][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:39:07,454][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:39:07,774][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:39:08,095][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:39:08,417][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:39:08,738][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:39:09,058][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:39:09,379][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:39:09,699][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:39:10,020][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:39:10,340][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:39:10,661][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:39:10,981][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:39:11,302][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:39:11,623][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:39:11,944][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:39:12,265][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:39:12,585][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:39:12,904][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:39:13,225][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:39:13,544][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:39:13,865][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:39:14,185][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:39:14,505][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:39:14,824][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:39:15,145][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:39:15,464][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:39:15,784][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:39:16,105][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:39:16,426][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:39:16,747][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:39:17,069][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:39:17,390][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:39:17,709][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:39:18,320][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:39:18,641][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:39:18,963][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:39:19,285][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:39:19,607][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:39:19,930][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:39:20,252][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:39:20,575][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:39:20,897][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:39:21,219][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:39:21,540][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:39:21,860][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:39:22,179][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:39:22,831][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:39:23,557][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:39:23,559][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:39:23,561][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:39:24,239][__main__][INFO] - Iteration 262 took 27s (11.87% Gen, 85.63% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 28m 26s. Estimated total time: 7h 34m 23s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 11s. +[2026-03-25 17:39:24,241][__main__][INFO] - Starting iteration 262. +[2026-03-25 17:39:24,244][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:39:24,245][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:39:27,497][__main__][INFO] - Number of regex retries in iteration 262: 0 +[2026-03-25 17:39:27,497][__main__][INFO] - agents played in iteration 262 are Bob, Alice +[2026-03-25 17:39:28,048][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:39:28,696][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:39:28,987][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:39:29,308][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:39:29,629][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:39:29,950][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:39:30,269][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:39:30,591][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:39:30,911][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:39:31,231][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:39:31,551][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:39:31,871][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:39:32,191][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:39:32,513][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:39:32,833][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:39:33,155][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:39:33,475][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:39:33,795][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:39:34,115][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:39:34,434][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:39:34,754][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:39:35,076][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:39:35,398][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:39:35,720][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:39:36,040][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:39:36,361][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:39:36,683][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:39:37,003][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:39:37,324][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:39:37,645][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:39:37,966][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:39:38,286][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:39:38,605][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:39:38,925][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:39:39,245][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:39:39,566][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:39:39,886][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:39:40,207][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:39:40,527][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:39:40,848][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:39:41,169][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:39:41,489][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:39:41,809][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:39:42,130][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:39:42,451][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:39:42,772][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:39:43,091][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:39:43,411][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:39:43,732][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:39:44,053][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:39:44,374][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:39:44,694][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:39:45,015][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:39:45,624][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:39:45,944][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:39:46,263][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:39:46,584][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:39:46,904][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:39:47,225][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:39:47,547][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:39:47,868][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:39:48,189][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:39:48,510][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:39:48,830][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:39:49,151][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:39:49,470][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:39:50,124][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:39:50,855][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:39:50,857][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:39:50,859][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:39:51,486][__main__][INFO] - Iteration 263 took 27s (11.94% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 27m 39s. Estimated total time: 7h 34m 3s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 1s. +[2026-03-25 17:39:51,489][__main__][INFO] - Starting iteration 263. +[2026-03-25 17:39:51,492][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:39:51,492][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:39:54,813][__main__][INFO] - Number of regex retries in iteration 263: 0 +[2026-03-25 17:39:54,814][__main__][INFO] - agents played in iteration 263 are Bob, Alice +[2026-03-25 17:39:55,369][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:39:56,015][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:39:56,304][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:39:56,625][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:39:56,944][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:39:57,263][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:39:57,583][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:39:57,902][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:39:58,224][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:39:58,545][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:39:58,866][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:39:59,185][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:39:59,504][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:39:59,824][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:40:00,144][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:40:00,464][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:40:00,784][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:40:01,103][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:40:01,423][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:40:01,745][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:40:02,065][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:40:02,385][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:40:02,706][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:40:03,025][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:40:03,346][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:40:03,668][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:40:03,989][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:40:04,310][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:40:04,630][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:40:04,950][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:40:05,271][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:40:05,594][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:40:05,914][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:40:06,236][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:40:06,556][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:40:06,878][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:40:07,199][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:40:07,519][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:40:07,841][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:40:08,161][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:40:08,482][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:40:08,804][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:40:09,125][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:40:09,447][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:40:09,769][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:40:10,089][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:40:10,410][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:40:10,732][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:40:11,053][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:40:11,374][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:40:11,694][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:40:12,014][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:40:12,334][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:40:12,943][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:40:13,264][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:40:13,584][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:40:13,906][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:40:14,226][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:40:14,546][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:40:14,868][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:40:15,187][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:40:15,507][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:40:15,826][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:40:16,146][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:40:16,467][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:40:16,788][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:40:17,440][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:40:18,175][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:40:18,177][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:40:18,179][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:40:18,912][__main__][INFO] - Iteration 264 took 27s (12.11% Gen, 85.20% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 30m 10s. Estimated total time: 7h 37m 1s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 42s, 500 more iterations: 3h 48m 30s. +[2026-03-25 17:40:18,915][__main__][INFO] - Starting iteration 264. +[2026-03-25 17:40:18,918][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:40:18,919][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:40:22,205][__main__][INFO] - Number of regex retries in iteration 264: 0 +[2026-03-25 17:40:22,206][__main__][INFO] - agents played in iteration 264 are Bob, Alice +[2026-03-25 17:40:22,759][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:40:23,405][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:40:23,695][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:40:24,017][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:40:24,338][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:40:24,659][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:40:24,979][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:40:25,299][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:40:25,618][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:40:25,939][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:40:26,259][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:40:26,579][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:40:26,899][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:40:27,220][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:40:27,541][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:40:27,860][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:40:28,181][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:40:28,501][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:40:28,822][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:40:29,143][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:40:29,464][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:40:29,784][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:40:30,105][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:40:30,425][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:40:30,746][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:40:31,066][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:40:31,386][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:40:31,707][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:40:32,028][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:40:32,347][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:40:32,667][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:40:32,987][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:40:33,306][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:40:33,627][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:40:33,946][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:40:34,267][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:40:34,586][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:40:34,906][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:40:35,227][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:40:35,547][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:40:35,867][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:40:36,186][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:40:36,506][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:40:36,827][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:40:37,147][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:40:37,467][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:40:37,786][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:40:38,107][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:40:38,426][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:40:38,746][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:40:39,066][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:40:39,385][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:40:39,705][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:40:40,316][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:40:40,637][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:40:40,958][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:40:41,279][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:40:41,599][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:40:41,920][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:40:42,241][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:40:42,560][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:40:42,879][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:40:43,199][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:40:43,520][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:40:43,840][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:40:44,161][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:40:44,813][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:40:45,542][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:40:45,544][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:40:45,545][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:40:46,172][__main__][INFO] - Iteration 265 took 27s (12.06% Gen, 85.63% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 26m 56s. Estimated total time: 7h 34m 14s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 7s. +[2026-03-25 17:40:46,174][__main__][INFO] - Starting iteration 265. +[2026-03-25 17:40:46,177][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:40:46,178][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:40:49,471][__main__][INFO] - Number of regex retries in iteration 265: 0 +[2026-03-25 17:40:49,471][__main__][INFO] - agents played in iteration 265 are Bob, Alice +[2026-03-25 17:40:50,021][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:40:50,670][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:40:50,960][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:40:51,281][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:40:51,603][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:40:51,922][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:40:52,241][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:40:52,561][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:40:52,881][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:40:53,202][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:40:53,521][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:40:53,842][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:40:54,162][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:40:54,484][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:40:54,804][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:40:55,125][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:40:55,447][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:40:55,768][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:40:56,088][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:40:56,409][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:40:56,731][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:40:57,053][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:40:57,374][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:40:57,694][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:40:58,015][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:40:58,336][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:40:58,657][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:40:58,980][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:40:59,300][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:40:59,622][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:40:59,943][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:41:00,264][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:41:00,584][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:41:00,904][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:41:01,226][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:41:01,545][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:41:01,866][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:41:02,185][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:41:02,504][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:41:02,825][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:41:03,145][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:41:03,466][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:41:03,785][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:41:04,104][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:41:04,426][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:41:04,746][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:41:05,066][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:41:05,387][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:41:05,708][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:41:06,029][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:41:06,349][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:41:06,671][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:41:06,990][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:41:07,602][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:41:07,922][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:41:08,243][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:41:08,564][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:41:08,884][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:41:09,203][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:41:09,525][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:41:09,846][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:41:10,166][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:41:10,486][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:41:10,805][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:41:11,126][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:41:11,447][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:41:12,097][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:41:12,820][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:41:12,822][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:41:12,823][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:41:13,506][__main__][INFO] - Iteration 266 took 27s (12.05% Gen, 85.44% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 27m 44s. Estimated total time: 7h 35m 30s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 45s. +[2026-03-25 17:41:13,509][__main__][INFO] - Starting iteration 266. +[2026-03-25 17:41:13,512][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:41:13,512][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:41:16,814][__main__][INFO] - Number of regex retries in iteration 266: 0 +[2026-03-25 17:41:16,814][__main__][INFO] - agents played in iteration 266 are Bob, Alice +[2026-03-25 17:41:17,370][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:41:18,016][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:41:18,308][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:41:18,629][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:41:18,949][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:41:19,269][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:41:19,588][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:41:19,907][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:41:20,228][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:41:20,548][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:41:20,869][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:41:21,189][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:41:21,509][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:41:21,830][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:41:22,151][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:41:22,472][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:41:22,792][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:41:23,112][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:41:23,433][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:41:23,752][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:41:24,072][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:41:24,392][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:41:24,711][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:41:25,031][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:41:25,350][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:41:25,671][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:41:25,992][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:41:26,313][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:41:26,632][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:41:26,952][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:41:27,272][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:41:27,592][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:41:27,912][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:41:28,233][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:41:28,555][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:41:28,876][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:41:29,197][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:41:29,518][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:41:29,839][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:41:30,160][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:41:30,480][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:41:30,800][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:41:31,120][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:41:31,441][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:41:31,762][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:41:32,082][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:41:32,404][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:41:32,724][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:41:33,044][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:41:33,365][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:41:33,685][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:41:34,004][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:41:34,325][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:41:34,935][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:41:35,257][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:41:35,577][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:41:35,899][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:41:36,219][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:41:36,540][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:41:36,860][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:41:37,180][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:41:37,500][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:41:37,820][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:41:38,140][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:41:38,463][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:41:38,784][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:41:39,435][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:41:40,157][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:41:40,160][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:41:40,161][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:41:40,833][__main__][INFO] - Iteration 267 took 27s (12.09% Gen, 85.45% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 27m 9s. Estimated total time: 7h 35m 22s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 41s. +[2026-03-25 17:41:40,836][__main__][INFO] - Starting iteration 267. +[2026-03-25 17:41:40,839][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:41:40,839][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:41:44,182][__main__][INFO] - Number of regex retries in iteration 267: 0 +[2026-03-25 17:41:44,183][__main__][INFO] - agents played in iteration 267 are Bob, Alice +[2026-03-25 17:41:44,761][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:41:45,421][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:41:45,711][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:41:46,032][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:41:46,353][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:41:46,675][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:41:46,996][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:41:47,317][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:41:47,638][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:41:47,958][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:41:48,278][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:41:48,598][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:41:48,919][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:41:49,239][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:41:49,560][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:41:49,880][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:41:50,201][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:41:50,522][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:41:50,842][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:41:51,163][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:41:51,483][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:41:51,804][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:41:52,123][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:41:52,444][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:41:52,764][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:41:53,083][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:41:53,403][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:41:53,722][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:41:54,043][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:41:54,364][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:41:54,685][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:41:55,004][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:41:55,325][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:41:55,646][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:41:55,967][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:41:56,288][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:41:56,608][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:41:56,928][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:41:57,247][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:41:57,569][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:41:57,890][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:41:58,210][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:41:58,531][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:41:58,851][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:41:59,172][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:41:59,494][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:41:59,813][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:42:00,132][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:42:00,452][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:42:00,771][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:42:01,090][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:42:01,411][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:42:01,730][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:42:02,341][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:42:02,663][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:42:02,984][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:42:03,304][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:42:03,623][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:42:03,944][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:42:04,263][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:42:04,584][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:42:04,903][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:42:05,224][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:42:05,544][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:42:05,864][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:42:06,185][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:42:06,837][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:42:07,596][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:42:07,599][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:42:07,600][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:42:08,239][__main__][INFO] - Iteration 268 took 27s (12.20% Gen, 85.46% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 28m 1s. Estimated total time: 7h 36m 41s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 40s, 500 more iterations: 3h 48m 20s. +[2026-03-25 17:42:08,242][__main__][INFO] - Starting iteration 268. +[2026-03-25 17:42:08,245][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:42:08,245][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:42:11,483][__main__][INFO] - Number of regex retries in iteration 268: 0 +[2026-03-25 17:42:11,483][__main__][INFO] - agents played in iteration 268 are Bob, Alice +[2026-03-25 17:42:12,024][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:42:12,670][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:42:12,961][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:42:13,282][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:42:13,604][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:42:13,924][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:42:14,243][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:42:14,564][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:42:14,884][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:42:15,205][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:42:15,525][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:42:15,847][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:42:16,167][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:42:16,486][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:42:16,806][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:42:17,125][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:42:17,444][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:42:17,764][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:42:18,083][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:42:18,403][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:42:18,723][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:42:19,044][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:42:19,364][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:42:19,684][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:42:20,003][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:42:20,324][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:42:20,644][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:42:20,964][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:42:21,284][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:42:21,604][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:42:21,924][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:42:22,245][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:42:22,564][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:42:22,885][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:42:23,206][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:42:23,526][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:42:23,845][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:42:24,165][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:42:24,484][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:42:24,805][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:42:25,124][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:42:25,445][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:42:25,765][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:42:26,084][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:42:26,404][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:42:26,726][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:42:27,046][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:42:27,366][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:42:27,687][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:42:28,008][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:42:28,328][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:42:28,648][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:42:28,968][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:42:29,578][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:42:29,898][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:42:30,219][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:42:30,538][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:42:30,858][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:42:31,179][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:42:31,500][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:42:31,820][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:42:32,141][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:42:32,461][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:42:32,782][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:42:33,102][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:42:33,423][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:42:34,080][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:42:34,817][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:42:34,819][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:42:34,821][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:42:35,453][__main__][INFO] - Iteration 269 took 27s (11.90% Gen, 85.77% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 24m 21s. Estimated total time: 7h 33m 29s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 20s, 500 more iterations: 3h 46m 44s. +[2026-03-25 17:42:35,455][__main__][INFO] - Starting iteration 269. +[2026-03-25 17:42:35,458][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:42:35,459][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:42:38,774][__main__][INFO] - Number of regex retries in iteration 269: 0 +[2026-03-25 17:42:38,775][__main__][INFO] - agents played in iteration 269 are Bob, Alice +[2026-03-25 17:42:39,326][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:42:39,973][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:42:40,264][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:42:40,585][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:42:40,905][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:42:41,225][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:42:41,546][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:42:41,868][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:42:42,189][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:42:42,509][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:42:42,830][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:42:43,151][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:42:43,472][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:42:43,792][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:42:44,113][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:42:44,433][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:42:44,755][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:42:45,076][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:42:45,397][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:42:45,718][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:42:46,040][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:42:46,361][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:42:46,682][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:42:47,002][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:42:47,323][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:42:47,644][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:42:47,963][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:42:48,283][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:42:48,602][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:42:48,924][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:42:49,244][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:42:49,565][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:42:49,884][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:42:50,206][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:42:50,526][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:42:50,845][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:42:51,166][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:42:51,485][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:42:51,806][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:42:52,126][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:42:52,446][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:42:52,766][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:42:53,085][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:42:53,405][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:42:53,725][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:42:54,045][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:42:54,366][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:42:54,686][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:42:55,006][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:42:55,326][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:42:55,647][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:42:55,967][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:42:56,287][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:42:56,897][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:42:57,217][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:42:57,538][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:42:57,858][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:42:58,179][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:42:58,500][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:42:58,820][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:42:59,141][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:42:59,460][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:42:59,780][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:43:00,100][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:43:00,419][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:43:00,740][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:43:01,390][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:43:02,116][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:43:02,118][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:43:02,119][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:43:02,744][__main__][INFO] - Iteration 270 took 27s (12.15% Gen, 85.55% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 25m 11s. Estimated total time: 7h 34m 46s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 23s. +[2026-03-25 17:43:02,746][__main__][INFO] - Starting iteration 270. +[2026-03-25 17:43:02,749][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:43:02,749][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:43:05,990][__main__][INFO] - Number of regex retries in iteration 270: 0 +[2026-03-25 17:43:05,991][__main__][INFO] - agents played in iteration 270 are Bob, Alice +[2026-03-25 17:43:06,551][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:43:07,198][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:43:07,489][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:43:07,810][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:43:08,131][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:43:08,452][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:43:08,771][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:43:09,092][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:43:09,413][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:43:09,732][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:43:10,052][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:43:10,371][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:43:10,693][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:43:11,012][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:43:11,333][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:43:11,655][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:43:11,974][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:43:12,293][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:43:12,615][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:43:12,937][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:43:13,258][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:43:13,578][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:43:13,898][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:43:14,219][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:43:14,539][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:43:14,859][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:43:15,180][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:43:15,500][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:43:15,821][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:43:16,139][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:43:16,460][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:43:16,781][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:43:17,102][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:43:17,421][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:43:17,741][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:43:18,061][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:43:18,382][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:43:18,701][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:43:19,021][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:43:19,342][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:43:19,661][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:43:19,982][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:43:20,304][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:43:20,625][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:43:20,946][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:43:21,266][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:43:21,586][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:43:21,906][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:43:22,226][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:43:22,546][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:43:22,866][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:43:23,187][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:43:23,506][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:43:24,121][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:43:24,443][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:43:24,763][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:43:25,085][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:43:25,407][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:43:25,728][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:43:26,048][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:43:26,368][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:43:26,689][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:43:27,010][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:43:27,330][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:43:27,649][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:43:27,970][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:43:28,622][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:43:29,342][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:43:29,344][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:43:29,346][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:43:29,972][__main__][INFO] - Iteration 271 took 27s (11.91% Gen, 85.79% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 23m 41s. Estimated total time: 7h 33m 43s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 51s. +[2026-03-25 17:43:29,974][__main__][INFO] - Starting iteration 271. +[2026-03-25 17:43:29,977][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:43:29,977][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:43:33,184][__main__][INFO] - Number of regex retries in iteration 271: 0 +[2026-03-25 17:43:33,185][__main__][INFO] - agents played in iteration 271 are Bob, Alice +[2026-03-25 17:43:33,745][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:43:34,393][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:43:34,683][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:43:35,003][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:43:35,325][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:43:35,646][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:43:35,966][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:43:36,286][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:43:36,606][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:43:36,924][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:43:37,244][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:43:37,565][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:43:37,885][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:43:38,205][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:43:38,525][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:43:38,845][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:43:39,165][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:43:39,485][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:43:39,805][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:43:40,125][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:43:40,445][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:43:40,766][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:43:41,086][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:43:41,406][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:43:41,726][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:43:42,045][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:43:42,366][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:43:42,685][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:43:43,004][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:43:43,325][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:43:43,644][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:43:43,965][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:43:44,286][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:43:44,606][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:43:44,926][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:43:45,246][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:43:45,567][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:43:45,887][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:43:46,206][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:43:46,527][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:43:46,847][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:43:47,166][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:43:47,486][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:43:47,805][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:43:48,125][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:43:48,445][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:43:48,766][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:43:49,086][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:43:49,405][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:43:49,725][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:43:50,046][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:43:50,366][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:43:50,687][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:43:51,297][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:43:51,617][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:43:51,939][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:43:52,260][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:43:52,580][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:43:52,899][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:43:53,220][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:43:53,539][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:43:53,859][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:43:54,180][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:43:54,499][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:43:54,819][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:43:55,140][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:43:55,792][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:43:56,515][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:43:56,517][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:43:56,519][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:43:57,141][__main__][INFO] - Iteration 272 took 27s (11.81% Gen, 85.90% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 22m 16s. Estimated total time: 7h 32m 45s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 16s, 500 more iterations: 3h 46m 22s. +[2026-03-25 17:43:57,144][__main__][INFO] - Starting iteration 272. +[2026-03-25 17:43:57,147][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:43:57,147][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:44:00,351][__main__][INFO] - Number of regex retries in iteration 272: 0 +[2026-03-25 17:44:00,352][__main__][INFO] - agents played in iteration 272 are Bob, Alice +[2026-03-25 17:44:00,921][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:44:01,569][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:44:01,858][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:44:02,180][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:44:02,499][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:44:02,819][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:44:03,140][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:44:03,460][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:44:03,781][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:44:04,102][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:44:04,423][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:44:04,743][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:44:05,064][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:44:05,385][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:44:05,704][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:44:06,024][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:44:06,343][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:44:06,664][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:44:06,984][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:44:07,304][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:44:07,623][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:44:07,944][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:44:08,265][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:44:08,584][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:44:08,903][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:44:09,223][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:44:09,544][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:44:09,865][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:44:10,184][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:44:10,504][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:44:10,824][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:44:11,147][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:44:11,467][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:44:11,788][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:44:12,110][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:44:12,430][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:44:12,751][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:44:13,072][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:44:13,393][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:44:13,716][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:44:14,036][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:44:14,358][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:44:14,680][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:44:15,001][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:44:15,321][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:44:15,642][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:44:15,963][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:44:16,283][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:44:16,603][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:44:16,924][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:44:17,243][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:44:17,564][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:44:17,883][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:44:18,494][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:44:18,813][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:44:19,132][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:44:19,453][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:44:19,772][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:44:20,092][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:44:20,413][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:44:20,735][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:44:21,056][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:44:21,376][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:44:21,697][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:44:22,018][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:44:22,339][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:44:22,993][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:44:23,727][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:44:23,729][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:44:23,731][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:44:24,355][__main__][INFO] - Iteration 273 took 27s (11.78% Gen, 85.92% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 22m 33s. Estimated total time: 7h 33m 29s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 20s, 500 more iterations: 3h 46m 44s. +[2026-03-25 17:44:24,358][__main__][INFO] - Starting iteration 273. +[2026-03-25 17:44:24,361][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:44:24,361][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:44:27,651][__main__][INFO] - Number of regex retries in iteration 273: 0 +[2026-03-25 17:44:27,652][__main__][INFO] - agents played in iteration 273 are Bob, Alice +[2026-03-25 17:44:28,199][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:44:28,846][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:44:29,135][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:44:29,457][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:44:29,776][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:44:30,095][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:44:30,416][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:44:30,737][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:44:31,056][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:44:31,377][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:44:31,697][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:44:32,017][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:44:32,339][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:44:32,660][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:44:32,980][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:44:33,299][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:44:33,621][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:44:33,941][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:44:34,261][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:44:34,580][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:44:34,899][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:44:35,220][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:44:35,540][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:44:35,860][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:44:36,180][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:44:36,499][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:44:36,819][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:44:37,139][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:44:37,460][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:44:37,780][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:44:38,101][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:44:38,420][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:44:38,741][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:44:39,060][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:44:39,380][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:44:39,699][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:44:40,020][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:44:40,340][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:44:40,661][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:44:40,982][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:44:41,301][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:44:41,621][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:44:41,941][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:44:42,262][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:44:42,583][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:44:42,904][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:44:43,223][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:44:43,544][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:44:43,865][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:44:44,185][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:44:44,504][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:44:44,824][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:44:45,144][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:44:45,755][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:44:46,077][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:44:46,398][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:44:46,718][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:44:47,037][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:44:47,358][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:44:47,679][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:44:47,999][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:44:48,320][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:44:48,641][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:44:48,963][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:44:49,285][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:44:49,605][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:44:50,261][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:44:51,047][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:44:51,049][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:44:51,051][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:44:51,698][__main__][INFO] - Iteration 274 took 27s (12.04% Gen, 85.59% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 24m 14s. Estimated total time: 7h 35m 38s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 49s. +[2026-03-25 17:44:51,701][__main__][INFO] - Starting iteration 274. +[2026-03-25 17:44:51,704][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:44:51,705][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:44:54,939][__main__][INFO] - Number of regex retries in iteration 274: 0 +[2026-03-25 17:44:54,940][__main__][INFO] - agents played in iteration 274 are Bob, Alice +[2026-03-25 17:44:55,485][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:44:56,132][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:44:56,422][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:44:56,743][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:44:57,064][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:44:57,384][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:44:57,705][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:44:58,025][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:44:58,345][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:44:58,666][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:44:58,987][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:44:59,308][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:44:59,629][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:44:59,949][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:45:00,271][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:45:00,590][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:45:00,910][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:45:01,230][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:45:01,551][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:45:01,873][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:45:02,196][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:45:02,516][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:45:02,837][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:45:03,158][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:45:03,481][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:45:03,802][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:45:04,121][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:45:04,442][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:45:04,762][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:45:05,083][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:45:05,402][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:45:05,723][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:45:06,044][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:45:06,364][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:45:06,685][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:45:07,006][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:45:07,325][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:45:07,644][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:45:07,964][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:45:08,284][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:45:08,605][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:45:08,925][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:45:09,244][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:45:09,564][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:45:09,885][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:45:10,204][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:45:10,525][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:45:10,845][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:45:11,166][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:45:11,486][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:45:11,806][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:45:12,126][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:45:12,446][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:45:13,059][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:45:13,380][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:45:13,699][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:45:14,018][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:45:14,340][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:45:14,661][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:45:14,981][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:45:15,302][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:45:15,622][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:45:15,943][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:45:16,265][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:45:16,585][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:45:16,907][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:45:17,558][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:45:18,292][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:45:18,294][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:45:18,296][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:45:18,958][__main__][INFO] - Iteration 275 took 27s (11.87% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 22m 24s. Estimated total time: 7h 34m 15s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 7s. +[2026-03-25 17:45:18,961][__main__][INFO] - Starting iteration 275. +[2026-03-25 17:45:18,963][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:45:18,964][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:45:22,191][__main__][INFO] - Number of regex retries in iteration 275: 0 +[2026-03-25 17:45:22,192][__main__][INFO] - agents played in iteration 275 are Bob, Alice +[2026-03-25 17:45:22,762][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:45:23,409][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:45:23,701][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:45:24,021][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:45:24,341][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:45:24,661][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:45:24,981][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:45:25,301][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:45:25,621][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:45:25,942][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:45:26,263][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:45:26,584][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:45:26,903][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:45:27,224][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:45:27,545][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:45:27,865][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:45:28,185][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:45:28,505][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:45:28,826][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:45:29,145][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:45:29,465][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:45:29,786][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:45:30,106][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:45:30,426][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:45:30,747][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:45:31,066][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:45:31,386][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:45:31,706][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:45:32,025][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:45:32,346][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:45:32,666][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:45:32,985][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:45:33,306][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:45:33,625][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:45:33,946][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:45:34,267][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:45:34,587][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:45:34,906][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:45:35,225][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:45:35,545][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:45:35,864][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:45:36,185][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:45:36,505][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:45:36,825][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:45:37,146][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:45:37,467][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:45:37,787][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:45:38,107][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:45:38,427][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:45:38,747][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:45:39,067][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:45:39,388][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:45:39,707][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:45:40,319][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:45:40,639][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:45:40,960][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:45:41,280][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:45:41,600][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:45:41,919][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:45:42,240][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:45:42,560][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:45:42,880][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:45:43,201][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:45:43,521][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:45:43,840][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:45:44,160][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:45:44,812][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:45:45,543][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:45:45,545][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:45:45,547][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:45:46,174][__main__][INFO] - Iteration 276 took 27s (11.86% Gen, 85.83% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 21m 12s. Estimated total time: 7h 33m 31s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 45s. +[2026-03-25 17:45:46,176][__main__][INFO] - Starting iteration 276. +[2026-03-25 17:45:46,179][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:45:46,179][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:45:49,413][__main__][INFO] - Number of regex retries in iteration 276: 0 +[2026-03-25 17:45:49,414][__main__][INFO] - agents played in iteration 276 are Bob, Alice +[2026-03-25 17:45:49,963][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:45:50,616][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:45:50,906][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:45:51,229][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:45:51,551][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:45:51,873][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:45:52,194][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:45:52,515][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:45:52,836][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:45:53,159][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:45:53,479][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:45:53,801][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:45:54,122][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:45:54,443][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:45:54,764][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:45:55,085][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:45:55,406][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:45:55,725][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:45:56,046][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:45:56,367][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:45:56,687][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:45:57,007][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:45:57,326][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:45:57,645][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:45:57,966][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:45:58,287][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:45:58,607][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:45:58,928][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:45:59,247][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:45:59,569][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:45:59,890][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:46:00,211][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:46:00,530][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:46:00,851][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:46:01,172][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:46:01,491][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:46:01,811][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:46:02,133][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:46:02,454][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:46:02,775][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:46:03,095][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:46:03,416][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:46:03,737][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:46:04,059][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:46:04,381][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:46:04,700][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:46:05,020][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:46:05,341][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:46:05,662][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:46:05,982][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:46:06,302][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:46:06,624][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:46:06,943][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:46:07,553][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:46:07,872][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:46:08,192][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:46:08,513][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:46:08,833][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:46:09,154][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:46:09,473][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:46:09,792][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:46:10,112][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:46:10,431][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:46:10,751][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:46:11,071][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:46:11,391][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:46:12,042][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:46:12,781][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:46:12,784][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:46:12,785][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:46:13,464][__main__][INFO] - Iteration 277 took 27s (11.86% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 22m 0s. Estimated total time: 7h 34m 46s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 23s. +[2026-03-25 17:46:13,466][__main__][INFO] - Starting iteration 277. +[2026-03-25 17:46:13,469][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:46:13,470][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:46:16,690][__main__][INFO] - Number of regex retries in iteration 277: 0 +[2026-03-25 17:46:16,691][__main__][INFO] - agents played in iteration 277 are Bob, Alice +[2026-03-25 17:46:17,239][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:46:17,885][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:46:18,175][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:46:18,497][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:46:18,818][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:46:19,139][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:46:19,461][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:46:19,781][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:46:20,101][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:46:20,422][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:46:20,741][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:46:21,064][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:46:21,384][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:46:21,704][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:46:22,025][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:46:22,345][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:46:22,665][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:46:22,985][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:46:23,305][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:46:23,624][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:46:23,945][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:46:24,266][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:46:24,585][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:46:24,906][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:46:25,226][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:46:25,546][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:46:25,867][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:46:26,186][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:46:26,505][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:46:26,826][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:46:27,146][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:46:27,465][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:46:27,786][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:46:28,105][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:46:28,426][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:46:28,746][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:46:29,066][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:46:29,386][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:46:29,705][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:46:30,025][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:46:30,345][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:46:30,666][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:46:30,987][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:46:31,307][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:46:31,626][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:46:31,945][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:46:32,265][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:46:32,587][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:46:32,906][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:46:33,227][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:46:33,547][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:46:33,868][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:46:34,189][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:46:34,799][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:46:35,119][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:46:35,441][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:46:35,760][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:46:36,080][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:46:36,400][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:46:36,720][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:46:37,040][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:46:37,361][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:46:37,682][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:46:38,002][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:46:38,323][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:46:38,644][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:46:39,296][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:46:40,030][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:46:40,032][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:46:40,034][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:46:40,663][__main__][INFO] - Iteration 278 took 27s (11.84% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 20m 1s. Estimated total time: 7h 33m 14s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 37s. +[2026-03-25 17:46:40,665][__main__][INFO] - Starting iteration 278. +[2026-03-25 17:46:40,668][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:46:40,669][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:46:43,906][__main__][INFO] - Number of regex retries in iteration 278: 0 +[2026-03-25 17:46:43,907][__main__][INFO] - agents played in iteration 278 are Bob, Alice +[2026-03-25 17:46:44,469][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:46:45,122][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:46:45,412][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:46:45,734][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:46:46,055][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:46:46,375][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:46:46,695][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:46:47,017][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:46:47,337][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:46:47,659][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:46:47,980][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:46:48,299][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:46:48,620][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:46:48,941][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:46:49,263][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:46:49,583][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:46:49,903][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:46:50,222][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:46:50,543][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:46:50,865][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:46:51,185][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:46:51,505][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:46:51,826][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:46:52,145][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:46:52,465][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:46:52,785][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:46:53,105][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:46:53,426][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:46:53,746][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:46:54,068][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:46:54,389][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:46:54,709][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:46:55,030][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:46:55,351][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:46:55,672][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:46:55,993][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:46:56,313][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:46:56,632][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:46:56,952][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:46:57,273][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:46:57,592][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:46:57,912][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:46:58,233][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:46:58,554][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:46:58,873][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:46:59,194][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:46:59,513][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:46:59,834][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:47:00,153][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:47:00,473][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:47:00,793][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:47:01,112][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:47:01,433][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:47:02,048][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:47:02,370][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:47:02,690][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:47:03,012][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:47:03,332][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:47:03,652][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:47:03,973][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:47:04,294][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:47:04,615][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:47:04,936][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:47:05,257][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:47:05,578][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:47:05,898][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:47:06,556][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:47:07,297][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:47:07,299][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:47:07,301][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:47:07,980][__main__][INFO] - Iteration 279 took 27s (11.85% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 21m 32s. Estimated total time: 7h 35m 13s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 36s. +[2026-03-25 17:47:07,983][__main__][INFO] - Starting iteration 279. +[2026-03-25 17:47:07,986][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:47:07,986][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:47:11,225][__main__][INFO] - Number of regex retries in iteration 279: 0 +[2026-03-25 17:47:11,226][__main__][INFO] - agents played in iteration 279 are Bob, Alice +[2026-03-25 17:47:11,786][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:47:12,440][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:47:12,731][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:47:13,052][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:47:13,372][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:47:13,694][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:47:14,014][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:47:14,334][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:47:14,655][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:47:14,975][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:47:15,295][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:47:15,616][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:47:15,936][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:47:16,257][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:47:16,576][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:47:16,897][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:47:17,217][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:47:17,536][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:47:17,858][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:47:18,178][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:47:18,498][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:47:18,818][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:47:19,139][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:47:19,460][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:47:19,780][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:47:20,101][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:47:20,422][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:47:20,743][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:47:21,063][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:47:21,384][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:47:21,704][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:47:22,024][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:47:22,345][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:47:22,665][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:47:22,986][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:47:23,305][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:47:23,626][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:47:23,947][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:47:24,267][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:47:24,587][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:47:24,906][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:47:25,226][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:47:25,545][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:47:25,865][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:47:26,185][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:47:26,505][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:47:26,827][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:47:27,148][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:47:27,469][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:47:27,790][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:47:28,111][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:47:28,431][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:47:28,751][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:47:29,376][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:47:29,697][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:47:30,020][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:47:30,342][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:47:30,662][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:47:30,983][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:47:31,303][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:47:31,624][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:47:31,944][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:47:32,264][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:47:32,584][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:47:32,906][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:47:33,226][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:47:33,884][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:47:34,660][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:47:34,663][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:47:34,664][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:47:35,304][__main__][INFO] - Iteration 280 took 27s (11.86% Gen, 85.79% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 21m 11s. Estimated total time: 7h 35m 18s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 39s. +[2026-03-25 17:47:35,306][__main__][INFO] - Starting iteration 280. +[2026-03-25 17:47:35,309][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:47:35,309][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:47:38,539][__main__][INFO] - Number of regex retries in iteration 280: 0 +[2026-03-25 17:47:38,539][__main__][INFO] - agents played in iteration 280 are Bob, Alice +[2026-03-25 17:47:39,111][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:47:39,765][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:47:40,057][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:47:40,381][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:47:40,702][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:47:41,022][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:47:41,342][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:47:41,663][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:47:41,984][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:47:42,304][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:47:42,625][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:47:42,945][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:47:43,265][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:47:43,586][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:47:43,907][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:47:44,228][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:47:44,548][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:47:44,867][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:47:45,186][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:47:45,506][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:47:45,827][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:47:46,146][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:47:46,466][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:47:46,785][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:47:47,105][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:47:47,427][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:47:47,748][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:47:48,068][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:47:48,388][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:47:48,708][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:47:49,029][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:47:49,349][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:47:49,670][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:47:49,990][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:47:50,311][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:47:50,632][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:47:50,952][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:47:51,273][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:47:51,594][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:47:51,913][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:47:52,233][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:47:52,554][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:47:52,873][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:47:53,194][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:47:53,513][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:47:53,835][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:47:54,156][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:47:54,477][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:47:54,797][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:47:55,117][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:47:55,438][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:47:55,760][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:47:56,080][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:47:56,695][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:47:57,015][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:47:57,336][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:47:57,657][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:47:57,977][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:47:58,298][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:47:58,620][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:47:58,941][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:47:59,261][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:47:59,581][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:47:59,901][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:48:00,221][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:48:00,541][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:48:01,198][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:48:01,934][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:48:01,936][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:48:01,937][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:48:02,586][__main__][INFO] - Iteration 281 took 27s (11.84% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 20m 3s. Estimated total time: 7h 34m 38s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 19s. +[2026-03-25 17:48:02,588][__main__][INFO] - Starting iteration 281. +[2026-03-25 17:48:02,591][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:48:02,591][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:48:05,818][__main__][INFO] - Number of regex retries in iteration 281: 0 +[2026-03-25 17:48:05,818][__main__][INFO] - agents played in iteration 281 are Bob, Alice +[2026-03-25 17:48:06,372][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:48:07,025][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:48:07,318][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:48:07,639][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:48:07,959][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:48:08,281][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:48:08,600][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:48:08,919][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:48:09,241][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:48:09,562][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:48:09,883][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:48:10,203][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:48:10,523][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:48:10,843][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:48:11,164][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:48:11,484][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:48:11,804][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:48:12,123][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:48:12,443][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:48:12,764][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:48:13,085][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:48:13,405][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:48:13,725][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:48:14,045][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:48:14,365][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:48:14,685][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:48:15,004][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:48:15,324][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:48:15,644][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:48:15,967][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:48:16,287][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:48:16,606][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:48:16,926][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:48:17,247][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:48:17,569][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:48:17,889][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:48:18,210][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:48:18,532][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:48:18,854][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:48:19,177][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:48:19,497][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:48:19,819][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:48:20,141][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:48:20,461][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:48:20,783][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:48:21,103][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:48:21,424][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:48:21,745][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:48:22,065][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:48:22,385][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:48:22,706][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:48:23,027][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:48:23,348][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:48:23,963][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:48:24,283][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:48:24,602][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:48:24,922][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:48:25,242][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:48:25,562][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:48:25,882][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:48:26,203][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:48:26,524][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:48:26,844][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:48:27,164][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:48:27,484][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:48:27,805][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:48:28,463][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:48:29,194][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:48:29,196][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:48:29,197][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:48:29,870][__main__][INFO] - Iteration 282 took 27s (11.83% Gen, 85.70% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 19m 38s. Estimated total time: 7h 34m 40s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 20s. +[2026-03-25 17:48:29,872][__main__][INFO] - Starting iteration 282. +[2026-03-25 17:48:29,876][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:48:29,876][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:48:33,105][__main__][INFO] - Number of regex retries in iteration 282: 0 +[2026-03-25 17:48:33,106][__main__][INFO] - agents played in iteration 282 are Bob, Alice +[2026-03-25 17:48:33,692][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:48:34,345][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:48:34,636][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:48:34,959][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:48:35,279][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:48:35,598][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:48:35,919][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:48:36,240][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:48:36,560][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:48:36,880][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:48:37,200][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:48:37,519][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:48:37,840][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:48:38,161][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:48:38,480][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:48:38,799][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:48:39,121][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:48:39,441][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:48:39,762][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:48:40,082][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:48:40,400][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:48:40,722][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:48:41,044][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:48:41,364][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:48:41,685][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:48:42,004][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:48:42,324][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:48:42,644][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:48:42,965][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:48:43,284][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:48:43,605][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:48:43,925][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:48:44,245][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:48:44,566][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:48:44,887][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:48:45,206][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:48:45,526][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:48:45,847][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:48:46,167][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:48:46,487][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:48:46,807][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:48:47,127][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:48:47,447][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:48:47,768][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:48:48,088][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:48:48,408][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:48:48,727][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:48:49,047][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:48:49,367][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:48:49,688][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:48:50,008][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:48:50,329][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:48:50,648][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:48:51,261][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:48:51,581][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:48:51,901][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:48:52,220][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:48:52,540][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:48:52,861][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:48:53,181][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:48:53,500][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:48:53,819][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:48:54,140][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:48:54,460][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:48:54,780][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:48:55,099][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:48:55,757][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:48:56,488][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:48:56,490][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:48:56,492][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:48:57,114][__main__][INFO] - Iteration 283 took 27s (11.86% Gen, 85.85% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 18m 30s. Estimated total time: 7h 33m 59s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 59s. +[2026-03-25 17:48:57,117][__main__][INFO] - Starting iteration 283. +[2026-03-25 17:48:57,120][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:48:57,120][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:49:00,408][__main__][INFO] - Number of regex retries in iteration 283: 0 +[2026-03-25 17:49:00,408][__main__][INFO] - agents played in iteration 283 are Bob, Alice +[2026-03-25 17:49:00,973][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:49:01,625][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:49:01,916][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:49:02,237][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:49:02,557][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:49:02,877][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:49:03,196][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:49:03,515][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:49:03,835][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:49:04,155][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:49:04,476][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:49:04,796][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:49:05,118][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:49:05,440][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:49:05,761][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:49:06,082][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:49:06,403][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:49:06,723][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:49:07,044][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:49:07,364][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:49:07,686][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:49:08,007][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:49:08,327][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:49:08,649][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:49:08,970][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:49:09,292][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:49:09,613][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:49:09,934][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:49:10,255][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:49:10,576][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:49:10,896][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:49:11,218][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:49:11,539][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:49:11,858][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:49:12,179][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:49:12,498][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:49:12,819][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:49:13,141][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:49:13,462][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:49:13,781][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:49:14,100][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:49:14,421][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:49:14,741][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:49:15,063][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:49:15,384][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:49:15,705][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:49:16,025][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:49:16,346][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:49:16,667][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:49:16,987][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:49:17,307][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:49:17,627][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:49:17,947][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:49:18,562][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:49:18,882][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:49:19,201][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:49:19,522][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:49:19,842][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:49:20,161][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:49:20,483][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:49:20,805][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:49:21,125][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:49:21,445][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:49:21,765][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:49:22,084][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:49:22,404][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:49:23,060][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:49:23,792][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:49:23,794][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:49:23,796][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:49:24,460][__main__][INFO] - Iteration 284 took 27s (12.03% Gen, 85.54% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 19m 44s. Estimated total time: 7h 35m 41s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 34s, 500 more iterations: 3h 47m 50s. +[2026-03-25 17:49:24,462][__main__][INFO] - Starting iteration 284. +[2026-03-25 17:49:24,465][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:49:24,466][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:49:27,711][__main__][INFO] - Number of regex retries in iteration 284: 0 +[2026-03-25 17:49:27,712][__main__][INFO] - agents played in iteration 284 are Bob, Alice +[2026-03-25 17:49:28,252][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:49:28,906][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:49:29,196][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:49:29,516][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:49:29,837][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:49:30,156][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:49:30,477][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:49:30,798][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:49:31,119][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:49:31,439][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:49:31,761][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:49:32,081][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:49:32,402][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:49:32,723][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:49:33,043][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:49:33,364][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:49:33,685][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:49:34,005][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:49:34,326][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:49:34,647][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:49:34,968][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:49:35,289][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:49:35,610][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:49:35,931][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:49:36,252][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:49:36,574][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:49:36,894][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:49:37,214][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:49:37,533][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:49:37,854][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:49:38,174][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:49:38,495][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:49:38,815][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:49:39,135][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:49:39,456][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:49:39,777][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:49:40,097][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:49:40,417][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:49:40,737][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:49:41,059][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:49:41,379][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:49:41,700][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:49:42,020][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:49:42,340][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:49:42,660][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:49:42,980][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:49:43,300][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:49:43,621][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:49:43,941][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:49:44,262][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:49:44,583][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:49:44,903][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:49:45,224][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:49:45,838][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:49:46,161][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:49:46,481][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:49:46,802][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:49:47,121][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:49:47,441][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:49:47,762][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:49:48,083][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:49:48,403][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:49:48,723][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:49:49,043][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:49:49,364][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:49:49,685][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:49:50,345][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:49:51,075][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:49:51,077][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:49:51,079][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:49:51,752][__main__][INFO] - Iteration 285 took 27s (11.90% Gen, 85.63% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 18m 23s. Estimated total time: 7h 34m 47s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 23s. +[2026-03-25 17:49:51,754][__main__][INFO] - Starting iteration 285. +[2026-03-25 17:49:51,758][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:49:51,758][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:49:55,001][__main__][INFO] - Number of regex retries in iteration 285: 0 +[2026-03-25 17:49:55,002][__main__][INFO] - agents played in iteration 285 are Bob, Alice +[2026-03-25 17:49:55,567][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:49:56,222][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:49:56,514][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:49:56,837][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:49:57,157][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:49:57,478][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:49:57,799][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:49:58,121][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:49:58,443][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:49:58,765][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:49:59,087][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:49:59,408][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:49:59,727][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:50:00,047][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:50:00,368][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:50:00,688][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:50:01,009][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:50:01,329][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:50:01,648][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:50:01,968][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:50:02,287][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:50:02,607][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:50:02,928][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:50:03,248][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:50:03,570][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:50:03,890][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:50:04,212][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:50:04,532][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:50:04,852][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:50:05,173][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:50:05,494][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:50:05,814][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:50:06,135][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:50:06,454][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:50:06,776][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:50:07,095][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:50:07,415][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:50:07,735][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:50:08,057][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:50:08,376][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:50:08,696][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:50:09,015][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:50:09,336][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:50:09,658][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:50:09,980][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:50:10,301][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:50:10,622][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:50:10,942][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:50:11,263][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:50:11,582][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:50:11,901][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:50:12,222][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:50:12,541][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:50:13,155][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:50:13,475][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:50:13,795][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:50:14,114][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:50:14,434][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:50:14,753][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:50:15,075][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:50:15,395][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:50:15,715][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:50:16,036][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:50:16,355][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:50:16,675][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:50:16,994][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:50:17,650][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:50:18,385][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:50:18,387][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:50:18,389][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:50:19,034][__main__][INFO] - Iteration 286 took 27s (11.89% Gen, 85.74% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 17m 46s. Estimated total time: 7h 34m 37s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 18s. +[2026-03-25 17:50:19,036][__main__][INFO] - Starting iteration 286. +[2026-03-25 17:50:19,039][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:50:19,040][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:50:22,359][__main__][INFO] - Number of regex retries in iteration 286: 0 +[2026-03-25 17:50:22,360][__main__][INFO] - agents played in iteration 286 are Bob, Alice +[2026-03-25 17:50:22,910][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:50:23,562][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:50:23,852][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:50:24,172][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:50:24,493][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:50:24,812][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:50:25,132][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:50:25,452][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:50:25,772][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:50:26,091][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:50:26,411][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:50:26,732][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:50:27,051][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:50:27,372][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:50:27,693][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:50:28,014][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:50:28,335][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:50:28,657][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:50:28,978][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:50:29,299][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:50:29,620][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:50:29,940][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:50:30,262][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:50:30,582][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:50:30,904][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:50:31,224][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:50:31,545][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:50:31,865][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:50:32,187][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:50:32,507][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:50:32,828][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:50:33,149][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:50:33,469][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:50:33,789][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:50:34,108][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:50:34,428][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:50:34,747][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:50:35,067][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:50:35,388][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:50:35,707][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:50:36,027][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:50:36,348][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:50:36,670][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:50:36,991][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:50:37,310][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:50:37,630][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:50:37,950][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:50:38,271][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:50:38,591][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:50:38,912][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:50:39,232][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:50:39,551][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:50:39,872][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:50:40,485][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:50:40,805][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:50:41,126][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:50:41,446][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:50:41,766][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:50:42,087][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:50:42,408][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:50:42,728][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:50:43,049][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:50:43,369][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:50:43,690][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:50:44,012][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:50:44,332][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:50:44,989][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:50:45,722][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:50:45,724][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:50:45,726][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:50:46,398][__main__][INFO] - Iteration 287 took 27s (12.14% Gen, 85.40% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 18m 41s. Estimated total time: 7h 35m 59s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 35s, 500 more iterations: 3h 47m 59s. +[2026-03-25 17:50:46,400][__main__][INFO] - Starting iteration 287. +[2026-03-25 17:50:46,403][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:50:46,404][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:50:49,714][__main__][INFO] - Number of regex retries in iteration 287: 0 +[2026-03-25 17:50:49,714][__main__][INFO] - agents played in iteration 287 are Bob, Alice +[2026-03-25 17:50:50,271][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:50:50,927][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:50:51,217][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:50:51,539][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:50:51,861][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:50:52,180][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:50:52,500][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:50:52,821][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:50:53,142][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:50:53,462][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:50:53,784][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:50:54,105][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:50:54,425][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:50:54,746][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:50:55,066][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:50:55,386][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:50:55,707][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:50:56,027][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:50:56,347][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:50:56,668][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:50:56,989][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:50:57,308][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:50:57,628][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:50:57,949][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:50:58,270][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:50:58,590][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:50:58,911][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:50:59,231][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:50:59,552][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:50:59,873][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:51:00,193][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:51:00,513][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:51:00,832][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:51:01,153][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:51:01,473][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:51:01,793][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:51:02,114][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:51:02,433][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:51:02,754][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:51:03,074][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:51:03,394][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:51:03,714][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:51:04,034][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:51:04,354][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:51:04,675][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:51:04,994][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:51:05,313][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:51:05,635][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:51:05,956][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:51:06,277][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:51:06,598][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:51:06,918][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:51:07,239][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:51:07,854][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:51:08,175][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:51:08,494][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:51:08,815][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:51:09,135][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:51:09,456][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:51:09,775][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:51:10,095][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:51:10,415][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:51:10,735][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:51:11,056][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:51:11,375][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:51:11,695][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:51:12,351][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:51:13,078][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:51:13,080][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:51:13,082][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:51:13,706][__main__][INFO] - Iteration 288 took 27s (12.12% Gen, 85.58% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 17m 17s. Estimated total time: 7h 35m 3s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 31s. +[2026-03-25 17:51:13,708][__main__][INFO] - Starting iteration 288. +[2026-03-25 17:51:13,711][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:51:13,712][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:51:16,965][__main__][INFO] - Number of regex retries in iteration 288: 0 +[2026-03-25 17:51:16,966][__main__][INFO] - agents played in iteration 288 are Bob, Alice +[2026-03-25 17:51:17,511][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:51:18,158][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:51:18,448][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:51:18,769][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:51:19,088][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:51:19,407][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:51:19,726][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:51:20,047][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:51:20,369][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:51:20,690][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:51:21,010][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:51:21,332][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:51:21,651][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:51:21,972][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:51:22,291][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:51:22,612][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:51:22,933][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:51:23,253][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:51:23,575][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:51:23,894][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:51:24,215][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:51:24,535][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:51:24,855][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:51:25,175][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:51:25,495][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:51:25,815][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:51:26,136][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:51:26,455][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:51:26,774][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:51:27,094][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:51:27,414][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:51:27,733][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:51:28,052][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:51:28,372][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:51:28,692][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:51:29,011][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:51:29,332][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:51:29,652][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:51:29,972][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:51:30,293][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:51:30,613][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:51:30,934][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:51:31,255][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:51:31,576][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:51:31,896][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:51:32,217][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:51:32,538][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:51:32,859][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:51:33,180][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:51:33,500][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:51:33,821][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:51:34,140][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:51:34,461][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:51:35,073][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:51:35,394][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:51:35,716][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:51:36,038][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:51:36,359][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:51:36,680][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:51:37,000][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:51:37,322][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:51:37,642][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:51:37,964][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:51:38,286][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:51:38,607][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:51:38,928][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:51:39,585][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:51:40,317][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:51:40,319][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:51:40,320][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:51:40,944][__main__][INFO] - Iteration 289 took 27s (11.95% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 15m 40s. Estimated total time: 7h 33m 54s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 57s. +[2026-03-25 17:51:40,947][__main__][INFO] - Starting iteration 289. +[2026-03-25 17:51:40,949][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:51:40,950][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:51:44,202][__main__][INFO] - Number of regex retries in iteration 289: 0 +[2026-03-25 17:51:44,202][__main__][INFO] - agents played in iteration 289 are Bob, Alice +[2026-03-25 17:51:44,754][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:51:45,412][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:51:45,703][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:51:46,025][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:51:46,346][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:51:46,665][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:51:46,986][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:51:47,306][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:51:47,625][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:51:47,946][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:51:48,267][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:51:48,588][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:51:48,907][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:51:49,228][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:51:49,548][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:51:49,870][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:51:50,191][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:51:50,513][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:51:50,833][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:51:51,153][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:51:51,473][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:51:51,794][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:51:52,115][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:51:52,436][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:51:52,758][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:51:53,079][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:51:53,399][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:51:53,721][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:51:54,042][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:51:54,362][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:51:54,683][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:51:55,002][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:51:55,324][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:51:55,645][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:51:55,966][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:51:56,286][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:51:56,606][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:51:56,926][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:51:57,247][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:51:57,568][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:51:57,889][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:51:58,209][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:51:58,529][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:51:58,848][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:51:59,168][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:51:59,488][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:51:59,807][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:52:00,130][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:52:00,451][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:52:00,772][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:52:01,093][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:52:01,413][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:52:01,733][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:52:02,348][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:52:02,669][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:52:02,989][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:52:03,308][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:52:03,627][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:52:03,948][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:52:04,268][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:52:04,587][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:52:04,909][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:52:05,229][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:52:05,549][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:52:05,870][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:52:06,190][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:52:06,851][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:52:07,589][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:52:07,591][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:52:07,593][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:52:08,220][__main__][INFO] - Iteration 290 took 27s (11.93% Gen, 85.77% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 15m 51s. Estimated total time: 7h 34m 31s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 15s. +[2026-03-25 17:52:08,223][__main__][INFO] - Starting iteration 290. +[2026-03-25 17:52:08,226][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:52:08,226][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:52:11,470][__main__][INFO] - Number of regex retries in iteration 290: 0 +[2026-03-25 17:52:11,471][__main__][INFO] - agents played in iteration 290 are Bob, Alice +[2026-03-25 17:52:12,012][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:52:12,669][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:52:12,963][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:52:13,285][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:52:13,604][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:52:13,924][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:52:14,244][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:52:14,564][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:52:14,885][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:52:15,204][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:52:15,526][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:52:15,845][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:52:16,166][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:52:16,487][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:52:16,810][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:52:17,131][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:52:17,451][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:52:17,773][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:52:18,094][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:52:18,414][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:52:18,734][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:52:19,053][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:52:19,373][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:52:19,693][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:52:20,013][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:52:20,335][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:52:20,655][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:52:20,975][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:52:21,295][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:52:21,616][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:52:21,937][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:52:22,259][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:52:22,580][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:52:22,900][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:52:23,221][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:52:23,541][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:52:23,863][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:52:24,184][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:52:24,505][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:52:24,825][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:52:25,146][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:52:25,467][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:52:25,788][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:52:26,110][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:52:26,430][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:52:26,753][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:52:27,074][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:52:27,395][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:52:27,716][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:52:28,038][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:52:28,360][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:52:28,681][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:52:29,004][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:52:29,622][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:52:29,942][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:52:30,263][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:52:30,583][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:52:30,904][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:52:31,226][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:52:31,547][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:52:31,866][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:52:32,186][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:52:32,505][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:52:32,826][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:52:33,146][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:52:33,468][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:52:34,132][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:52:34,876][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:52:34,878][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:52:34,879][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:52:35,508][__main__][INFO] - Iteration 291 took 27s (11.89% Gen, 85.80% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 15m 35s. Estimated total time: 7h 34m 43s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 21s. +[2026-03-25 17:52:35,510][__main__][INFO] - Starting iteration 291. +[2026-03-25 17:52:35,513][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:52:35,514][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:52:38,771][__main__][INFO] - Number of regex retries in iteration 291: 0 +[2026-03-25 17:52:38,772][__main__][INFO] - agents played in iteration 291 are Bob, Alice +[2026-03-25 17:52:39,318][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:52:39,978][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:52:40,269][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:52:40,590][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:52:40,910][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:52:41,229][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:52:41,548][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:52:41,869][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:52:42,189][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:52:42,508][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:52:42,827][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:52:43,147][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:52:43,469][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:52:43,789][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:52:44,108][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:52:44,428][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:52:44,749][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:52:45,069][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:52:45,388][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:52:45,709][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:52:46,028][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:52:46,349][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:52:46,669][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:52:46,988][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:52:47,308][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:52:47,627][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:52:47,949][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:52:48,269][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:52:48,590][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:52:48,909][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:52:49,228][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:52:49,549][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:52:49,868][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:52:50,189][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:52:50,510][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:52:50,832][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:52:51,153][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:52:51,473][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:52:51,794][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:52:52,114][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:52:52,435][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:52:52,755][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:52:53,074][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:52:53,394][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:52:53,714][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:52:54,034][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:52:54,354][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:52:54,673][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:52:54,994][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:52:55,315][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:52:55,635][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:52:55,957][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:52:56,279][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:52:56,895][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:52:57,216][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:52:57,537][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:52:57,857][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:52:58,179][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:52:58,498][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:52:58,820][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:52:59,140][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:52:59,461][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:52:59,783][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:53:00,103][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:53:00,423][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:53:00,743][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:53:01,404][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:53:02,140][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:53:02,143][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:53:02,144][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:53:02,822][__main__][INFO] - Iteration 292 took 27s (11.93% Gen, 85.58% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 15m 34s. Estimated total time: 7h 35m 9s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 34s. +[2026-03-25 17:53:02,824][__main__][INFO] - Starting iteration 292. +[2026-03-25 17:53:02,827][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:53:02,827][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:53:06,080][__main__][INFO] - Number of regex retries in iteration 292: 0 +[2026-03-25 17:53:06,080][__main__][INFO] - agents played in iteration 292 are Bob, Alice +[2026-03-25 17:53:06,624][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:53:07,280][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:53:07,572][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:53:07,892][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:53:08,212][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:53:08,533][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:53:08,854][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:53:09,174][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:53:09,494][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:53:09,814][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:53:10,135][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:53:10,455][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:53:10,775][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:53:11,095][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:53:11,414][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:53:11,734][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:53:12,054][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:53:12,375][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:53:12,694][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:53:13,013][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:53:13,335][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:53:13,657][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:53:13,978][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:53:14,298][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:53:14,619][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:53:14,939][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:53:15,261][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:53:15,581][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:53:15,901][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:53:16,222][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:53:16,542][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:53:16,862][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:53:17,183][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:53:17,504][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:53:17,824][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:53:18,143][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:53:18,464][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:53:18,785][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:53:19,106][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:53:19,426][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:53:19,747][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:53:20,067][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:53:20,389][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:53:20,710][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:53:21,032][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:53:21,353][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:53:21,676][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:53:21,997][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:53:22,318][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:53:22,639][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:53:22,961][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:53:23,282][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:53:23,602][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:53:24,220][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:53:24,541][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:53:24,861][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:53:25,183][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:53:25,504][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:53:25,824][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:53:26,144][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:53:26,465][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:53:26,786][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:53:27,107][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:53:27,426][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:53:27,745][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:53:28,066][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:53:28,728][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:53:29,460][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:53:29,462][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:53:29,464][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:53:30,094][__main__][INFO] - Iteration 293 took 27s (11.93% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 14m 25s. Estimated total time: 7h 34m 27s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 13s. +[2026-03-25 17:53:30,097][__main__][INFO] - Starting iteration 293. +[2026-03-25 17:53:30,100][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:53:30,101][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:53:33,359][__main__][INFO] - Number of regex retries in iteration 293: 0 +[2026-03-25 17:53:33,359][__main__][INFO] - agents played in iteration 293 are Bob, Alice +[2026-03-25 17:53:33,910][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:53:34,569][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:53:34,860][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:53:35,182][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:53:35,503][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:53:35,824][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:53:36,145][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:53:36,465][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:53:36,785][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:53:37,106][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:53:37,427][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:53:37,747][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:53:38,068][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:53:38,388][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:53:38,708][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:53:39,027][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:53:39,348][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:53:39,667][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:53:39,986][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:53:40,306][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:53:40,626][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:53:40,946][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:53:41,266][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:53:41,587][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:53:41,906][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:53:42,225][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:53:42,546][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:53:42,867][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:53:43,188][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:53:43,507][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:53:43,828][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:53:44,148][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:53:44,468][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:53:44,788][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:53:45,107][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:53:45,427][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:53:45,747][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:53:46,067][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:53:46,385][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:53:46,706][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:53:47,026][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:53:47,347][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:53:47,668][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:53:47,987][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:53:48,308][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:53:48,629][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:53:48,950][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:53:49,270][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:53:49,591][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:53:49,910][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:53:50,231][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:53:50,551][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:53:50,872][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:53:51,511][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:53:51,830][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:53:52,149][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:53:52,470][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:53:52,791][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:53:53,111][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:53:53,431][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:53:53,751][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:53:54,072][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:53:54,392][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:53:54,711][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:53:55,032][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:53:55,352][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:53:56,015][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:53:56,764][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:53:56,766][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:53:56,768][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:53:57,416][__main__][INFO] - Iteration 294 took 27s (11.93% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 14m 47s. Estimated total time: 7h 35m 16s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 38s. +[2026-03-25 17:53:57,421][__main__][INFO] - Starting iteration 294. +[2026-03-25 17:53:57,427][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:53:57,428][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:54:00,815][__main__][INFO] - Number of regex retries in iteration 294: 0 +[2026-03-25 17:54:00,815][__main__][INFO] - agents played in iteration 294 are Bob, Alice +[2026-03-25 17:54:01,359][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:54:02,018][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:54:02,309][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:54:02,631][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:54:02,951][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:54:03,272][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:54:03,592][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:54:03,913][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:54:04,234][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:54:04,555][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:54:04,874][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:54:05,194][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:54:05,514][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:54:05,834][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:54:06,155][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:54:06,476][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:54:06,798][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:54:07,119][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:54:07,440][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:54:07,762][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:54:08,084][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:54:08,405][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:54:08,725][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:54:09,045][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:54:09,366][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:54:09,688][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:54:10,009][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:54:10,329][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:54:10,650][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:54:10,973][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:54:11,295][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:54:11,617][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:54:11,939][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:54:12,259][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:54:12,578][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:54:12,899][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:54:13,218][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:54:13,540][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:54:13,860][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:54:14,182][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:54:14,505][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:54:14,825][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:54:15,145][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:54:15,466][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:54:15,788][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:54:16,108][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:54:16,428][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:54:16,748][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:54:17,067][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:54:17,387][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:54:17,706][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:54:18,026][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:54:18,346][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:54:18,963][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:54:19,283][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:54:19,604][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:54:19,924][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:54:20,247][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:54:20,569][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:54:20,888][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:54:21,208][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:54:21,529][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:54:21,850][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:54:22,171][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:54:22,491][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:54:22,813][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:54:23,475][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:54:24,221][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:54:24,223][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:54:24,225][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:54:24,904][__main__][INFO] - Iteration 295 took 27s (12.33% Gen, 85.19% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 16m 59s. Estimated total time: 7h 37m 57s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 47s, 500 more iterations: 3h 48m 58s. +[2026-03-25 17:54:24,906][__main__][INFO] - Starting iteration 295. +[2026-03-25 17:54:24,909][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:54:24,909][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:54:28,176][__main__][INFO] - Number of regex retries in iteration 295: 0 +[2026-03-25 17:54:28,176][__main__][INFO] - agents played in iteration 295 are Bob, Alice +[2026-03-25 17:54:28,718][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:54:29,375][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:54:29,665][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:54:29,986][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:54:30,305][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:54:30,626][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:54:30,945][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:54:31,267][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:54:31,588][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:54:31,909][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:54:32,230][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:54:32,551][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:54:32,871][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:54:33,192][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:54:33,511][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:54:33,832][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:54:34,153][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:54:34,473][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:54:34,794][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:54:35,114][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:54:35,435][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:54:35,756][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:54:36,078][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:54:36,399][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:54:36,721][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:54:37,041][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:54:37,361][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:54:37,681][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:54:38,001][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:54:38,321][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:54:38,642][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:54:38,964][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:54:39,285][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:54:39,606][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:54:39,927][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:54:40,247][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:54:40,567][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:54:40,887][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:54:41,208][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:54:41,528][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:54:41,847][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:54:42,168][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:54:42,487][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:54:42,807][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:54:43,128][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:54:43,449][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:54:43,770][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:54:44,090][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:54:44,412][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:54:44,733][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:54:45,054][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:54:45,375][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:54:45,694][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:54:46,311][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:54:46,632][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:54:46,952][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:54:47,273][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:54:47,592][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:54:47,911][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:54:48,233][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:54:48,553][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:54:48,874][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:54:49,194][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:54:49,514][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:54:49,835][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:54:50,154][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:54:50,816][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:54:51,554][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:54:51,556][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:54:51,558][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:54:52,188][__main__][INFO] - Iteration 296 took 27s (11.98% Gen, 85.71% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 13m 15s. Estimated total time: 7h 34m 40s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 20s. +[2026-03-25 17:54:52,190][__main__][INFO] - Starting iteration 296. +[2026-03-25 17:54:52,193][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:54:52,194][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:54:55,460][__main__][INFO] - Number of regex retries in iteration 296: 0 +[2026-03-25 17:54:55,461][__main__][INFO] - agents played in iteration 296 are Bob, Alice +[2026-03-25 17:54:56,002][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:54:56,659][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:54:56,949][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:54:57,270][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:54:57,591][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:54:57,911][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:54:58,231][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:54:58,551][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:54:58,871][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:54:59,191][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:54:59,510][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:54:59,830][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:55:00,151][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:55:00,471][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:55:00,792][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:55:01,113][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:55:01,435][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:55:01,757][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:55:02,079][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:55:02,401][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:55:02,722][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:55:03,042][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:55:03,363][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:55:03,683][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:55:04,002][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:55:04,323][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:55:04,644][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:55:04,965][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:55:05,286][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:55:05,605][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:55:05,926][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:55:06,247][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:55:06,567][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:55:06,888][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:55:07,207][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:55:07,527][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:55:07,846][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:55:08,167][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:55:08,487][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:55:08,806][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:55:09,126][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:55:09,447][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:55:09,766][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:55:10,086][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:55:10,406][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:55:10,727][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:55:11,047][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:55:11,369][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:55:11,692][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:55:12,013][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:55:12,333][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:55:12,655][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:55:12,976][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:55:13,592][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:55:13,913][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:55:14,233][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:55:14,553][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:55:14,873][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:55:15,194][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:55:15,515][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:55:15,834][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:55:16,155][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:55:16,475][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:55:16,795][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:55:17,115][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:55:17,434][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:55:18,096][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:55:18,852][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:55:18,855][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:55:18,856][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:55:19,476][__main__][INFO] - Iteration 297 took 27s (11.97% Gen, 85.75% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 12m 52s. Estimated total time: 7h 34m 44s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 22s. +[2026-03-25 17:55:19,479][__main__][INFO] - Starting iteration 297. +[2026-03-25 17:55:19,482][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:55:19,483][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:55:22,745][__main__][INFO] - Number of regex retries in iteration 297: 0 +[2026-03-25 17:55:22,746][__main__][INFO] - agents played in iteration 297 are Bob, Alice +[2026-03-25 17:55:23,295][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:55:23,950][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:55:24,243][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:55:24,565][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:55:24,884][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:55:25,205][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:55:25,525][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:55:25,844][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:55:26,165][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:55:26,485][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:55:26,804][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:55:27,126][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:55:27,445][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:55:27,766][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:55:28,086][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:55:28,407][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:55:28,727][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:55:29,047][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:55:29,368][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:55:29,687][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:55:30,007][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:55:30,326][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:55:30,647][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:55:30,967][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:55:31,288][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:55:31,607][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:55:31,927][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:55:32,246][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:55:32,566][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:55:32,887][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:55:33,207][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:55:33,526][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:55:33,846][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:55:34,165][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:55:34,485][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:55:34,806][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:55:35,126][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:55:35,448][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:55:35,769][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:55:36,089][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:55:36,408][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:55:36,729][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:55:37,049][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:55:37,368][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:55:37,687][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:55:38,007][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:55:38,327][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:55:38,647][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:55:38,967][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:55:39,288][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:55:39,609][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:55:39,930][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:55:40,249][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:55:40,859][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:55:41,181][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:55:41,501][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:55:41,821][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:55:42,141][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:55:42,461][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:55:42,781][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:55:43,102][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:55:43,421][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:55:43,740][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:55:44,062][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:55:44,381][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:55:44,702][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:55:45,353][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:55:46,090][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:55:46,092][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:55:46,094][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:55:46,768][__main__][INFO] - Iteration 298 took 27s (11.96% Gen, 85.56% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 12m 27s. Estimated total time: 7h 34m 46s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 23s. +[2026-03-25 17:55:46,770][__main__][INFO] - Starting iteration 298. +[2026-03-25 17:55:46,773][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:55:46,774][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:55:50,038][__main__][INFO] - Number of regex retries in iteration 298: 0 +[2026-03-25 17:55:50,039][__main__][INFO] - agents played in iteration 298 are Bob, Alice +[2026-03-25 17:55:50,587][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:55:51,244][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:55:51,535][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:55:51,856][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:55:52,175][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:55:52,494][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:55:52,813][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:55:53,133][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:55:53,452][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:55:53,772][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:55:54,091][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:55:54,412][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:55:54,731][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:55:55,051][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:55:55,372][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:55:55,693][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:55:56,014][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:55:56,336][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:55:56,655][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:55:56,975][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:55:57,295][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:55:57,615][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:55:57,935][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:55:58,255][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:55:58,575][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:55:58,897][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:55:59,218][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:55:59,539][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:55:59,861][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:56:00,184][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:56:00,506][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:56:00,827][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:56:01,149][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:56:01,469][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:56:01,789][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:56:02,110][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:56:02,429][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:56:02,750][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:56:03,070][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:56:03,389][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:56:03,709][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:56:04,029][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:56:04,350][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:56:04,671][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:56:04,990][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:56:05,311][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:56:05,631][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:56:05,952][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:56:06,272][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:56:06,592][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:56:06,912][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:56:07,233][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:56:07,553][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:56:08,165][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:56:08,486][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:56:08,806][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:56:09,126][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:56:09,447][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:56:09,767][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:56:10,087][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:56:10,407][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:56:10,727][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:56:11,046][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:56:11,366][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:56:11,688][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:56:12,007][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:56:12,660][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:56:13,388][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:56:13,391][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:56:13,392][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:56:14,020][__main__][INFO] - Iteration 299 took 27s (11.98% Gen, 85.70% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 11m 21s. Estimated total time: 7h 34m 8s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 4s. +[2026-03-25 17:56:14,023][__main__][INFO] - Starting iteration 299. +[2026-03-25 17:56:14,025][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:56:14,026][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:56:17,326][__main__][INFO] - Number of regex retries in iteration 299: 0 +[2026-03-25 17:56:17,326][__main__][INFO] - agents played in iteration 299 are Bob, Alice +[2026-03-25 17:56:17,867][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:56:18,515][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:56:18,805][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:56:19,126][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:56:19,446][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:56:19,766][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:56:20,087][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:56:20,407][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:56:20,727][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:56:21,046][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:56:21,368][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:56:21,688][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:56:22,008][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:56:22,328][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:56:22,648][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:56:22,968][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:56:23,287][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:56:23,607][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:56:23,927][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:56:24,248][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:56:24,569][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:56:24,889][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:56:25,208][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:56:25,528][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:56:25,848][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:56:26,167][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:56:26,488][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:56:26,807][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:56:27,127][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:56:27,448][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:56:27,768][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:56:28,088][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:56:28,408][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:56:28,728][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:56:29,048][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:56:29,368][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:56:29,689][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:56:30,008][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:56:30,329][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:56:30,649][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:56:30,969][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:56:31,289][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:56:31,610][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:56:31,930][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:56:32,252][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:56:32,573][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:56:32,893][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:56:33,214][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:56:33,535][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:56:33,855][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:56:34,176][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:56:34,495][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:56:34,816][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:56:35,433][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:56:35,754][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:56:36,074][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:56:36,393][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:56:36,714][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:56:37,034][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:56:37,357][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:56:37,678][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:56:37,999][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:56:38,321][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:56:38,642][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:56:38,962][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:56:39,284][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:56:39,936][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:56:40,674][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:56:40,676][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:56:40,678][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:56:41,303][__main__][INFO] - Iteration 300 took 27s (12.10% Gen, 85.60% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 11m 25s. Estimated total time: 7h 34m 38s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 19s. +[2026-03-25 17:56:41,306][__main__][INFO] - Starting iteration 300. +[2026-03-25 17:56:41,310][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 5 and human policies 1. +[2026-03-25 17:56:41,310][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:56:44,476][__main__][INFO] - Number of regex retries in iteration 300: 0 +[2026-03-25 17:56:44,477][__main__][INFO] - agents played in iteration 300 are Bob, Alice +[2026-03-25 17:56:45,025][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:56:45,672][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:56:45,962][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:56:46,285][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:56:46,605][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:56:46,925][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:56:47,245][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:56:47,566][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:56:47,886][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:56:48,208][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:56:48,529][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:56:48,850][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:56:49,172][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:56:49,493][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:56:49,815][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:56:50,137][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:56:50,460][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:56:50,780][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:56:51,100][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:56:51,421][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:56:51,742][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:56:52,063][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:56:52,383][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:56:52,703][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:56:53,022][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:56:53,343][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:56:53,665][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:56:53,986][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:56:54,304][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:56:54,624][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:56:54,945][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:56:55,266][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:56:55,587][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:56:55,906][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:56:56,227][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:56:56,547][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:56:56,867][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:56:57,187][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:56:57,506][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:56:57,826][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:56:58,145][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:56:58,465][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:56:58,786][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:56:59,107][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:56:59,429][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:56:59,750][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:57:00,072][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:57:00,393][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:57:00,712][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:57:01,032][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:57:01,355][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:57:01,675][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:57:01,995][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:57:02,608][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:57:02,929][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:57:03,248][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:57:03,568][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:57:03,888][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:57:04,207][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:57:04,528][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:57:04,849][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:57:05,170][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:57:05,491][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:57:05,811][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:57:06,132][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:57:06,453][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:57:07,104][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:57:07,841][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:57:07,843][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:57:07,845][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:57:09,055][__main__][INFO] - Iteration 301 took 27s (11.41% Gen, 84.22% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 18m 44s. Estimated total time: 7h 42m 25s. Time estimates for 10 more iterations: 4m 37s, 100 more iterations: 46m 14s, 500 more iterations: 3h 51m 12s. +[2026-03-25 17:57:09,057][__main__][INFO] - Starting iteration 301. +[2026-03-25 17:57:09,060][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 17:57:09,060][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:57:12,284][__main__][INFO] - Number of regex retries in iteration 301: 0 +[2026-03-25 17:57:12,285][__main__][INFO] - agents played in iteration 301 are Bob, Alice +[2026-03-25 17:57:12,830][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:57:13,480][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:57:13,770][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:57:14,093][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:57:14,412][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:57:14,732][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:57:15,053][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:57:15,374][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:57:15,692][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:57:16,014][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:57:16,334][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:57:16,655][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:57:16,974][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:57:17,296][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:57:17,616][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:57:17,936][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:57:18,256][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:57:18,576][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:57:18,896][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:57:19,217][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:57:19,537][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:57:19,858][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:57:20,181][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:57:20,503][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:57:20,824][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:57:21,144][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:57:21,464][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:57:21,785][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:57:22,105][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:57:22,426][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:57:22,746][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:57:23,066][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:57:23,386][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:57:23,706][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:57:24,025][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:57:24,346][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:57:24,664][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:57:24,985][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:57:25,305][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:57:25,626][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:57:25,946][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:57:26,266][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:57:26,587][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:57:26,908][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:57:27,228][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:57:27,547][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:57:27,869][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:57:28,188][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:57:28,507][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:57:28,827][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:57:29,147][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:57:29,469][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:57:29,789][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:57:30,408][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:57:30,727][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:57:31,048][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:57:31,370][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:57:31,691][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:57:32,011][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:57:32,332][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:57:32,653][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:57:32,974][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:57:33,293][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:57:33,613][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:57:33,933][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:57:34,253][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:57:34,915][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:57:35,697][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:57:35,700][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:57:35,701][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:57:36,462][__main__][INFO] - Iteration 302 took 27s (11.77% Gen, 85.45% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 12m 34s. Estimated total time: 7h 36m 43s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 40s, 500 more iterations: 3h 48m 21s. +[2026-03-25 17:57:36,464][__main__][INFO] - Starting iteration 302. +[2026-03-25 17:57:36,467][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 17:57:36,468][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:57:39,705][__main__][INFO] - Number of regex retries in iteration 302: 0 +[2026-03-25 17:57:39,706][__main__][INFO] - agents played in iteration 302 are Bob, Alice +[2026-03-25 17:57:40,258][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:57:40,909][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:57:41,200][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:57:41,521][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:57:41,842][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:57:42,163][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:57:42,484][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:57:42,805][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:57:43,125][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:57:43,446][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:57:43,766][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:57:44,087][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:57:44,408][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:57:44,728][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:57:45,048][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:57:45,368][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:57:45,688][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:57:46,009][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:57:46,329][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:57:46,649][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:57:46,968][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:57:47,288][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:57:47,607][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:57:47,929][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:57:48,250][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:57:48,570][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:57:48,891][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:57:49,212][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:57:49,531][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:57:49,852][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:57:50,172][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:57:50,494][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:57:50,816][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:57:51,135][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:57:51,455][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:57:51,775][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:57:52,095][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:57:52,415][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:57:52,736][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:57:53,055][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:57:53,375][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:57:53,695][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:57:54,014][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:57:54,334][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:57:54,655][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:57:54,977][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:57:55,298][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:57:55,618][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:57:55,939][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:57:56,260][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:57:56,581][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:57:56,903][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:57:57,223][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:57:57,834][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:57:58,154][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:57:58,474][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:57:58,794][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:57:59,114][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:57:59,434][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:57:59,755][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:58:00,077][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:58:00,397][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:58:00,719][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:58:01,040][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:58:01,361][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:58:01,682][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:58:02,335][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:58:03,068][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:58:03,070][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:58:03,072][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:58:03,738][__main__][INFO] - Iteration 303 took 27s (11.87% Gen, 85.68% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 9m 55s. Estimated total time: 7h 34m 31s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 15s. +[2026-03-25 17:58:03,740][__main__][INFO] - Starting iteration 303. +[2026-03-25 17:58:03,743][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 17:58:03,744][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:58:06,942][__main__][INFO] - Number of regex retries in iteration 303: 0 +[2026-03-25 17:58:06,943][__main__][INFO] - agents played in iteration 303 are Bob, Alice +[2026-03-25 17:58:07,493][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:58:08,142][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:58:08,433][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:58:08,755][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:58:09,075][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:58:09,395][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:58:09,715][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:58:10,036][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:58:10,355][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:58:10,676][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:58:10,995][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:58:11,315][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:58:11,636][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:58:11,956][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:58:12,278][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:58:12,600][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:58:12,920][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:58:13,242][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:58:13,563][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:58:13,882][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:58:14,201][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:58:14,522][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:58:14,843][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:58:15,162][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:58:15,482][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:58:15,803][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:58:16,125][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:58:16,446][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:58:16,767][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:58:17,088][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:58:17,408][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:58:17,728][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:58:18,048][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:58:18,369][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:58:18,689][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:58:19,008][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:58:19,329][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:58:19,649][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:58:19,970][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:58:20,291][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:58:20,611][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:58:20,932][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:58:21,251][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:58:21,572][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:58:21,893][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:58:22,213][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:58:22,534][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:58:22,854][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:58:23,175][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:58:23,494][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:58:23,815][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:58:24,136][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:58:24,455][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:58:25,066][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:58:25,388][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:58:25,708][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:58:26,029][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:58:26,350][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:58:26,672][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:58:26,993][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:58:27,314][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:58:27,635][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:58:27,956][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:58:28,278][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:58:28,599][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:58:28,919][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:58:29,574][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:58:30,306][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:58:30,309][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:58:30,310][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:58:30,983][__main__][INFO] - Iteration 304 took 27s (11.74% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 8m 57s. Estimated total time: 7h 34m 0s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 0s. +[2026-03-25 17:58:30,985][__main__][INFO] - Starting iteration 304. +[2026-03-25 17:58:30,988][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 17:58:30,989][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:58:34,208][__main__][INFO] - Number of regex retries in iteration 304: 0 +[2026-03-25 17:58:34,209][__main__][INFO] - agents played in iteration 304 are Bob, Alice +[2026-03-25 17:58:34,757][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:58:35,411][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:58:35,702][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:58:36,023][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:58:36,343][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:58:36,663][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:58:36,982][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:58:37,302][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:58:37,623][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:58:37,944][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:58:38,265][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:58:38,586][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:58:38,907][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:58:39,228][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:58:39,549][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:58:39,870][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:58:40,191][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:58:40,511][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:58:40,832][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:58:41,153][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:58:41,474][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:58:41,794][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:58:42,115][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:58:42,435][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:58:42,756][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:58:43,075][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:58:43,394][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:58:43,714][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:58:44,034][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:58:44,355][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:58:44,675][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:58:44,994][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:58:45,313][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:58:45,635][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:58:45,955][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:58:46,274][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:58:46,595][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:58:46,915][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:58:47,236][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:58:47,557][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:58:47,879][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:58:48,198][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:58:48,520][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:58:48,841][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:58:49,161][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:58:49,481][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:58:49,801][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:58:50,122][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:58:50,442][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:58:50,763][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:58:51,082][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:58:51,403][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:58:51,721][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:58:52,334][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:58:52,654][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:58:52,976][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:58:53,295][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:58:53,614][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:58:53,933][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:58:54,253][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:58:54,573][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:58:54,892][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:58:55,212][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:58:55,531][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:58:55,851][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:58:56,172][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:58:56,825][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:58:57,553][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:58:57,555][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:58:57,557][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:58:58,182][__main__][INFO] - Iteration 305 took 27s (11.84% Gen, 85.85% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 7m 44s. Estimated total time: 7h 33m 15s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 37s. +[2026-03-25 17:58:58,186][__main__][INFO] - Starting iteration 305. +[2026-03-25 17:58:58,189][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 17:58:58,189][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:59:01,405][__main__][INFO] - Number of regex retries in iteration 305: 0 +[2026-03-25 17:59:01,405][__main__][INFO] - agents played in iteration 305 are Bob, Alice +[2026-03-25 17:59:01,942][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:59:02,592][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:59:02,884][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:59:03,205][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:59:03,525][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:59:03,846][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:59:04,164][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:59:04,483][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:59:04,804][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:59:05,125][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:59:05,446][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:59:05,767][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:59:06,086][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:59:06,406][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:59:06,728][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:59:07,048][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:59:07,368][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:59:07,688][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:59:08,007][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:59:08,327][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:59:08,647][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:59:08,967][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:59:09,288][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:59:09,610][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:59:09,930][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:59:10,251][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:59:10,571][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:59:10,890][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:59:11,211][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:59:11,531][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:59:11,850][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:59:12,169][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:59:12,488][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:59:12,807][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:59:13,127][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:59:13,448][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:59:13,768][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:59:14,089][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:59:14,409][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:59:14,728][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:59:15,049][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:59:15,370][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:59:15,692][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:59:16,012][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:59:16,333][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:59:16,655][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:59:16,977][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:59:17,297][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:59:17,618][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:59:17,940][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:59:18,261][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:59:18,582][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:59:18,902][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:59:19,514][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:59:19,833][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:59:20,154][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:59:20,473][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:59:20,793][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:59:21,113][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:59:21,433][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:59:21,753][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:59:22,074][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:59:22,393][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:59:22,714][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:59:23,035][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:59:23,354][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:59:24,010][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:59:24,746][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:59:24,748][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:59:24,750][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:59:25,375][__main__][INFO] - Iteration 306 took 27s (11.83% Gen, 85.86% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 7m 9s. Estimated total time: 7h 33m 7s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 33s. +[2026-03-25 17:59:25,378][__main__][INFO] - Starting iteration 306. +[2026-03-25 17:59:25,381][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 17:59:25,381][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:59:28,589][__main__][INFO] - Number of regex retries in iteration 306: 0 +[2026-03-25 17:59:28,590][__main__][INFO] - agents played in iteration 306 are Bob, Alice +[2026-03-25 17:59:29,134][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:59:29,784][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:59:30,074][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:59:30,395][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:59:30,715][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:59:31,035][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:59:31,355][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:59:31,676][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:59:31,995][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 17:59:32,316][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 17:59:32,636][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 17:59:32,957][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 17:59:33,277][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 17:59:33,596][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 17:59:33,916][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 17:59:34,236][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 17:59:34,556][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 17:59:34,876][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 17:59:35,195][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 17:59:35,515][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 17:59:35,836][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 17:59:36,157][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 17:59:36,477][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 17:59:36,797][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 17:59:37,118][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 17:59:37,439][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 17:59:37,760][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 17:59:38,082][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 17:59:38,402][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 17:59:38,723][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 17:59:39,043][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 17:59:39,362][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 17:59:39,683][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 17:59:40,003][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 17:59:40,322][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 17:59:40,641][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 17:59:40,962][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 17:59:41,283][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 17:59:41,603][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 17:59:41,924][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 17:59:42,243][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 17:59:42,563][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 17:59:42,882][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 17:59:43,201][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 17:59:43,521][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 17:59:43,842][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 17:59:44,163][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 17:59:44,483][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 17:59:44,805][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 17:59:45,124][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 17:59:45,445][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 17:59:45,765][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 17:59:46,086][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 17:59:46,697][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 17:59:47,017][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 17:59:47,337][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 17:59:47,658][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 17:59:47,980][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 17:59:48,300][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 17:59:48,619][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 17:59:48,940][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 17:59:49,260][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 17:59:49,581][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 17:59:49,900][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 17:59:50,222][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 17:59:50,543][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 17:59:51,766][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 17:59:52,498][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 17:59:52,500][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 17:59:52,502][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 17:59:53,127][__main__][INFO] - Iteration 307 took 27s (11.56% Gen, 86.17% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 16m 2s. Estimated total time: 7h 42m 27s. Time estimates for 10 more iterations: 4m 37s, 100 more iterations: 46m 14s, 500 more iterations: 3h 51m 13s. +[2026-03-25 17:59:53,130][__main__][INFO] - Starting iteration 307. +[2026-03-25 17:59:53,133][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 17:59:53,133][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 17:59:56,333][__main__][INFO] - Number of regex retries in iteration 307: 0 +[2026-03-25 17:59:56,334][__main__][INFO] - agents played in iteration 307 are Bob, Alice +[2026-03-25 17:59:56,871][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 17:59:57,525][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 17:59:57,817][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 17:59:58,138][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 17:59:58,461][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 17:59:58,779][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 17:59:59,100][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 17:59:59,421][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 17:59:59,742][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:00:00,063][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:00:00,383][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:00:00,703][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:00:01,023][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:00:01,344][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:00:01,665][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:00:01,987][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:00:02,307][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:00:02,627][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:00:02,947][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:00:03,268][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:00:03,588][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:00:03,907][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:00:04,227][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:00:04,548][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:00:04,870][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:00:05,191][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:00:05,511][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:00:05,833][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:00:06,153][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:00:06,476][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:00:06,797][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:00:07,119][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:00:07,440][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:00:07,761][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:00:08,082][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:00:08,401][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:00:08,722][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:00:09,043][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:00:09,363][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:00:09,684][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:00:10,004][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:00:10,325][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:00:10,646][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:00:10,967][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:00:11,288][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:00:11,607][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:00:11,928][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:00:12,248][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:00:12,568][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:00:12,890][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:00:13,211][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:00:13,532][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:00:13,853][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:00:14,470][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:00:14,792][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:00:15,113][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:00:15,435][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:00:15,754][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:00:16,074][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:00:16,395][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:00:16,716][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:00:17,037][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:00:17,357][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:00:17,679][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:00:17,999][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:00:18,318][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:00:18,978][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:00:19,716][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:00:19,718][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:00:19,719][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:00:20,348][__main__][INFO] - Iteration 308 took 27s (11.76% Gen, 85.92% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 6m 44s. Estimated total time: 7h 33m 36s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 48s. +[2026-03-25 18:00:20,351][__main__][INFO] - Starting iteration 308. +[2026-03-25 18:00:20,353][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:00:20,354][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:00:23,570][__main__][INFO] - Number of regex retries in iteration 308: 0 +[2026-03-25 18:00:23,570][__main__][INFO] - agents played in iteration 308 are Bob, Alice +[2026-03-25 18:00:24,115][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:00:24,770][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:00:25,062][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:00:25,384][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:00:25,706][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:00:26,026][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:00:26,346][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:00:26,665][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:00:26,986][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:00:27,306][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:00:27,626][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:00:27,947][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:00:28,267][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:00:28,587][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:00:28,908][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:00:29,229][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:00:29,548][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:00:29,867][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:00:30,186][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:00:30,507][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:00:30,827][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:00:31,147][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:00:31,467][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:00:31,787][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:00:32,108][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:00:32,428][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:00:32,748][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:00:33,068][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:00:33,389][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:00:33,708][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:00:34,027][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:00:34,347][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:00:34,667][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:00:34,988][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:00:35,307][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:00:35,629][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:00:35,948][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:00:36,269][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:00:36,590][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:00:36,909][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:00:37,230][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:00:37,550][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:00:37,870][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:00:38,190][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:00:38,509][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:00:38,828][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:00:39,148][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:00:39,469][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:00:39,790][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:00:40,109][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:00:40,429][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:00:40,749][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:00:41,068][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:00:41,684][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:00:42,005][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:00:42,325][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:00:42,645][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:00:42,964][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:00:43,285][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:00:43,606][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:00:43,926][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:00:44,247][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:00:44,569][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:00:44,890][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:00:45,209][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:00:45,530][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:00:46,190][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:00:46,932][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:00:46,934][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:00:46,936][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:00:47,709][__main__][INFO] - Iteration 309 took 27s (11.76% Gen, 85.41% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 8m 36s. Estimated total time: 7h 35m 56s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 35s, 500 more iterations: 3h 47m 58s. +[2026-03-25 18:00:47,711][__main__][INFO] - Starting iteration 309. +[2026-03-25 18:00:47,714][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:00:47,714][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:00:50,923][__main__][INFO] - Number of regex retries in iteration 309: 0 +[2026-03-25 18:00:50,924][__main__][INFO] - agents played in iteration 309 are Bob, Alice +[2026-03-25 18:00:51,472][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:00:52,128][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:00:52,420][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:00:52,743][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:00:53,064][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:00:53,385][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:00:53,706][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:00:54,029][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:00:54,350][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:00:54,671][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:00:54,992][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:00:55,315][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:00:55,637][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:00:55,960][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:00:56,281][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:00:56,601][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:00:56,922][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:00:57,242][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:00:57,563][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:00:57,884][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:00:58,205][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:00:58,526][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:00:58,846][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:00:59,166][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:00:59,487][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:00:59,807][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:01:00,128][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:01:00,450][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:01:00,771][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:01:01,092][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:01:01,413][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:01:01,733][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:01:02,054][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:01:02,374][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:01:02,695][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:01:03,016][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:01:03,336][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:01:03,656][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:01:03,976][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:01:04,298][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:01:04,620][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:01:04,940][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:01:05,262][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:01:05,583][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:01:05,904][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:01:06,224][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:01:06,543][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:01:06,864][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:01:07,186][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:01:07,507][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:01:07,827][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:01:08,148][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:01:08,468][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:01:09,083][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:01:09,402][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:01:09,723][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:01:10,043][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:01:10,364][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:01:10,685][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:01:11,006][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:01:11,326][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:01:11,647][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:01:11,969][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:01:12,288][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:01:12,607][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:01:12,928][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:01:13,588][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:01:14,346][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:01:14,349][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:01:14,351][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:01:15,147][__main__][INFO] - Iteration 310 took 27s (11.70% Gen, 85.40% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 9m 26s. Estimated total time: 7h 37m 13s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 43s, 500 more iterations: 3h 48m 36s. +[2026-03-25 18:01:15,149][__main__][INFO] - Starting iteration 310. +[2026-03-25 18:01:15,152][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:01:15,153][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:01:18,372][__main__][INFO] - Number of regex retries in iteration 310: 0 +[2026-03-25 18:01:18,373][__main__][INFO] - agents played in iteration 310 are Bob, Alice +[2026-03-25 18:01:18,936][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:01:19,592][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:01:19,881][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:01:20,203][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:01:20,523][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:01:20,843][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:01:21,164][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:01:21,485][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:01:21,805][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:01:22,126][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:01:22,447][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:01:22,768][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:01:23,088][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:01:23,407][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:01:23,728][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:01:24,048][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:01:24,368][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:01:24,689][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:01:25,008][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:01:25,328][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:01:25,647][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:01:25,967][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:01:26,287][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:01:26,607][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:01:26,926][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:01:27,246][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:01:27,567][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:01:27,886][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:01:28,207][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:01:28,527][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:01:28,847][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:01:29,168][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:01:29,488][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:01:29,807][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:01:30,127][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:01:30,448][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:01:30,769][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:01:31,089][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:01:31,409][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:01:31,729][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:01:32,050][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:01:32,372][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:01:32,690][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:01:33,009][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:01:33,330][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:01:33,649][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:01:33,968][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:01:34,290][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:01:34,609][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:01:34,928][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:01:35,249][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:01:35,568][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:01:35,887][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:01:36,499][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:01:36,820][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:01:37,141][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:01:37,462][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:01:37,782][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:01:38,102][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:01:38,423][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:01:38,742][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:01:39,064][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:01:39,387][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:01:39,707][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:01:40,027][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:01:40,349][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:01:41,012][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:01:41,762][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:01:41,765][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:01:41,767][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:01:42,405][__main__][INFO] - Iteration 311 took 27s (11.82% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 5m 59s. Estimated total time: 7h 34m 13s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 6s. +[2026-03-25 18:01:42,407][__main__][INFO] - Starting iteration 311. +[2026-03-25 18:01:42,410][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:01:42,411][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:01:45,651][__main__][INFO] - Number of regex retries in iteration 311: 0 +[2026-03-25 18:01:45,652][__main__][INFO] - agents played in iteration 311 are Bob, Alice +[2026-03-25 18:01:46,191][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:01:46,845][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:01:47,135][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:01:47,455][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:01:47,776][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:01:48,098][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:01:48,418][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:01:48,738][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:01:49,059][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:01:49,380][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:01:49,700][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:01:50,023][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:01:50,345][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:01:50,668][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:01:50,989][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:01:51,309][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:01:51,630][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:01:51,951][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:01:52,272][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:01:52,592][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:01:52,913][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:01:53,234][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:01:53,554][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:01:53,874][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:01:54,194][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:01:54,514][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:01:54,833][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:01:55,154][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:01:55,475][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:01:55,796][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:01:56,116][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:01:56,436][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:01:56,757][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:01:57,078][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:01:57,398][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:01:57,719][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:01:58,039][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:01:58,361][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:01:58,682][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:01:59,001][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:01:59,321][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:01:59,642][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:01:59,962][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:02:00,282][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:02:00,602][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:02:00,924][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:02:01,246][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:02:01,565][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:02:01,885][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:02:02,207][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:02:02,525][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:02:02,846][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:02:03,167][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:02:03,783][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:02:04,102][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:02:04,422][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:02:04,742][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:02:05,063][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:02:05,384][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:02:05,705][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:02:06,026][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:02:06,346][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:02:06,666][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:02:06,987][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:02:07,308][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:02:07,628][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:02:08,288][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:02:09,014][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:02:09,016][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:02:09,018][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:02:09,799][__main__][INFO] - Iteration 312 took 27s (11.83% Gen, 85.31% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 7m 47s. Estimated total time: 7h 36m 29s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 38s, 500 more iterations: 3h 48m 14s. +[2026-03-25 18:02:09,802][__main__][INFO] - Starting iteration 312. +[2026-03-25 18:02:09,805][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:02:09,805][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:02:13,047][__main__][INFO] - Number of regex retries in iteration 312: 0 +[2026-03-25 18:02:13,048][__main__][INFO] - agents played in iteration 312 are Bob, Alice +[2026-03-25 18:02:13,601][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:02:14,257][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:02:14,549][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:02:14,871][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:02:15,190][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:02:15,509][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:02:15,829][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:02:16,148][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:02:16,469][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:02:16,788][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:02:17,107][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:02:17,428][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:02:17,747][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:02:18,068][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:02:18,389][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:02:18,709][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:02:19,029][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:02:19,350][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:02:19,669][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:02:19,990][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:02:20,312][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:02:20,632][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:02:20,952][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:02:21,273][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:02:21,593][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:02:21,913][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:02:22,234][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:02:22,555][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:02:22,875][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:02:23,194][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:02:23,515][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:02:23,835][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:02:24,155][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:02:24,476][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:02:24,796][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:02:25,116][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:02:25,436][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:02:25,757][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:02:26,076][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:02:26,396][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:02:26,716][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:02:27,034][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:02:27,355][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:02:27,675][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:02:27,995][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:02:28,315][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:02:28,635][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:02:28,956][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:02:29,277][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:02:29,597][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:02:29,917][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:02:30,237][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:02:30,557][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:02:31,172][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:02:31,493][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:02:31,814][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:02:32,133][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:02:32,453][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:02:32,774][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:02:33,093][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:02:33,413][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:02:33,733][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:02:34,054][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:02:34,374][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:02:34,694][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:02:35,014][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:02:35,665][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:02:36,388][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:02:36,390][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:02:36,392][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:02:37,164][__main__][INFO] - Iteration 313 took 27s (11.85% Gen, 85.32% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 6m 50s. Estimated total time: 7h 35m 59s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 35s, 500 more iterations: 3h 47m 59s. +[2026-03-25 18:02:37,166][__main__][INFO] - Starting iteration 313. +[2026-03-25 18:02:37,169][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:02:37,169][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:02:40,458][__main__][INFO] - Number of regex retries in iteration 313: 0 +[2026-03-25 18:02:40,458][__main__][INFO] - agents played in iteration 313 are Bob, Alice +[2026-03-25 18:02:41,035][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:02:41,695][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:02:41,986][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:02:42,308][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:02:42,630][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:02:42,950][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:02:43,270][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:02:43,589][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:02:43,908][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:02:44,230][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:02:44,550][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:02:44,871][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:02:45,193][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:02:45,514][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:02:45,835][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:02:46,156][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:02:46,477][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:02:46,796][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:02:47,116][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:02:47,436][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:02:47,757][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:02:48,078][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:02:48,398][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:02:48,720][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:02:49,039][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:02:49,361][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:02:49,682][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:02:50,002][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:02:50,323][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:02:50,643][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:02:50,965][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:02:51,286][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:02:51,606][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:02:51,928][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:02:52,248][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:02:52,567][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:02:52,888][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:02:53,208][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:02:53,527][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:02:53,847][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:02:54,169][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:02:54,490][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:02:54,810][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:02:55,129][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:02:55,449][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:02:55,769][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:02:56,089][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:02:56,407][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:02:56,728][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:02:57,048][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:02:57,369][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:02:57,689][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:02:58,009][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:02:58,621][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:02:58,941][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:02:59,262][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:02:59,582][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:02:59,902][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:03:00,223][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:03:00,542][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:03:00,863][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:03:01,183][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:03:01,503][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:03:01,824][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:03:02,144][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:03:02,466][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:03:03,714][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 18:03:04,447][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:03:04,449][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:03:04,451][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:03:05,082][__main__][INFO] - Iteration 314 took 27s (11.78% Gen, 85.95% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 15m 37s. Estimated total time: 7h 45m 14s. Time estimates for 10 more iterations: 4m 39s, 100 more iterations: 46m 31s, 500 more iterations: 3h 52m 37s. +[2026-03-25 18:03:05,084][__main__][INFO] - Starting iteration 314. +[2026-03-25 18:03:05,087][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:03:05,088][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:03:08,311][__main__][INFO] - Number of regex retries in iteration 314: 0 +[2026-03-25 18:03:08,312][__main__][INFO] - agents played in iteration 314 are Bob, Alice +[2026-03-25 18:03:08,854][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:03:09,510][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:03:09,803][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:03:10,124][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:03:10,445][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:03:10,765][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:03:11,086][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:03:11,407][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:03:11,726][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:03:12,045][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:03:12,366][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:03:12,687][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:03:13,006][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:03:13,327][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:03:13,647][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:03:13,968][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:03:14,288][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:03:14,609][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:03:14,929][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:03:15,249][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:03:15,569][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:03:15,889][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:03:16,208][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:03:16,527][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:03:16,848][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:03:17,169][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:03:17,490][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:03:17,809][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:03:18,130][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:03:18,449][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:03:18,770][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:03:19,089][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:03:19,410][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:03:19,730][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:03:20,051][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:03:20,371][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:03:20,691][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:03:21,012][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:03:21,332][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:03:21,653][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:03:21,974][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:03:22,295][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:03:22,616][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:03:22,935][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:03:23,256][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:03:23,577][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:03:23,897][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:03:24,218][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:03:24,540][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:03:24,861][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:03:25,181][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:03:25,501][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:03:25,822][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:03:26,437][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:03:26,758][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:03:27,079][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:03:27,398][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:03:27,718][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:03:28,040][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:03:28,362][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:03:28,682][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:03:29,002][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:03:29,322][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:03:29,643][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:03:29,964][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:03:30,286][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:03:30,951][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:03:31,678][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:03:31,680][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:03:31,682][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:03:32,312][__main__][INFO] - Iteration 315 took 27s (11.84% Gen, 85.83% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 3m 41s. Estimated total time: 7h 33m 46s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 53s. +[2026-03-25 18:03:32,315][__main__][INFO] - Starting iteration 315. +[2026-03-25 18:03:32,318][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:03:32,318][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:03:35,571][__main__][INFO] - Number of regex retries in iteration 315: 0 +[2026-03-25 18:03:35,572][__main__][INFO] - agents played in iteration 315 are Bob, Alice +[2026-03-25 18:03:36,129][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:03:36,784][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:03:37,074][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:03:37,395][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:03:37,714][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:03:38,033][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:03:38,353][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:03:38,674][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:03:38,995][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:03:39,315][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:03:39,635][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:03:39,954][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:03:40,274][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:03:40,594][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:03:40,916][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:03:41,238][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:03:41,558][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:03:41,880][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:03:42,200][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:03:42,520][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:03:42,841][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:03:43,162][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:03:43,484][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:03:43,803][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:03:44,125][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:03:44,445][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:03:44,765][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:03:45,086][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:03:45,406][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:03:45,726][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:03:46,047][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:03:46,368][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:03:46,688][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:03:47,008][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:03:47,327][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:03:47,647][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:03:47,967][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:03:48,288][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:03:48,609][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:03:48,929][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:03:49,249][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:03:49,569][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:03:49,890][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:03:50,210][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:03:50,530][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:03:50,850][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:03:51,171][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:03:51,491][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:03:51,811][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:03:52,130][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:03:52,449][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:03:52,770][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:03:53,091][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:03:53,703][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:03:54,023][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:03:54,344][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:03:54,666][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:03:54,985][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:03:55,306][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:03:55,627][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:03:55,947][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:03:56,267][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:03:56,588][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:03:56,909][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:03:57,228][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:03:57,548][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:03:58,204][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:03:58,934][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:03:58,937][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:03:58,938][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:03:59,544][__main__][INFO] - Iteration 316 took 27s (11.95% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 3m 15s. Estimated total time: 7h 33m 47s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 53s. +[2026-03-25 18:03:59,546][__main__][INFO] - Starting iteration 316. +[2026-03-25 18:03:59,549][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:03:59,550][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:04:02,806][__main__][INFO] - Number of regex retries in iteration 316: 0 +[2026-03-25 18:04:02,807][__main__][INFO] - agents played in iteration 316 are Bob, Alice +[2026-03-25 18:04:03,368][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:04:04,021][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:04:04,312][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:04:04,632][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:04:04,952][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:04:05,272][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:04:05,593][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:04:05,914][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:04:06,233][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:04:06,553][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:04:06,874][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:04:07,194][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:04:07,514][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:04:07,835][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:04:08,154][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:04:08,474][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:04:08,795][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:04:09,115][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:04:09,434][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:04:09,754][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:04:10,075][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:04:10,396][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:04:10,715][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:04:11,034][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:04:11,354][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:04:11,673][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:04:11,994][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:04:12,315][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:04:12,634][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:04:12,955][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:04:13,275][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:04:13,595][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:04:13,915][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:04:14,236][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:04:14,555][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:04:14,874][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:04:15,195][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:04:15,515][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:04:15,834][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:04:16,154][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:04:16,475][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:04:16,795][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:04:17,116][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:04:17,435][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:04:17,756][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:04:18,077][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:04:18,396][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:04:18,717][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:04:19,038][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:04:19,358][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:04:19,679][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:04:20,001][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:04:20,323][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:04:20,946][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:04:21,268][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:04:21,588][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:04:21,909][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:04:22,231][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:04:22,551][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:04:22,872][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:04:23,192][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:04:23,511][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:04:23,831][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:04:24,151][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:04:24,471][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:04:24,790][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:04:25,444][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:04:26,166][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:04:26,168][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:04:26,170][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:04:26,795][__main__][INFO] - Iteration 317 took 27s (11.95% Gen, 85.74% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 3m 8s. Estimated total time: 7h 34m 7s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 3s. +[2026-03-25 18:04:26,798][__main__][INFO] - Starting iteration 317. +[2026-03-25 18:04:26,800][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:04:26,801][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:04:30,244][__main__][INFO] - Number of regex retries in iteration 317: 0 +[2026-03-25 18:04:30,245][__main__][INFO] - agents played in iteration 317 are Bob, Alice +[2026-03-25 18:04:30,799][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:04:31,446][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:04:31,737][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:04:32,059][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:04:32,380][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:04:32,700][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:04:33,021][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:04:33,341][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:04:33,663][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:04:33,983][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:04:34,304][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:04:34,624][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:04:34,942][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:04:35,263][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:04:35,583][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:04:35,902][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:04:36,223][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:04:36,543][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:04:36,863][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:04:37,182][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:04:37,501][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:04:37,823][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:04:38,144][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:04:38,465][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:04:38,787][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:04:39,106][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:04:39,427][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:04:39,746][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:04:40,065][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:04:40,386][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:04:40,707][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:04:41,027][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:04:41,346][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:04:41,666][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:04:41,987][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:04:42,306][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:04:42,627][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:04:42,947][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:04:43,267][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:04:43,587][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:04:43,908][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:04:44,228][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:04:44,549][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:04:44,869][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:04:45,189][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:04:45,509][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:04:45,829][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:04:46,149][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:04:46,470][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:04:46,789][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:04:47,108][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:04:47,428][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:04:47,749][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:04:48,359][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:04:48,681][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:04:49,001][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:04:49,322][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:04:49,643][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:04:49,963][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:04:50,284][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:04:50,604][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:04:50,924][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:04:51,244][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:04:51,564][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:04:51,885][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:04:52,204][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:04:52,856][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:04:53,589][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:04:53,592][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:04:53,593][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:04:54,206][__main__][INFO] - Iteration 318 took 27s (12.57% Gen, 85.19% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 5m 20s. Estimated total time: 7h 36m 46s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 40s, 500 more iterations: 3h 48m 23s. +[2026-03-25 18:04:54,209][__main__][INFO] - Starting iteration 318. +[2026-03-25 18:04:54,212][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:04:54,212][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:04:57,420][__main__][INFO] - Number of regex retries in iteration 318: 0 +[2026-03-25 18:04:57,421][__main__][INFO] - agents played in iteration 318 are Bob, Alice +[2026-03-25 18:04:57,980][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:04:58,629][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:04:58,920][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:04:59,239][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:04:59,560][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:04:59,880][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:05:00,201][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:05:00,522][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:05:00,841][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:05:01,160][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:05:01,481][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:05:01,802][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:05:02,122][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:05:02,442][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:05:02,763][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:05:03,083][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:05:03,402][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:05:03,723][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:05:04,043][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:05:04,363][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:05:04,682][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:05:05,001][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:05:05,322][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:05:05,643][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:05:05,962][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:05:06,283][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:05:06,602][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:05:06,923][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:05:07,244][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:05:07,563][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:05:07,884][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:05:08,204][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:05:08,526][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:05:08,847][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:05:09,171][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:05:09,493][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:05:09,813][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:05:10,136][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:05:10,458][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:05:10,781][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:05:11,103][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:05:11,423][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:05:11,744][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:05:12,064][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:05:12,384][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:05:12,706][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:05:13,027][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:05:13,348][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:05:13,669][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:05:13,989][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:05:14,311][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:05:14,631][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:05:14,951][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:05:15,563][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:05:15,885][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:05:16,205][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:05:16,526][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:05:16,847][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:05:17,169][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:05:17,489][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:05:17,809][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:05:18,130][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:05:18,451][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:05:18,771][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:05:19,092][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:05:19,412][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:05:20,069][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:05:20,796][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:05:20,799][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:05:20,800][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:05:21,483][__main__][INFO] - Iteration 319 took 27s (11.76% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 2m 39s. Estimated total time: 7h 34m 32s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 16s. +[2026-03-25 18:05:21,486][__main__][INFO] - Starting iteration 319. +[2026-03-25 18:05:21,489][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:05:21,490][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:05:24,722][__main__][INFO] - Number of regex retries in iteration 319: 0 +[2026-03-25 18:05:24,723][__main__][INFO] - agents played in iteration 319 are Bob, Alice +[2026-03-25 18:05:25,263][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:05:25,919][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:05:26,208][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:05:26,529][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:05:26,849][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:05:27,169][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:05:27,490][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:05:27,809][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:05:28,130][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:05:28,451][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:05:28,772][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:05:29,091][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:05:29,410][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:05:29,731][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:05:30,051][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:05:30,371][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:05:30,690][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:05:31,010][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:05:31,330][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:05:31,650][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:05:31,970][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:05:32,289][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:05:32,608][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:05:32,928][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:05:33,247][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:05:33,567][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:05:33,887][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:05:34,208][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:05:34,528][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:05:34,849][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:05:35,168][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:05:35,490][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:05:35,809][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:05:36,128][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:05:36,448][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:05:36,768][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:05:37,089][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:05:37,410][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:05:37,730][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:05:38,049][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:05:38,370][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:05:38,690][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:05:39,010][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:05:39,329][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:05:39,650][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:05:39,970][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:05:40,289][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:05:40,609][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:05:40,929][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:05:41,249][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:05:41,569][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:05:41,888][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:05:42,208][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:05:42,824][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:05:43,143][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:05:43,463][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:05:43,782][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:05:44,103][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:05:44,424][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:05:44,744][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:05:45,064][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:05:45,384][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:05:45,706][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:05:46,026][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:05:46,345][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:05:46,666][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:05:47,326][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:05:48,052][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:05:48,055][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:05:48,056][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:05:48,683][__main__][INFO] - Iteration 320 took 27s (11.89% Gen, 85.80% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 0m 54s. Estimated total time: 7h 33m 15s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 37s. +[2026-03-25 18:05:48,685][__main__][INFO] - Starting iteration 320. +[2026-03-25 18:05:48,688][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:05:48,689][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:05:51,933][__main__][INFO] - Number of regex retries in iteration 320: 0 +[2026-03-25 18:05:51,934][__main__][INFO] - agents played in iteration 320 are Bob, Alice +[2026-03-25 18:05:52,482][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:05:53,138][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:05:53,428][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:05:53,750][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:05:54,069][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:05:54,389][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:05:54,709][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:05:55,028][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:05:55,348][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:05:55,668][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:05:55,989][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:05:56,309][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:05:56,628][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:05:56,947][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:05:57,269][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:05:57,591][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:05:57,911][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:05:58,233][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:05:58,553][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:05:58,873][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:05:59,195][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:05:59,518][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:05:59,839][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:06:00,161][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:06:00,482][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:06:00,803][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:06:01,124][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:06:01,444][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:06:01,766][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:06:02,085][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:06:02,405][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:06:02,725][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:06:03,045][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:06:03,366][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:06:03,686][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:06:04,007][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:06:04,328][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:06:04,649][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:06:04,969][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:06:05,290][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:06:05,610][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:06:05,929][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:06:06,249][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:06:06,570][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:06:06,890][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:06:07,211][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:06:07,530][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:06:07,849][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:06:08,169][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:06:08,490][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:06:08,811][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:06:09,131][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:06:09,450][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:06:10,068][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:06:10,389][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:06:10,709][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:06:11,029][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:06:11,348][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:06:11,669][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:06:11,989][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:06:12,310][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:06:12,630][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:06:12,951][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:06:13,272][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:06:13,593][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:06:13,914][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:06:15,173][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 18:06:15,903][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:06:15,905][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:06:15,907][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:06:16,572][__main__][INFO] - Iteration 321 took 27s (11.64% Gen, 85.97% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 11m 55s. Estimated total time: 7h 44m 44s. Time estimates for 10 more iterations: 4m 38s, 100 more iterations: 46m 28s, 500 more iterations: 3h 52m 22s. +[2026-03-25 18:06:16,574][__main__][INFO] - Starting iteration 321. +[2026-03-25 18:06:16,578][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:06:16,578][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:06:19,831][__main__][INFO] - Number of regex retries in iteration 321: 0 +[2026-03-25 18:06:19,832][__main__][INFO] - agents played in iteration 321 are Bob, Alice +[2026-03-25 18:06:20,376][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:06:21,032][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:06:21,321][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:06:21,643][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:06:21,964][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:06:22,283][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:06:22,602][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:06:22,921][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:06:23,240][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:06:23,561][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:06:23,881][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:06:24,203][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:06:24,522][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:06:24,843][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:06:25,164][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:06:25,485][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:06:25,804][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:06:26,125][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:06:26,444][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:06:26,765][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:06:27,084][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:06:27,406][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:06:27,727][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:06:28,047][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:06:28,367][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:06:28,687][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:06:29,007][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:06:29,327][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:06:29,648][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:06:29,970][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:06:30,290][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:06:30,609][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:06:30,929][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:06:31,249][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:06:31,568][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:06:31,887][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:06:32,206][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:06:32,525][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:06:32,845][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:06:33,166][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:06:33,487][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:06:33,806][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:06:34,126][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:06:34,447][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:06:34,768][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:06:35,088][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:06:35,408][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:06:35,728][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:06:36,048][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:06:36,369][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:06:36,690][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:06:37,010][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:06:37,330][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:06:37,943][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:06:38,264][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:06:38,584][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:06:38,906][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:06:39,226][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:06:39,547][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:06:39,866][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:06:40,187][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:06:40,509][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:06:40,828][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:06:41,147][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:06:41,468][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:06:41,788][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:06:42,448][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:06:43,182][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:06:43,185][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:06:43,186][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:06:43,855][__main__][INFO] - Iteration 322 took 27s (11.93% Gen, 85.61% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 1m 22s. Estimated total time: 7h 34m 38s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 19s. +[2026-03-25 18:06:43,858][__main__][INFO] - Starting iteration 322. +[2026-03-25 18:06:43,861][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:06:43,861][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:06:47,117][__main__][INFO] - Number of regex retries in iteration 322: 0 +[2026-03-25 18:06:47,118][__main__][INFO] - agents played in iteration 322 are Bob, Alice +[2026-03-25 18:06:47,671][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:06:48,339][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:06:48,629][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:06:48,950][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:06:49,271][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:06:49,592][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:06:49,914][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:06:50,234][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:06:50,555][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:06:50,875][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:06:51,195][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:06:51,516][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:06:51,836][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:06:52,157][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:06:52,478][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:06:52,799][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:06:53,118][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:06:53,438][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:06:53,759][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:06:54,080][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:06:54,402][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:06:54,723][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:06:55,043][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:06:55,364][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:06:55,685][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:06:56,006][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:06:56,326][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:06:56,647][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:06:56,968][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:06:57,288][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:06:57,608][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:06:57,929][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:06:58,248][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:06:58,568][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:06:58,889][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:06:59,208][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:06:59,527][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:06:59,848][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:07:00,167][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:07:00,487][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:07:00,807][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:07:01,128][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:07:01,449][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:07:01,769][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:07:02,088][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:07:02,408][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:07:02,728][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:07:03,047][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:07:03,367][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:07:03,688][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:07:04,008][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:07:04,327][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:07:04,647][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:07:05,263][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:07:05,582][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:07:05,903][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:07:06,223][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:07:06,543][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:07:06,864][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:07:07,185][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:07:07,505][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:07:07,825][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:07:08,144][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:07:08,465][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:07:08,785][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:07:09,106][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:07:09,766][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:07:10,495][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:07:10,498][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:07:10,499][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:07:11,191][__main__][INFO] - Iteration 323 took 27s (11.92% Gen, 85.55% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 1m 48s. Estimated total time: 7h 35m 31s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 45s. +[2026-03-25 18:07:11,194][__main__][INFO] - Starting iteration 323. +[2026-03-25 18:07:11,197][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:07:11,197][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:07:14,446][__main__][INFO] - Number of regex retries in iteration 323: 0 +[2026-03-25 18:07:14,447][__main__][INFO] - agents played in iteration 323 are Bob, Alice +[2026-03-25 18:07:14,987][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:07:15,644][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:07:15,932][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:07:16,253][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:07:16,574][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:07:16,894][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:07:17,213][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:07:17,532][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:07:17,853][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:07:18,173][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:07:18,493][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:07:18,813][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:07:19,133][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:07:19,452][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:07:19,772][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:07:20,091][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:07:20,411][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:07:20,731][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:07:21,050][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:07:21,370][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:07:21,691][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:07:22,010][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:07:22,331][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:07:22,651][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:07:22,971][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:07:23,290][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:07:23,609][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:07:23,929][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:07:24,250][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:07:24,569][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:07:24,889][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:07:25,208][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:07:25,527][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:07:25,847][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:07:26,167][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:07:26,488][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:07:26,808][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:07:27,127][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:07:27,447][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:07:27,768][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:07:28,089][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:07:28,409][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:07:28,728][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:07:29,049][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:07:29,369][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:07:29,690][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:07:30,009][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:07:30,330][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:07:30,649][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:07:30,968][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:07:31,288][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:07:31,608][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:07:31,928][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:07:32,543][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:07:32,863][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:07:33,184][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:07:33,505][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:07:33,826][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:07:34,147][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:07:34,467][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:07:34,789][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:07:35,108][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:07:35,428][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:07:35,750][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:07:36,070][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:07:36,392][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:07:37,061][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:07:37,794][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:07:37,796][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:07:37,798][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:07:38,491][__main__][INFO] - Iteration 324 took 27s (11.91% Gen, 85.55% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 0m 44s. Estimated total time: 7h 34m 55s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 27s. +[2026-03-25 18:07:38,494][__main__][INFO] - Starting iteration 324. +[2026-03-25 18:07:38,497][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:07:38,498][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:07:41,755][__main__][INFO] - Number of regex retries in iteration 324: 0 +[2026-03-25 18:07:41,756][__main__][INFO] - agents played in iteration 324 are Bob, Alice +[2026-03-25 18:07:42,297][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:07:42,955][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:07:43,248][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:07:43,569][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:07:43,889][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:07:44,211][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:07:44,530][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:07:44,850][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:07:45,171][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:07:45,491][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:07:45,810][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:07:46,130][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:07:46,450][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:07:46,770][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:07:47,089][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:07:47,409][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:07:47,730][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:07:48,051][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:07:48,370][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:07:48,690][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:07:49,009][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:07:49,329][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:07:49,649][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:07:49,970][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:07:50,291][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:07:50,611][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:07:50,932][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:07:51,253][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:07:51,572][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:07:51,893][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:07:52,212][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:07:52,533][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:07:52,854][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:07:53,174][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:07:53,493][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:07:53,814][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:07:54,134][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:07:54,454][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:07:54,774][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:07:55,094][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:07:55,414][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:07:55,734][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:07:56,056][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:07:56,377][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:07:56,696][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:07:57,017][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:07:57,337][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:07:57,657][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:07:57,977][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:07:58,297][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:07:58,617][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:07:58,936][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:07:59,256][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:07:59,872][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:08:00,191][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:08:00,510][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:08:00,830][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:08:01,149][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:08:01,471][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:08:01,791][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:08:02,112][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:08:02,433][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:08:02,753][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:08:03,074][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:08:03,394][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:08:03,715][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:08:04,376][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:08:05,102][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:08:05,105][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:08:05,106][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:08:05,736][__main__][INFO] - Iteration 325 took 27s (11.96% Gen, 85.72% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 59m 21s. Estimated total time: 7h 33m 59s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 59s. +[2026-03-25 18:08:05,739][__main__][INFO] - Starting iteration 325. +[2026-03-25 18:08:05,741][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:08:05,742][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:08:08,992][__main__][INFO] - Number of regex retries in iteration 325: 0 +[2026-03-25 18:08:08,993][__main__][INFO] - agents played in iteration 325 are Bob, Alice +[2026-03-25 18:08:09,535][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:08:10,191][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:08:10,481][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:08:10,802][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:08:11,123][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:08:11,444][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:08:11,766][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:08:12,086][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:08:12,406][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:08:12,727][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:08:13,047][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:08:13,368][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:08:13,688][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:08:14,007][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:08:14,326][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:08:14,647][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:08:14,968][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:08:15,288][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:08:15,609][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:08:15,928][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:08:16,247][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:08:16,567][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:08:16,886][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:08:17,207][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:08:17,526][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:08:17,847][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:08:18,169][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:08:18,489][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:08:18,810][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:08:19,131][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:08:19,450][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:08:19,771][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:08:20,091][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:08:20,413][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:08:20,733][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:08:21,054][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:08:21,374][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:08:21,695][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:08:22,014][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:08:22,335][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:08:22,655][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:08:22,976][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:08:23,297][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:08:23,618][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:08:23,939][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:08:24,261][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:08:24,581][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:08:24,901][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:08:25,223][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:08:25,545][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:08:25,867][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:08:26,188][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:08:26,509][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:08:27,129][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:08:27,449][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:08:27,771][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:08:28,093][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:08:28,414][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:08:28,733][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:08:29,052][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:08:29,372][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:08:29,694][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:08:30,015][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:08:30,336][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:08:30,656][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:08:30,975][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:08:31,639][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:08:32,371][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:08:32,374][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:08:32,375][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:08:33,005][__main__][INFO] - Iteration 326 took 27s (11.92% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 59m 19s. Estimated total time: 7h 34m 24s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 12s. +[2026-03-25 18:08:33,007][__main__][INFO] - Starting iteration 326. +[2026-03-25 18:08:33,010][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:08:33,011][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:08:36,272][__main__][INFO] - Number of regex retries in iteration 326: 0 +[2026-03-25 18:08:36,273][__main__][INFO] - agents played in iteration 326 are Bob, Alice +[2026-03-25 18:08:36,816][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:08:37,474][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:08:37,765][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:08:38,086][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:08:38,408][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:08:38,728][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:08:39,048][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:08:39,369][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:08:39,689][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:08:40,009][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:08:40,330][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:08:40,651][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:08:40,971][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:08:41,290][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:08:41,610][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:08:41,930][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:08:42,250][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:08:42,570][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:08:42,890][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:08:43,211][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:08:43,531][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:08:43,852][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:08:44,172][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:08:44,493][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:08:44,815][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:08:45,134][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:08:45,453][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:08:45,774][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:08:46,095][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:08:46,414][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:08:46,736][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:08:47,056][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:08:47,377][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:08:47,696][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:08:48,015][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:08:48,336][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:08:48,657][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:08:48,977][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:08:49,298][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:08:49,618][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:08:49,937][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:08:50,257][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:08:50,577][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:08:50,897][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:08:51,218][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:08:51,538][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:08:51,859][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:08:52,181][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:08:52,501][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:08:52,822][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:08:53,143][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:08:53,465][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:08:53,787][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:08:54,403][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:08:54,723][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:08:55,044][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:08:55,365][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:08:55,686][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:08:56,006][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:08:56,327][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:08:56,648][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:08:56,969][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:08:57,289][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:08:57,609][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:08:57,930][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:08:58,251][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:08:58,911][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:08:59,640][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:08:59,643][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:08:59,644][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:09:00,313][__main__][INFO] - Iteration 327 took 27s (11.95% Gen, 85.60% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 59m 31s. Estimated total time: 7h 35m 3s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 31s. +[2026-03-25 18:09:00,316][__main__][INFO] - Starting iteration 327. +[2026-03-25 18:09:00,319][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:09:00,319][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:09:03,595][__main__][INFO] - Number of regex retries in iteration 327: 0 +[2026-03-25 18:09:03,596][__main__][INFO] - agents played in iteration 327 are Bob, Alice +[2026-03-25 18:09:04,146][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:09:04,803][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:09:05,094][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:09:05,415][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:09:05,735][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:09:06,056][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:09:06,378][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:09:06,697][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:09:07,018][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:09:07,338][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:09:07,658][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:09:07,978][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:09:08,298][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:09:08,618][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:09:08,939][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:09:09,260][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:09:09,581][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:09:09,902][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:09:10,221][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:09:10,543][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:09:10,864][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:09:11,185][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:09:11,505][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:09:11,827][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:09:12,147][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:09:12,468][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:09:12,789][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:09:13,109][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:09:13,429][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:09:13,751][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:09:14,073][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:09:14,393][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:09:14,714][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:09:15,036][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:09:15,359][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:09:15,680][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:09:16,002][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:09:16,324][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:09:16,645][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:09:16,967][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:09:17,288][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:09:17,608][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:09:17,928][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:09:18,249][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:09:18,569][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:09:18,890][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:09:19,209][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:09:19,531][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:09:19,850][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:09:20,170][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:09:20,492][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:09:20,813][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:09:21,134][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:09:21,751][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:09:22,072][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:09:22,394][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:09:22,714][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:09:23,035][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:09:23,356][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:09:23,677][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:09:23,998][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:09:24,318][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:09:24,639][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:09:24,959][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:09:25,279][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:09:25,600][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:09:26,750][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:09:27,484][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:09:27,486][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:09:27,488][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:09:28,117][__main__][INFO] - Iteration 328 took 27s (11.79% Gen, 85.94% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 7m 19s. Estimated total time: 7h 43m 19s. Time estimates for 10 more iterations: 4m 37s, 100 more iterations: 46m 19s, 500 more iterations: 3h 51m 39s. +[2026-03-25 18:09:28,120][__main__][INFO] - Starting iteration 328. +[2026-03-25 18:09:28,123][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:09:28,123][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:09:31,394][__main__][INFO] - Number of regex retries in iteration 328: 0 +[2026-03-25 18:09:31,395][__main__][INFO] - agents played in iteration 328 are Bob, Alice +[2026-03-25 18:09:31,945][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:09:32,601][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:09:32,891][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:09:33,212][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:09:33,531][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:09:33,852][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:09:34,172][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:09:34,492][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:09:34,813][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:09:35,133][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:09:35,454][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:09:35,774][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:09:36,094][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:09:36,414][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:09:36,735][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:09:37,054][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:09:37,375][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:09:37,695][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:09:38,015][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:09:38,336][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:09:38,656][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:09:38,977][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:09:39,296][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:09:39,617][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:09:39,937][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:09:40,256][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:09:40,577][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:09:40,897][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:09:41,217][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:09:41,538][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:09:41,858][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:09:42,178][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:09:42,499][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:09:42,820][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:09:43,139][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:09:43,460][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:09:43,781][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:09:44,102][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:09:44,423][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:09:44,744][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:09:45,064][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:09:45,385][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:09:45,705][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:09:46,027][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:09:46,348][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:09:46,669][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:09:46,991][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:09:47,310][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:09:47,630][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:09:47,949][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:09:48,270][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:09:48,591][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:09:48,910][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:09:49,525][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:09:49,846][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:09:50,168][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:09:50,490][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:09:50,811][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:09:51,131][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:09:51,451][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:09:51,772][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:09:52,092][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:09:52,413][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:09:52,734][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:09:53,055][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:09:53,375][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:09:54,035][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:09:54,770][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:09:54,772][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:09:54,774][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:09:55,402][__main__][INFO] - Iteration 329 took 27s (11.99% Gen, 85.70% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 58m 12s. Estimated total time: 7h 34m 40s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 20s. +[2026-03-25 18:09:55,405][__main__][INFO] - Starting iteration 329. +[2026-03-25 18:09:55,408][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:09:55,409][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:09:58,691][__main__][INFO] - Number of regex retries in iteration 329: 0 +[2026-03-25 18:09:58,692][__main__][INFO] - agents played in iteration 329 are Bob, Alice +[2026-03-25 18:09:59,242][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:09:59,892][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:10:00,717][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:10:01,037][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:10:01,358][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:10:01,679][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:10:02,001][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:10:02,322][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:10:02,642][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:10:02,964][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:10:03,283][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:10:03,605][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:10:03,926][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:10:04,248][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:10:04,570][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:10:04,891][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:10:05,212][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:10:05,534][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:10:05,855][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:10:06,178][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:10:06,499][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:10:06,820][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:10:07,143][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:10:07,465][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:10:07,785][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:10:08,106][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:10:08,426][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:10:08,747][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:10:09,069][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:10:09,389][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:10:09,710][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:10:10,030][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:10:10,351][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:10:10,672][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:10:10,994][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:10:11,315][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:10:11,636][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:10:11,957][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:10:12,279][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:10:12,601][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:10:12,921][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:10:13,241][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:10:13,562][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:10:13,883][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:10:14,204][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:10:14,524][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:10:14,843][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:10:15,163][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:10:15,483][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:10:15,803][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:10:16,123][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:10:16,444][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:10:16,765][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:10:17,378][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:10:17,700][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:10:18,022][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:10:18,343][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:10:18,664][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:10:18,984][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:10:19,305][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:10:19,625][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:10:19,946][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:10:20,267][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:10:20,588][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:10:20,910][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:10:21,230][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:10:21,883][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:22 +[2026-03-25 18:10:22,619][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:10:22,621][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:10:22,623][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:10:23,248][__main__][INFO] - Iteration 330 took 27s (11.79% Gen, 85.96% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 7m 5s. Estimated total time: 7h 44m 0s. Time estimates for 10 more iterations: 4m 38s, 100 more iterations: 46m 24s, 500 more iterations: 3h 52m 0s. +[2026-03-25 18:10:23,250][__main__][INFO] - Starting iteration 330. +[2026-03-25 18:10:23,253][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:10:23,254][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:10:26,452][__main__][INFO] - Number of regex retries in iteration 330: 0 +[2026-03-25 18:10:26,453][__main__][INFO] - agents played in iteration 330 are Bob, Alice +[2026-03-25 18:10:26,998][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:10:27,647][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:10:27,938][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:10:28,261][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:10:28,582][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:10:28,902][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:10:29,223][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:10:29,543][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:10:29,865][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:10:30,186][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:10:30,507][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:10:30,827][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:10:31,148][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:10:31,468][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:10:31,788][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:10:32,109][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:10:32,428][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:10:32,749][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:10:33,070][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:10:33,390][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:10:33,710][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:10:34,030][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:10:34,351][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:10:34,671][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:10:34,991][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:10:35,311][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:10:35,631][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:10:35,951][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:10:36,273][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:10:36,594][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:10:36,915][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:10:37,235][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:10:37,557][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:10:37,878][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:10:38,198][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:10:38,518][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:10:38,838][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:10:39,159][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:10:39,480][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:10:39,801][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:10:40,123][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:10:40,445][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:10:40,765][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:10:41,085][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:10:41,406][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:10:41,727][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:10:42,046][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:10:42,366][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:10:42,686][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:10:43,007][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:10:43,326][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:10:43,647][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:10:43,968][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:10:44,580][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:10:44,902][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:10:45,223][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:10:45,545][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:10:45,866][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:10:46,187][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:10:46,507][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:10:46,828][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:10:47,149][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:10:47,471][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:10:47,790][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:10:48,110][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:10:48,431][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:10:49,084][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:10:49,817][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:10:49,820][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:10:49,821][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:10:50,450][__main__][INFO] - Iteration 331 took 27s (11.76% Gen, 85.92% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 55m 55s. Estimated total time: 7h 33m 17s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 38s. +[2026-03-25 18:10:50,452][__main__][INFO] - Starting iteration 331. +[2026-03-25 18:10:50,455][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:10:50,456][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:10:53,687][__main__][INFO] - Number of regex retries in iteration 331: 0 +[2026-03-25 18:10:53,687][__main__][INFO] - agents played in iteration 331 are Bob, Alice +[2026-03-25 18:10:54,252][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:10:54,923][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:10:55,216][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:10:55,537][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:10:55,860][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:10:56,180][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:10:56,501][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:10:56,822][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:10:57,145][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:10:57,467][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:10:57,788][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:10:58,109][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:10:58,431][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:10:58,751][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:10:59,072][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:10:59,394][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:10:59,715][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:11:00,035][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:11:00,357][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:11:00,678][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:11:00,999][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:11:01,321][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:11:01,642][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:11:01,963][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:11:02,285][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:11:02,605][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:11:02,926][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:11:03,247][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:11:03,568][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:11:03,889][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:11:04,210][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:11:04,531][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:11:04,851][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:11:05,173][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:11:05,494][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:11:05,815][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:11:06,135][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:11:06,455][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:11:06,776][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:11:07,096][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:11:07,416][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:11:07,736][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:11:08,055][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:11:08,375][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:11:08,694][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:11:09,014][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:11:09,333][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:11:09,653][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:11:09,974][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:11:10,294][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:11:10,614][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:11:10,935][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:11:11,256][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:11:11,871][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:11:12,191][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:11:12,510][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:11:12,829][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:11:13,150][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:11:13,471][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:11:13,792][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:11:14,114][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:11:14,435][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:11:14,755][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:11:15,077][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:11:15,397][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:11:15,718][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:11:16,379][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:11:17,120][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:11:17,122][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:11:17,124][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:11:17,756][__main__][INFO] - Iteration 332 took 27s (11.84% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 57m 12s. Estimated total time: 7h 35m 2s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 31s. +[2026-03-25 18:11:17,759][__main__][INFO] - Starting iteration 332. +[2026-03-25 18:11:17,762][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:11:17,762][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:11:20,976][__main__][INFO] - Number of regex retries in iteration 332: 0 +[2026-03-25 18:11:20,977][__main__][INFO] - agents played in iteration 332 are Bob, Alice +[2026-03-25 18:11:21,525][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:11:22,183][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:11:22,474][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:11:22,796][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:11:23,117][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:11:23,438][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:11:23,757][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:11:24,077][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:11:24,398][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:11:24,719][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:11:25,041][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:11:25,363][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:11:25,685][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:11:26,006][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:11:26,327][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:11:26,648][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:11:26,968][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:11:27,288][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:11:27,609][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:11:27,929][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:11:28,249][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:11:28,571][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:11:28,891][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:11:29,211][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:11:29,530][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:11:29,851][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:11:30,171][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:11:30,494][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:11:30,815][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:11:31,134][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:11:31,455][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:11:31,775][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:11:32,095][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:11:32,416][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:11:32,736][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:11:33,056][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:11:33,377][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:11:33,697][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:11:34,018][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:11:34,338][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:11:34,658][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:11:34,980][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:11:35,300][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:11:35,619][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:11:35,941][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:11:36,262][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:11:36,582][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:11:36,902][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:11:37,223][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:11:37,544][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:11:37,864][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:11:38,184][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:11:38,504][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:11:39,118][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:11:39,438][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:11:39,760][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:11:40,082][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:11:40,403][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:11:40,724][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:11:41,045][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:11:41,367][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:11:41,688][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:11:42,007][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:11:42,328][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:11:42,647][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:11:42,968][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:11:43,642][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:11:44,374][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:11:44,377][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:11:44,378][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:11:45,066][__main__][INFO] - Iteration 333 took 27s (11.77% Gen, 85.70% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 56m 47s. Estimated total time: 7h 35m 4s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 30s, 500 more iterations: 3h 47m 32s. +[2026-03-25 18:11:45,068][__main__][INFO] - Starting iteration 333. +[2026-03-25 18:11:45,071][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:11:45,071][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:11:48,298][__main__][INFO] - Number of regex retries in iteration 333: 0 +[2026-03-25 18:11:48,299][__main__][INFO] - agents played in iteration 333 are Bob, Alice +[2026-03-25 18:11:48,849][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:11:49,505][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:11:49,796][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:11:50,117][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:11:50,436][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:11:50,757][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:11:51,077][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:11:51,396][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:11:51,717][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:11:52,039][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:11:52,360][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:11:52,680][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:11:52,999][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:11:53,321][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:11:53,642][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:11:53,963][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:11:54,284][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:11:54,604][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:11:54,925][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:11:55,246][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:11:55,567][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:11:55,887][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:11:56,208][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:11:56,527][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:11:56,847][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:11:57,167][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:11:57,486][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:11:57,807][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:11:58,127][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:11:58,447][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:11:58,768][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:11:59,089][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:11:59,410][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:11:59,731][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:12:00,050][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:12:00,370][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:12:00,691][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:12:01,013][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:12:01,334][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:12:01,654][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:12:01,976][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:12:02,296][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:12:02,616][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:12:02,937][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:12:03,258][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:12:03,577][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:12:03,897][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:12:04,216][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:12:04,537][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:12:04,858][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:12:05,177][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:12:05,497][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:12:05,817][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:12:06,433][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:12:06,753][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:12:07,073][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:12:07,392][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:12:07,713][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:12:08,035][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:12:08,355][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:12:08,675][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:12:08,995][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:12:09,315][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:12:09,636][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:12:09,957][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:12:10,279][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:12:10,939][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:12:11,697][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:12:11,699][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:12:11,701][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:12:12,492][__main__][INFO] - Iteration 334 took 27s (11.77% Gen, 85.34% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 58m 17s. Estimated total time: 7h 37m 2s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 42s, 500 more iterations: 3h 48m 31s. +[2026-03-25 18:12:12,494][__main__][INFO] - Starting iteration 334. +[2026-03-25 18:12:12,498][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:12:12,498][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:12:15,715][__main__][INFO] - Number of regex retries in iteration 334: 0 +[2026-03-25 18:12:15,716][__main__][INFO] - agents played in iteration 334 are Bob, Alice +[2026-03-25 18:12:16,278][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:12:16,931][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:12:17,222][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:12:17,544][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:12:17,863][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:12:18,184][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:12:18,505][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:12:18,827][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:12:19,148][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:12:19,468][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:12:19,787][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:12:20,108][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:12:20,428][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:12:20,746][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:12:21,065][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:12:21,386][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:12:21,708][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:12:22,027][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:12:22,347][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:12:22,669][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:12:22,990][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:12:23,310][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:12:23,629][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:12:23,950][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:12:24,269][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:12:24,590][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:12:24,909][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:12:25,229][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:12:25,550][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:12:25,870][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:12:26,190][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:12:26,510][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:12:26,830][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:12:27,149][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:12:27,469][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:12:27,789][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:12:28,109][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:12:28,429][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:12:28,749][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:12:29,069][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:12:29,389][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:12:29,711][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:12:30,031][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:12:30,352][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:12:30,672][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:12:30,991][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:12:31,311][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:12:31,631][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:12:31,951][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:12:32,273][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:12:32,594][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:12:32,915][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:12:33,236][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:12:33,855][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:12:34,178][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:12:34,499][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:12:34,818][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:12:35,138][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:12:35,458][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:12:35,778][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:12:36,097][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:12:36,416][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:12:36,738][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:12:37,059][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:12:37,380][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:12:37,703][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:12:38,377][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:12:39,108][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:12:39,111][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:12:39,113][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:12:39,742][__main__][INFO] - Iteration 335 took 27s (11.81% Gen, 85.87% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 54m 53s. Estimated total time: 7h 34m 6s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 3s. +[2026-03-25 18:12:39,745][__main__][INFO] - Starting iteration 335. +[2026-03-25 18:12:39,748][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:12:39,748][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:12:42,978][__main__][INFO] - Number of regex retries in iteration 335: 0 +[2026-03-25 18:12:42,979][__main__][INFO] - agents played in iteration 335 are Bob, Alice +[2026-03-25 18:12:43,550][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:12:44,208][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:12:44,499][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:12:44,821][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:12:45,143][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:12:45,465][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:12:45,785][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:12:46,107][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:12:46,428][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:12:46,747][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:12:47,069][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:12:47,389][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:12:47,709][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:12:48,029][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:12:48,351][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:12:48,671][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:12:48,990][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:12:49,310][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:12:49,629][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:12:49,950][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:12:50,272][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:12:50,591][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:12:50,911][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:12:51,232][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:12:51,553][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:12:51,872][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:12:52,193][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:12:52,512][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:12:52,831][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:12:53,151][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:12:53,472][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:12:53,792][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:12:54,112][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:12:54,431][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:12:54,752][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:12:55,071][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:12:55,391][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:12:55,711][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:12:56,031][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:12:56,352][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:12:56,672][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:12:56,991][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:12:57,311][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:12:57,631][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:12:57,951][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:12:58,271][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:12:58,591][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:12:58,910][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:12:59,229][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:12:59,549][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:12:59,869][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:13:00,189][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:13:00,508][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:13:01,126][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:13:01,446][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:13:01,768][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:13:02,088][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:13:02,408][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:13:02,728][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:13:03,049][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:13:03,369][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:13:03,690][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:13:04,010][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:13:04,331][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:13:04,650][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:13:04,969][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:13:05,632][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:13:06,366][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:13:06,368][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:13:06,370][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:13:06,994][__main__][INFO] - Iteration 336 took 27s (11.86% Gen, 85.85% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 54m 28s. Estimated total time: 7h 34m 7s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 3s. +[2026-03-25 18:13:06,996][__main__][INFO] - Starting iteration 336. +[2026-03-25 18:13:06,999][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:13:07,000][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:13:10,187][__main__][INFO] - Number of regex retries in iteration 336: 0 +[2026-03-25 18:13:10,188][__main__][INFO] - agents played in iteration 336 are Bob, Alice +[2026-03-25 18:13:10,748][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:13:11,407][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:13:11,697][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:13:12,018][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:13:12,337][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:13:12,660][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:13:12,981][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:13:13,301][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:13:13,622][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:13:13,942][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:13:14,262][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:13:14,583][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:13:14,904][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:13:15,224][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:13:15,546][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:13:15,866][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:13:16,187][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:13:16,508][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:13:16,826][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:13:17,147][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:13:17,466][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:13:17,786][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:13:18,108][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:13:18,428][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:13:18,749][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:13:19,068][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:13:19,388][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:13:19,707][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:13:20,026][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:13:20,347][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:13:20,668][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:13:20,988][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:13:21,307][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:13:21,627][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:13:21,948][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:13:22,268][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:13:22,589][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:13:22,910][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:13:23,231][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:13:23,552][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:13:23,874][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:13:24,193][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:13:24,514][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:13:24,835][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:13:25,157][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:13:25,476][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:13:25,796][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:13:26,119][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:13:26,440][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:13:26,761][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:13:27,082][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:13:27,402][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:13:27,723][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:13:28,339][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:13:28,661][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:13:28,982][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:13:29,304][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:13:29,626][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:13:29,946][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:13:30,266][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:13:30,589][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:13:30,910][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:13:31,231][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:13:31,550][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:13:31,871][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:13:32,192][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:13:32,848][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:13:33,579][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:13:33,581][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:13:33,583][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:13:34,314][__main__][INFO] - Iteration 337 took 27s (11.67% Gen, 85.65% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 55m 9s. Estimated total time: 7h 35m 15s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 37s. +[2026-03-25 18:13:34,316][__main__][INFO] - Starting iteration 337. +[2026-03-25 18:13:34,319][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:13:34,320][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:13:37,528][__main__][INFO] - Number of regex retries in iteration 337: 0 +[2026-03-25 18:13:37,529][__main__][INFO] - agents played in iteration 337 are Bob, Alice +[2026-03-25 18:13:38,081][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:13:38,730][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:13:39,019][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:13:39,340][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:13:39,660][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:13:39,983][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:13:40,304][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:13:40,624][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:13:40,944][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:13:41,265][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:13:41,585][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:13:41,906][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:13:42,227][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:13:42,547][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:13:42,868][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:13:43,189][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:13:43,509][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:13:43,829][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:13:44,149][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:13:44,470][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:13:44,790][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:13:45,110][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:13:45,428][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:13:45,749][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:13:46,070][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:13:46,391][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:13:46,709][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:13:47,030][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:13:47,350][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:13:47,670][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:13:47,991][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:13:48,310][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:13:48,629][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:13:48,949][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:13:49,269][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:13:49,589][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:13:49,910][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:13:50,229][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:13:50,550][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:13:50,870][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:13:51,190][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:13:51,510][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:13:51,829][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:13:52,149][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:13:52,470][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:13:52,790][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:13:53,109][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:13:53,428][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:13:53,749][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:13:54,070][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:13:54,389][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:13:54,709][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:13:55,029][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:13:55,640][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:13:55,961][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:13:56,282][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:13:56,601][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:13:56,920][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:13:57,242][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:13:57,563][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:13:57,883][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:13:58,203][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:13:58,525][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:13:58,845][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:13:59,165][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:13:59,486][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:14:00,139][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:14:00,870][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:14:00,873][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:14:00,874][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:14:01,492][__main__][INFO] - Iteration 338 took 27s (11.81% Gen, 85.91% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 52m 19s. Estimated total time: 7h 32m 53s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 17s, 500 more iterations: 3h 46m 26s. +[2026-03-25 18:14:01,494][__main__][INFO] - Starting iteration 338. +[2026-03-25 18:14:01,497][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:14:01,498][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:14:04,700][__main__][INFO] - Number of regex retries in iteration 338: 0 +[2026-03-25 18:14:04,701][__main__][INFO] - agents played in iteration 338 are Bob, Alice +[2026-03-25 18:14:05,245][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:14:05,893][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:14:06,184][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:14:06,504][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:14:06,824][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:14:07,145][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:14:07,466][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:14:07,785][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:14:08,105][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:14:08,426][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:14:08,747][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:14:09,067][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:14:09,388][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:14:09,709][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:14:10,030][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:14:10,351][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:14:10,672][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:14:10,991][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:14:11,312][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:14:11,633][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:14:11,953][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:14:12,274][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:14:12,594][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:14:12,914][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:14:13,233][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:14:13,554][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:14:13,875][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:14:14,196][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:14:14,516][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:14:14,835][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:14:15,155][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:14:15,475][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:14:15,796][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:14:16,115][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:14:16,436][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:14:16,754][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:14:17,075][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:14:17,395][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:14:17,714][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:14:18,034][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:14:18,354][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:14:18,674][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:14:18,995][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:14:19,315][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:14:19,636][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:14:19,956][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:14:20,276][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:14:20,597][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:14:20,916][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:14:21,237][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:14:21,557][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:14:21,878][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:14:22,197][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:14:22,824][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:14:23,144][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:14:23,465][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:14:23,786][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:14:24,105][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:14:24,424][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:14:24,745][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:14:25,064][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:14:25,385][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:14:25,706][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:14:26,026][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:14:26,345][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:14:26,666][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:14:27,317][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:14:28,046][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:14:28,048][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:14:28,049][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:14:28,665][__main__][INFO] - Iteration 339 took 27s (11.79% Gen, 85.94% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 51m 47s. Estimated total time: 7h 32m 48s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 16s, 500 more iterations: 3h 46m 24s. +[2026-03-25 18:14:28,667][__main__][INFO] - Starting iteration 339. +[2026-03-25 18:14:28,670][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:14:28,671][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:14:31,931][__main__][INFO] - Number of regex retries in iteration 339: 0 +[2026-03-25 18:14:31,932][__main__][INFO] - agents played in iteration 339 are Bob, Alice +[2026-03-25 18:14:32,481][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:14:33,131][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:14:33,422][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:14:33,744][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:14:34,064][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:14:34,384][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:14:34,704][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:14:35,025][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:14:35,346][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:14:35,667][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:14:35,987][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:14:36,308][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:14:36,629][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:14:36,949][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:14:37,269][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:14:37,589][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:14:37,908][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:14:38,228][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:14:38,549][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:14:38,868][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:14:39,189][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:14:39,510][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:14:39,829][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:14:40,149][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:14:40,469][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:14:40,789][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:14:41,110][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:14:41,431][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:14:41,750][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:14:42,069][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:14:42,390][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:14:42,709][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:14:43,028][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:14:43,348][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:14:43,668][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:14:43,988][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:14:44,309][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:14:44,630][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:14:44,949][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:14:45,269][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:14:45,588][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:14:45,908][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:14:46,228][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:14:46,547][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:14:46,868][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:14:47,187][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:14:47,508][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:14:47,827][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:14:48,148][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:14:48,467][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:14:48,788][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:14:49,109][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:14:49,427][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:14:50,038][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:14:50,358][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:14:50,677][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:14:50,997][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:14:51,316][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:14:51,637][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:14:51,957][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:14:52,277][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:14:52,597][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:14:52,916][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:14:53,235][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:14:53,555][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:14:53,876][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:14:54,578][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:14:55,311][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:14:55,313][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:14:55,315][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:14:55,927][__main__][INFO] - Iteration 340 took 27s (11.96% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 52m 49s. Estimated total time: 7h 34m 18s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 25s, 500 more iterations: 3h 47m 9s. +[2026-03-25 18:14:55,930][__main__][INFO] - Starting iteration 340. +[2026-03-25 18:14:55,932][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:14:55,933][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:14:59,146][__main__][INFO] - Number of regex retries in iteration 340: 0 +[2026-03-25 18:14:59,147][__main__][INFO] - agents played in iteration 340 are Bob, Alice +[2026-03-25 18:14:59,694][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:15:00,341][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:15:00,631][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:15:00,951][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:15:01,272][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:15:01,591][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:15:01,910][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:15:02,230][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:15:02,549][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:15:02,869][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:15:03,188][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:15:03,507][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:15:03,826][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:15:04,146][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:15:04,465][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:15:04,785][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:15:05,106][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:15:05,428][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:15:05,748][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:15:06,067][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:15:06,388][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:15:06,708][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:15:07,027][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:15:07,347][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:15:07,666][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:15:07,987][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:15:08,306][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:15:08,627][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:15:08,946][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:15:09,267][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:15:09,587][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:15:09,908][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:15:10,226][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:15:10,546][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:15:10,865][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:15:11,186][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:15:11,505][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:15:11,825][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:15:12,144][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:15:12,464][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:15:12,783][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:15:13,103][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:15:13,423][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:15:13,744][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:15:14,065][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:15:14,385][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:15:14,705][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:15:15,026][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:15:15,347][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:15:15,667][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:15:15,987][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:15:16,308][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:15:16,628][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:15:17,247][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:15:17,568][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:15:17,889][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:15:18,210][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:15:18,531][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:15:18,851][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:15:19,171][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:15:19,491][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:15:19,811][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:15:20,130][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:15:20,450][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:15:20,770][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:15:21,092][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:15:21,763][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:15:22,492][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:15:22,494][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:15:22,496][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:15:23,116][__main__][INFO] - Iteration 341 took 27s (11.82% Gen, 85.89% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 51m 8s. Estimated total time: 7h 33m 4s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 32s. +[2026-03-25 18:15:23,118][__main__][INFO] - Starting iteration 341. +[2026-03-25 18:15:23,121][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:15:23,122][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:15:26,314][__main__][INFO] - Number of regex retries in iteration 341: 0 +[2026-03-25 18:15:26,315][__main__][INFO] - agents played in iteration 341 are Bob, Alice +[2026-03-25 18:15:26,872][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:15:27,520][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:15:27,809][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:15:28,130][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:15:28,449][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:15:28,768][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:15:29,088][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:15:29,408][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:15:29,727][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:15:30,046][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:15:30,367][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:15:30,688][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:15:31,007][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:15:31,327][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:15:31,648][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:15:31,968][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:15:32,289][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:15:32,609][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:15:32,929][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:15:33,248][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:15:33,569][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:15:33,888][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:15:34,208][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:15:34,529][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:15:34,849][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:15:35,170][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:15:35,490][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:15:35,811][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:15:36,131][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:15:36,450][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:15:36,772][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:15:37,090][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:15:37,410][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:15:37,729][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:15:38,048][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:15:38,368][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:15:38,687][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:15:39,007][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:15:39,328][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:15:39,648][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:15:39,968][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:15:40,289][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:15:40,609][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:15:40,928][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:15:41,248][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:15:41,569][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:15:41,888][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:15:42,207][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:15:42,528][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:15:42,848][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:15:43,169][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:15:43,489][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:15:43,810][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:15:44,423][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:15:44,743][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:15:45,065][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:15:45,385][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:15:45,704][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:15:46,026][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:15:46,345][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:15:46,666][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:15:46,987][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:15:47,306][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:15:47,625][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:15:47,945][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:15:48,266][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:15:48,918][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:15:49,649][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:15:49,652][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:15:49,653][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:15:50,265][__main__][INFO] - Iteration 342 took 27s (11.76% Gen, 85.98% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 50m 1s. Estimated total time: 7h 32m 24s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 14s, 500 more iterations: 3h 46m 12s. +[2026-03-25 18:15:50,267][__main__][INFO] - Starting iteration 342. +[2026-03-25 18:15:50,270][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:15:50,271][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:15:53,510][__main__][INFO] - Number of regex retries in iteration 342: 0 +[2026-03-25 18:15:53,510][__main__][INFO] - agents played in iteration 342 are Bob, Alice +[2026-03-25 18:15:54,072][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:15:54,727][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:15:55,017][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:15:55,339][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:15:55,658][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:15:55,979][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:15:56,299][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:15:56,619][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:15:56,939][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:15:57,259][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:15:57,579][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:15:57,899][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:15:58,219][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:15:58,539][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:15:58,860][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:15:59,180][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:15:59,500][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:15:59,820][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:16:00,142][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:16:00,463][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:16:00,784][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:16:01,103][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:16:01,425][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:16:01,745][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:16:02,065][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:16:02,387][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:16:02,708][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:16:03,027][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:16:03,347][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:16:03,668][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:16:03,989][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:16:04,308][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:16:04,628][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:16:04,949][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:16:05,270][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:16:05,589][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:16:05,908][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:16:06,229][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:16:06,550][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:16:06,870][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:16:07,190][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:16:07,510][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:16:07,832][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:16:08,151][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:16:08,472][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:16:08,792][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:16:09,112][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:16:09,433][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:16:09,754][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:16:10,077][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:16:10,397][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:16:10,719][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:16:11,041][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:16:11,663][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:16:11,985][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:16:12,307][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:16:12,629][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:16:12,949][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:16:13,269][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:16:13,589][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:16:13,909][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:16:14,229][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:16:14,549][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:16:14,870][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:16:15,190][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:16:15,510][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:16:16,170][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:16:16,903][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:16:16,906][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:16:16,907][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:16:17,695][__main__][INFO] - Iteration 343 took 27s (11.81% Gen, 85.31% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 54m 16s. Estimated total time: 7h 37m 6s. Time estimates for 10 more iterations: 4m 34s, 100 more iterations: 45m 42s, 500 more iterations: 3h 48m 33s. +[2026-03-25 18:16:17,698][__main__][INFO] - Starting iteration 343. +[2026-03-25 18:16:17,700][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:16:17,701][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:16:20,937][__main__][INFO] - Number of regex retries in iteration 343: 0 +[2026-03-25 18:16:20,938][__main__][INFO] - agents played in iteration 343 are Bob, Alice +[2026-03-25 18:16:21,479][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:16:22,132][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:16:22,421][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:16:22,743][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:16:23,063][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:16:23,384][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:16:23,703][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:16:24,023][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:16:24,343][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:16:24,664][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:16:24,985][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:16:25,306][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:16:25,626][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:16:25,947][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:16:26,268][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:16:26,589][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:16:26,908][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:16:27,229][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:16:27,549][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:16:27,868][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:16:28,190][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:16:28,510][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:16:28,830][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:16:29,149][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:16:29,471][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:16:29,791][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:16:30,110][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:16:30,431][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:16:30,751][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:16:31,072][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:16:31,392][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:16:31,711][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:16:32,030][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:16:32,351][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:16:32,671][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:16:32,991][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:16:33,311][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:16:33,631][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:16:33,950][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:16:34,269][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:16:34,589][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:16:34,909][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:16:35,229][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:16:35,549][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:16:35,870][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:16:36,189][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:16:36,510][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:16:36,830][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:16:37,150][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:16:37,471][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:16:37,792][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:16:38,113][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:16:38,434][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:16:39,048][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:16:39,369][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:16:39,689][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:16:40,010][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:16:40,330][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:16:40,649][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:16:40,970][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:16:41,291][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:16:41,610][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:16:41,930][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:16:42,250][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:16:42,572][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:16:42,892][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:16:43,551][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:16:44,279][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:16:44,282][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:16:44,283][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:16:44,888][__main__][INFO] - Iteration 344 took 27s (11.90% Gen, 85.86% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 49m 51s. Estimated total time: 7h 33m 8s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 34s. +[2026-03-25 18:16:44,891][__main__][INFO] - Starting iteration 344. +[2026-03-25 18:16:44,894][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:16:44,894][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:16:48,106][__main__][INFO] - Number of regex retries in iteration 344: 0 +[2026-03-25 18:16:48,107][__main__][INFO] - agents played in iteration 344 are Bob, Alice +[2026-03-25 18:16:48,638][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:16:49,291][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:16:49,581][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:16:49,903][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:16:50,224][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:16:50,545][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:16:50,867][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:16:51,187][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:16:51,508][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:16:51,828][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:16:52,147][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:16:52,468][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:16:52,788][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:16:53,108][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:16:53,428][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:16:53,749][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:16:54,070][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:16:54,391][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:16:54,709][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:16:55,030][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:16:55,350][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:16:55,670][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:16:55,989][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:16:56,309][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:16:56,628][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:16:56,949][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:16:57,269][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:16:57,589][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:16:57,910][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:16:58,229][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:16:58,549][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:16:58,869][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:16:59,189][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:16:59,509][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:16:59,830][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:17:00,152][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:17:00,475][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:17:00,798][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:17:01,118][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:17:01,440][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:17:01,762][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:17:02,085][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:17:02,406][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:17:02,729][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:17:03,050][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:17:03,370][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:17:03,691][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:17:04,011][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:17:04,332][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:17:04,652][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:17:04,972][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:17:05,292][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:17:05,611][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:17:06,228][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:17:06,550][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:17:06,870][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:17:07,190][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:17:07,510][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:17:07,832][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:17:08,152][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:17:08,473][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:17:08,794][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:17:09,115][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:17:09,435][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:17:09,757][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:17:10,078][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:17:10,739][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:17:11,465][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:17:11,467][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:17:11,469][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:17:12,086][__main__][INFO] - Iteration 345 took 27s (11.81% Gen, 85.91% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 49m 28s. Estimated total time: 7h 33m 13s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 36s. +[2026-03-25 18:17:12,088][__main__][INFO] - Starting iteration 345. +[2026-03-25 18:17:12,091][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:17:12,092][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:17:15,347][__main__][INFO] - Number of regex retries in iteration 345: 0 +[2026-03-25 18:17:15,347][__main__][INFO] - agents played in iteration 345 are Bob, Alice +[2026-03-25 18:17:15,901][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:17:16,555][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:17:16,845][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:17:17,167][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:17:17,488][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:17:17,807][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:17:18,127][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:17:18,448][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:17:18,767][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:17:19,088][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:17:19,408][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:17:19,728][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:17:20,049][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:17:20,371][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:17:20,692][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:17:21,013][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:17:21,333][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:17:21,652][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:17:21,971][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:17:22,291][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:17:22,611][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:17:22,931][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:17:23,252][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:17:23,571][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:17:23,892][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:17:24,212][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:17:24,532][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:17:24,853][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:17:25,175][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:17:25,496][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:17:25,816][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:17:26,136][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:17:26,456][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:17:26,776][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:17:27,095][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:17:27,415][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:17:27,736][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:17:28,058][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:17:28,379][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:17:28,699][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:17:29,020][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:17:29,340][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:17:29,662][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:17:29,983][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:17:30,301][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:17:30,623][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:17:30,944][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:17:31,263][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:17:31,583][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:17:31,904][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:17:32,225][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:17:32,546][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:17:32,866][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:17:33,481][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:17:33,802][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:17:34,121][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:17:34,442][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:17:34,763][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:17:35,083][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:17:35,404][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:17:35,724][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:17:36,045][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:17:36,366][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:17:36,686][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:17:37,008][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:17:37,328][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:17:37,986][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:17:38,714][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:17:38,716][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:17:38,718][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:17:39,335][__main__][INFO] - Iteration 346 took 27s (11.95% Gen, 85.78% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 49m 53s. Estimated total time: 7h 34m 4s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 2s. +[2026-03-25 18:17:39,337][__main__][INFO] - Starting iteration 346. +[2026-03-25 18:17:39,340][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:17:39,341][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:17:42,540][__main__][INFO] - Number of regex retries in iteration 346: 0 +[2026-03-25 18:17:42,541][__main__][INFO] - agents played in iteration 346 are Bob, Alice +[2026-03-25 18:17:43,069][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:17:43,724][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:17:44,015][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:17:44,337][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:17:44,657][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:17:44,977][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:17:45,298][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:17:45,616][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:17:45,937][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:17:46,259][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:17:46,580][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:17:46,900][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:17:47,220][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:17:47,539][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:17:47,860][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:17:48,181][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:17:48,503][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:17:48,823][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:17:49,144][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:17:49,464][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:17:49,786][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:17:50,107][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:17:50,428][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:17:50,750][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:17:51,071][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:17:51,393][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:17:51,713][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:17:52,033][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:17:52,355][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:17:52,677][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:17:52,996][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:17:53,316][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:17:53,635][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:17:53,956][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:17:54,277][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:17:54,597][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:17:54,918][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:17:55,238][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:17:55,559][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:17:55,880][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:17:56,200][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:17:56,520][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:17:56,841][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:17:57,163][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:17:57,483][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:17:57,804][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:17:58,125][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:17:58,444][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:17:58,765][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:17:59,085][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:17:59,406][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:17:59,728][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:18:00,047][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:18:00,662][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:18:00,982][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:18:01,302][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:18:01,623][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:18:01,944][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:18:02,263][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:18:02,585][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:18:02,906][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:18:03,227][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:18:03,548][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:18:03,869][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:18:04,190][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:18:04,508][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:18:05,166][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:18:05,897][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:18:05,899][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:18:05,901][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:18:06,518][__main__][INFO] - Iteration 347 took 27s (11.77% Gen, 85.95% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 48m 19s. Estimated total time: 7h 32m 58s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 17s, 500 more iterations: 3h 46m 29s. +[2026-03-25 18:18:06,520][__main__][INFO] - Starting iteration 347. +[2026-03-25 18:18:06,523][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:18:06,524][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:18:09,761][__main__][INFO] - Number of regex retries in iteration 347: 0 +[2026-03-25 18:18:09,762][__main__][INFO] - agents played in iteration 347 are Bob, Alice +[2026-03-25 18:18:10,326][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:18:10,980][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:18:11,271][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:18:11,591][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:18:11,910][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:18:12,230][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:18:12,552][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:18:12,873][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:18:13,192][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:18:13,511][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:18:13,831][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:18:14,151][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:18:14,472][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:18:14,792][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:18:15,113][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:18:15,434][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:18:15,755][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:18:16,077][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:18:16,398][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:18:16,717][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:18:17,038][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:18:17,358][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:18:17,679][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:18:17,999][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:18:18,319][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:18:18,638][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:18:18,959][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:18:19,280][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:18:19,600][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:18:19,921][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:18:20,242][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:18:20,563][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:18:20,884][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:18:21,204][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:18:21,525][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:18:21,846][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:18:22,168][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:18:22,487][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:18:22,809][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:18:23,130][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:18:23,451][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:18:23,771][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:18:24,091][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:18:24,412][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:18:24,733][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:18:25,052][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:18:25,374][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:18:25,695][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:18:26,016][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:18:26,336][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:18:26,656][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:18:26,975][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:18:27,296][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:18:27,912][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:18:28,232][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:18:28,552][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:18:28,873][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:18:29,194][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:18:29,516][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:18:29,837][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:18:30,157][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:18:30,478][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:18:30,798][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:18:31,118][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:18:31,438][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:18:31,759][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:18:32,417][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:18:33,147][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:18:33,149][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:18:33,151][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:18:33,769][__main__][INFO] - Iteration 348 took 27s (11.88% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 49m 1s. Estimated total time: 7h 34m 7s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 3s. +[2026-03-25 18:18:33,772][__main__][INFO] - Starting iteration 348. +[2026-03-25 18:18:33,775][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:18:33,776][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:18:37,016][__main__][INFO] - Number of regex retries in iteration 348: 0 +[2026-03-25 18:18:37,016][__main__][INFO] - agents played in iteration 348 are Bob, Alice +[2026-03-25 18:18:37,566][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:18:38,219][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:18:38,508][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:18:38,829][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:18:39,150][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:18:39,470][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:18:39,790][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:18:40,109][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:18:40,428][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:18:40,748][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:18:41,069][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:18:41,390][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:18:41,710][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:18:42,031][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:18:42,352][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:18:42,673][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:18:42,994][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:18:43,315][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:18:43,635][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:18:43,956][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:18:44,276][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:18:44,597][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:18:44,917][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:18:45,238][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:18:45,559][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:18:45,879][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:18:46,199][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:18:46,520][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:18:46,840][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:18:47,160][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:18:47,481][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:18:47,802][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:18:48,122][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:18:48,444][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:18:48,763][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:18:49,084][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:18:49,404][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:18:49,724][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:18:50,045][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:18:50,366][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:18:50,688][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:18:51,009][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:18:51,329][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:18:51,650][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:18:51,970][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:18:52,291][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:18:52,612][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:18:52,932][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:18:53,253][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:18:53,573][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:18:53,894][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:18:54,215][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:18:54,537][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:18:55,150][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:18:55,469][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:18:55,790][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:18:56,110][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:18:56,429][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:18:56,749][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:18:57,069][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:18:57,390][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:18:57,710][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:18:58,030][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:18:58,350][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:18:58,671][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:18:58,990][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:18:59,650][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:19:00,372][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:19:00,374][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:19:00,376][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:19:00,992][__main__][INFO] - Iteration 349 took 27s (11.91% Gen, 85.82% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 48m 4s. Estimated total time: 7h 33m 38s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 49s. +[2026-03-25 18:19:00,994][__main__][INFO] - Starting iteration 349. +[2026-03-25 18:19:00,997][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:19:00,998][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:19:04,235][__main__][INFO] - Number of regex retries in iteration 349: 0 +[2026-03-25 18:19:04,236][__main__][INFO] - agents played in iteration 349 are Bob, Alice +[2026-03-25 18:19:04,812][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:19:05,467][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:19:05,757][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:19:06,078][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:19:06,400][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:19:06,720][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:19:07,042][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:19:07,362][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:19:07,684][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:19:08,003][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:19:08,323][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:19:08,644][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:19:08,964][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:19:09,284][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:19:09,604][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:19:09,925][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:19:10,247][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:19:10,566][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:19:10,885][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:19:11,205][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:19:11,526][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:19:11,845][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:19:12,165][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:19:12,487][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:19:12,806][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:19:13,126][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:19:13,448][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:19:13,769][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:19:14,090][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:19:14,409][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:19:14,731][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:19:15,051][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:19:15,372][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:19:15,693][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:19:16,012][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:19:16,334][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:19:16,654][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:19:16,975][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:19:17,296][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:19:17,615][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:19:17,935][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:19:18,255][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:19:18,576][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:19:18,895][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:19:19,216][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:19:19,535][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:19:19,856][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:19:20,177][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:19:20,497][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:19:20,818][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:19:21,139][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:19:21,460][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:19:21,779][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:19:22,394][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:19:22,714][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:19:23,035][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:19:23,356][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:19:23,675][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:19:23,995][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:19:24,316][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:19:24,637][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:19:24,956][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:19:25,276][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:19:25,597][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:19:25,917][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:19:26,238][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:19:26,897][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:19:27,631][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:19:27,633][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:19:27,634][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:19:28,312][__main__][INFO] - Iteration 350 took 27s (11.85% Gen, 85.66% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 49m 15s. Estimated total time: 7h 35m 15s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 37s. +[2026-03-25 18:19:28,314][__main__][INFO] - Starting iteration 350. +[2026-03-25 18:19:28,317][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 6 and human policies 1. +[2026-03-25 18:19:28,318][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:19:31,600][__main__][INFO] - Number of regex retries in iteration 350: 0 +[2026-03-25 18:19:31,601][__main__][INFO] - agents played in iteration 350 are Bob, Alice +[2026-03-25 18:19:32,153][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:19:32,823][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:19:33,112][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:19:33,435][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:19:33,755][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:19:34,076][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:19:34,396][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:19:34,716][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:19:35,038][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:19:35,358][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:19:35,677][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:19:35,999][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:19:36,320][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:19:36,640][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:19:36,962][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:19:37,283][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:19:37,604][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:19:37,925][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:19:38,246][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:19:38,566][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:19:38,885][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:19:39,206][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:19:39,527][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:19:39,848][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:19:40,169][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:19:40,489][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:19:40,810][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:19:41,129][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:19:41,449][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:19:41,768][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:19:42,089][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:19:42,409][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:19:42,728][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:19:43,048][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:19:43,368][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:19:43,689][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:19:44,010][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:19:44,330][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:19:44,652][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:19:44,971][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:19:45,291][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:19:45,611][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:19:45,933][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:19:46,254][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:19:46,574][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:19:46,895][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:19:47,216][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:19:47,536][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:19:47,856][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:19:48,175][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:19:48,495][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:19:48,816][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:19:49,137][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:19:49,751][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:19:50,073][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:19:50,393][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:19:50,715][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:19:51,036][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:19:51,355][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:19:51,675][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:19:51,994][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:19:52,313][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:19:52,635][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:19:52,956][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:19:53,276][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:19:53,596][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:19:54,255][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:19:54,985][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:19:54,987][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:19:54,989][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:19:56,514][__main__][INFO] - Iteration 351 took 28s (11.64% Gen, 82.94% Train). Generation: 3s, Training: 23s. Estimated remaining time: 5h 3m 29s. Estimated total time: 7h 49m 57s. Time estimates for 10 more iterations: 4m 41s, 100 more iterations: 46m 59s, 500 more iterations: 3h 54m 58s. +[2026-03-25 18:19:56,517][__main__][INFO] - Starting iteration 351. +[2026-03-25 18:19:56,520][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:19:56,520][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:19:59,782][__main__][INFO] - Number of regex retries in iteration 351: 0 +[2026-03-25 18:19:59,783][__main__][INFO] - agents played in iteration 351 are Bob, Alice +[2026-03-25 18:20:00,328][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:20:00,982][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:20:01,272][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:20:01,593][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:20:01,913][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:20:02,232][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:20:02,553][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:20:02,873][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:20:03,192][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:20:03,511][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:20:03,830][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:20:04,151][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:20:04,471][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:20:04,790][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:20:05,110][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:20:05,430][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:20:05,751][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:20:06,072][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:20:06,392][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:20:06,712][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:20:07,032][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:20:07,353][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:20:07,672][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:20:07,993][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:20:08,315][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:20:08,634][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:20:08,955][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:20:09,275][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:20:09,597][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:20:09,918][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:20:10,238][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:20:10,558][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:20:10,879][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:20:11,199][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:20:11,519][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:20:11,838][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:20:12,159][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:20:12,480][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:20:12,800][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:20:13,120][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:20:13,441][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:20:13,762][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:20:14,082][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:20:14,404][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:20:14,725][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:20:15,046][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:20:15,367][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:20:15,688][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:20:16,008][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:20:16,327][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:20:16,647][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:20:16,969][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:20:17,288][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:20:17,901][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:20:18,223][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:20:18,544][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:20:18,863][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:20:19,184][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:20:19,505][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:20:19,824][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:20:20,146][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:20:20,470][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:20:20,791][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:20:21,112][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:20:21,433][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:20:21,755][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:20:22,432][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:20:23,161][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:20:23,163][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:20:23,165][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:20:23,781][__main__][INFO] - Iteration 352 took 27s (11.97% Gen, 85.76% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 47m 26s. Estimated total time: 7h 34m 22s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 26s, 500 more iterations: 3h 47m 11s. +[2026-03-25 18:20:23,784][__main__][INFO] - Starting iteration 352. +[2026-03-25 18:20:23,787][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:20:23,787][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:20:27,001][__main__][INFO] - Number of regex retries in iteration 352: 0 +[2026-03-25 18:20:27,002][__main__][INFO] - agents played in iteration 352 are Bob, Alice +[2026-03-25 18:20:27,546][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:20:28,203][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:20:28,492][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:20:28,813][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:20:29,132][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:20:29,452][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:20:29,774][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:20:30,093][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:20:30,413][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:20:30,732][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:20:31,052][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:20:31,373][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:20:31,695][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:20:32,015][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:20:32,336][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:20:32,657][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:20:32,977][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:20:33,297][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:20:33,617][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:20:33,937][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:20:34,256][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:20:34,576][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:20:34,896][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:20:35,217][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:20:35,539][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:20:35,859][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:20:36,179][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:20:36,499][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:20:36,819][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:20:37,138][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:20:37,460][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:20:37,781][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:20:38,102][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:20:38,424][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:20:38,745][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:20:39,066][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:20:39,386][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:20:39,707][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:20:40,027][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:20:40,347][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:20:40,668][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:20:40,989][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:20:41,309][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:20:41,629][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:20:41,950][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:20:42,270][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:20:42,590][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:20:42,910][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:20:43,229][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:20:43,550][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:20:43,871][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:20:44,190][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:20:44,510][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:20:45,125][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:20:45,445][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:20:45,766][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:20:46,087][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:20:46,408][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:20:46,729][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:20:47,049][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:20:47,369][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:20:47,689][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:20:48,010][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:20:48,330][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:20:48,651][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:20:48,971][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:20:49,629][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:20:50,324][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:20:50,326][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:20:50,328][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:20:51,125][__main__][INFO] - Iteration 353 took 27s (11.76% Gen, 85.32% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 48m 15s. Estimated total time: 7h 35m 39s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 49s. +[2026-03-25 18:20:51,127][__main__][INFO] - Starting iteration 353. +[2026-03-25 18:20:51,130][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:20:51,130][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:20:54,395][__main__][INFO] - Number of regex retries in iteration 353: 0 +[2026-03-25 18:20:54,396][__main__][INFO] - agents played in iteration 353 are Bob, Alice +[2026-03-25 18:20:54,939][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:20:55,593][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:20:55,884][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:20:56,206][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:20:56,527][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:20:56,848][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:20:57,169][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:20:57,489][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:20:57,808][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:20:58,129][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:20:58,450][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:20:58,770][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:20:59,091][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:20:59,410][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:20:59,730][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:21:00,050][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:21:00,372][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:21:00,692][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:21:01,014][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:21:01,335][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:21:01,655][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:21:01,975][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:21:02,296][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:21:02,616][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:21:02,934][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:21:03,255][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:21:03,575][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:21:03,896][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:21:04,215][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:21:04,534][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:21:04,854][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:21:05,175][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:21:05,496][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:21:05,816][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:21:06,135][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:21:06,456][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:21:06,777][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:21:07,097][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:21:07,417][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:21:07,737][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:21:08,058][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:21:08,378][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:21:08,699][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:21:09,019][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:21:09,340][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:21:09,662][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:21:09,984][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:21:10,306][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:21:10,628][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:21:10,949][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:21:11,271][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:21:11,591][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:21:11,913][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:21:12,531][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:21:12,850][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:21:13,172][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:21:13,492][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:21:13,812][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:21:14,131][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:21:14,451][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:21:14,772][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:21:15,092][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:21:15,413][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:21:15,734][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:21:16,056][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:21:16,377][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:21:17,031][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:21:17,750][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:21:17,752][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:21:17,754][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:21:18,373][__main__][INFO] - Iteration 354 took 27s (11.99% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 46m 13s. Estimated total time: 7h 34m 4s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 2s. +[2026-03-25 18:21:18,375][__main__][INFO] - Starting iteration 354. +[2026-03-25 18:21:18,379][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:21:18,379][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:21:21,613][__main__][INFO] - Number of regex retries in iteration 354: 0 +[2026-03-25 18:21:21,613][__main__][INFO] - agents played in iteration 354 are Bob, Alice +[2026-03-25 18:21:22,157][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:21:22,805][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:21:23,096][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:21:23,419][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:21:23,737][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:21:24,057][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:21:24,377][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:21:24,698][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:21:25,017][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:21:25,336][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:21:25,658][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:21:25,979][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:21:26,298][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:21:26,619][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:21:26,939][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:21:27,260][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:21:27,580][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:21:27,900][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:21:28,219][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:21:28,540][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:21:28,859][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:21:29,179][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:21:29,499][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:21:29,819][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:21:30,139][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:21:30,460][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:21:30,780][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:21:31,100][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:21:31,420][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:21:31,741][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:21:32,061][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:21:32,382][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:21:32,704][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:21:33,025][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:21:33,345][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:21:33,664][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:21:33,985][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:21:34,304][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:21:34,626][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:21:34,946][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:21:35,267][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:21:35,588][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:21:35,909][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:21:36,229][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:21:36,550][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:21:36,870][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:21:37,190][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:21:37,509][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:21:37,828][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:21:38,150][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:21:38,471][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:21:38,791][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:21:39,111][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:21:39,723][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:21:40,044][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:21:40,364][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:21:40,685][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:21:41,004][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:21:41,325][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:21:41,645][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:21:41,967][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:21:42,287][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:21:42,606][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:21:42,927][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:21:43,247][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:21:43,567][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:21:44,220][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:21:44,951][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:21:44,954][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:21:44,956][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:21:45,619][__main__][INFO] - Iteration 355 took 27s (11.87% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 45m 43s. Estimated total time: 7h 34m 1s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 0s. +[2026-03-25 18:21:45,621][__main__][INFO] - Starting iteration 355. +[2026-03-25 18:21:45,624][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:21:45,625][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:21:48,906][__main__][INFO] - Number of regex retries in iteration 355: 0 +[2026-03-25 18:21:48,907][__main__][INFO] - agents played in iteration 355 are Bob, Alice +[2026-03-25 18:21:49,478][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:21:50,130][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:21:50,420][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:21:50,741][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:21:51,061][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:21:51,381][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:21:51,700][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:21:52,022][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:21:52,343][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:21:52,665][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:21:52,984][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:21:53,304][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:21:53,625][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:21:53,945][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:21:54,267][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:21:54,588][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:21:54,907][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:21:55,228][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:21:55,547][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:21:55,869][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:21:56,190][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:21:56,511][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:21:56,830][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:21:57,151][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:21:57,472][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:21:57,793][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:21:58,115][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:21:58,436][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:21:58,757][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:21:59,077][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:21:59,400][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:21:59,722][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:22:00,044][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:22:00,367][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:22:00,689][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:22:01,010][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:22:01,330][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:22:01,650][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:22:01,971][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:22:02,292][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:22:02,611][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:22:02,931][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:22:03,252][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:22:03,572][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:22:03,892][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:22:04,211][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:22:04,532][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:22:04,853][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:22:05,174][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:22:05,495][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:22:05,816][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:22:06,136][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:22:06,456][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:22:07,069][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:22:07,388][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:22:07,708][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:22:08,028][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:22:08,348][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:22:08,669][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:22:08,990][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:22:09,310][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:22:09,630][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:22:09,951][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:22:10,272][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:22:10,593][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:22:10,912][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:22:11,565][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:22:12,297][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:22:12,299][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:22:12,301][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:22:12,911][__main__][INFO] - Iteration 356 took 27s (12.03% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 46m 3s. Estimated total time: 7h 34m 48s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 28s, 500 more iterations: 3h 47m 24s. +[2026-03-25 18:22:12,914][__main__][INFO] - Starting iteration 356. +[2026-03-25 18:22:12,917][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:22:12,917][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:22:16,185][__main__][INFO] - Number of regex retries in iteration 356: 0 +[2026-03-25 18:22:16,185][__main__][INFO] - agents played in iteration 356 are Bob, Alice +[2026-03-25 18:22:16,758][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:22:17,407][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:22:17,695][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:22:18,018][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:22:18,339][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:22:18,660][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:22:18,981][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:22:19,302][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:22:19,624][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:22:19,944][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:22:20,266][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:22:20,587][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:22:20,907][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:22:21,228][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:22:21,549][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:22:21,869][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:22:22,191][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:22:22,511][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:22:22,831][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:22:23,151][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:22:23,472][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:22:23,791][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:22:24,111][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:22:24,431][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:22:24,752][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:22:25,073][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:22:25,394][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:22:25,713][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:22:26,032][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:22:26,352][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:22:26,672][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:22:26,992][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:22:27,313][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:22:27,632][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:22:27,951][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:22:28,271][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:22:28,590][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:22:28,910][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:22:29,230][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:22:29,550][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:22:29,870][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:22:30,192][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:22:30,511][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:22:30,831][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:22:31,150][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:22:31,471][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:22:31,792][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:22:32,113][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:22:32,432][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:22:32,752][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:22:33,072][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:22:33,393][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:22:33,713][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:22:34,324][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:22:34,645][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:22:34,965][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:22:35,285][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:22:35,605][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:22:35,925][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:22:36,246][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:22:36,567][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:22:36,887][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:22:37,207][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:22:37,528][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:22:37,848][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:22:38,169][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:22:38,822][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:22:39,550][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:22:39,553][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:22:39,554][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:22:40,148][__main__][INFO] - Iteration 357 took 27s (12.00% Gen, 85.81% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 44m 39s. Estimated total time: 7h 33m 51s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 55s. +[2026-03-25 18:22:40,150][__main__][INFO] - Starting iteration 357. +[2026-03-25 18:22:40,154][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:22:40,154][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:22:43,435][__main__][INFO] - Number of regex retries in iteration 357: 0 +[2026-03-25 18:22:43,435][__main__][INFO] - agents played in iteration 357 are Bob, Alice +[2026-03-25 18:22:43,981][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:22:44,629][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:22:44,920][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:22:45,243][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:22:45,564][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:22:45,886][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:22:46,205][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:22:46,525][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:22:46,845][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:22:47,166][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:22:47,487][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:22:47,809][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:22:48,129][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:22:48,450][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:22:48,770][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:22:49,092][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:22:49,412][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:22:49,734][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:22:50,056][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:22:50,378][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:22:50,700][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:22:51,023][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:22:51,344][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:22:51,666][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:22:51,985][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:22:52,305][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:22:52,625][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:22:52,945][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:22:53,266][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:22:53,585][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:22:53,905][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:22:54,225][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:22:54,544][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:22:54,863][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:22:55,183][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:22:55,503][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:22:55,825][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:22:56,145][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:22:56,466][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:22:56,788][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:22:57,108][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:22:57,427][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:22:57,747][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:22:58,068][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:22:58,389][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:22:58,710][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:22:59,030][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:22:59,351][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:22:59,670][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:22:59,990][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:23:00,311][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:23:00,632][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:23:00,951][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:23:01,561][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:23:01,882][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:23:02,203][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:23:02,523][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:23:02,843][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:23:03,164][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:23:03,485][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:23:03,804][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:23:04,123][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:23:04,444][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:23:04,765][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:23:05,085][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:23:05,405][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:23:06,059][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:23:06,782][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:23:06,784][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:23:06,786][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:23:07,402][__main__][INFO] - Iteration 358 took 27s (12.04% Gen, 85.69% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 44m 29s. Estimated total time: 7h 34m 9s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 24s, 500 more iterations: 3h 47m 4s. +[2026-03-25 18:23:07,404][__main__][INFO] - Starting iteration 358. +[2026-03-25 18:23:07,407][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:23:07,408][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:23:10,679][__main__][INFO] - Number of regex retries in iteration 358: 0 +[2026-03-25 18:23:10,680][__main__][INFO] - agents played in iteration 358 are Bob, Alice +[2026-03-25 18:23:11,234][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:23:11,883][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:23:12,172][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:23:12,492][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:23:12,811][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:23:13,131][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:23:13,452][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:23:13,771][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:23:14,092][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:23:14,411][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:23:14,731][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:23:15,051][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:23:15,371][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:23:15,690][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:23:16,011][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:23:16,331][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:23:16,651][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:23:16,971][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:23:17,291][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:23:17,610][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:23:17,930][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:23:18,250][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:23:18,572][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:23:18,893][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:23:19,212][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:23:19,531][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:23:19,851][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:23:20,172][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:23:20,492][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:23:20,812][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:23:21,132][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:23:21,452][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:23:21,774][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:23:22,094][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:23:22,416][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:23:22,736][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:23:23,057][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:23:23,377][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:23:23,698][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:23:24,018][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:23:24,337][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:23:24,659][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:23:24,979][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:23:25,297][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:23:25,618][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:23:25,938][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:23:26,259][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:23:26,581][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:23:26,902][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:23:27,222][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:23:27,542][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:23:27,863][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:23:28,184][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:23:28,795][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:23:29,117][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:23:29,438][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:23:29,758][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:23:30,079][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:23:30,399][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:23:30,717][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:23:31,037][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:23:31,358][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:23:31,679][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:23:31,999][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:23:32,317][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:23:32,636][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:23:33,291][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:23:34,014][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:23:34,016][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:23:34,017][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:23:34,720][__main__][INFO] - Iteration 359 took 27s (11.98% Gen, 85.44% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 45m 6s. Estimated total time: 7h 35m 13s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 36s. +[2026-03-25 18:23:34,722][__main__][INFO] - Starting iteration 359. +[2026-03-25 18:23:34,725][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:23:34,725][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:23:38,046][__main__][INFO] - Number of regex retries in iteration 359: 0 +[2026-03-25 18:23:38,047][__main__][INFO] - agents played in iteration 359 are Bob, Alice +[2026-03-25 18:23:38,621][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:23:39,285][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:23:39,575][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:23:39,896][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:23:40,217][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:23:40,538][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:23:40,860][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:23:41,179][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:23:41,499][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:23:41,818][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:23:42,137][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:23:42,457][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:23:42,776][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:23:43,097][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:23:43,416][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:23:43,737][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:23:44,058][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:23:44,380][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:23:44,699][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:23:45,019][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:23:45,339][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:23:45,659][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:23:45,980][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:23:46,300][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:23:46,619][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:23:46,940][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:23:47,259][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:23:47,580][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:23:47,900][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:23:48,221][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:23:48,542][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:23:48,863][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:23:49,184][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:23:49,504][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:23:49,826][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:23:50,147][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:23:50,468][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:23:50,787][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:23:51,109][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:23:51,428][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:23:51,749][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:23:52,069][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:23:52,391][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:23:52,711][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:23:53,032][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:23:53,351][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:23:53,672][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:23:53,992][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:23:54,313][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:23:54,632][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:23:54,952][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:23:55,271][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:23:55,592][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:23:56,206][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:23:56,526][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:23:56,845][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:23:57,166][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:23:57,486][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:23:57,806][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:23:58,125][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:23:58,444][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:23:58,765][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:23:59,085][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:23:59,406][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:23:59,725][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:24:00,046][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:24:00,705][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:24:01,430][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:24:01,433][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:24:01,434][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:24:02,054][__main__][INFO] - Iteration 360 took 27s (12.15% Gen, 85.57% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 44m 56s. Estimated total time: 7h 35m 30s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 45s. +[2026-03-25 18:24:02,056][__main__][INFO] - Starting iteration 360. +[2026-03-25 18:24:02,060][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:24:02,060][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:24:05,227][__main__][INFO] - Number of regex retries in iteration 360: 0 +[2026-03-25 18:24:05,228][__main__][INFO] - agents played in iteration 360 are Bob, Alice +[2026-03-25 18:24:05,776][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:24:06,431][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:24:06,720][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:24:07,042][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:24:07,362][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:24:07,681][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:24:08,001][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:24:08,321][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:24:08,642][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:24:08,962][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:24:09,282][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:24:09,603][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:24:09,923][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:24:10,242][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:24:10,563][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:24:10,884][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:24:11,205][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:24:11,524][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:24:11,846][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:24:12,166][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:24:12,486][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:24:12,806][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:24:13,127][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:24:13,447][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:24:13,768][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:24:14,089][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:24:14,409][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:24:14,730][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:24:15,050][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:24:15,370][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:24:15,692][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:24:16,012][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:24:16,333][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:24:16,652][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:24:16,972][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:24:17,292][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:24:17,611][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:24:17,931][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:24:18,252][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:24:18,572][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:24:18,893][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:24:19,212][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:24:19,531][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:24:19,851][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:24:20,172][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:24:20,494][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:24:20,815][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:24:21,134][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:24:21,454][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:24:21,773][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:24:22,093][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:24:22,414][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:24:22,733][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:24:23,350][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:24:23,670][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:24:23,991][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:24:24,310][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:24:24,631][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:24:24,951][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:24:25,272][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:24:25,592][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:24:25,914][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:24:26,234][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:24:26,554][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:24:26,875][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:24:27,196][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:24:27,870][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:24:28,599][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:24:28,601][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:24:28,603][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:24:29,237][__main__][INFO] - Iteration 361 took 27s (11.65% Gen, 86.01% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 41m 57s. Estimated total time: 7h 32m 58s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 17s, 500 more iterations: 3h 46m 29s. +[2026-03-25 18:24:29,240][__main__][INFO] - Starting iteration 361. +[2026-03-25 18:24:29,242][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:24:29,243][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:24:32,467][__main__][INFO] - Number of regex retries in iteration 361: 0 +[2026-03-25 18:24:32,468][__main__][INFO] - agents played in iteration 361 are Bob, Alice +[2026-03-25 18:24:33,002][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:24:33,660][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:24:33,950][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:24:34,270][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:24:34,590][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:24:34,912][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:24:35,233][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:24:35,553][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:24:35,873][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:24:36,192][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:24:36,512][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:24:36,831][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:24:37,150][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:24:37,471][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:24:37,791][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:24:38,110][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:24:38,431][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:24:38,751][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:24:39,069][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:24:39,389][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:24:39,711][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:24:40,030][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:24:40,349][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:24:40,669][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:24:40,990][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:24:41,309][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:24:41,629][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:24:41,949][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:24:42,270][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:24:42,590][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:24:42,910][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:24:43,231][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:24:43,551][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:24:43,872][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:24:44,192][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:24:44,512][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:24:44,831][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:24:45,152][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:24:45,473][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:24:45,792][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:24:46,112][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:24:46,432][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:24:46,752][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:24:47,073][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:24:47,393][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:24:47,713][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:24:48,033][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:24:48,352][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:24:48,673][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:24:48,993][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:24:49,312][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:24:49,632][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:24:49,951][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:24:50,567][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:24:50,887][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:24:51,206][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:24:51,527][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:24:51,848][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:24:52,168][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:24:52,487][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:24:52,806][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:24:53,127][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:24:53,449][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:24:53,769][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:24:54,088][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:24:54,409][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:24:55,069][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:24:55,798][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:24:55,801][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:24:55,802][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:24:56,423][__main__][INFO] - Iteration 362 took 27s (11.86% Gen, 85.84% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 41m 32s. Estimated total time: 7h 33m 1s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 18s, 500 more iterations: 3h 46m 30s. +[2026-03-25 18:24:56,425][__main__][INFO] - Starting iteration 362. +[2026-03-25 18:24:56,428][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:24:56,429][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:24:59,611][__main__][INFO] - Number of regex retries in iteration 362: 0 +[2026-03-25 18:24:59,612][__main__][INFO] - agents played in iteration 362 are Bob, Alice +[2026-03-25 18:25:00,192][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:25:00,849][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:25:01,139][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:25:01,460][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:25:01,779][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:25:02,101][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:25:02,421][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:25:02,741][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:25:03,061][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:25:03,383][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:25:03,702][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:25:04,023][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:25:04,343][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:25:04,664][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:25:04,985][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:25:05,306][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:25:05,626][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:25:05,945][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:25:06,265][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:25:06,585][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:25:06,906][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:25:07,227][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:25:07,547][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:25:07,868][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:25:08,188][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:25:08,509][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:25:08,828][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:25:09,147][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:25:09,467][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:25:09,788][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:25:10,109][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:25:10,430][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:25:10,750][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:25:11,070][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:25:11,391][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:25:11,710][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:25:12,028][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:25:12,349][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:25:12,670][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:25:12,990][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:25:13,310][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:25:13,632][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:25:13,952][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:25:14,273][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:25:14,594][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:25:14,915][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:25:15,235][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:25:15,555][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:25:15,876][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:25:16,195][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:25:16,515][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:25:16,834][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:25:17,154][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:25:17,774][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:25:18,096][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:25:18,418][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:25:18,740][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:25:19,063][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:25:19,385][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:25:19,706][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:25:20,027][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:25:20,348][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:25:20,670][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:25:20,991][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:25:21,311][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:25:21,631][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:25:22,294][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:25:23,019][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:25:23,022][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:25:23,024][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:25:23,640][__main__][INFO] - Iteration 363 took 27s (11.70% Gen, 86.03% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 41m 36s. Estimated total time: 7h 33m 32s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 21s, 500 more iterations: 3h 46m 46s. +[2026-03-25 18:25:23,642][__main__][INFO] - Starting iteration 363. +[2026-03-25 18:25:23,645][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:25:23,646][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:25:26,869][__main__][INFO] - Number of regex retries in iteration 363: 0 +[2026-03-25 18:25:26,870][__main__][INFO] - agents played in iteration 363 are Bob, Alice +[2026-03-25 18:25:27,424][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:25:28,082][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:25:28,372][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:25:28,693][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:25:29,013][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:25:29,336][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:25:29,657][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:25:29,977][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:25:30,298][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:25:30,618][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:25:30,939][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:25:31,259][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:25:31,580][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:25:31,900][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:25:32,222][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:25:32,544][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:25:32,865][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:25:33,186][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:25:33,506][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:25:33,826][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:25:34,147][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:25:34,468][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:25:34,789][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:25:35,109][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:25:35,429][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:25:35,750][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:25:36,070][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:25:36,390][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:25:36,711][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:25:37,031][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:25:37,352][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:25:37,673][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:25:37,993][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:25:38,313][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:25:38,632][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:25:38,951][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:25:39,271][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:25:39,591][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:25:39,913][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:25:40,233][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:25:40,553][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:25:40,872][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:25:41,192][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:25:41,511][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:25:41,832][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:25:42,152][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:25:42,473][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:25:42,793][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:25:43,113][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:25:43,431][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:25:43,752][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:25:44,072][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:25:44,392][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:25:45,004][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:25:45,324][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:25:45,646][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:25:45,968][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:25:46,288][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:25:46,609][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:25:46,930][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:25:47,250][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:25:47,570][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:25:47,890][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:25:48,209][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:25:48,529][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:25:48,850][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:25:49,504][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:25:50,231][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:25:50,235][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:25:50,237][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:25:50,921][__main__][INFO] - Iteration 364 took 27s (11.82% Gen, 85.67% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 42m 13s. Estimated total time: 7h 34m 36s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 18s. +[2026-03-25 18:25:50,923][__main__][INFO] - Starting iteration 364. +[2026-03-25 18:25:50,926][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:25:50,927][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:25:54,106][__main__][INFO] - Number of regex retries in iteration 364: 0 +[2026-03-25 18:25:54,107][__main__][INFO] - agents played in iteration 364 are Bob, Alice +[2026-03-25 18:25:54,661][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:25:55,309][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:25:55,601][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:25:55,920][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:25:56,240][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:25:56,561][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:25:56,883][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:25:57,204][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:25:57,525][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:25:57,845][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:25:58,166][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:25:58,486][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:25:58,806][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:25:59,127][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:25:59,448][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:25:59,769][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:26:00,090][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:26:00,411][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:26:00,731][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:26:01,052][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:26:01,373][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:26:01,694][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:26:02,015][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:26:02,335][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:26:02,655][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:26:02,976][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:26:03,298][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:26:03,619][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:26:03,939][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:26:04,260][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:26:04,580][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:26:04,900][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:26:05,220][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:26:05,541][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:26:05,863][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:26:06,183][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:26:06,504][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:26:06,825][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:26:07,146][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:26:07,469][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:26:07,791][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:26:08,111][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:26:08,432][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:26:08,754][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:26:09,076][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:26:09,399][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:26:09,719][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:26:10,040][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:26:10,360][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:26:10,680][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:26:11,001][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:26:11,322][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:26:11,642][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:26:12,253][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:26:12,575][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:26:12,895][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:26:13,214][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:26:13,535][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:26:13,855][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:26:14,175][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:26:14,496][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:26:14,817][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:26:15,138][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:26:15,461][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:26:15,783][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:26:16,104][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:26:16,761][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:26:17,495][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:26:17,497][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:26:17,499][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:26:18,118][__main__][INFO] - Iteration 365 took 27s (11.70% Gen, 86.02% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 40m 22s. Estimated total time: 7h 33m 12s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 36s. +[2026-03-25 18:26:18,120][__main__][INFO] - Starting iteration 365. +[2026-03-25 18:26:18,123][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:26:18,124][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:26:21,315][__main__][INFO] - Number of regex retries in iteration 365: 0 +[2026-03-25 18:26:21,316][__main__][INFO] - agents played in iteration 365 are Bob, Alice +[2026-03-25 18:26:21,867][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:26:22,518][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:26:22,807][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:26:23,129][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:26:23,449][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:26:23,770][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:26:24,091][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:26:24,410][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:26:24,732][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:26:25,051][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:26:25,371][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:26:25,691][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:26:26,011][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:26:26,330][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:26:26,650][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:26:26,970][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:26:27,291][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:26:27,611][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:26:27,930][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:26:28,250][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:26:28,570][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:26:28,890][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:26:29,210][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:26:29,528][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:26:29,849][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:26:30,170][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:26:30,491][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:26:30,810][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:26:31,130][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:26:31,450][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:26:31,771][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:26:32,091][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:26:32,410][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:26:32,731][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:26:33,051][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:26:33,371][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:26:33,691][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:26:34,011][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:26:34,331][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:26:34,651][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:26:34,971][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:26:35,292][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:26:35,612][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:26:35,932][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:26:36,252][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:26:36,574][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:26:36,893][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:26:37,215][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:26:37,535][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:26:37,854][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:26:38,175][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:26:38,494][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:26:38,815][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:26:39,426][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:26:39,746][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:26:40,067][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:26:40,386][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:26:40,706][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:26:41,026][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:26:41,347][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:26:41,668][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:26:41,989][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:26:42,308][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:26:42,628][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:26:42,949][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:26:43,270][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:26:43,923][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:26:44,653][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:26:44,655][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:26:44,657][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:26:45,272][__main__][INFO] - Iteration 366 took 27s (11.76% Gen, 85.97% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 39m 12s. Estimated total time: 7h 32m 30s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 15s, 500 more iterations: 3h 46m 15s. +[2026-03-25 18:26:45,275][__main__][INFO] - Starting iteration 366. +[2026-03-25 18:26:45,278][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:26:45,279][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:26:48,484][__main__][INFO] - Number of regex retries in iteration 366: 0 +[2026-03-25 18:26:48,485][__main__][INFO] - agents played in iteration 366 are Bob, Alice +[2026-03-25 18:26:49,027][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:26:49,676][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:26:49,966][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:26:50,288][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:26:50,609][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:26:50,929][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:26:51,250][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:26:51,572][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:26:51,892][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:26:52,213][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:26:52,534][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:26:52,855][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:26:53,174][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:26:53,496][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:26:53,816][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:26:54,136][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:26:54,457][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:26:54,777][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:26:55,095][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:26:55,416][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:26:55,738][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:26:56,058][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:26:56,379][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:26:56,699][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:26:57,018][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:26:57,337][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:26:57,658][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:26:57,979][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:26:58,299][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:26:58,618][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:26:58,939][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:26:59,259][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:26:59,579][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:26:59,900][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:27:00,219][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:27:00,539][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:27:00,859][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:27:01,179][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:27:01,499][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:27:01,819][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:27:02,138][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:27:02,457][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:27:02,777][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:27:03,096][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:27:03,418][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:27:03,738][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:27:04,061][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:27:04,383][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:27:04,704][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:27:05,027][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:27:05,348][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:27:05,669][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:27:05,989][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:27:06,605][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:27:06,927][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:27:07,248][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:27:07,568][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:27:07,887][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:27:08,208][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:27:08,528][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:27:08,849][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:27:09,168][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:27:09,489][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:27:09,809][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:27:10,130][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:27:10,450][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:27:11,111][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:27:11,837][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:27:11,839][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:27:11,841][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:27:12,598][__main__][INFO] - Iteration 367 took 27s (11.74% Gen, 85.49% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 41m 36s. Estimated total time: 7h 35m 21s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 32s, 500 more iterations: 3h 47m 40s. +[2026-03-25 18:27:12,601][__main__][INFO] - Starting iteration 367. +[2026-03-25 18:27:12,604][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:27:12,604][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:27:15,782][__main__][INFO] - Number of regex retries in iteration 367: 0 +[2026-03-25 18:27:15,782][__main__][INFO] - agents played in iteration 367 are Bob, Alice +[2026-03-25 18:27:16,326][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:27:16,981][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:27:17,271][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:27:17,593][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:27:17,913][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:27:18,234][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:27:18,554][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:27:18,874][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:27:19,195][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:27:19,516][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:27:19,835][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:27:20,156][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:27:20,477][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:27:20,798][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:27:21,119][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:27:21,438][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:27:21,758][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:27:22,079][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:27:22,399][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:27:22,719][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:27:23,039][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:27:23,359][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:27:23,679][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:27:23,999][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:27:24,318][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:27:24,639][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:27:24,959][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:27:25,280][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:27:25,600][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:27:25,921][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:27:26,241][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:27:26,561][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:27:26,882][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:27:27,204][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:27:27,525][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:27:27,845][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:27:28,166][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:27:28,486][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:27:28,806][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:27:29,126][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:27:29,445][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:27:29,765][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:27:30,085][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:27:30,404][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:27:30,726][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:27:31,046][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:27:31,366][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:27:31,686][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:27:32,005][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:27:32,326][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:27:32,646][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:27:32,965][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:27:33,286][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:27:33,901][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:27:34,222][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:27:34,541][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:27:34,862][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:27:35,183][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:27:35,504][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:27:35,824][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:27:36,144][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:27:36,466][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:27:36,787][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:27:37,108][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:27:37,429][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:27:37,748][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:27:38,409][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:27:39,141][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:27:39,144][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:27:39,145][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:27:39,922][__main__][INFO] - Iteration 368 took 27s (11.63% Gen, 85.52% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 41m 7s. Estimated total time: 7h 35m 19s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 31s, 500 more iterations: 3h 47m 39s. +[2026-03-25 18:27:39,924][__main__][INFO] - Starting iteration 368. +[2026-03-25 18:27:39,927][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:27:39,928][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:27:43,144][__main__][INFO] - Number of regex retries in iteration 368: 0 +[2026-03-25 18:27:43,145][__main__][INFO] - agents played in iteration 368 are Bob, Alice +[2026-03-25 18:27:43,687][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:27:44,341][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:27:44,631][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:27:44,953][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:27:45,273][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:27:45,592][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:27:45,912][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:27:46,231][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:27:46,552][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:27:46,872][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:27:47,191][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:27:47,511][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:27:47,830][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:27:48,151][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:27:48,472][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:27:48,792][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:27:49,111][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:27:49,432][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:27:49,752][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:27:50,073][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:27:50,395][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:27:50,715][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:27:51,036][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:27:51,356][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:27:51,677][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:27:51,998][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:27:52,318][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:27:52,639][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:27:52,960][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:27:53,282][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:27:53,602][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:27:53,923][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:27:54,244][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:27:54,567][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:27:54,889][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:27:55,211][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:27:55,531][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:27:55,851][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:27:56,172][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:27:56,493][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:27:56,812][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:27:57,133][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:27:57,453][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:27:57,774][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:27:58,095][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:27:58,417][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:27:58,736][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:27:59,057][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:27:59,378][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:27:59,699][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:28:00,018][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:28:00,339][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:28:00,659][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:28:01,286][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:28:01,606][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:28:01,928][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:28:02,249][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:28:02,571][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:28:02,892][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:28:03,213][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:28:03,533][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:28:03,853][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:28:04,175][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:28:04,494][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:28:04,815][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:28:05,137][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:28:05,797][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:28:06,528][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:28:06,530][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:28:06,532][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:28:07,151][__main__][INFO] - Iteration 369 took 27s (11.82% Gen, 85.90% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 39m 5s. Estimated total time: 7h 33m 44s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 52s. +[2026-03-25 18:28:07,153][__main__][INFO] - Starting iteration 369. +[2026-03-25 18:28:07,156][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:28:07,156][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:28:10,432][__main__][INFO] - Number of regex retries in iteration 369: 0 +[2026-03-25 18:28:10,433][__main__][INFO] - agents played in iteration 369 are Bob, Alice +[2026-03-25 18:28:10,989][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:28:11,645][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:28:11,935][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:28:12,257][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:28:12,578][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:28:12,898][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:28:13,218][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:28:13,539][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:28:13,859][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:28:14,180][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:28:14,502][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:28:14,824][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:28:15,144][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:28:15,465][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:28:15,786][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:28:16,107][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:28:16,429][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:28:16,750][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:28:17,071][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:28:17,391][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:28:17,710][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:28:18,031][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:28:18,351][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:28:18,672][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:28:18,993][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:28:19,313][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:28:19,633][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:28:19,953][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:28:20,275][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:28:20,595][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:28:20,915][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:28:21,233][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:28:21,553][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:28:21,875][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:28:22,196][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:28:22,517][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:28:22,838][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:28:23,157][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:28:23,477][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:28:23,798][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:28:24,118][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:28:24,439][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:28:24,758][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:28:25,079][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:28:25,400][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:28:25,720][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:28:26,040][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:28:26,359][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:28:26,679][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:28:26,998][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:28:27,319][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:28:27,639][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:28:27,959][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:28:28,573][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:28:28,894][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:28:29,216][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:28:29,537][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:28:29,858][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:28:30,178][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:28:30,500][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:28:30,820][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:28:31,141][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:28:31,463][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:28:31,784][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:28:32,105][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:28:32,426][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:28:33,085][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:28:33,813][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:28:33,815][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:28:33,817][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:28:34,503][__main__][INFO] - Iteration 370 took 27s (11.98% Gen, 85.51% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 40m 41s. Estimated total time: 7h 35m 47s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 34s, 500 more iterations: 3h 47m 53s. +[2026-03-25 18:28:34,505][__main__][INFO] - Starting iteration 370. +[2026-03-25 18:28:34,508][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:28:34,508][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:28:37,731][__main__][INFO] - Number of regex retries in iteration 370: 0 +[2026-03-25 18:28:37,732][__main__][INFO] - agents played in iteration 370 are Bob, Alice +[2026-03-25 18:28:38,265][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:28:38,920][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:28:39,210][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:28:39,530][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:28:39,850][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:28:40,170][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:28:40,491][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:28:40,810][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:28:41,130][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:28:41,450][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:28:41,771][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:28:42,092][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:28:42,413][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:28:42,733][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:28:43,052][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:28:43,374][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:28:43,693][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:28:44,012][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:28:44,333][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:28:44,652][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:28:44,973][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:28:45,294][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:28:45,614][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:28:45,936][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:28:46,255][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:28:46,576][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:28:46,897][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:28:47,217][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:28:47,538][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:28:47,859][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:28:48,178][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:28:48,498][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:28:48,818][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:28:49,139][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:28:49,459][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:28:49,781][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:28:50,102][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:28:50,423][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:28:50,744][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:28:51,065][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:28:51,387][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:28:51,707][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:28:52,028][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:28:52,347][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:28:52,670][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:28:52,992][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:28:53,315][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:28:53,637][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:28:53,959][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:28:54,280][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:28:54,601][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:28:54,924][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:28:55,246][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:28:55,865][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:28:56,187][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:28:56,508][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:28:56,827][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:28:57,148][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:28:57,469][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:28:57,789][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:28:58,110][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:28:58,430][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:28:58,750][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:28:59,070][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:28:59,391][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:28:59,711][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:29:00,371][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:29:01,079][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:29:01,081][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:29:01,083][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:29:01,711][__main__][INFO] - Iteration 371 took 27s (11.85% Gen, 85.83% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 37m 50s. Estimated total time: 7h 33m 24s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 20s, 500 more iterations: 3h 46m 42s. +[2026-03-25 18:29:01,713][__main__][INFO] - Starting iteration 371. +[2026-03-25 18:29:01,716][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:29:01,717][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:29:04,945][__main__][INFO] - Number of regex retries in iteration 371: 0 +[2026-03-25 18:29:04,946][__main__][INFO] - agents played in iteration 371 are Bob, Alice +[2026-03-25 18:29:05,499][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:29:06,154][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:29:06,444][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:29:06,766][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:29:07,087][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:29:07,407][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:29:07,728][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:29:08,050][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:29:08,371][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:29:08,691][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:29:09,013][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:29:09,333][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:29:09,654][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:29:09,976][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:29:10,297][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:29:10,617][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:29:10,939][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:29:11,258][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:29:11,578][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:29:11,898][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:29:12,219][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:29:12,539][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:29:12,860][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:29:13,180][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:29:13,501][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:29:13,820][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:29:14,139][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:29:14,460][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:29:14,779][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:29:15,099][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:29:15,419][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:29:15,740][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:29:16,060][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:29:16,380][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:29:16,700][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:29:17,020][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:29:17,341][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:29:17,661][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:29:17,983][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:29:18,304][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:29:18,626][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:29:18,946][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:29:19,267][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:29:19,587][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:29:19,908][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:29:20,231][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:29:20,550][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:29:20,872][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:29:21,193][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:29:21,513][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:29:21,833][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:29:22,154][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:29:22,475][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:29:23,088][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:29:23,408][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:29:23,728][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:29:24,049][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:29:24,370][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:29:24,691][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:29:25,012][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:29:25,333][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:29:25,653][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:29:25,974][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:29:26,295][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:29:26,615][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:29:26,936][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:29:27,594][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:29:28,321][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:29:28,323][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:29:28,325][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:29:28,945][__main__][INFO] - Iteration 372 took 27s (11.86% Gen, 85.86% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 37m 49s. Estimated total time: 7h 33m 50s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 55s. +[2026-03-25 18:29:28,948][__main__][INFO] - Starting iteration 372. +[2026-03-25 18:29:28,951][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:29:28,951][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:29:32,196][__main__][INFO] - Number of regex retries in iteration 372: 0 +[2026-03-25 18:29:32,197][__main__][INFO] - agents played in iteration 372 are Bob, Alice +[2026-03-25 18:29:32,738][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:29:33,392][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:29:33,684][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:29:34,007][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:29:34,327][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:29:34,648][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:29:34,969][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:29:35,289][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:29:35,608][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:29:35,930][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:29:36,250][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:29:36,570][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:29:36,891][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:29:37,211][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:29:37,531][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:29:37,850][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:29:38,170][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:29:38,491][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:29:38,811][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:29:39,131][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:29:39,451][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:29:39,773][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:29:40,092][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:29:40,414][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:29:40,735][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:29:41,056][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:29:41,377][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:29:41,699][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:29:42,020][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:29:42,341][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:29:42,662][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:29:42,983][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:29:43,306][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:29:43,629][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:29:43,951][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:29:44,273][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:29:44,595][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:29:44,917][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:29:45,239][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:29:45,561][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:29:45,882][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:29:46,204][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:29:46,525][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:29:46,847][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:29:47,168][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:29:47,490][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:29:47,811][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:29:48,130][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:29:48,450][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:29:48,771][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:29:49,091][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:29:49,413][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:29:49,733][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:29:50,352][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:29:50,673][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:29:50,994][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:29:51,316][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:29:51,636][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:29:51,956][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:29:52,278][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:29:52,600][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:29:52,920][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:29:53,240][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:29:53,561][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:29:53,882][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:29:54,202][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:29:54,862][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:29:55,599][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:29:55,601][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:29:55,603][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:29:56,288][__main__][INFO] - Iteration 373 took 27s (11.87% Gen, 85.61% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 39m 9s. Estimated total time: 7h 35m 38s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 33s, 500 more iterations: 3h 47m 49s. +[2026-03-25 18:29:56,291][__main__][INFO] - Starting iteration 373. +[2026-03-25 18:29:56,294][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:29:56,294][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:29:59,521][__main__][INFO] - Number of regex retries in iteration 373: 0 +[2026-03-25 18:29:59,522][__main__][INFO] - agents played in iteration 373 are Bob, Alice +[2026-03-25 18:30:00,054][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:30:00,702][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:30:00,992][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:30:01,314][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:30:01,634][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:30:01,956][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:30:02,276][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:30:02,597][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:30:02,917][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:30:03,237][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:30:03,558][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:30:03,878][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:30:04,199][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:30:04,520][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:30:04,841][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:30:05,161][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:30:05,481][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:30:05,802][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:30:06,121][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:30:06,443][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:30:06,763][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:30:07,084][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:30:07,406][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:30:07,727][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:30:08,047][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:30:08,368][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:30:08,687][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:30:09,009][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:30:09,329][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:30:09,650][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:30:09,971][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:30:10,292][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:30:10,612][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:30:10,933][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:30:11,254][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:30:11,576][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:30:11,895][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:30:12,214][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:30:12,536][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:30:12,857][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:30:13,177][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:30:13,497][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:30:13,817][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:30:14,137][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:30:14,457][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:30:14,777][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:30:15,096][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:30:15,417][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:30:15,738][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:30:16,061][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:30:16,381][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:30:16,702][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:30:17,024][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:30:17,635][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:30:17,955][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:30:18,276][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:30:18,597][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:30:18,916][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:30:19,237][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:30:19,558][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:30:19,878][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:30:20,200][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:30:20,521][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:30:20,842][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:30:21,162][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:30:21,484][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:30:22,138][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:30:22,867][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:30:22,870][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:30:22,871][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:30:23,488][__main__][INFO] - Iteration 374 took 27s (11.87% Gen, 85.86% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 36m 19s. Estimated total time: 7h 33m 15s. Time estimates for 10 more iterations: 4m 31s, 100 more iterations: 45m 19s, 500 more iterations: 3h 46m 37s. +[2026-03-25 18:30:23,490][__main__][INFO] - Starting iteration 374. +[2026-03-25 18:30:23,493][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:30:23,494][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:30:26,707][__main__][INFO] - Number of regex retries in iteration 374: 0 +[2026-03-25 18:30:26,708][__main__][INFO] - agents played in iteration 374 are Bob, Alice +[2026-03-25 18:30:27,241][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:30:27,889][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:30:28,180][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:30:28,503][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:30:28,824][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:30:29,145][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:30:29,467][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:30:29,788][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:30:30,109][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:30:30,430][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:30:30,751][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:30:31,072][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:30:31,393][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:30:31,712][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:30:32,033][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:30:32,354][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:30:32,674][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:30:32,995][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:30:33,317][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:30:33,637][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:30:33,958][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:30:34,280][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:30:34,603][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:30:34,924][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:30:35,247][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:30:35,568][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:30:35,890][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:30:36,211][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:30:36,534][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:30:36,855][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:30:37,176][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:30:37,495][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:30:37,816][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:30:38,137][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:30:38,458][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:30:38,779][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:30:39,098][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:30:39,418][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:30:39,739][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:30:40,059][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:30:40,380][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:30:40,699][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:30:41,019][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:30:41,338][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:30:41,658][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:30:41,978][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:30:42,297][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:30:42,617][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:30:42,937][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:30:43,257][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:30:43,577][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:30:43,898][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:30:44,219][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:30:44,836][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:30:45,156][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:30:45,477][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:30:45,799][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:30:46,121][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:30:46,442][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:30:46,764][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:30:47,085][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:30:47,405][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:30:47,727][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:30:48,048][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:30:48,369][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:30:48,690][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:30:49,343][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:30:50,076][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:30:50,078][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:30:50,079][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:30:50,700][__main__][INFO] - Iteration 375 took 27s (11.81% Gen, 85.90% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 36m 5s. Estimated total time: 7h 33m 28s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 20s, 500 more iterations: 3h 46m 44s. +[2026-03-25 18:30:50,703][__main__][INFO] - Starting iteration 375. +[2026-03-25 18:30:50,706][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:30:50,706][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:30:53,945][__main__][INFO] - Number of regex retries in iteration 375: 0 +[2026-03-25 18:30:53,945][__main__][INFO] - agents played in iteration 375 are Bob, Alice +[2026-03-25 18:30:54,480][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:30:55,129][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:30:55,421][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:30:55,742][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:30:56,062][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:30:56,385][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:30:56,705][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:30:57,026][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:30:57,348][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:30:57,670][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:30:57,991][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:30:58,312][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:30:58,634][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:30:58,953][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:30:59,273][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:30:59,593][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:30:59,914][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:31:00,233][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:31:00,553][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:31:00,874][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:31:01,195][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:31:01,516][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:31:01,835][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:31:02,156][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:31:02,477][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:31:02,797][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:31:03,117][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:31:03,438][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:31:03,759][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:31:04,080][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:31:04,400][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:31:04,721][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:31:05,042][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:31:05,364][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:31:05,686][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:31:06,006][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:31:06,327][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:31:06,646][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:31:06,967][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:31:07,287][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:31:07,608][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:31:07,929][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:31:08,249][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:31:08,570][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:31:08,891][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:31:09,211][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:31:09,532][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:31:09,852][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:31:10,173][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:31:10,493][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:31:10,813][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:31:11,134][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:31:11,454][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:31:12,066][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:31:12,387][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:31:12,708][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:31:13,030][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:31:13,351][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:31:13,672][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:31:13,991][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:31:14,311][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:31:14,632][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:31:14,953][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:31:15,274][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:31:15,595][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:31:15,916][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:31:16,569][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:31:17,298][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:31:17,300][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:31:17,301][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:31:17,926][__main__][INFO] - Iteration 376 took 27s (11.90% Gen, 85.80% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 35m 51s. Estimated total time: 7h 33m 41s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 22s, 500 more iterations: 3h 46m 50s. +[2026-03-25 18:31:17,929][__main__][INFO] - Starting iteration 376. +[2026-03-25 18:31:17,931][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:31:17,932][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:31:21,162][__main__][INFO] - Number of regex retries in iteration 376: 0 +[2026-03-25 18:31:21,163][__main__][INFO] - agents played in iteration 376 are Bob, Alice +[2026-03-25 18:31:21,710][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:31:22,359][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:31:22,649][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:31:22,971][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:31:23,293][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:31:23,614][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:31:23,935][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:31:24,258][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:31:24,578][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:31:24,898][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:31:25,219][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:31:25,541][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:31:25,862][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:31:26,185][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:31:26,507][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:31:26,828][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:31:27,149][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:31:27,471][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:31:27,790][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:31:28,111][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:31:28,432][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:31:28,752][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:31:29,073][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:31:29,394][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:31:29,716][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:31:30,036][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:31:30,356][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:31:30,676][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:31:30,997][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:31:31,318][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:31:31,640][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:31:31,961][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:31:32,284][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:31:32,604][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:31:32,925][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:31:33,247][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:31:33,568][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:31:33,889][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:31:34,209][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:31:34,531][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:31:34,851][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:31:35,173][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:31:35,494][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:31:35,815][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:31:36,136][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:31:36,457][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:31:36,777][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:31:37,097][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:31:37,418][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:31:37,740][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:31:38,061][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:31:38,381][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:31:38,701][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:31:39,313][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:31:39,633][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:31:39,955][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:31:40,276][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:31:40,595][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:31:40,917][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:31:41,237][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:31:41,558][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:31:41,878][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:31:42,199][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:31:42,520][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:31:42,841][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:31:43,161][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:31:43,814][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:31:44,542][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:31:44,544][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:31:44,546][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:31:45,163][__main__][INFO] - Iteration 377 took 27s (11.86% Gen, 85.86% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 35m 35s. Estimated total time: 7h 33m 52s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 23s, 500 more iterations: 3h 46m 56s. +[2026-03-25 18:31:45,165][__main__][INFO] - Starting iteration 377. +[2026-03-25 18:31:45,168][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:31:45,169][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:31:48,458][__main__][INFO] - Number of regex retries in iteration 377: 0 +[2026-03-25 18:31:48,459][__main__][INFO] - agents played in iteration 377 are Bob, Alice +[2026-03-25 18:31:49,026][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:31:49,675][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:31:49,964][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:31:50,287][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:31:50,608][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:31:50,929][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:31:51,249][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:31:51,570][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:31:51,890][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:31:52,209][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:31:52,530][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:31:52,850][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:31:53,171][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:31:53,492][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:31:53,812][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:31:54,131][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:31:54,451][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:31:54,771][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:31:55,092][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:31:55,412][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:31:55,732][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:31:56,052][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:31:56,373][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:31:56,692][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:31:57,012][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:31:57,332][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:31:57,652][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:31:57,973][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:31:58,294][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:31:58,614][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:31:58,934][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:31:59,254][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:31:59,575][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:31:59,895][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:32:00,215][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:32:00,536][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:32:00,856][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:32:01,177][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:32:01,497][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:32:01,816][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:32:02,136][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:32:02,455][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:32:02,776][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:32:03,097][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:32:03,417][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:32:03,738][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:32:04,058][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:32:04,377][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:32:04,697][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:32:05,019][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:32:05,339][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:32:05,658][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:32:05,980][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:32:06,591][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:32:06,912][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:32:07,232][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:32:07,551][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:32:07,871][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:32:08,191][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:32:08,512][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:32:08,831][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:32:09,152][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:32:09,472][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:32:09,793][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:32:10,113][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:32:10,434][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:32:11,085][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:32:11,827][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:32:11,829][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:32:11,831][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:32:12,458][__main__][INFO] - Iteration 378 took 27s (12.06% Gen, 85.64% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 36m 5s. Estimated total time: 7h 34m 50s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 25s. +[2026-03-25 18:32:12,460][__main__][INFO] - Starting iteration 378. +[2026-03-25 18:32:12,463][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:32:12,464][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:32:15,759][__main__][INFO] - Number of regex retries in iteration 378: 0 +[2026-03-25 18:32:15,760][__main__][INFO] - agents played in iteration 378 are Bob, Alice +[2026-03-25 18:32:16,314][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:32:16,969][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:32:17,259][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:32:17,581][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:32:17,900][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:32:18,221][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:32:18,541][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:32:18,861][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:32:19,183][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:32:19,506][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:32:19,827][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:32:20,148][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:32:20,470][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:32:20,790][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:32:21,111][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:32:21,431][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:32:21,752][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:32:22,073][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:32:22,395][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:32:22,716][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:32:23,037][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:32:23,357][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:32:23,677][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:32:23,997][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:32:24,318][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:32:24,638][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:32:24,959][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:32:25,279][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:32:25,601][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:32:25,924][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:32:26,244][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:32:26,566][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:32:26,887][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:32:27,206][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:32:27,528][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:32:27,849][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:32:28,170][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:32:28,490][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:32:28,810][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:32:29,129][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:32:29,449][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:32:29,770][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:32:30,090][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:32:30,410][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:32:30,730][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:32:31,052][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:32:31,372][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:32:31,691][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:32:32,012][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:32:32,333][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:32:32,653][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:32:32,973][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:32:33,293][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:32:33,906][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:32:34,227][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:32:34,547][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:32:34,868][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:32:35,188][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:32:35,507][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:32:35,829][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:32:36,150][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:32:36,470][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:32:36,791][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:32:37,112][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:32:37,431][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:32:37,752][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:32:38,406][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:32:39,132][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:32:39,134][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:32:39,136][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:32:39,759][__main__][INFO] - Iteration 379 took 27s (12.08% Gen, 85.64% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 35m 44s. Estimated total time: 7h 34m 56s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 29s, 500 more iterations: 3h 47m 28s. +[2026-03-25 18:32:39,761][__main__][INFO] - Starting iteration 379. +[2026-03-25 18:32:39,764][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:32:39,765][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:32:42,992][__main__][INFO] - Number of regex retries in iteration 379: 0 +[2026-03-25 18:32:42,992][__main__][INFO] - agents played in iteration 379 are Bob, Alice +[2026-03-25 18:32:43,555][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:32:44,204][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:32:44,494][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:32:44,816][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:32:45,135][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:32:45,456][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:32:45,778][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:32:46,098][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:32:46,418][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:32:46,738][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:32:47,056][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:32:47,378][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:32:47,697][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:32:48,016][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:32:48,337][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:32:48,658][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:32:48,977][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:32:49,297][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:32:49,617][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:32:49,937][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:32:50,257][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:32:50,578][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:32:50,900][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:32:51,219][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:32:51,540][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:32:51,861][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:32:52,181][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:32:52,500][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:32:52,820][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:32:53,140][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:32:53,461][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:32:53,782][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:32:54,102][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:32:54,421][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:32:54,742][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:32:55,062][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:32:55,382][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:32:55,704][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:32:56,024][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:32:56,345][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:32:56,668][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:32:56,987][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:32:57,306][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:32:57,626][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:32:57,946][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:32:58,267][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:32:58,587][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:32:58,906][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:32:59,227][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:32:59,546][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:32:59,867][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:33:00,187][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:33:00,507][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:33:01,120][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:33:01,441][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:33:01,762][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:33:02,084][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:33:02,405][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:33:02,727][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:33:03,047][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:33:03,368][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:33:03,689][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:33:04,010][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:33:04,331][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:33:04,653][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:33:04,975][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:33:05,643][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:33:06,372][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:33:06,374][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:33:06,376][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:33:07,040][__main__][INFO] - Iteration 380 took 27s (11.83% Gen, 85.73% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 34m 57s. Estimated total time: 7h 34m 36s. Time estimates for 10 more iterations: 4m 32s, 100 more iterations: 45m 27s, 500 more iterations: 3h 47m 18s. +[2026-03-25 18:33:07,042][__main__][INFO] - Starting iteration 380. +[2026-03-25 18:33:07,045][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:33:07,046][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 +[2026-03-25 18:33:10,288][__main__][INFO] - Number of regex retries in iteration 380: 0 +[2026-03-25 18:33:10,289][__main__][INFO] - agents played in iteration 380 are Bob, Alice +[2026-03-25 18:33:10,855][mllm.training.trainer_independent][INFO] - Sharing advantage data. +[2026-03-25 18:33:11,505][mllm.training.trainer_common][INFO] - Processing mini-batch 0 of 256 +[2026-03-25 18:33:11,796][mllm.training.trainer_common][INFO] - Processing mini-batch 4 of 256 +[2026-03-25 18:33:12,118][mllm.training.trainer_common][INFO] - Processing mini-batch 8 of 256 +[2026-03-25 18:33:12,437][mllm.training.trainer_common][INFO] - Processing mini-batch 12 of 256 +[2026-03-25 18:33:12,757][mllm.training.trainer_common][INFO] - Processing mini-batch 16 of 256 +[2026-03-25 18:33:13,077][mllm.training.trainer_common][INFO] - Processing mini-batch 20 of 256 +[2026-03-25 18:33:13,397][mllm.training.trainer_common][INFO] - Processing mini-batch 24 of 256 +[2026-03-25 18:33:13,717][mllm.training.trainer_common][INFO] - Processing mini-batch 28 of 256 +[2026-03-25 18:33:14,037][mllm.training.trainer_common][INFO] - Processing mini-batch 32 of 256 +[2026-03-25 18:33:14,358][mllm.training.trainer_common][INFO] - Processing mini-batch 36 of 256 +[2026-03-25 18:33:14,678][mllm.training.trainer_common][INFO] - Processing mini-batch 40 of 256 +[2026-03-25 18:33:14,998][mllm.training.trainer_common][INFO] - Processing mini-batch 44 of 256 +[2026-03-25 18:33:15,318][mllm.training.trainer_common][INFO] - Processing mini-batch 48 of 256 +[2026-03-25 18:33:15,638][mllm.training.trainer_common][INFO] - Processing mini-batch 52 of 256 +[2026-03-25 18:33:15,961][mllm.training.trainer_common][INFO] - Processing mini-batch 56 of 256 +[2026-03-25 18:33:16,283][mllm.training.trainer_common][INFO] - Processing mini-batch 60 of 256 +[2026-03-25 18:33:16,603][mllm.training.trainer_common][INFO] - Processing mini-batch 64 of 256 +[2026-03-25 18:33:16,925][mllm.training.trainer_common][INFO] - Processing mini-batch 68 of 256 +[2026-03-25 18:33:17,246][mllm.training.trainer_common][INFO] - Processing mini-batch 72 of 256 +[2026-03-25 18:33:17,567][mllm.training.trainer_common][INFO] - Processing mini-batch 76 of 256 +[2026-03-25 18:33:17,888][mllm.training.trainer_common][INFO] - Processing mini-batch 80 of 256 +[2026-03-25 18:33:18,207][mllm.training.trainer_common][INFO] - Processing mini-batch 84 of 256 +[2026-03-25 18:33:18,528][mllm.training.trainer_common][INFO] - Processing mini-batch 88 of 256 +[2026-03-25 18:33:18,847][mllm.training.trainer_common][INFO] - Processing mini-batch 92 of 256 +[2026-03-25 18:33:19,167][mllm.training.trainer_common][INFO] - Processing mini-batch 96 of 256 +[2026-03-25 18:33:19,487][mllm.training.trainer_common][INFO] - Processing mini-batch 100 of 256 +[2026-03-25 18:33:19,807][mllm.training.trainer_common][INFO] - Processing mini-batch 104 of 256 +[2026-03-25 18:33:20,128][mllm.training.trainer_common][INFO] - Processing mini-batch 108 of 256 +[2026-03-25 18:33:20,450][mllm.training.trainer_common][INFO] - Processing mini-batch 112 of 256 +[2026-03-25 18:33:20,771][mllm.training.trainer_common][INFO] - Processing mini-batch 116 of 256 +[2026-03-25 18:33:21,090][mllm.training.trainer_common][INFO] - Processing mini-batch 120 of 256 +[2026-03-25 18:33:21,412][mllm.training.trainer_common][INFO] - Processing mini-batch 124 of 256 +[2026-03-25 18:33:21,732][mllm.training.trainer_common][INFO] - Processing mini-batch 128 of 256 +[2026-03-25 18:33:22,052][mllm.training.trainer_common][INFO] - Processing mini-batch 132 of 256 +[2026-03-25 18:33:22,374][mllm.training.trainer_common][INFO] - Processing mini-batch 136 of 256 +[2026-03-25 18:33:22,693][mllm.training.trainer_common][INFO] - Processing mini-batch 140 of 256 +[2026-03-25 18:33:23,012][mllm.training.trainer_common][INFO] - Processing mini-batch 144 of 256 +[2026-03-25 18:33:23,333][mllm.training.trainer_common][INFO] - Processing mini-batch 148 of 256 +[2026-03-25 18:33:23,653][mllm.training.trainer_common][INFO] - Processing mini-batch 152 of 256 +[2026-03-25 18:33:23,974][mllm.training.trainer_common][INFO] - Processing mini-batch 156 of 256 +[2026-03-25 18:33:24,295][mllm.training.trainer_common][INFO] - Processing mini-batch 160 of 256 +[2026-03-25 18:33:24,614][mllm.training.trainer_common][INFO] - Processing mini-batch 164 of 256 +[2026-03-25 18:33:24,935][mllm.training.trainer_common][INFO] - Processing mini-batch 168 of 256 +[2026-03-25 18:33:25,255][mllm.training.trainer_common][INFO] - Processing mini-batch 172 of 256 +[2026-03-25 18:33:25,576][mllm.training.trainer_common][INFO] - Processing mini-batch 176 of 256 +[2026-03-25 18:33:25,897][mllm.training.trainer_common][INFO] - Processing mini-batch 180 of 256 +[2026-03-25 18:33:26,218][mllm.training.trainer_common][INFO] - Processing mini-batch 184 of 256 +[2026-03-25 18:33:26,538][mllm.training.trainer_common][INFO] - Processing mini-batch 188 of 256 +[2026-03-25 18:33:26,859][mllm.training.trainer_common][INFO] - Processing mini-batch 192 of 256 +[2026-03-25 18:33:27,181][mllm.training.trainer_common][INFO] - Processing mini-batch 196 of 256 +[2026-03-25 18:33:27,500][mllm.training.trainer_common][INFO] - Processing mini-batch 200 of 256 +[2026-03-25 18:33:27,820][mllm.training.trainer_common][INFO] - Processing mini-batch 204 of 256 +[2026-03-25 18:33:28,431][mllm.training.trainer_common][INFO] - Processing mini-batch 208 of 256 +[2026-03-25 18:33:28,751][mllm.training.trainer_common][INFO] - Processing mini-batch 212 of 256 +[2026-03-25 18:33:29,070][mllm.training.trainer_common][INFO] - Processing mini-batch 216 of 256 +[2026-03-25 18:33:29,391][mllm.training.trainer_common][INFO] - Processing mini-batch 220 of 256 +[2026-03-25 18:33:29,711][mllm.training.trainer_common][INFO] - Processing mini-batch 224 of 256 +[2026-03-25 18:33:30,032][mllm.training.trainer_common][INFO] - Processing mini-batch 228 of 256 +[2026-03-25 18:33:30,352][mllm.training.trainer_common][INFO] - Processing mini-batch 232 of 256 +[2026-03-25 18:33:30,672][mllm.training.trainer_common][INFO] - Processing mini-batch 236 of 256 +[2026-03-25 18:33:30,993][mllm.training.trainer_common][INFO] - Processing mini-batch 240 of 256 +[2026-03-25 18:33:31,312][mllm.training.trainer_common][INFO] - Processing mini-batch 244 of 256 +[2026-03-25 18:33:31,632][mllm.training.trainer_common][INFO] - Processing mini-batch 248 of 256 +[2026-03-25 18:33:31,952][mllm.training.trainer_common][INFO] - Processing mini-batch 252 of 256 +[2026-03-25 18:33:32,273][mllm.training.trainer_common][INFO] - Accumulated the policy gradient loss for 7680 tokens. +[2026-03-25 18:33:32,926][mllm.training.trainer_common][INFO] - For task: Apply reinforce step, ΔVRAM % (total): 6.26%, Current % of VRAM taken: 45.79%, Block Peak % of device VRAM: 26.60%, ΔTime: 00:00:21 +[2026-03-25 18:33:33,660][mllm.training.trainer_common][INFO] - Saved main optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/policy_optimizer_state.pt +[2026-03-25 18:33:33,663][mllm.training.trainer_common][INFO] - Saved critic optimizer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/critic_optimizer_state.pt +[2026-03-25 18:33:33,664][mllm.training.trainer_common][INFO] - Saved trainer state to /scratch/muqeeth/llm_negotiation/2026_03/ipd_naive_seed42/seed_42/agent_trainer/trainer_annealing_state.pkl +[2026-03-25 18:33:34,427][__main__][INFO] - Iteration 381 took 27s (11.84% Gen, 85.37% Train). Generation: 3s, Training: 23s. Estimated remaining time: 4h 36m 16s. Estimated total time: 7h 36m 23s. Time estimates for 10 more iterations: 4m 33s, 100 more iterations: 45m 38s, 500 more iterations: 3h 48m 11s. +[2026-03-25 18:33:34,429][__main__][INFO] - Starting iteration 381. +[2026-03-25 18:33:34,432][__main__][INFO] - Inference policies count is regular policies 2 and buffer policies 7 and human policies 1. +[2026-03-25 18:33:34,433][__main__][INFO] - Hard coded buffer agents are set to False with prob 0 diff --git a/seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/README.md b/seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/README.md new file mode 100644 index 0000000000000000000000000000000000000000..feabdd34815ab11ff7efffe88d68bef225068132 --- /dev/null +++ b/seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/README.md @@ -0,0 +1,207 @@ +--- +base_model: Qwen/Qwen2.5-7B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:Qwen/Qwen2.5-7B-Instruct +- lora +- transformers +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_config.json b/seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6a221a5e4bea1d17fa4c06fad23a1fda10272798 --- /dev/null +++ b/seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "down_proj", + "k_proj", + "o_proj", + "up_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_config.json b/seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6a221a5e4bea1d17fa4c06fad23a1fda10272798 --- /dev/null +++ b/seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "down_proj", + "k_proj", + "o_proj", + "up_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc b/src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e0ce601325acab2a3935c9b22e10e27fcccc961 Binary files /dev/null and b/src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/chat_utils/apply_template.py b/src_code_for_reproducibility/chat_utils/apply_template.py new file mode 100644 index 0000000000000000000000000000000000000000..6bbdc32dbb1df0407ff24ae90395dba0d162bf7d --- /dev/null +++ b/src_code_for_reproducibility/chat_utils/apply_template.py @@ -0,0 +1,89 @@ +""" +File: mllm/chat_utils/apply_template.py +Summary: Applies tokenizer-specific chat templates and stitches chat token IDs. +""" + +import torch + +from mllm.chat_utils.chat_turn import ChatTurn +from mllm.chat_utils.template_specific import ( + custom_gemma3_template, + custom_llama3_template, + custom_qwen2_template, + custom_qwen3_template, + gemma3_assistant_postfix, + qwen2_assistant_postfix, + qwen3_assistant_postfix, +) + + +def get_custom_chat_template(tokenizer) -> str: + """ + Get the chat template for the tokenizer. + """ + if "qwen2" in tokenizer.name_or_path.lower(): + return custom_qwen2_template + elif "llama" in tokenizer.name_or_path.lower(): + return custom_llama3_template + elif "qwen3" in tokenizer.name_or_path.lower(): + return custom_qwen3_template + elif "gemma" in tokenizer.name_or_path.lower(): + return custom_gemma3_template + else: + raise ValueError(f"Tokenizer {tokenizer.name_or_path} not supported") + + +def get_custom_assistant_postfix(tokenizer) -> torch.Tensor: + """ + Get the custom assistant postfix for the tokenizer. + """ + if "qwen2" in tokenizer.name_or_path.lower(): + return qwen2_assistant_postfix + elif "qwen3" in tokenizer.name_or_path.lower(): + return qwen3_assistant_postfix + elif "gemma" in tokenizer.name_or_path.lower(): + return gemma3_assistant_postfix + return torch.tensor([], dtype=torch.long) + + +def tokenize_chats(chats: list[ChatTurn], tokenizer, enable_thinking) -> None: + """ + Set the chat_template_token_ids for each chat turn. + We rely on tokenizer-side templates because engine-provided cached tokens are not exposed yet. + """ + custom_template = get_custom_chat_template(tokenizer) + custom_assistant_postfix: torch.Tensor = get_custom_assistant_postfix(tokenizer) + for i, chat in enumerate(chats): + if chat.chat_template_token_ids is None: + if chat.role == "user": + next_chat = chats[i + 1] if i + 1 < len(chats) else None + add_generation_prompt = True + if next_chat and next_chat.role == "user": + add_generation_prompt = False + encoded_chat = tokenizer.apply_chat_template( + [chat], + return_tensors="pt", + chat_template=custom_template, + add_generation_prompt=add_generation_prompt, + add_system_prompt=True if i == 0 else False, + enable_thinking=enable_thinking, + ).flatten() + previous_chat = chats[i - 1] if i > 0 else None + if previous_chat and previous_chat.role == "assistant": + encoded_chat = torch.cat([custom_assistant_postfix, encoded_chat]) + elif chat.role == "assistant": + encoded_chat = chat.out_token_ids + chat.chat_template_token_ids = encoded_chat + + +def chat_turns_to_token_ids( + chats: list[ChatTurn], tokenizer, enable_thinking +) -> list[int]: + """ + Tokenize the chat turns and set the chat_template_token_ids for each chat turn. + """ + tokenize_chats(chats=chats, tokenizer=tokenizer, enable_thinking=enable_thinking) + token_ids = [] + for chat in chats: + token_ids.append(chat.chat_template_token_ids) + return torch.cat(token_ids) diff --git a/src_code_for_reproducibility/chat_utils/chat_turn.py b/src_code_for_reproducibility/chat_utils/chat_turn.py new file mode 100644 index 0000000000000000000000000000000000000000..cfc0d9422a6070c86b1da8abce17ad28816fb2eb --- /dev/null +++ b/src_code_for_reproducibility/chat_utils/chat_turn.py @@ -0,0 +1,32 @@ +""" +File: mllm/chat_utils/chat_turn.py +Summary: Defines the ChatTurn schema plus helpers for serialization and validation. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, List, Literal, Optional, Tuple + +import jsonschema +import torch +from pydantic import BaseModel, ConfigDict, Field, model_validator + +AgentId = str + + +class ChatTurn(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) # needed for torch tensors + + role: str = Field(pattern="^(user|assistant)$") + agent_id: AgentId # ID of the agent with which the chat occured + content: str + reasoning_content: str | None = None + chat_template_token_ids: torch.LongTensor | None = None # Token ids of chat template format. For example, token ids of "{content}"" + out_token_ids: torch.LongTensor | None = ( + None # tokens generated from inference engine + ) + log_probs: torch.FloatTensor | None = None + is_state_end: bool = False # indicates whether this chat turn marks the end of a state in the trajectory diff --git a/src_code_for_reproducibility/chat_utils/template_specific.py b/src_code_for_reproducibility/chat_utils/template_specific.py new file mode 100644 index 0000000000000000000000000000000000000000..c22328455c55f0b0a02439efdacf6b09234d7981 --- /dev/null +++ b/src_code_for_reproducibility/chat_utils/template_specific.py @@ -0,0 +1,114 @@ +""" +File: mllm/chat_utils/template_specific.py +Summary: Stores chat template variants and assistant postfix tensors per tokenizer. +""" + +import huggingface_hub +import torch +from transformers import AutoTokenizer + +custom_llama3_template = """ +{%- if add_system_prompt %} + {{- '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|>' }} +{%- endif %} +{%- for message in messages %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }} +{%- endfor %} + +{%- if add_generation_prompt %} + {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }} +{%- endif %} +""" + +qwen2_assistant_postfix = ( + AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct") + .encode("\n", return_tensors="pt") + .flatten() +) +qwen3_assistant_postfix = ( + AutoTokenizer.from_pretrained("Qwen/Qwen3-8B") + .encode("\n", return_tensors="pt") + .flatten() +) +gemma3_assistant_postfix = ( + AutoTokenizer.from_pretrained("google/gemma-3-4b-it") + .encode("\n", return_tensors="pt") + .flatten() +) +custom_qwen2_template = """ +{%- if add_system_prompt %} + {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if reasoning_content %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} +""" + +custom_qwen3_template = """ +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} +""" + +custom_gemma3_template = """ +{%- if add_system_prompt %} +{{- bos_token -}} +{%- endif %} +{%- for message in messages -%} +{%- if message['role'] == 'assistant' -%} +{%- set role = 'model' -%} +{%- else -%} +{%- set role = message['role'] -%} +{%- endif -%} +{{ '' + role + '\n' + message['content'] | trim + '\n' }} +{%- endfor -%} +{%- if add_generation_prompt -%} +{{ 'model\n' }} +{%- endif -%} +""" diff --git a/src_code_for_reproducibility/markov_games/__init__.py b/src_code_for_reproducibility/markov_games/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2a7015344d8ac9b53f2660d4e837f709908213db --- /dev/null +++ b/src_code_for_reproducibility/markov_games/__init__.py @@ -0,0 +1,4 @@ +""" +File: mllm/markov_games/__init__.py +Summary: Makes Markov-game subpackages importable from the top-level namespace. +""" diff --git a/src_code_for_reproducibility/markov_games/agent.py b/src_code_for_reproducibility/markov_games/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..56406ae2695ce97ad7fa4fc436908904ee11be9f --- /dev/null +++ b/src_code_for_reproducibility/markov_games/agent.py @@ -0,0 +1,72 @@ +""" +File: mllm/markov_games/agent.py +Summary: Declares the base Agent interface connecting simulations to policy calls. +""" + +from abc import ABC, abstractmethod +from collections.abc import Callable +from typing import Any, Tuple + +from numpy.random import default_rng + +from mllm.markov_games.rollout_tree import AgentActLog + + +class Agent(ABC): + """Abstract policy wrapper that bridges simulations with arbitrary backends.""" + + @abstractmethod + def __init__( + self, + seed: int, + agent_id: str, + agent_name: str, + agent_policy: Callable[[list[dict]], str], + *args, + **kwargs, + ): + """ + Initialize the agent state and seed its RNG. + + Subclasses typically store extra handles (tokenizers, inference clients, etc.) + but they should always call ``super().__init__`` so sampling remains reproducible. + """ + self.seed = seed + self.agent_id = agent_id + self.agent_name = agent_name + self.policy = policy + self.rng = default_rng(self.seed) + raise NotImplementedError + + async def act(self, observation) -> Tuple[Any, AgentActLog]: + """ + Produce the next action (and associated chat log) given an environment observation. + + Implementations can iterate with rejection sampling, multi-call deliberation, etc. + Returns both the chosen action and an `AgentActLog` describing how it was produced. + """ + raise NotImplementedError + + def get_safe_copy(self): + """ + Return a deep copy whose future calls do not mutate the original agent. + + Needed for branch exploration/reruns with alternative actions. + """ + raise NotImplementedError + + def reset(self): + """Reset any internal state between rollouts.""" + raise NotImplementedError + + def render(self): + """Optional human-readable visualization of the agent (CLI/UI).""" + raise NotImplementedError + + def close(self): + """Release any external resources (network sockets, subprocesses, etc.).""" + raise NotImplementedError + + def get_agent_info(self): + """Return diagnostic metadata to embed inside rollout logs.""" + raise NotImplementedError diff --git a/src_code_for_reproducibility/markov_games/alternative_actions_runner.py b/src_code_for_reproducibility/markov_games/alternative_actions_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..d5165a2552019aefdf281c2bd41e50d204713921 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/alternative_actions_runner.py @@ -0,0 +1,146 @@ +""" +File: mllm/markov_games/alternative_actions_runner.py +Summary: Generates rollout branches by replaying trajectories with unilateral action changes. +""" + +import asyncio +import copy +import json +import os.path +from typing import Any, Tuple + +from mllm.markov_games.markov_game import AgentAndActionSafeCopy, MarkovGame +from mllm.markov_games.rollout_tree import ( + AgentActLog, + RolloutTreeBranchNode, + RolloutTreeNode, + RolloutTreeRootNode, + StepLog, +) + +AgentId = str + + +async def run_with_unilateral_alt_action( + markov_game: MarkovGame, + agent_id: AgentId, + time_step: int, + branch_node: RolloutTreeBranchNode, + max_depth: int, +): + """ + Roll out a counterfactual branch where ``agent_id`` deviates unilaterally. + + Starting from ``branch_node`` (which already contains the main trajectory), + we replay the simulation with the deviating agent's action while freezing + all other agents/actions, then continue for ``max_depth`` steps. + """ + + # Generate alternative action and take a step + await markov_game.set_action_of_agent(agent_id) + terminated: bool = markov_game.take_simulation_step() + step_log = markov_game.get_step_log() + first_alternative_node = RolloutTreeNode( + step_log=step_log, + time_step=time_step, + ) + + # Generate rest of trajectory up to max depth + time_step += 1 + counter = 1 + previous_node = first_alternative_node + while not terminated and counter <= max_depth: + terminated, step_log = await markov_game.step() + current_node = RolloutTreeNode(step_log=step_log, time_step=time_step) + previous_node.child = current_node + previous_node = current_node + counter += 1 + time_step += 1 + + if branch_node.branches == None: + branch_node.branches = {agent_id: [first_alternative_node]} + else: + agent_branches = branch_node.branches.get(agent_id, []) + agent_branches.append(first_alternative_node) + branch_node.branches[agent_id] = agent_branches + + +async def AlternativeActionsRunner( + markov_game: MarkovGame, + output_folder: str, + nb_alternative_actions: int, + max_depth: int, + branch_only_on_new_round: bool = False, +): + """ + Generate a rollout tree containing the main path plus unilateral deviation branches. + + For each timestep we: + 1. Cache agent actions without side effects. + 2. Advance the main trajectory. + 3. Spawn ``nb_alternative_actions`` asynchronous deviations per agent, + each replaying up to ``max_depth`` steps from the cached pre-action state. + The resulting branches feed advantage-alignment estimators. + """ + + tasks = [] + time_step = 0 + terminated = False + root = RolloutTreeRootNode(id=markov_game.get_id(), crn_id=markov_game.get_crn_id()) + previous_node = root + + while not terminated: + mg_before_action = markov_game.get_safe_copy() + + # Get safe copies for main branch + agent_action_safe_copies: dict[ + AgentId, AgentAndActionSafeCopy + ] = await markov_game.get_actions_of_agents_without_side_effects() + + markov_game.set_actions_of_agents_manually(agent_action_safe_copies) + terminated = markov_game.take_simulation_step() + main_node = RolloutTreeNode( + step_log=markov_game.get_step_log(), time_step=time_step + ) + branch_node = RolloutTreeBranchNode(main_child=main_node) + previous_node.child = branch_node + previous_node = main_node + + # Get alternative branches by generating new unilateral actions + for agent_id in markov_game.agent_ids: + for _ in range(nb_alternative_actions): + # Get safe copies for branches + branch_agent_action_safe_copies: dict[ + AgentId, AgentAndActionSafeCopy + ] = { + agent_id: AgentAndActionSafeCopy( + action=copy.deepcopy(agent_action_safe_copy.action), + action_info=copy.deepcopy(agent_action_safe_copy.action_info), + agent_after_action=agent_action_safe_copy.agent_after_action.get_safe_copy(), + ) + for agent_id, agent_action_safe_copy in agent_action_safe_copies.items() + } + mg_branch: MarkovGame = mg_before_action.get_safe_copy() + other_agent_id = [id for id in mg_branch.agent_ids if id != agent_id][0] + mg_branch.set_action_and_agent_after_action_manually( + agent_id=other_agent_id, + agent_action_safe_copy=branch_agent_action_safe_copies[ + other_agent_id + ], + ) + task = asyncio.create_task( + run_with_unilateral_alt_action( + markov_game=mg_branch, + time_step=time_step, + agent_id=agent_id, + branch_node=branch_node, + max_depth=max_depth, + ) + ) + tasks.append(task) + time_step += 1 + + # wait for all branches to complete + await asyncio.gather(*tasks) + + return root diff --git a/src_code_for_reproducibility/markov_games/group_timesteps.py b/src_code_for_reproducibility/markov_games/group_timesteps.py new file mode 100644 index 0000000000000000000000000000000000000000..48b5882a632ba858787befaac306195af959b376 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/group_timesteps.py @@ -0,0 +1,133 @@ +""" +File: mllm/markov_games/group_timesteps.py +Summary: Provides timestep-grouping utilities for rollout trees and training. +""" + +import copy +from typing import Callable + +from mllm.markov_games.markov_game import MarkovGame +from mllm.markov_games.rollout_tree import ( + AgentActLog, + RolloutTreeBranchNode, + RolloutTreeNode, + RolloutTreeRootNode, + StepLog, +) +from mllm.markov_games.simulation import SimulationStepLog + +AgentId = str + + +def group_time_steps( + rollout_tree: RolloutTreeRootNode, + accumulation_stop_condition: Callable[[StepLog], bool], +) -> RolloutTreeRootNode: + """ + During generation, we create rollout trees according to the real time steps. + However, during training, we might want to treat groups of time steps as a single time step. + As a concrete example, take Trust-and-Split. At each round, say we have X time steps of communication and then one time step for the split. + Then the communication actions will not get any reward, and the split action will get the reward. During REINFORCE training, with discounting, this + can cause training instability. We could instead treat every action in the round as being part of a single action, and give it the reward of the split action. + This method helps to do this sort of grouping. + It accumulates actions until the accumulation_stop_condition is met, and then creates a new node with the accumulated actions. + It then recursively calls itself on the child node. + Details: + - The reward for the group is the reward of the last time step in the group. + - The simulation log for the group is the simulation log of the last time step in the group. + - The state end for the group becomes the first state end in the group. + - The agent info for the group is the agent info of the last time step in the group. + """ + + def group_step_logs(step_logs: list[StepLog]) -> StepLog: + """ + Concatenate per-agent chat turns across steps; keep only the first is_state_end. + """ + last_sim_log = step_logs[-1].simulation_step_log + agent_ids = {aid for s in step_logs for aid in s.action_logs.keys()} + grouped_logs: dict[AgentId, AgentActLog] = {} + for aid in agent_ids: + turns = [] + for s in step_logs: + act = s.action_logs.get(aid) + if act and act.chat_turns: + turns.extend(copy.deepcopy(act.chat_turns)) + disable_is_state_end = False + # Only the first state_end should be True, the rest should be False + for t in turns: + if t.is_state_end: + if disable_is_state_end: + t.is_state_end = False + else: + disable_is_state_end = True + continue + grouped_logs[aid] = AgentActLog( + chat_turns=turns, info=step_logs[-1].action_logs[aid].info + ) + return StepLog(action_logs=grouped_logs, simulation_step_log=last_sim_log) + + def group_time_steps_rec( + current_node: RolloutTreeNode | RolloutTreeBranchNode, + group_time_step: int, + accumulation_step_logs: list[StepLog], + ) -> RolloutTreeNode | RolloutTreeBranchNode: + """ + Groups time steps. Recursion is used to handle branches. + """ + assert isinstance(current_node, RolloutTreeNode) or isinstance( + current_node, RolloutTreeBranchNode + ), "Current node must be a tree node or a branch node. Is of type: " + str( + type(current_node) + ) + first_group_node = None + current_group_node = None + while current_node is not None: + if isinstance(current_node, RolloutTreeBranchNode): + raise Exception( + "Grouping timesteps by round is not supported for branching trajectories yet." + ) + + # Accumulate + accumulation_step_logs.append(current_node.step_log) + if accumulation_stop_condition(current_node.step_log): + grouped_step_logs = group_step_logs(accumulation_step_logs) + accumulation_step_logs = [] + new_group_node = RolloutTreeNode( + step_log=grouped_step_logs, time_step=group_time_step, child=None + ) + if first_group_node == None: + first_group_node = new_group_node + group_time_step += 1 + if current_group_node is not None: + current_group_node.child = new_group_node + current_group_node = new_group_node + current_node = current_node.child + return first_group_node + + node = group_time_steps_rec( + current_node=rollout_tree.child, group_time_step=0, accumulation_step_logs=[] + ) + return RolloutTreeRootNode( + id=rollout_tree.id, + crn_id=rollout_tree.crn_id, + child=node, + agent_ids=rollout_tree.agent_ids, + ) + + +def stop_when_round_ends(step_log: StepLog) -> bool: + """ + Simplest stop condition. Will return True if step log is the last time step of a round. + This will throw an error if this information is not available in the simulation info. + """ + assert ( + "is_last_timestep_in_round" in step_log.simulation_step_log.info.keys() + ), "To group by round, is_last_timestep_in_round must be set in the info of your simulation step log at each time step." + return step_log.simulation_step_log.info["is_last_timestep_in_round"] + + +def group_by_round(rollout_tree: RolloutTreeRootNode) -> RolloutTreeRootNode: + """ + Groups time steps by round. + """ + return group_time_steps(rollout_tree, stop_when_round_ends) diff --git a/src_code_for_reproducibility/markov_games/linear_runner.py b/src_code_for_reproducibility/markov_games/linear_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..e3e14a3eda72cf4e620db5ab8ed0d3f7d552e9fe --- /dev/null +++ b/src_code_for_reproducibility/markov_games/linear_runner.py @@ -0,0 +1,42 @@ +""" +File: mllm/markov_games/linear_runner.py +Summary: Simulates a single unbranched Markov-game rollout and records it. +""" + +import asyncio +import json +import os.path + +from mllm.markov_games.markov_game import MarkovGame +from mllm.markov_games.rollout_tree import RolloutTreeNode, RolloutTreeRootNode + + +async def LinearRunner( + markov_game: MarkovGame, output_folder: str +) -> RolloutTreeRootNode: + """ + Generate a single main-path rollout (no branching) for the provided Markov game. + + Parameters + ---------- + markov_game: + Initialized ``MarkovGame`` with agents + simulation ready to step. + output_folder: + Unused placeholder in the legacy API (kept for compatibility). + """ + time_step = 0 + terminated = False + root = RolloutTreeRootNode( + id=markov_game.get_id(), + crn_id=markov_game.get_crn_id(), + agent_ids=markov_game.get_agent_ids(), + ) + previous_node = root + while not terminated: + terminated, step_log = await markov_game.step() + current_node = RolloutTreeNode(step_log=step_log, time_step=time_step) + previous_node.child = current_node + previous_node = current_node + time_step += 1 + + return root diff --git a/src_code_for_reproducibility/markov_games/markov_game.py b/src_code_for_reproducibility/markov_games/markov_game.py new file mode 100644 index 0000000000000000000000000000000000000000..7964fd69d24f617c76e36f852491b1e6141f6c48 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/markov_game.py @@ -0,0 +1,217 @@ +""" +File: mllm/markov_games/markov_game.py +Summary: Defines the MarkovGame base class plus shared simulation interfaces. +""" + +import asyncio +import copy +import json +import os +from dataclasses import dataclass +from typing import Any, List, Literal, Optional, Tuple + +from transformers.models.idefics2 import Idefics2Config + +from mllm.markov_games.agent import Agent +from mllm.markov_games.rollout_tree import AgentActLog, StepLog +from mllm.markov_games.simulation import Simulation + +AgentId = str + + +@dataclass +class AgentAndActionSafeCopy: + """Snapshot of an agent, its action, and metadata used for branch replay.""" + + action: Any + action_info: AgentActLog + agent_after_action: type[Agent] + + +class MarkovGame(object): + def __init__( + self, + id: int, + agents: dict[AgentId, type[Agent]], + simulation: type[Simulation], + crn_id: int, + ): + """ + Initialize the Markov game wrapper. + + Parameters + ---------- + id: + Unique rollout identifier (logged into rollout trees). + agents: + Mapping of agent_id -> Agent instance. + simulation: + Environment implementing the ``Simulation`` interface (IPD, TAS, etc.). + crn_id: + Identifier for the common random number stream used by this rollout. + """ + self.agents = agents + self.agent_ids = self.agents.keys() + self.simulation = simulation + self.simulation_step_log = None + self.agent_step_logs = {agent_id: None for agent_id in self.agent_ids} + self.actions = {} + self.id = id + self.crn_id = crn_id + + def get_id(self) -> str: + return self.id + + def get_crn_id(self) -> int: + return self.crn_id + + def get_agent_ids(self) -> List[AgentId]: + return list(self.agent_ids) + + async def get_action_of_agent_without_side_effects( + self, agent_id: AgentId + ) -> Tuple[Any, AgentActLog]: + """ + Safe function to get an action of an agent without modifying the agent or the simulation. + """ + agent = self.agents[agent_id] + agent_before_action = agent.get_safe_copy() + obs = self.simulation.get_obs_agent(agent_id) + action, action_info = await agent.act(observation=obs) + self.agents[agent_id] = agent_before_action + agent_after_action = agent.get_safe_copy() + return AgentAndActionSafeCopy(action, action_info, agent_after_action) + + async def get_actions_of_agents_without_side_effects( + self, + ) -> dict[AgentId, AgentAndActionSafeCopy]: + """ + Safe function to get an action of an agent without modifying the agent or the simulation. + """ + tasks = [] + for agent_id in self.agent_ids: + task = asyncio.create_task( + self.get_action_of_agent_without_side_effects(agent_id) + ) + tasks.append(task) + agent_and_action_safe_copies: list[ + AgentAndActionSafeCopy + ] = await asyncio.gather(*tasks) + return { + agent_id: agent_and_action_safe_copy + for agent_id, agent_and_action_safe_copy in zip( + self.agent_ids, agent_and_action_safe_copies + ) + } + + def set_action_and_agent_after_action_manually( + self, + agent_id: AgentId, + agent_action_safe_copy: AgentAndActionSafeCopy, + ): + """ + Set the action and the agent after action manually. + """ + self.actions[agent_id] = agent_action_safe_copy.action + self.agent_step_logs[agent_id] = agent_action_safe_copy.action_info + self.agents[agent_id] = agent_action_safe_copy.agent_after_action + + def set_actions_of_agents_manually( + self, actions: dict[AgentId, AgentAndActionSafeCopy] + ): + """ + Set the actions of agents manually. + """ + for agent_id, agent_action_safe_copy in actions.items(): + self.set_action_and_agent_after_action_manually( + agent_id, agent_action_safe_copy + ) + + async def set_action_of_agent(self, agent_id: AgentId): + """ + Query a single agent for its next action and store the result locally. + """ + agent = self.agents[agent_id] + obs = self.simulation.get_obs_agent(agent_id) + action, action_info = await agent.act(observation=obs) + self.actions[agent_id] = action + self.agent_step_logs[agent_id] = action_info + + async def set_actions(self): + """ + Query every agent concurrently and populate the cached actions/logs. + """ + # background_tasks = set() + tasks = [] + for agent_id in self.agent_ids: + task = asyncio.create_task(self.set_action_of_agent(agent_id)) + tasks.append(task) + await asyncio.gather(*tasks) + + def take_simulation_step(self): + """ + Advance the simulation by one step using the cached actions. + """ + terminated, self.simulation_step_log = self.simulation.step(self.actions) + return terminated + + def get_step_log(self) -> StepLog: + """ + Package the most recent simulation step and agent logs into a StepLog. + """ + if self.simulation_step_log is None: + raise RuntimeError( + "Simulation step log is empty; call take_simulation_step() first." + ) + missing_logs = [ + agent_id for agent_id, log in self.agent_step_logs.items() if log is None + ] + if missing_logs: + raise RuntimeError( + f"Agent action logs missing for: {', '.join(missing_logs)}. " + "Ensure set_actions() ran before requesting the step log." + ) + step_log = StepLog( + simulation_step_log=self.simulation_step_log, + action_logs=self.agent_step_logs, + ) + return step_log + + async def step(self) -> Tuple[bool, StepLog]: + """ + Convenience step that collects actions, advances the simulation, and returns the log. + """ + await self.set_actions() + terminated = self.take_simulation_step() + step_log = self.get_step_log() + return terminated, step_log + + def get_safe_copy(self): + """ + Create a shallow copy of the game with deep-copied agents/simulation for branching. + """ + + new_markov_game = copy.copy(self) + new_simulation = self.simulation.get_safe_copy() + new_agents = { + agent_id: agent.get_safe_copy() for agent_id, agent in self.agents.items() + } + + # Reassign copied components + new_markov_game.simulation = new_simulation + new_markov_game.agents = new_agents + + # IMPORTANT: ensure agent_ids references the new agents dict, not the original + new_markov_game.agent_ids = new_markov_game.agents.keys() + + # Deep-copy step data to avoid correlation + new_markov_game.simulation_step_log = copy.deepcopy(self.simulation_step_log) + new_markov_game.actions = copy.deepcopy(self.actions) + # Rebuild logs to align exactly with new agent ids + old_agent_step_logs = copy.deepcopy(self.agent_step_logs) + new_markov_game.agent_step_logs = { + agent_id: old_agent_step_logs.get(agent_id) + for agent_id in new_markov_game.agent_ids + } + + return new_markov_game diff --git a/src_code_for_reproducibility/markov_games/mg_utils.py b/src_code_for_reproducibility/markov_games/mg_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4fc406cd1f0cba593daad1108de2746b6a1d7678 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/mg_utils.py @@ -0,0 +1,97 @@ +""" +File: mllm/markov_games/mg_utils.py +Summary: Holds miscellaneous helpers shared across Markov-game modules. +""" + +import asyncio +import copy +from collections.abc import Callable +from dataclasses import dataclass + +from mllm.markov_games.ipd.ipd_agent import IPDAgent +from mllm.markov_games.ipd.Ipd_hard_coded_agents import ( + AlwaysCooperateIPDAgent, + AlwaysDefectIPDAgent, +) +from mllm.markov_games.ipd.ipd_simulation import IPD +from mllm.markov_games.markov_game import MarkovGame +from mllm.markov_games.negotiation.dond_agent import DealNoDealAgent +from mllm.markov_games.negotiation.dond_simulation import DealNoDealSimulation +from mllm.markov_games.negotiation.nego_hard_coded_policies import ( + HardCodedNegoGreedyPolicy, + HardCodedNegoWelfareMaximizingPolicy, +) +from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent +from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressSimulation +from mllm.markov_games.negotiation.tas_rps_agent import TrustAndSplitRPSAgent +from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSSimulation +from mllm.markov_games.rollout_tree import ( + AgentActLog, + RolloutTreeBranchNode, + RolloutTreeNode, + RolloutTreeRootNode, + StepLog, +) +from mllm.markov_games.simulation import SimulationStepLog + +AgentId = str + + +@dataclass +class AgentConfig: + """Configuration blob describing one agent in a Markov game spec.""" + + agent_id: str + agent_name: str + agent_class_name: str + policy_id: str + init_kwargs: dict + + +@dataclass +class MarkovGameConfig: + """Top-level config that ties together simulation settings and agent configs.""" + + id: int + seed: int + simulation_class_name: str + simulation_init_args: dict + agent_configs: list[AgentConfig] + + +def init_markov_game_components( + config: MarkovGameConfig, policies: dict[str, Callable[[list[dict]], str]] +): + """ + Materialize Agents and the Simulation described by ``config`` and return a MarkovGame. + + `policies` is a mapping of policy_id -> callable retrieved from the hosting trainer. + """ + agents = {} + agent_names = [] + for agent_config in config.agent_configs: + agent_id = agent_config.agent_id + agent_name = agent_config.agent_name + agent_class = eval(agent_config.agent_class_name) + agent = agent_class( + seed=config.seed, + agent_id=agent_id, + agent_name=agent_name, + policy=policies[agent_config.policy_id], + **agent_config.init_kwargs, + ) + agents[agent_id] = agent + agent_names.append(agent_name) + simulation = eval(config.simulation_class_name)( + seed=config.seed, + agent_ids=list(agents.keys()), + agent_names=agent_names, + **config.simulation_init_args, + ) + markov_game = MarkovGame( + id=config.id, + crn_id=config.seed, + agents=agents, + simulation=simulation, + ) + return markov_game diff --git a/src_code_for_reproducibility/markov_games/negotiation/nego_simulation.py b/src_code_for_reproducibility/markov_games/negotiation/nego_simulation.py new file mode 100644 index 0000000000000000000000000000000000000000..2172aa345aba163ed8d84b477f54c3fbde5ce249 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/negotiation/nego_simulation.py @@ -0,0 +1,252 @@ +""" +File: mllm/markov_games/negotiation/nego_simulation.py +Summary: Simulation harness for general negotiation environments. +""" + +import copy +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Dict, List, Tuple + +from numpy.random import default_rng + +from mllm.markov_games.rollout_tree import SimulationStepLog +from mllm.markov_games.simulation import Simulation +from mllm.utils.get_coagent_id import get_coagent_id + +AgentId = str + + +@dataclass +class Split: + """Structured proposal describing how many units of each item an agent keeps.""" + + items_given_to_self: Dict[str, int] + + +@dataclass +class Message: + """Single chat utterance exchanged during the negotiation phase.""" + + message: str + + +@dataclass # gets extended by variants +class NegotiationState: + """Full simulator state snapshot shared by all negotiation variants.""" + + round_nb: int + last_message: str + current_agent: AgentId + quantities: Dict[str, int] + values: Dict[AgentId, Dict[str, float]] + splits: Dict[AgentId, Split | None] + nb_messages_sent: Dict[AgentId, int] + previous_values: Dict[AgentId, Dict[str, float]] | None + previous_splits: Dict[AgentId, Dict[str, int] | None] | None + previous_points: Dict[AgentId, float] | None + previous_quantities: Dict[str, int] | None + split_phase: bool + + +@dataclass # gets extended by variants +class NegotiationObs: + """Observation presented to agents each turn (base fields; variants extend).""" + + round_nb: int + last_message: str + quota_messages_per_agent_per_round: int + current_agent: AgentId + other_agent: str + quantities: Dict[str, int] + item_types: List[str] + value: Dict[str, int] + split_phase: bool + last_split_agent: Dict[str, int] | None + last_value_agent: Dict[str, int] | None + last_points_agent: float | None + last_split_coagent: Dict[str, int] | None + last_value_coagent: Dict[str, int] | None + last_points_coagent: float | None + last_quantities: Dict[str, int] | None + + +def compute_tas_style_rewards( + agent_ids: List[AgentId], + values: Dict[AgentId, float], + splits: Dict[AgentId, Split], + quantities: Dict[str, int], +) -> Dict[AgentId, float]: + """ + TAS-like reward computation: if sum of proposed coins exceeds max_coins, + allocate proportionally. Otherwise, use proposed amounts directly. + Rewards are quantity_kept * per-coin value for each agent. + """ + a0, a1 = agent_ids[0], agent_ids[1] + r0, r1 = 0.0, 0.0 + + for item in quantities: + max_item = quantities[item] + item_to_self_0 = int( + (splits[a0].items_given_to_self.get(item, 0)) + if splits[a0] is not None + else 0 + ) + item_to_self_1 = int( + (splits[a1].items_given_to_self.get(item, 0)) + if splits[a1] is not None + else 0 + ) + denom = max(int(max_item), item_to_self_0 + item_to_self_1) + q0 = float(max_item) * float(item_to_self_0) / float(denom) + q1 = float(max_item) * float(item_to_self_1) / float(denom) + if type(values[a0]) is not dict: + r0 += q0 * float(values[a0]) + r1 += q1 * float(values[a1]) + else: + r0 += q0 * float(values[a0][item]) + r1 += q1 * float(values[a1][item]) + return {a0: r0, a1: r1} + + +class NegotiationSimulation(Simulation): + def __init__( + self, + agent_ids: List[AgentId], + agent_names: List[str], + seed: int, + nb_of_rounds: int, + quota_messages_per_agent_per_round: int, + item_types: List[str] | None = None, + ): + self.seed = seed + self.rng = default_rng(self.seed) + self.agent_ids = list(agent_ids) + self.agent_names = agent_names + self.agent_id_to_name = { + agent_id: agent_name for agent_id, agent_name in zip(agent_ids, agent_names) + } + self.nb_of_rounds = int(nb_of_rounds) + self.quota_messages_per_agent_per_round = int( + quota_messages_per_agent_per_round + ) + if item_types is not None: + self.item_types = [item.lower() for item in item_types] + else: + self.item_types = ["coins"] + self.state: NegotiationState | None = None + self._starting_agent_index = self.rng.choice([0, 1]) + self.reset() + + def _other(self, agent_id: AgentId) -> AgentId: + return get_coagent_id(self.agent_ids, agent_id) + + @abstractmethod + def set_new_round_of_variant(self): + """Variant hook: sample new private values / stock before each round.""" + pass + + @abstractmethod + def get_info_of_variant( + self, state: NegotiationState, actions: Dict[AgentId, Any] + ) -> Dict[str, Any]: + """Variant hook: populate SimulationStepLog.info with custom diagnostics.""" + pass + + def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]: + """ + Returns terminated, step_log + """ + assert self.state is not None + current_agent = self.state.current_agent + a0, a1 = self.agent_ids[0], self.agent_ids[1] + action = actions.get(current_agent) + + # Split phase: require both splits in the same timestep + if self.state.split_phase: + action_a0 = actions.get(a0) + action_a1 = actions.get(a1) + have_both_splits = isinstance(action_a0, Split) and isinstance( + action_a1, Split + ) + if not have_both_splits: + rewards = {agent_id: 0.0 for agent_id in self.agent_ids} + return False, SimulationStepLog( + rewards=rewards, info={"type": "waiting_for_splits"} + ) + + # Record splits + self.state.splits[a0] = action_a0 + self.state.splits[a1] = action_a1 + + # Compute rewards and end round + rewards = self.get_rewards(self.state.splits) + + # Info + info = self.get_info_of_variant(self.state, actions) + + # Prepare next round + # Alternate starting agent + self.state.round_nb += 1 + self._starting_agent_index = 1 - self._starting_agent_index + self.state.current_agent = self.agent_ids[self._starting_agent_index] + self.state.previous_values = copy.deepcopy(self.state.values) + self.state.previous_splits = copy.deepcopy(self.state.splits) + self.state.previous_quantities = copy.deepcopy(self.state.quantities) + self.state.previous_points = copy.deepcopy(rewards) + self.state.last_message = "" + self.set_new_round_of_variant() # variant specific + self.state.splits = {agent_id: None for agent_id in self.agent_ids} + self.state.nb_messages_sent = {agent_id: 0 for agent_id in self.agent_ids} + is_last_timestep_in_round = True + done = self.state.round_nb >= self.nb_of_rounds + + # Message phase: roll the conversation forward a single turn. + elif isinstance(action, Message): + self.state.last_message = action.message + self.state.nb_messages_sent[current_agent] += 1 + + # Move turn to other agent + self.state.current_agent = self._other(current_agent) + + # If both agents have reached their message quota, enter split phase + if all( + self.state.nb_messages_sent[agent_id] + >= self.quota_messages_per_agent_per_round + for agent_id in self.agent_ids + ): + self.state.split_phase = True + is_last_timestep_in_round = False + done = False + rewards = {agent_id: 0.0 for agent_id in self.agent_ids} + info = {"type": "message"} + + info[ + "is_last_timestep_in_round" + ] = is_last_timestep_in_round # Used later to group round timesteps if needed + return done, SimulationStepLog(rewards=rewards, info=info) + + def get_obs(self): + """Returns all agent observations in dict""" + return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids} + + @abstractmethod + def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]: + pass + + @abstractmethod + def get_obs_agent(self, agent_id): + pass + + def get_state(self): + return self.state + + def get_safe_copy(self): + """Return a safe copy of the simulation.""" + simulation_copy = copy.copy(self) + simulation_copy.state = copy.deepcopy(self.state) + return simulation_copy + + @abstractmethod + def reset(self) -> dict[AgentId, NegotiationObs]: + pass diff --git a/src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py b/src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py new file mode 100644 index 0000000000000000000000000000000000000000..f0c49b6cd995c05fe7ff4ad2efc7d7082035ac2e --- /dev/null +++ b/src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py @@ -0,0 +1,249 @@ +""" +File: mllm/markov_games/negotiation/negotiation_statistics.py +Summary: Aggregates and reports statistics for negotiation experiments. +""" + +from __future__ import annotations + +from typing import Callable, Dict, List, Tuple + +from mllm.markov_games.negotiation.nego_simulation import Split +from mllm.markov_games.rollout_tree import SimulationStepLog + + +def avg_reward(sl: SimulationStepLog) -> List[Tuple[str, float]]: + """Average (per-step) reward for each agent and overall. + + What it computes: + - Returns the raw reward for every (non-buffer) agent at the current + simulation step. + - Adds an aggregate key ``all_agents`` which is the simple arithmetic + mean across the agents present in ``sl.rewards``. + + Rationale / motivation: + Monitoring the reward stream at each step helps: + * Diagnose reward shaping issues (e.g., unintended negative drift). + * Provide a fairness snapshot (are rewards systematically skewed?). + * Supply a ubiquitous baseline metric used by other higher‑level + summaries (efficiency, surplus allocation, etc.). + + Return shape: + { agent_id: float, ..., "all_agents": float } + If any agent id contains the substring "buffer" we treat this step as + an implementation artifact (e.g., rollout buffer) and return ``None`` + to avoid polluting aggregates. + """ + for aid in sl.rewards.keys(): + if "buffer" in str(aid) and "live" not in str(aid): + return None + # One value per agent at each step + rewards_dict = {f"reward-{aid}": float(v) for aid, v in (sl.rewards or {}).items()} + return [(key, value) for key, value in rewards_dict.items() if value is not None] + + +def split_efficiency(sl: SimulationStepLog) -> List[Tuple[str, float]] | None: + """Final‑round allocation efficiency relative to an upper bound. + + What it computes (only on the last timestep of a negotiation round): + - Uses ``info['values']`` (per‑agent per‑item valuations) and + ``info['quantities']`` (available item counts) to form a greedy + *upper bound* on achievable total reward: allocate each unit of an + item to the single agent who values that item most. + - Compares the actually realized sum of rewards at that final + timestep to this constructed maximum. + - Emits a single scalar under key ``"all_agents"`` equal to + achieved / theoretical_max. + + Motivation: + Efficiency (a core welfare notion) distinguishes between coordination + failures (low efficiency) versus strategic distributional disputes + (high efficiency but uneven splits). Tracking this per round helps + evaluate whether models learn to identify and realize joint surplus. + + Notes / caveats: + - Only defined for 2+ non‑buffer agents; if a buffer agent is present + returns ``None`` to exclude spurious steps. + - Requires the environment to have populated ``values`` and + ``quantities``; otherwise returns ``None``. + - This is an optimistic bound (not necessarily reachable under + protocol constraints) but is simple, fast, and comparable across + runs. + """ + info = sl.info or {} + if not info or not info.get("is_last_timestep_in_round"): + return None + quantities = info.get("quantities") or {} + values = info.get("values") or {} + if not values or not quantities: + return None + agent_ids = list(sl.rewards.keys()) + if type(values[agent_ids[0]]) is dict: + item_keys = list(values.values())[0].keys() + max_vals, max_quantities = [], [] + for item in item_keys: + max_val = max(float(agent_vals[item]) for agent_vals in values.values()) + max_vals.append(max_val) + max_quantities.append(quantities[item]) + else: + max_vals = [max(float(v) for v in values.values())] + max_quantities = [quantities[item] for item in quantities.keys()] + for aid in sl.rewards.keys(): + if "buffer" in str(aid) and "live" not in str(aid): + return None + achieved = sum(float(v) for v in sl.rewards.values()) + max_reward = sum(d * v for d, v in zip(max_quantities, max_vals)) + # Efficiency is a global metric; emit same value for a special key "all" + return [("split_efficiency", achieved / max_reward)] + + +def _extract_items_from_split(raw_split: Dict) -> Dict[str, float] | None: + """Return a mapping item->proposal amount from a split structure. + + Supports both generic negotiation splits with nested structure + { 'items_given_to_self': {item: qty, ...}} + and TAS coin-only variants which may already be a flat mapping {'coins': qty}. + """ + + if raw_split is None: + return {} + elif isinstance(raw_split, Split): + return {k: float(v) for k, v in raw_split.items_given_to_self.items()} + elif isinstance(raw_split, dict): + if "items_given_to_self" in raw_split and isinstance( + raw_split["items_given_to_self"], dict + ): + return {k: float(v) for k, v in raw_split["items_given_to_self"].items()} + # Fallback: assume already flat mapping of items + elif hasattr(raw_split, "items_given_to_self"): + return {k: float(v) for k, v in raw_split["items_given_to_self"].items()} + return { + k: float(v) for k, v in raw_split.items() if isinstance(v, (int, float)) + } + return {} + + +def _average_proposal_relative_value( + sl: SimulationStepLog, + metric_name: str, + comparator: Callable[[float, float], bool], + opposite_comparator: Callable[[float, float], bool], +) -> Dict[str, float | None] | None: + """Shared implementation for proposal size conditioned on relative value. + + Parameters: + comparator: returns True when agent_0's value relation (e.g. < or >) + to agent_1 holds for an item and we should collect agent_0's + proposed quantity for that item. + opposite_comparator: inverse relation used to collect agent_1's items. + + Behavior: + - Executes only on final timestep of a round (where the definitive + proposal / allocation is known via ``info['splits']``). + - For each item, classifies which agent's value satisfies the chosen + relation and records that agent's proposed quantity from the split. + - Averages (mean) across all qualifying items per agent; if no items + qualify for an agent returns ``None`` for that agent id. + - Adds ``all_agents`` mean across the numeric (non-None) agent values. + + Why this matters: + Distinguishing how much an agent *asks for* when it subjectively + values items more (or less) than its counterpart reveals patterns of + opportunism vs. concession. This is especially useful when raw reward + differences are subtle but allocation *intent* differs. + """ + info = sl.info or {} + if not info or not info.get("is_last_timestep_in_round"): + return None + quantities = info.get("quantities") or {} + splits = info.get("splits") or {} + values = info.get("values") or {} + agent_ids: List[str] = list(sl.rewards.keys()) + if len(agent_ids) != 2: + return None # Only defined for 2-agent case. + for aid in agent_ids: + if "buffer" in str(aid) and "live" not in str(aid): + return None + # Extract per-agent item proposals robustly + split_items = {aid: _extract_items_from_split(splits.get(aid)) for aid in agent_ids} + agent_0_vals: List[float] = [] + agent_1_vals: List[float] = [] + for item in quantities.keys(): + # Values may be either a float (same for all items) or dict per item + v0_raw = values[agent_ids[0]] + v1_raw = values[agent_ids[1]] + v0 = float(v0_raw[item]) if isinstance(v0_raw, dict) else float(v0_raw) + v1 = float(v1_raw[item]) if isinstance(v1_raw, dict) else float(v1_raw) + if comparator(v0, v1): + agent_0_vals.append(split_items[agent_ids[0]].get(item, 0.0)) + elif opposite_comparator(v0, v1): + agent_1_vals.append(split_items[agent_ids[1]].get(item, 0.0)) + out: Dict[str, float | None] = {} + out[f"{metric_name}-{agent_ids[0]}"] = ( + sum(agent_0_vals) / len(agent_0_vals) if agent_0_vals else None + ) + out[f"{metric_name}-{agent_ids[1]}"] = ( + sum(agent_1_vals) / len(agent_1_vals) if agent_1_vals else None + ) + + return [(key, value) for key, value in out.items() if value is not None] + + +def average_proposal_when_agent_values_item_lower( + sl: SimulationStepLog, +) -> List[Tuple[str, float | None]] | None: + """Mean quantity an agent proposes for items it values *less* than opponent. + + Interpretation: + A higher value implies the agent still claims (or is allocated) a + notable share of items where it has a comparative *disadvantage* in + valuation, signaling either strategic over-claiming or protocol-driven + egalitarian splits. Conversely, very low numbers can indicate + efficient specialization or excessive concession. + + Returns: + Mapping { agent_id: float | None, "all_agents": float | None } where + None indicates no qualifying items for that agent in the round. + """ + return _average_proposal_relative_value( + sl, + "average_proposal_when_agent_values_item_lower", + lambda a, b: a < b, + lambda a, b: a > b, + ) + + +def average_proposal_when_agent_values_item_higher( + sl: SimulationStepLog, +) -> List[Tuple[str, float | None]] | None: + """Mean quantity an agent proposes for items it values *more* than opponent. + + Interpretation: + Captures how aggressively an agent claims items where it holds a + comparative *advantage*. Elevated values can reflect rational + specialization (efficient exploitation of comparative advantage) or + potentially unfair grabs if paired with low concession in the lower + valuation metric. Comparing this with the 'lower' counterpart helps + profile negotiation style (cooperative vs. exploitative). + + Returns: + Mapping { agent_id: float | None, "all_agents": float | None } where + None indicates no qualifying items. + """ + return _average_proposal_relative_value( + sl, + "average_proposal_when_agent_values_item_higher", + lambda a, b: a > b, + lambda a, b: a < b, + ) + + +# Explicit list of metric functions exported for rendering. Helper functions +# starting with '_' are intentionally excluded. Update this list when adding +# new public statistics so render.py can rely on it instead of introspecting +# every callable in the module. +stat_functs: list[Callable[[SimulationStepLog], List[Tuple[str, float]]]] = [ + avg_reward, + average_proposal_when_agent_values_item_lower, + average_proposal_when_agent_values_item_higher, + split_efficiency, +] diff --git a/src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py b/src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..82152f7f0b4353e4b119be6179eb309ead518e7f --- /dev/null +++ b/src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py @@ -0,0 +1,108 @@ +""" +File: mllm/markov_games/negotiation/no_press_nego_agent.py +Summary: Agent variant for no-press negotiations without explicit messaging. +""" + +from typing import Any, Dict, List, Tuple + +from mllm.markov_games.negotiation.nego_agent import ( + NegotiationAgent, + NegotiationAgentState, +) +from mllm.markov_games.negotiation.nego_simulation import Split +from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs +from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn + + +class NoPressAgent(NegotiationAgent): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # No communication in this variant + self.intro_prompt = ( + "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n" + "Setup:\n" + "1. The game consists of multiple independent rounds.\n" + "2. In each round, there are multiple items to split between the two agents.\n" + "3. Both agents are assigned a per-item value between 1 and 20 (inclusive) in each round.\n" + "4. You can observe per-item values of both agents.\n" + "5. Because assignments are random, both agents are equally likely to have same expected per-item value.\n" + "\n" + "Protocol:\n" + "1. Both agents simultaneously propose the amount of each item they will keep.\n" + "2. If the total sum of proposals is less than or equal to the item quantity, both agents receive their proposed amounts.\n" + "3. If the total sum of proposals exceeds the item quantity, they are allocated proportionally.\n" + "4. Your points for the round = (amount you receive per item) x (your per-item value for that round), added across all items.\n" + "5. Points are accumulated across rounds.\n" + "Your goal: {goal}\n" + ) + self.new_round_prompt = ( + "A New Round Begins\n" + "The items to split are {quantities}.\n" + "Your per-item values are {value} and {other_agent}'s per-item values are {other_value}." + ) + self.last_round_prompt = ( + "Last Round Summary:\n" + " - Items to split: {last_quantities}\n" + " - Your per-item values: {last_value_agent}\n" + " - {other_agent}'s per-item values: {last_value_coagent}\n" + " - You proposed: {last_split_agent}\n" + " - You earned: {last_points_agent} points\n" + " - {other_agent} proposed: {last_split_coagent}\n" + " - {other_agent} earned: {last_points_coagent} points\n" + " - Round Complete.\n" + ) + self.send_split_prompt = "Submit Your Proposal\n" "Respond as {proposal_style}" + + def get_message_regex(self, observation: NoPressObs) -> str: + """Return an empty pattern because the no-press variant forbids chat.""" + return r"^$" # No messages allowed + + def get_split_regex(self, observation: NoPressObs) -> str: + """Match proposals like ``Proposal: 4 coins, 6 apples`` case-insensitively.""" + items = list(observation.quantities.keys()) + # Accept both singular and plural forms + item_pattern = "|".join( + [f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?" for item in items] + ) + regex = rf"(?i)Proposal:\s*((?:\s*(?P(10|[0-9]))\s*(?P{item_pattern})\s*,?)+)" + return regex + + def get_split_action(self, policy_output: str, observation: NoPressObs) -> Split: + """ + Parse the LLM proposal into a normalized ``Split`` structure. + + The regex-based parser is lenient (accepts pluralization variants) so that + prompt tweaks do not require re-training the extraction logic. + """ + items = list(observation.quantities.keys()) + import re as _re + + split_regex = self.get_split_regex(observation) + items_given_to_self = {item: 0 for item in items} + m = _re.match(split_regex, policy_output.strip()) + if m: + # Find all (number, item) pairs + item_pattern = "|".join( + [ + f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?" + for item in items + ] + ) + inner_regex = rf"(?i)(10|[0-9])\s*({item_pattern})" + + def normalize_item_name(item_str): + """Canonicalize plural/singular user text back to the config item id.""" + for orig in items: + if item_str.lower() == orig.lower(): + return orig + if orig.endswith("s") and item_str.lower() == orig[:-1].lower(): + return orig + if ( + not orig.endswith("s") + and item_str.lower() == orig.lower() + "s" + ): + return orig + + for num, item in _re.findall(inner_regex, m.group(1)): + items_given_to_self[normalize_item_name(item)] = int(num) + return Split(items_given_to_self=items_given_to_self) diff --git a/src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py b/src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py new file mode 100644 index 0000000000000000000000000000000000000000..e4b6cfeed0821a3ebdebdaa5abf4a8f09a69723c --- /dev/null +++ b/src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py @@ -0,0 +1,182 @@ +""" +File: mllm/markov_games/negotiation/no_press_nego_simulation.py +Summary: Simulation driver for no-press negotiation scenarios. +""" + +import copy +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Literal, Tuple + +from mllm.markov_games.negotiation.nego_simulation import ( + NegotiationObs, + NegotiationSimulation, + NegotiationState, + Split, + compute_tas_style_rewards, +) + +AgentId = str + + +@dataclass +class NoPressState(NegotiationState): + """NegotiationState alias used to clarify we run in always-split phase.""" + + pass + + +@dataclass +class NoPressObs(NegotiationObs): + """Observation that includes both agents' values (since there is no messaging).""" + + other_value: Dict[str, float] + + +class NoPressSimulation(NegotiationSimulation): + def __init__( + self, + game_type: Literal["10-1-exclusive", "10-1-ties", "1-to-20"] = "1-to-20", + same_round_value: bool = True, + atleast_one_conflict: bool = False, + *args, + **kwargs, + ): + self.game_type = game_type + self.same_round_value = same_round_value + self.atleast_one_conflict = atleast_one_conflict + super().__init__(*args, **kwargs) + + def _sample_values(self) -> Dict[AgentId, dict]: + """Sample per-item valuations according to the configured template.""" + values = defaultdict(dict) + if self.state is None: + item_types = self.item_types + else: + item_types = list(self.state.quantities.keys()) + while True: + for item in item_types: + if self.game_type == "10-1-exclusive": + v = int(self.rng.choice([1, 10])) + values[self.agent_ids[0]][item] = v + values[self.agent_ids[1]][item] = 10 if v == 1 else 1 + elif self.game_type == "10-1-ties": + for aid in self.agent_ids: + values[aid][item] = int(self.rng.choice([1, 10])) + elif self.game_type == "1-to-20": + for aid in self.agent_ids: + values[aid][item] = int(self.rng.integers(1, 21)) + if self.atleast_one_conflict: + has_conflict = False + for item in item_types: + agent_values_for_item = [ + values[aid][item] for aid in self.agent_ids + ] + if len(set(agent_values_for_item)) > 1: + has_conflict = True + break + if not has_conflict: + continue + agent_values = [sum(v.values()) for v in values.values()] + if len(set(agent_values)) == 1 or not self.same_round_value: + break + return values + + def _sample_quantities(self) -> Dict[str, int]: + """No-press setups use symmetric 10-unit stocks for every item.""" + return {item.lower(): 10 for item in self.item_types} + + def set_new_round_of_variant(self): + """Refresh quantities/values and jump directly into the simultaneous split.""" + self.state.quantities = self._sample_quantities() + self.state.values = self._sample_values() + self.state.split_phase = True + + def get_info_of_variant( + self, state: NegotiationState, actions: Dict[AgentId, Any] + ) -> Dict[str, Any]: + """Surface quantities/values/splits so statistics modules can read them.""" + return { + "quantities": copy.deepcopy(state.quantities), + "values": copy.deepcopy(state.values), + "splits": copy.deepcopy(state.splits), + } + + def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]: + """Reuse TAS reward logic because the split arbitration is identical.""" + return compute_tas_style_rewards( + self.agent_ids, self.state.values, splits, self.state.quantities + ) + + def get_obs(self): + return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids} + + def get_obs_agent(self, agent_id): + other_id = self._other(agent_id) + last_value_coagent = ( + None + if self.state.previous_values is None + else self.state.previous_values.get(other_id) + ) + last_points_coagent = ( + None + if self.state.previous_points is None + else round(self.state.previous_points.get(other_id), 1) + ) + last_value_agent = ( + None + if self.state.previous_values is None + else self.state.previous_values.get(agent_id) + ) + last_points_agent = ( + None + if self.state.previous_points is None + else round(self.state.previous_points.get(agent_id), 1) + ) + last_split_coagent = None + last_split_agent = None + if self.state.previous_splits is not None: + last_split_coagent = self.state.previous_splits[ + other_id + ].items_given_to_self + last_split_agent = self.state.previous_splits[agent_id].items_given_to_self + obs = NoPressObs( + round_nb=self.state.round_nb, + last_message="", + quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round, + current_agent=self.state.current_agent, + other_agent=self.agent_id_to_name[other_id], + quantities=self.state.quantities, + item_types=self.item_types, + value=self.state.values[agent_id], + split_phase=self.state.split_phase, + last_split_agent=last_split_agent, + last_value_agent=last_value_agent, + last_points_agent=last_points_agent, + last_split_coagent=last_split_coagent, + last_value_coagent=last_value_coagent, + last_points_coagent=last_points_coagent, + other_value=self.state.values[other_id], + last_quantities=self.state.previous_quantities, + ) + return obs + + def reset(self): + start_agent = self.agent_ids[self._starting_agent_index] + quantities = self._sample_quantities() + values = self._sample_values() + self.state = NoPressState( + round_nb=0, + last_message="", + current_agent=start_agent, + quantities=quantities, + values=values, + previous_values=None, + splits={aid: None for aid in self.agent_ids}, + nb_messages_sent={aid: 0 for aid in self.agent_ids}, + split_phase=True, + previous_splits=None, + previous_points=None, + previous_quantities=None, + ) + return self.get_obs() diff --git a/src_code_for_reproducibility/markov_games/negotiation/tas_rps_simulation.py b/src_code_for_reproducibility/markov_games/negotiation/tas_rps_simulation.py new file mode 100644 index 0000000000000000000000000000000000000000..ac4b67f5a04a916af94768031983555fbb33cb36 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/negotiation/tas_rps_simulation.py @@ -0,0 +1,257 @@ +""" +File: mllm/markov_games/negotiation/tas_rps_simulation.py +Summary: Simulation for TAS Rock-Paper-Scissors blended scenarios. +""" + +import copy +from dataclasses import dataclass +from typing import Any, Dict, List, Literal, Tuple + +from mllm.markov_games.negotiation.nego_simulation import ( + Message, + NegotiationObs, + NegotiationSimulation, + NegotiationState, + Split, + compute_tas_style_rewards, +) +from mllm.markov_games.rollout_tree import SimulationStepLog + +AgentId = str + + +def _get_rps_winner( + hand1: Literal["rock", "paper", "scissors"], + hand2: Literal["rock", "paper", "scissors"], +) -> Literal["rock", "paper", "scissors"]: + """Determine winner of rock-paper-scissors between two hands.""" + if hand1 == hand2: + raise ValueError("Hands should be different") + if ( + (hand1 == "rock" and hand2 == "scissors") + or (hand1 == "paper" and hand2 == "rock") + or (hand1 == "scissors" and hand2 == "paper") + ): + return hand1 + else: + return hand2 + + +@dataclass +class TrustAndSplitRPSState(NegotiationState): + """Negotiation state augmented with the current and previous RPS hands.""" + + hands: Dict[ + AgentId, Literal["rock", "paper", "scissors"] + ] # rock, paper, or scissors + previous_hands: Dict[AgentId, Literal["rock", "paper", "scissors"]] | None + + +@dataclass +class TrustAndSplitRPSObs(NegotiationObs): + """Agent-facing observation enriched with last-hand metadata.""" + + hand: Literal["rock", "paper", "scissors"] + last_hand_agent: Literal["rock", "paper", "scissors"] | None + last_hand_coagent: Literal["rock", "paper", "scissors"] | None + last_hand_value_coagent: Literal["upper", "lower"] | None + + +class TrustAndSplitRPSSimulation(NegotiationSimulation): + """Negotiation variant that splices TAS splitting with RPS-determined stakes.""" + + def __init__( + self, + alternating_hands: bool = False, + alternating_mix_ratio: float = None, + *args, + **kwargs, + ): + self.alternating_hands = alternating_hands + self.alternating_mix_ratio = alternating_mix_ratio + super().__init__(*args, **kwargs) + if self.alternating_mix_ratio is not None: + if self.rng.random() < self.alternating_mix_ratio: + self.alternating_hands = True + else: + self.alternating_hands = False + + def _sample_hands_and_values( + self, + alternate_hands: bool = False, + ) -> Tuple[Dict[AgentId, str], Dict[AgentId, float]]: + """ + Sample a rock-paper-scissors hand for each agent plus the per-hand value. + + When ``alternate_hands`` is True we deliberately flip the previous round's + winner/loser roles to create nonstationary payoffs; otherwise we draw + uniformly without replacement. + """ + hands = ["rock", "paper", "scissors"] + if alternate_hands: + previous_hands = list(self.state.previous_hands.values()) + hand1, hand2 = self.rng.choice(hands, size=2, replace=False) + winner = _get_rps_winner(hand1, hand2) + loser = hand1 if winner == hand2 else hand2 + previous_winner = _get_rps_winner(previous_hands[0], previous_hands[1]) + agent_hands, values = {}, {} + for agent_id in self.agent_ids: + if self.state.previous_hands[agent_id] == previous_winner: + agent_hands[agent_id] = loser + values[agent_id] = 1.0 + else: + agent_hands[agent_id] = winner + values[agent_id] = 10.0 + return agent_hands, values + else: + # Assign different hands to each agent + hand1, hand2 = self.rng.choice(hands, size=2, replace=False) + + agent_hands = {self.agent_ids[0]: hand1, self.agent_ids[1]: hand2} + + # Determine winner and assign values + winner = _get_rps_winner(hand1, hand2) + values = {} + for agent_id in self.agent_ids: + if agent_hands[agent_id] == winner: + values[agent_id] = 10.0 # Winner gets value 10 + else: + values[agent_id] = 1.0 # Loser gets value 1 + + return agent_hands, values + + def set_new_round_of_variant(self): + """Refresh hands/values and reset round-specific state.""" + self.state.previous_hands = copy.deepcopy(self.state.hands) + new_hands, new_values = self._sample_hands_and_values( + alternate_hands=self.alternating_hands + ) + self.state.hands = new_hands + self.state.values = new_values + # Quantities are constant in TAS + self.state.quantities = {"coins": 10} + self.state.split_phase = False + + def get_info_of_variant( + self, state: NegotiationState, actions: Dict[AgentId, Any] + ) -> Dict[str, Any]: + """Expose variant-specific tensors for downstream logging/analysis.""" + return { + "quantities": copy.deepcopy(state.quantities), + "hands": copy.deepcopy(state.hands), + "values": copy.deepcopy(state.values), + "previous_hands": copy.deepcopy(state.previous_hands), + "previous_values": copy.deepcopy(state.previous_values), + "splits": copy.deepcopy(state.splits), + } + + def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]: + """Delegates to TAS reward helper because the payout rule is identical.""" + return compute_tas_style_rewards( + self.agent_ids, self.state.values, splits, self.state.quantities + ) + + def get_obs_agent(self, agent_id): + """Return a full Trust-and-Split observation for ``agent_id``.""" + other_id = self._other(agent_id) + last_value_coagent = ( + None + if self.state.previous_values is None + else self.state.previous_values.get(other_id) + ) + last_hand_coagent = ( + None + if self.state.previous_hands is None + else self.state.previous_hands.get(other_id) + ) + last_points_coagent = ( + None + if self.state.previous_points is None + else round(self.state.previous_points.get(other_id), 1) + ) + last_value_agent = ( + None + if self.state.previous_values is None + else self.state.previous_values.get(agent_id) + ) + last_hand_agent = ( + None + if self.state.previous_hands is None + else self.state.previous_hands.get(agent_id) + ) + last_points_agent = ( + None + if self.state.previous_points is None + else round(self.state.previous_points.get(agent_id), 1) + ) + last_split_coagent = None + last_split_agent = None + if self.state.previous_splits is not None: + last_split_coagent = self.state.previous_splits[ + other_id + ].items_given_to_self["coins"] + last_split_agent = self.state.previous_splits[agent_id].items_given_to_self[ + "coins" + ] + if last_hand_agent is None or last_hand_coagent is None: + last_hand_value_coagent = None + else: + winner = _get_rps_winner(last_hand_agent, last_hand_coagent) + last_hand_value_coagent = ( + "upper" if winner == last_hand_coagent else "lower" + ) + obs = TrustAndSplitRPSObs( + round_nb=self.state.round_nb, + last_message=self.state.last_message, + quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round, + current_agent=self.state.current_agent, + other_agent=self.agent_id_to_name[other_id], + quantities={"coins": 10}, + item_types=self.item_types, + value=self.state.values[agent_id], + split_phase=self.state.split_phase, + last_split_agent=last_split_agent, + last_value_agent=last_value_agent, + last_points_agent=last_points_agent, + last_split_coagent=last_split_coagent, + last_value_coagent=last_value_coagent, + last_points_coagent=last_points_coagent, + hand=self.state.hands[agent_id], + last_hand_coagent=last_hand_coagent, + last_hand_agent=last_hand_agent, + last_quantities=self.state.previous_quantities, + last_hand_value_coagent=last_hand_value_coagent, + ) + return obs + + def get_state(self): + return self.state + + def get_safe_copy(self): + """Return a safe copy of the simulation.""" + simulation_copy = copy.copy(self) + simulation_copy.state = copy.deepcopy(self.state) + return simulation_copy + + def reset(self): + """Initialize and return initial observations""" + # Decide starting agent alternating across resets for determinism + start_agent = self.agent_ids[self._starting_agent_index] + hands, values = self._sample_hands_and_values() + self.state = TrustAndSplitRPSState( + round_nb=0, + last_message="", + current_agent=start_agent, + quantities={"coins": 10}, + values=values, + splits={aid: None for aid in self.agent_ids}, + nb_messages_sent={aid: 0 for aid in self.agent_ids}, + previous_values=None, + previous_splits=None, + previous_points=None, + split_phase=False, + hands=hands, + previous_hands=None, + previous_quantities=None, + ) + return self.get_obs() diff --git a/src_code_for_reproducibility/markov_games/rollout_tree.py b/src_code_for_reproducibility/markov_games/rollout_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..c9feb0e92f3bcf19255d80c6ff2dcd9a045c6c21 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/rollout_tree.py @@ -0,0 +1,95 @@ +""" +File: mllm/markov_games/rollout_tree.py +Summary: Defines rollout tree data structures and serialization helpers. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, List, Literal, Optional, Tuple + +import jsonschema +from pydantic import BaseModel, Field, model_validator + +from mllm.chat_utils.chat_turn import ChatTurn + +AgentId = str + + +class SimulationStepLog(BaseModel): + """Minimal snapshot of environment-side rewards and auxiliary info.""" + + rewards: dict[AgentId, float] + info: Any = None + + +class AgentActLog(BaseModel): + """LLM-side provenance for an action (chat turns + metadata).""" + + chat_turns: list[ChatTurn] | None + info: Any = None + + @model_validator(mode="after") + def _exactly_one_state_end(self): + """ + This method is used to enforce that for each AgentActLog, there is exactly one ChatTurn which is a state end. + """ + if self.chat_turns != []: + n = sum(1 for t in self.chat_turns if t.is_state_end) + if n != 1: + raise ValueError( + f"AgentActLog must have exactly one ChatTurn with is_state_end=True; got {self.chat_turns}." + ) + return self + else: + return self + + +class StepLog(BaseModel): + action_logs: dict[AgentId, AgentActLog] + simulation_step_log: SimulationStepLog + + +# BranchType = Literal["unilateral_deviation", "common_deviation"] # might not be necessary +# class BranchNodeInfo(BaseModel): +# branch_id: str +# branch_for: AgentId +# branch_type: BranchType + + +class RolloutTreeNode(BaseModel): + """Single timestep of the main trajectory (or a branch) plus linkage.""" + + step_log: StepLog + time_step: int + child: RolloutTreeNode | RolloutTreeBranchNode | None = None + + +class RolloutTreeBranchNode(BaseModel): + """ + First item of the tuple indicates which agent "called" for an alternative branch. + """ + + main_child: RolloutTreeNode + branches: dict[AgentId, list[RolloutTreeNode]] | None = None + + +class RolloutTreeRootNode(BaseModel): + """Entry point for serialized rollouts (main path plus optional branches).""" + + id: int + crn_id: int # ID of the rng used to generate this rollout tree + child: RolloutTreeNode | RolloutTreeBranchNode | None = None + agent_ids: List[AgentId] = Field(min_length=1) + + +# class RolloutTreeLeafNode(BaseModel): +# step_log: StepLog +# time_step: int + + +# Necessary for self-referential stuff in pydantic +RolloutTreeBranchNode.model_rebuild() +RolloutTreeNode.model_rebuild() diff --git a/src_code_for_reproducibility/markov_games/run_markov_games.py b/src_code_for_reproducibility/markov_games/run_markov_games.py new file mode 100644 index 0000000000000000000000000000000000000000..86b7f2ef0ac2e19afdd61a8a32342aaf8120e7de --- /dev/null +++ b/src_code_for_reproducibility/markov_games/run_markov_games.py @@ -0,0 +1,35 @@ +""" +File: mllm/markov_games/run_markov_games.py +Summary: CLI entry point for running configured Markov-game experiments. +""" + +import asyncio +from collections.abc import Callable +from dataclasses import dataclass + +from torch._C import ClassType + +from mllm.markov_games.markov_game import MarkovGame +from mllm.markov_games.rollout_tree import RolloutTreeRootNode + + +async def run_markov_games( + runner: Callable[[MarkovGame], RolloutTreeRootNode], + runner_kwargs: dict, + output_folder: str, + markov_games: list[MarkovGame], +) -> list[RolloutTreeRootNode]: + """ + Kick off multiple Markov game rollouts concurrently and return their trees. + + Parameters mirror the Hydra configs (runner callable + kwargs) so callers can + choose ``LinearRunner``, ``AlternativeActionsRunner`` or future variants. + """ + tasks = [] + for mg in markov_games: + tasks.append( + asyncio.create_task( + runner(markov_game=mg, output_folder=output_folder, **runner_kwargs) + ) + ) + return await asyncio.gather(*tasks) diff --git a/src_code_for_reproducibility/markov_games/simulation.py b/src_code_for_reproducibility/markov_games/simulation.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0a2e61924f9a79aee3229ed8d7aa20827ae859 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/simulation.py @@ -0,0 +1,94 @@ +""" +File: mllm/markov_games/simulation.py +Summary: Core simulation loop utilities and step logging for Markov games. +""" + +from abc import ABC, abstractmethod +from typing import Any, Tuple + +from numpy.random import default_rng + +from mllm.markov_games.rollout_tree import SimulationStepLog + + +class Simulation(ABC): + @abstractmethod + def __init__(self, seed: int, *args, **kwargs): + self.seed = seed + self.rng = default_rng(self.seed) + + @abstractmethod + def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]: + """ + Advance the environment by one logical tick using ``actions``. + + Returns + ------- + terminated: bool + Whether the episode has finished. + SimulationStepLog + Reward/info bundle describing this transition. + """ + raise NotImplementedError + + def get_obs(self): + """Return a dict mapping agent_id -> observation for *all* agents.""" + raise NotImplementedError + + def get_obs_agent(self, agent_id): + """Return the observation for a single agent.""" + raise NotImplementedError + + def get_obs_size(self): + """Describe the observation tensor shape (useful for critic heads).""" + raise NotImplementedError + + def get_state(self): + """Return the privileged simulator state if available.""" + raise NotImplementedError + + def get_state_size(self): + """Describe the state tensor shape.""" + raise NotImplementedError + + def get_avail_actions(self): + """Return the global action mask/tensor if the space is discrete.""" + raise NotImplementedError + + def get_avail_agent_actions(self, agent_id): + """Return the available action mask for a given agent.""" + raise NotImplementedError + + def get_total_actions(self): + """Returns the total number of actions an agent could ever take. + + Implementations currently assume a discrete, one-dimensional action space per agent. + """ + raise NotImplementedError + + def get_safe_copy(self): + """ + Return copy of the simulator that shares no mutable state with the original. + """ + raise NotImplementedError + + def reset(self): + """Reset to the initial state and return the starting observations.""" + raise NotImplementedError + + def render(self): + """Optional human-facing visualization.""" + raise NotImplementedError + + def close(self): + """Release any owned resources (files, processes, etc.).""" + raise NotImplementedError + + # def seed(self): + # raise NotImplementedError + + def save_replay(self): + raise NotImplementedError + + def get_simulation_info(self): + raise NotImplementedError diff --git a/src_code_for_reproducibility/markov_games/statistics_runner.py b/src_code_for_reproducibility/markov_games/statistics_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..e58131fc505806a758936978a46e4f8faefacad3 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/statistics_runner.py @@ -0,0 +1,415 @@ +""" +File: mllm/markov_games/statistics_runner.py +Summary: Executes multiple rollouts to compute experiment statistics. +""" + +from __future__ import annotations + +import gc +import json +import pickle +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional + +from basic_render import find_iteration_folders + +from mllm.markov_games.rollout_tree import ( + RolloutTreeBranchNode, + RolloutTreeNode, + RolloutTreeRootNode, + SimulationStepLog, +) + + +def _iterate_main_nodes(root: RolloutTreeRootNode) -> Iterator[RolloutTreeNode]: + """ + Iterate the main path nodes without materializing full path lists. + """ + current = root.child + while current is not None: + if isinstance(current, RolloutTreeNode): + yield current + current = current.child + elif isinstance(current, RolloutTreeBranchNode): + # Follow only the main child on the main trajectory + current = current.main_child + else: + break + + +def iterate_main_simulation_logs( + root: RolloutTreeRootNode, +) -> Iterator[SimulationStepLog]: + """Yield ``SimulationStepLog`` objects along the main (non-branch) path.""" + for node in _iterate_main_nodes(root): + yield node.step_log.simulation_step_log + + +def stream_rollout_files(iteration_folder: Path) -> Iterator[Path]: + """Iterate over every ``*.rt.pkl`` file under an iteration directory.""" + for p in iteration_folder.rglob("*.rt.pkl"): + if p.is_file(): + yield p + + +def load_root(path: Path) -> RolloutTreeRootNode: + """Load and validate a rollout tree from disk.""" + with open(path, "rb") as f: + data = pickle.load(f) + return RolloutTreeRootNode.model_validate(data) + + +@dataclass +class StatRecord: + """Convenience container for serialized stat rows.""" + + mgid: int + crn_id: Optional[int] + iteration: str + values: Dict[str, Any] + + +class StatComputer: + """ + Stateful stat computer that consumes SimulationStepLog instances + and produces final aggregated values for one rollout (mgid). + """ + + def update(self, sl: SimulationStepLog) -> None: # pragma: no cover - interface + raise NotImplementedError + + def finalize(self) -> Dict[str, Any]: # pragma: no cover - interface + raise NotImplementedError + + +def run_stats( + data_root: Path, + game_name: str, + make_computers: Callable[[], List[StatComputer]], + output_filename: Optional[str] = None, + output_format: str = "json", # "json" (dict of lists) or "jsonl" +) -> Path: + """ + Compute stats across all iteration_* folders under data_root. + Writes JSONL to data_root/statistics/. + """ + data_root = Path(data_root) + outdir = data_root / "statistics" + outdir.mkdir(parents=True, exist_ok=True) + # Choose extension by format + default_name = ( + f"{game_name}.stats.json" + if output_format == "json" + else f"{game_name}.stats.jsonl" + ) + outfile = outdir / ( + output_filename if output_filename is not None else default_name + ) + + # Rewrite file each run to keep it clean and small + if outfile.exists(): + outfile.unlink() + + iteration_folders = find_iteration_folders(str(data_root)) + + # If writing JSONL, stream directly; otherwise accumulate minimal records + if output_format == "jsonl": + with open(outfile, "w", encoding="utf-8") as w: + for iteration_folder in iteration_folders: + iteration_name = Path(iteration_folder).name + for pkl_path in stream_rollout_files(Path(iteration_folder)): + root = load_root(pkl_path) + + computers = make_computers() + for sl in iterate_main_simulation_logs(root): + for comp in computers: + try: + comp.update(sl) + except Exception: + continue + + values: Dict[str, Any] = {} + for comp in computers: + try: + values.update(comp.finalize()) + except Exception: + continue + + rec = { + "mgid": getattr(root, "id", None), + "crn_id": getattr(root, "crn_id", None), + "iteration": iteration_name, + "stats": values, + } + w.write(json.dumps(rec, ensure_ascii=False) + "\n") + + del root + del computers + gc.collect() + else: + # Aggregate to dict-of-lists for easier plotting + records: List[Dict[str, Any]] = [] + # Process in deterministic order + for iteration_folder in iteration_folders: + iteration_name = Path(iteration_folder).name + for pkl_path in stream_rollout_files(Path(iteration_folder)): + root = load_root(pkl_path) + + computers = make_computers() + for sl in iterate_main_simulation_logs(root): + for comp in computers: + try: + comp.update(sl) + except Exception: + continue + + values: Dict[str, Any] = {} + for comp in computers: + try: + values.update(comp.finalize()) + except Exception: + continue + + records.append( + { + "mgid": getattr(root, "id", None), + "crn_id": getattr(root, "crn_id", None), + "iteration": iteration_name, + "stats": values, + } + ) + + del root + del computers + gc.collect() + + # Build dict-of-lists with nested stats preserved + # Collect all stat keys and nested agent keys where needed + mgids: List[Any] = [] + crn_ids: List[Any] = [] + iterations_out: List[str] = [] + # stats_out is a nested structure mirroring keys but with lists + stats_out: Dict[str, Any] = {} + + # First pass to collect union of keys + stat_keys: set[str] = set() + nested_agent_keys: Dict[str, set[str]] = {} + for r in records: + stats = r.get("stats", {}) or {} + for k, v in stats.items(): + stat_keys.add(k) + if isinstance(v, dict): + nested = nested_agent_keys.setdefault(k, set()) + for ak in v.keys(): + nested.add(str(ak)) + + # Initialize structure + for k in stat_keys: + if k in nested_agent_keys: + stats_out[k] = {ak: [] for ak in sorted(nested_agent_keys[k])} + else: + stats_out[k] = [] + + # Fill lists + for r in records: + mgids.append(r.get("mgid")) + crn_ids.append(r.get("crn_id")) + iterations_out.append(r.get("iteration")) + stats = r.get("stats", {}) or {} + for k in stat_keys: + val = stats.get(k) + if isinstance(stats_out[k], dict): + # per-agent dict + agent_dict = val if isinstance(val, dict) else {} + for ak in stats_out[k].keys(): + stats_out[k][ak].append(agent_dict.get(ak)) + else: + stats_out[k].append(val) + + with open(outfile, "w", encoding="utf-8") as w: + json.dump( + { + "mgid": mgids, + "crn_id": crn_ids, + "iteration": iterations_out, + "stats": stats_out, + }, + w, + ensure_ascii=False, + ) + + return outfile + + +def run_stats_functional( + data_root: Path, + game_name: str, + metrics: Dict[str, Callable[[SimulationStepLog], Optional[Dict[str, float]]]], + output_filename: Optional[str] = None, + output_format: str = "json", +) -> Path: + """ + Functional variant where metrics is a dict of name -> f(SimulationStepLog) -> {agent_id: value}. + Aggregates per rollout by averaging over steps where a metric produced a value. + Writes a single consolidated file in data_root/statistics/. + """ + data_root = Path(data_root) + outdir = data_root / "statistics" + outdir.mkdir(parents=True, exist_ok=True) + default_name = ( + f"{game_name}.stats.json" + if output_format == "json" + else f"{game_name}.stats.jsonl" + ) + outfile = outdir / ( + output_filename if output_filename is not None else default_name + ) + + if outfile.exists(): + outfile.unlink() + + iteration_folders = find_iteration_folders(str(data_root)) + + def finalize_rollout( + agg: Dict[str, Dict[str, List[float]]] + ) -> Dict[str, Dict[str, float]]: + # avg per metric per agent + result: Dict[str, Dict[str, float]] = {} + for mname, agent_values in agg.items(): + result[mname] = {} + for aid, vals in agent_values.items(): + if not vals: + result[mname][aid] = None # keep alignment; could be None + else: + result[mname][aid] = sum(vals) / len(vals) + return result + + if output_format == "jsonl": + with open(outfile, "w", encoding="utf-8") as w: + for iteration_folder in iteration_folders: + iteration_name = Path(iteration_folder).name + for pkl_path in stream_rollout_files(Path(iteration_folder)): + root = load_root(pkl_path) + + # aggregator structure: metric -> agent_id -> list of values + agg: Dict[str, Dict[str, List[float]]] = { + m: {} for m in metrics.keys() + } + + for sl in iterate_main_simulation_logs(root): + for mname, fn in metrics.items(): + try: + vals = fn(sl) + except Exception: + vals = None + if not vals: + continue + for aid, v in vals.items(): + if v is None: + continue + lst = agg[mname].setdefault(str(aid), []) + try: + lst.append(float(v)) + except Exception: + continue + + values = finalize_rollout(agg) + rec = { + "mgid": getattr(root, "id", None), + "crn_id": getattr(root, "crn_id", None), + "iteration": iteration_name, + "stats": values, + } + w.write(json.dumps(rec, ensure_ascii=False) + "\n") + + del root + gc.collect() + else: + records: List[Dict[str, Any]] = [] + for iteration_folder in iteration_folders: + iteration_name = Path(iteration_folder).name + for pkl_path in stream_rollout_files(Path(iteration_folder)): + root = load_root(pkl_path) + + agg: Dict[str, Dict[str, List[float]]] = {m: {} for m in metrics.keys()} + for sl in iterate_main_simulation_logs(root): + for mname, fn in metrics.items(): + try: + vals = fn(sl) + except Exception: + vals = None + if not vals: + continue + for aid, v in vals.items(): + if v is None: + continue + lst = agg[mname].setdefault(str(aid), []) + try: + lst.append(float(v)) + except Exception: + continue + + values = finalize_rollout(agg) + records.append( + { + "mgid": getattr(root, "id", None), + "crn_id": getattr(root, "crn_id", None), + "iteration": iteration_name, + "stats": values, + } + ) + + del root + gc.collect() + + # Build dict-of-lists output + mgids: List[Any] = [] + crn_ids: List[Any] = [] + iterations_out: List[str] = [] + stats_out: Dict[str, Any] = {} + + stat_keys: set[str] = set() + nested_agent_keys: Dict[str, set[str]] = {} + for r in records: + stats = r.get("stats", {}) or {} + for k, v in stats.items(): + stat_keys.add(k) + if isinstance(v, dict): + nested = nested_agent_keys.setdefault(k, set()) + for ak in v.keys(): + nested.add(str(ak)) + + for k in stat_keys: + if k in nested_agent_keys: + stats_out[k] = {ak: [] for ak in sorted(nested_agent_keys[k])} + else: + stats_out[k] = [] + + for r in records: + mgids.append(r.get("mgid")) + crn_ids.append(r.get("crn_id")) + iterations_out.append(r.get("iteration")) + stats = r.get("stats", {}) or {} + for k in stat_keys: + val = stats.get(k) + if isinstance(stats_out[k], dict): + agent_dict = val if isinstance(val, dict) else {} + for ak in stats_out[k].keys(): + stats_out[k][ak].append(agent_dict.get(ak)) + else: + stats_out[k].append(val) + + with open(outfile, "w", encoding="utf-8") as w: + json.dump( + { + "mgid": mgids, + "crn_id": crn_ids, + "iteration": iterations_out, + "stats": stats_out, + }, + w, + ensure_ascii=False, + ) + + return outfile diff --git a/src_code_for_reproducibility/models/__init__.py b/src_code_for_reproducibility/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..46d40ee71acb4c10a596d4107d18fd3e890df610 --- /dev/null +++ b/src_code_for_reproducibility/models/__init__.py @@ -0,0 +1,4 @@ +""" +File: mllm/models/__init__.py +Summary: Exports model-layer utilities from the models package. +""" diff --git a/src_code_for_reproducibility/models/__pycache__/adapter_training_wrapper.cpython-312.pyc b/src_code_for_reproducibility/models/__pycache__/adapter_training_wrapper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20bd4c3c5474187bdc2c3856bf591729509b1269 Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/adapter_training_wrapper.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-312.pyc b/src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..555da444de4de381e84ad3e9999f1b84aab9dd37 Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-312.pyc b/src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..250bd3412ee0a5198f76e824ef265ae2d0bb5732 Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/models/adapter_training_wrapper.py b/src_code_for_reproducibility/models/adapter_training_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..f99c768935cca8203f7a9482d25aeade0dee0d59 --- /dev/null +++ b/src_code_for_reproducibility/models/adapter_training_wrapper.py @@ -0,0 +1,104 @@ +""" +File: mllm/models/adapter_training_wrapper.py +Summary: Wraps a shared LLM with adapter-specific PEFT handling for training. +""" + +import logging +from typing import Union + +import torch +import torch.nn as nn +from peft import LoraConfig, get_peft_model + +logger = logging.getLogger(__name__) + + +class AdapterWrapper(nn.Module): + """ + A thin façade that + • keeps a reference to a *shared* PEFT-wrapped model, + • ensures `set_adapter(adapter)` is called on every forward, + • exposes only the parameters that should be trained for that adapter + (plus whatever extra modules you name). + """ + + def __init__( + self, + shared_llm: nn.Module, + adapter_id: str, + lora_config: dict, + path: Union[str, None] = None, + ): + super().__init__() + self.shared_llm = shared_llm + self.adapter_id = adapter_id + lora_config = LoraConfig(**lora_config) + # this modifies the shared llm in place, adding a lora adapter inside + self.shared_llm = get_peft_model( + model=shared_llm, + peft_config=lora_config, + adapter_name=adapter_id, + ) + self.shared_llm.train() + # Load external adapter weights if provided + loaded_from: str | None = None + if path: + try: + # Supports both local filesystem paths and HF Hub repo IDs + self.shared_llm.load_adapter( + is_trainable=True, + model_id=path, + adapter_name=adapter_id, + ) + loaded_from = path + except ( + Exception + ) as exc: # noqa: BLE001 - want to log any load failure context + logger.warning( + f"Adapter '{adapter_id}': failed to load from '{path}': {exc}" + ) + + if loaded_from: + logger.info( + f"Adapter '{adapter_id}': loaded initial weights from '{loaded_from}'." + ) + else: + logger.info( + f"Adapter '{adapter_id}': initialized with fresh weights (no initial weights found)." + ) + + def parameters(self, recurse: bool = True): + """ + "recurse" is just for pytorch compatibility + """ + self.shared_llm.set_adapter(self.adapter_id) + params = [p for p in self.shared_llm.parameters() if p.requires_grad] + + return params + + def get_base_model_logits(self, contexts): + """ + Run the base model (without adapter) in inference mode, without tracking gradients. + This is useful to get reference logits for KL-divergence computation. + """ + with torch.no_grad(): + with self.shared_llm.disable_adapter(): + return self.shared_llm(input_ids=contexts)[0] + + def forward(self, *args, **kwargs): + self.shared_llm.set_adapter(self.adapter_id) + return self.shared_llm(*args, **kwargs) + + def save_pretrained(self, save_path): + self.shared_llm.save_pretrained(save_path) + + def gradient_checkpointing_enable(self, *args, **kwargs): + self.shared_llm.gradient_checkpointing_enable(*args, **kwargs) + + @property + def dtype(self): + return self.shared_llm.dtype + + @property + def device(self): + return self.shared_llm.device diff --git a/src_code_for_reproducibility/models/human_policy.py b/src_code_for_reproducibility/models/human_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..699c2d817c17abfdd0e144e99e14fb2b3ba06872 --- /dev/null +++ b/src_code_for_reproducibility/models/human_policy.py @@ -0,0 +1,260 @@ +""" +File: mllm/models/human_policy.py +Summary: Implements an interactive human-in-the-loop policy for experiments. +""" + +import asyncio +import os +import re +import shutil +import sys +from typing import Callable, Dict, List, Optional + +from mllm.markov_games.rollout_tree import ChatTurn + +try: + import rstr # For generating example strings from regex +except Exception: # pragma: no cover + rstr = None + + +def _clear_terminal() -> None: + """ + Clear the terminal screen in a cross-platform manner. + """ + if sys.stdout.isatty(): + os.system("cls" if os.name == "nt" else "clear") + + +def _terminal_width(default: int = 100) -> int: + try: + return shutil.get_terminal_size().columns + except Exception: + return default + + +def _horizontal_rule(char: str = "─") -> str: + width = max(20, _terminal_width() - 2) + return char * width + + +class _Style: + # ANSI colors (bright, readable) + RESET = "\033[0m" + BOLD = "\033[1m" + DIM = "\033[2m" + # Foreground colors + FG_BLUE = "\033[94m" # user/system headers + FG_GREEN = "\033[92m" # human response header + FG_YELLOW = "\033[93m" # notices + FG_RED = "\033[91m" # errors + FG_MAGENTA = "\033[95m" # regex + FG_CYAN = "\033[96m" # tips + + +def _render_chat(state) -> str: + """ + Render prior messages in a compact, readable terminal format. + + Expected message dict keys: {"role": str, "content": str, ...} + """ + lines: List[str] = [] + lines.append(_horizontal_rule()) + lines.append(f"{_Style.FG_BLUE}{_Style.BOLD} Conversation so far {_Style.RESET}") + lines.append(_horizontal_rule()) + for chat in state: + role = chat.role + content = str(chat.content).strip() + # Map roles to display names and colors/emojis + if role == "assistant": + header = f"{_Style.FG_GREEN}{_Style.BOLD}HUMAN--🧑‍💻{_Style.RESET}" + elif role == "user": + header = f"{_Style.FG_BLUE}{_Style.BOLD}USER--⚙️{_Style.RESET}" + else: + header = f"[{_Style.DIM}{role.upper()}{_Style.RESET}]" + lines.append(header) + # Indent content for readability + for line in content.splitlines() or [""]: + lines.append(f" {line}") + lines.append("") + lines.append(_horizontal_rule()) + return "\n".join(lines) + + +async def _async_input(prompt_text: str) -> str: + """Non-blocking input using a background thread.""" + return await asyncio.to_thread(input, prompt_text) + + +def _short_regex_example(regex: str, max_len: int = 30) -> Optional[str]: + """ + Try to produce a short example string that matches the regex. + We attempt multiple times and pick the first <= max_len. + """ + if rstr is None: + return None + try: + for _ in range(20): + candidate = rstr.xeger(regex) + if len(candidate) <= max_len: + return candidate + # Fallback to truncation (may break match, so don't return) + return None + except Exception: + return None + + +def _detect_input_type(regex: str | None) -> tuple[str, str, str]: + """ + Detect what type of input is expected based on the regex pattern. + Returns (input_type, start_tag, end_tag) + """ + if regex is None: + return "text", "", "" + + if "message_start" in regex and "message_end" in regex: + return "message", "<>", "<>" + elif "proposal_start" in regex and "proposal_end" in regex: + return "proposal", "<>", "<>" + else: + return "text", "", "" + + +async def human_policy(state, agent_id, regex: str | None = None) -> str: + """ + Async human-in-the-loop policy. + + - Displays prior conversation context in the terminal. + - Prompts the user for a response. + - If a regex is provided, validates and re-prompts until it matches. + - Automatically adds formatting tags based on expected input type. + + Args: + prompt: Chat history as a list of {role, content} dicts. + regex: Optional fullmatch validation pattern. + + Returns: + The user's validated response string. + """ + # Detect input type and formatting + input_type, start_tag, end_tag = _detect_input_type(regex) + + while True: + _clear_terminal() + print(_render_chat(state)) + + if regex: + example = _short_regex_example(regex, max_len=30) + print( + f"{_Style.FG_MAGENTA}{_Style.BOLD}Expected format (regex fullmatch):{_Style.RESET}" + ) + print(f" {_Style.FG_MAGENTA}{regex}{_Style.RESET}") + if example: + print( + f"{_Style.FG_CYAN}Example (random, <=30 chars):{_Style.RESET} {example}" + ) + print(_horizontal_rule(".")) + + # Custom prompt based on input type + if input_type == "message": + print( + f"{_Style.FG_YELLOW}Type your message content (formatting will be added automatically):{_Style.RESET}" + ) + elif input_type == "proposal": + print( + f"{_Style.FG_YELLOW}Type your proposal (number only, formatting will be added automatically):{_Style.RESET}" + ) + else: + print( + f"{_Style.FG_YELLOW}Type your response and press Enter.{_Style.RESET}" + ) + + print( + f"{_Style.DIM}Commands: /help to view commands, /refresh to re-render, /quit to abort{_Style.RESET}" + ) + else: + print( + f"{_Style.FG_YELLOW}Type your response and press Enter.{_Style.RESET} {_Style.DIM}(/help for commands){_Style.RESET}" + ) + + user_in = (await _async_input("> ")).rstrip("\n") + + # Commands + if user_in.strip().lower() in {"/help", "/h"}: + print(f"\n{_Style.FG_CYAN}{_Style.BOLD}Available commands:{_Style.RESET}") + print( + f" {_Style.FG_CYAN}/help{_Style.RESET} or {_Style.FG_CYAN}/h{_Style.RESET} Show this help" + ) + print( + f" {_Style.FG_CYAN}/refresh{_Style.RESET} or {_Style.FG_CYAN}/r{_Style.RESET} Re-render the conversation and prompt" + ) + print( + f" {_Style.FG_CYAN}/quit{_Style.RESET} or {_Style.FG_CYAN}/q{_Style.RESET} Abort the run (raises KeyboardInterrupt)" + ) + await asyncio.sleep(1.0) + continue + if user_in.strip().lower() in {"/refresh", "/r"}: + continue + if user_in.strip().lower() in {"/quit", "/q"}: + raise KeyboardInterrupt("Human aborted run from human_policy") + + # Add formatting tags if needed + if start_tag and end_tag: + formatted_input = f"{start_tag}{user_in}{end_tag}" + else: + formatted_input = user_in + + if regex is None: + return ChatTurn( + role="assistant", agent_id=agent_id, content=formatted_input + ) + + # Validate against regex (fullmatch) + try: + pattern = re.compile(regex) + except re.error as e: + # If regex is invalid, fall back to accepting any input + print( + f"{_Style.FG_RED}Warning:{_Style.RESET} Provided regex is invalid: {e}. Accepting input without validation." + ) + await asyncio.sleep(0.5) + return ChatTurn( + role="assistant", agent_id=agent_id, content=formatted_input + ) + + if pattern.fullmatch(formatted_input): + return ChatTurn( + role="assistant", agent_id=agent_id, content=formatted_input + ) + + # Show validation error and re-prompt + print("") + print( + f"{_Style.FG_RED}{_Style.BOLD}Input did not match the required format.{_Style.RESET} Please try again." + ) + + if input_type == "message": + print( + f"You entered: {_Style.FG_CYAN}{start_tag}{user_in}{end_tag}{_Style.RESET}" + ) + print(f"Just type the message content without tags.") + elif input_type == "proposal": + print( + f"You entered: {_Style.FG_CYAN}{start_tag}{user_in}{end_tag}{_Style.RESET}" + ) + print(f"Just type the number without tags.") + else: + print(f"Expected (regex):") + print(f" {_Style.FG_MAGENTA}{regex}{_Style.RESET}") + + print(_horizontal_rule(".")) + print(f"{_Style.FG_YELLOW}Press Enter to retry...{_Style.RESET}") + await _async_input("") + + +def get_human_policies() -> Dict[str, Callable[[List[Dict]], str]]: + """ + Expose the human policy in the same map shape used elsewhere. + """ + # Type hint says Callable[[List[Dict]], str] but we intentionally return the async callable. + return {"human_policy": human_policy} # type: ignore[return-value] diff --git a/src_code_for_reproducibility/models/inference_backend.py b/src_code_for_reproducibility/models/inference_backend.py new file mode 100644 index 0000000000000000000000000000000000000000..c204482170d5a4418870805b620295cab294fab6 --- /dev/null +++ b/src_code_for_reproducibility/models/inference_backend.py @@ -0,0 +1,44 @@ +""" +File: mllm/models/inference_backend.py +Summary: Declares the inference backend interface and shared dataclasses. +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Optional + + +@dataclass +class LLMInferenceOutput: + content: str + reasoning_content: str | None = None + log_probs: list[float] | None = None + out_token_ids: list[int] | None = None + + +class LLMInferenceBackend(ABC): + @abstractmethod + def __init__(self, **kwargs): + ... + + @abstractmethod + def prepare_adapter( + self, adapter_id: str, weights_got_updated: bool = False + ) -> None: + """Ensure adapter is ready/loaded for next generation call.""" + + @abstractmethod + async def generate(self, prompt: list[dict], regex: Optional[str] = None) -> str: + ... + + @abstractmethod + def toggle_training_mode(self) -> None: + ... + + @abstractmethod + def toggle_eval_mode(self) -> None: + ... + + @abstractmethod + def shutdown(self) -> None: + ... diff --git a/src_code_for_reproducibility/models/inference_backend_dummy.py b/src_code_for_reproducibility/models/inference_backend_dummy.py new file mode 100644 index 0000000000000000000000000000000000000000..22dd123f5fbcf9a976282b0657097edc680c6ac3 --- /dev/null +++ b/src_code_for_reproducibility/models/inference_backend_dummy.py @@ -0,0 +1,59 @@ +""" +File: mllm/models/inference_backend_dummy.py +Summary: Stub inference backend that returns synthetic completions for tests. +""" + +import asyncio +from typing import Optional + +import rstr +from transformers import AutoTokenizer + +from mllm.models.inference_backend import LLMInferenceBackend, LLMInferenceOutput +from mllm.utils.short_id_gen import generate_short_id + + +class DummyInferenceBackend(LLMInferenceBackend): + def __init__( + self, + *args, + **kwargs, + ): + pass + + def prepare_adapter( + self, + adapter_id: Optional[str], + weights_got_updated: bool, + adapter_path: Optional[str] = None, + ) -> None: + pass + + async def toggle_training_mode(self) -> None: + await asyncio.sleep(0) + pass + + async def toggle_eval_mode(self) -> None: + await asyncio.sleep(0) + pass + + def shutdown(self) -> None: + pass + + async def generate( + self, + prompt_text: str, + regex: Optional[str] = None, + extract_thinking: bool = False, + ) -> LLMInferenceOutput: + if regex: + # Create random string that respects the regex + return LLMInferenceOutput( + content=rstr.xeger(regex), + reasoning_content="I don't think, I am a dummy backend.", + ) + else: + return LLMInferenceOutput( + content="I am a dummy backend without a regex.", + reasoning_content="I don't think, I am a dummy backend.", + ) diff --git a/src_code_for_reproducibility/models/inference_backend_vllm.py b/src_code_for_reproducibility/models/inference_backend_vllm.py new file mode 100644 index 0000000000000000000000000000000000000000..a4a7fc73287cb676ce56beea5de77cf03fc24555 --- /dev/null +++ b/src_code_for_reproducibility/models/inference_backend_vllm.py @@ -0,0 +1,111 @@ +""" +File: mllm/models/inference_backend_vllm.py +Summary: Connects to in-process vLLM instances for batched generation. +""" + +import asyncio +import re +from typing import Optional + +import torch +from transformers import AutoTokenizer +from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams +from vllm.inputs import TokensPrompt +from vllm.lora.request import LoRARequest +from vllm.sampling_params import GuidedDecodingParams, RequestOutputKind + +from mllm.models.inference_backend import LLMInferenceBackend, LLMInferenceOutput +from mllm.utils.short_id_gen import generate_short_id + + +class VLLMAsyncBackend(LLMInferenceBackend): + def __init__( + self, + model_name: str, + tokenizer: AutoTokenizer, + # adapter_paths: dict[str, str], + engine_init_kwargs: dict = {}, + sampling_params: dict = {}, + ): + self.model_name = model_name + self.vllm_adapter_ids = {} + ea = dict(model=model_name, **engine_init_kwargs) + self.engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**ea)) + + self.sampling_params = sampling_params + self.tokenizer = tokenizer + + def prepare_adapter( + self, + adapter_id: Optional[str], + adapter_path: Optional[str], + weights_got_updated: bool, + ) -> None: + if weights_got_updated: + self.vllm_adapter_ids[adapter_id] = generate_short_id() + self.current_lora_request = LoRARequest( + adapter_id, + self.vllm_adapter_ids[adapter_id], + adapter_path, + ) + + async def toggle_training_mode(self) -> None: + await self.engine.sleep(level=1) + + async def toggle_eval_mode(self) -> None: + await self.engine.wake_up() + + def shutdown(self) -> None: + # No explicit close call; engine stops when process exits. + pass + + async def generate( + self, + input_token_ids: list[int], + regex: Optional[str] = None, + extract_thinking: bool = False, + ) -> LLMInferenceOutput: + # Build SamplingParams correctly + guided = GuidedDecodingParams(regex=regex) if regex else None + sp = SamplingParams( + **self.sampling_params, + guided_decoding=guided, + output_kind=RequestOutputKind.FINAL_ONLY, + ) + + prompt = TokensPrompt(prompt_token_ids=input_token_ids) + request_id = f"req-{asyncio.get_running_loop().time()}" + result_generator = self.engine.generate( + prompt, + sp, # SamplingParams(...) + request_id, + lora_request=self.current_lora_request, + ) + + async for out in result_generator: # with FINAL_ONLY this runs once + res = out + + raw_text = res.outputs[0].text + out_token_ids = res.outputs[0].token_ids + log_probs = [ + logprob_dict[token_id].logprob + for token_id, logprob_dict in zip(out_token_ids, res.outputs[0].logprobs) + ] + log_probs = torch.tensor(log_probs) + out_token_ids = torch.tensor(out_token_ids, dtype=torch.long) + content = raw_text + reasoning_content = None + + if extract_thinking: + m = re.match( + r"^\n\n([\s\S]*?)\n\n(.*)$", raw_text, flags=re.DOTALL + ) + if m: + reasoning_content = m.group(1) + content = m.group(2) + return LLMInferenceOutput( + content=content, + reasoning_content=reasoning_content, + log_probs=log_probs, + out_token_ids=out_token_ids, + ) diff --git a/src_code_for_reproducibility/models/large_language_model_api.py b/src_code_for_reproducibility/models/large_language_model_api.py new file mode 100644 index 0000000000000000000000000000000000000000..d687c85d5386ec909ab140c7166886725d64f97d --- /dev/null +++ b/src_code_for_reproducibility/models/large_language_model_api.py @@ -0,0 +1,174 @@ +""" +File: mllm/models/large_language_model_api.py +Summary: Implements API-based large-language-model inference adapters. +""" + +from __future__ import annotations + +import asyncio +import copy +import os +import random +import re +from typing import Any, Callable, Dict, List, Optional, Sequence + +import backoff +from openai import AsyncOpenAI, OpenAIError + +from mllm.markov_games.rollout_tree import ChatTurn +from mllm.models.inference_backend import LLMInferenceOutput + +# Static list copied from the public OpenAI docs until a discovery endpoint is exposed. +reasoning_models = [ + "gpt-5-nano", + "gpt-5-mini", + "gpt-5", + "o1-mini", + "o1", + "o1-pro", + "o3-mini", + "o3", + "o3-pro", + "o4-mini", + "o4", + "o4-pro", +] + + +class LargeLanguageModelOpenAI: + """Tiny async wrapper for OpenAI Chat Completions.""" + + def __init__( + self, + llm_id: str = "", + model: str = "gpt-4.1-mini", + api_key: Optional[str] = None, + base_url: Optional[str] = None, + timeout_s: float = 300.0, + regex_max_attempts: int = 10, + sampling_params: Optional[Dict[str, Any]] = None, + init_kwargs: Optional[Dict[str, Any]] = None, + output_directory: Optional[str] = None, + ) -> None: + self.llm_id = llm_id + self.model = model + key = api_key or os.getenv("OPENAI_API_KEY") + if not key: + raise RuntimeError( + "Set OPENAI_API_KEY as global environment variable or pass api_key." + ) + client_kwargs: Dict[str, Any] = {"api_key": key, "timeout": timeout_s} + if base_url: + client_kwargs["base_url"] = base_url + self.client = AsyncOpenAI(**client_kwargs) + + # Sampling/default request params set at init + self.sampling_params = sampling_params + self.use_reasoning = model in reasoning_models + if self.use_reasoning: + self.sampling_params["reasoning"] = { + "effort": "low", + "summary": "detailed", + } + self.regex_max_attempts = max(1, int(regex_max_attempts)) + + def get_inference_policies(self) -> Dict[str, Callable]: + return { + self.llm_id: self.get_action, + } + + async def prepare_adapter_for_inference(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + async def toggle_eval_mode(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + async def toggle_training_mode(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + async def export_adapters(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + async def checkpoint_all_adapters(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + def extract_output_from_response(self, resp: Response) -> LLMInferenceOutput: + if len(resp.output) > 1: + summary = resp.output[0].summary + if summary != []: + reasoning_content = summary[0].text + reasoning_content = f"OpenAI Reasoning Summary: {reasoning_content}" + else: + reasoning_content = None + content = resp.output[1].content[0].text + else: + reasoning_content = None + content = resp.output[0].content[0].text + + return LLMInferenceOutput( + content=content, + reasoning_content=reasoning_content, + ) + + @backoff.on_exception( + backoff.expo, Exception, max_time=10**10, max_tries=10**10 + ) + async def get_action( + self, + state: list[ChatTurn], + agent_id: str, + regex: Optional[str] = None, + ) -> LLMInferenceOutput: + # Remove any non-role/content keys from the prompt else openai will error. + prompt = [{"role": p.role, "content": p.content} for p in state] + + # if self.sleep_between_requests: + # await self.wait_random_time() + + # If regex is required, prime the model and validate client-side + if regex: + constraint_msg = { + "role": "user", + "content": ( + f"Output must match this regex exactly: {regex} \n" + "Return only the matching string, with no quotes or extra text." + ), + } + prompt = [constraint_msg, *prompt] + pattern = re.compile(regex) + for _ in range(self.regex_max_attempts): + resp = await self.client.responses.create( + model=self.model, + input=prompt, + **self.sampling_params, + ) + policy_output = self.extract_output_from_response(resp) + if pattern.fullmatch(policy_output.content): + return policy_output + prompt = [ + *prompt, + { + "role": "user", + "content": ( + f"Invalid response format. Expected format (regex): {regex}\n Please try again and provide ONLY a response that matches this regex." + ), + }, + ] + return policy_output + + # Simple, unconstrained generation + resp = await self.client.responses.create( + model=self.model, + input=prompt, + **self.sampling_params, + ) + policy_output = self.extract_output_from_response(resp) + return policy_output + + def shutdown(self) -> None: + self.client = None diff --git a/src_code_for_reproducibility/models/large_language_model_local.py b/src_code_for_reproducibility/models/large_language_model_local.py new file mode 100644 index 0000000000000000000000000000000000000000..4475b51d8fa895895df6d6dcaff4ed430cda74ca --- /dev/null +++ b/src_code_for_reproducibility/models/large_language_model_local.py @@ -0,0 +1,361 @@ +""" +File: mllm/models/large_language_model_local.py +Summary: Provides a local large language model wrapper over inference backends. +""" + +import logging +import os +import re +import sys +import uuid +from collections.abc import Callable +from copy import deepcopy +from datetime import datetime +from typing import Literal + +import httpx +import requests +import torch +import torch.nn as nn +from torch.optim import SGD, Adam, AdamW, RMSprop +from transformers import AutoModelForCausalLM, AutoTokenizer + +from mllm.chat_utils.apply_template import chat_turns_to_token_ids +from mllm.markov_games.rollout_tree import ChatTurn +from mllm.models.adapter_training_wrapper import AdapterWrapper +from mllm.models.inference_backend import LLMInferenceOutput +from mllm.models.inference_backend_dummy import DummyInferenceBackend +from mllm.models.inference_backend_vllm import VLLMAsyncBackend + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + +AdapterID = str +PolicyID = str + + +class LeanLocalLLM: + """ + Wrapper that manages local HuggingFace models, adapters, and inference backends. + """ + + def __init__( + self, + llm_id: str = "base_llm", + model_name: str = "Qwen/Qwen3-4B-Instruct-2507", + device: str = "cuda", + hf_kwargs: dict = {}, + adapter_configs: dict = {}, + output_directory: str = "./models/", + inference_backend: Literal["vllm", "dummy"] = "vllm", + inference_backend_sampling_params: dict = {}, + inference_backend_init_kwargs: dict = {}, + initial_adapter_paths: dict[str, str] | None = None, + initial_buffer_paths: list[str] | None = None, + enable_thinking: bool = None, + regex_max_attempts: int = -1, + max_thinking_characters: int = 0, + ): + self.inference_backend_name = inference_backend + self.output_directory = output_directory + self.llm_id = llm_id + self.device = torch.device(device) if device else torch.device("cuda") + self.model_name = model_name + self.adapter_configs = adapter_configs + self.adapter_ids = list(adapter_configs.keys()) + self.enable_thinking = enable_thinking + self.regex_max_attempts = regex_max_attempts + self.initial_buffer_paths = initial_buffer_paths + self.max_thinking_characters = max_thinking_characters + self.regex_retries_count = 0 + + # Optional user-specified initial adapter weight locations (local or HF Hub) + # Format: {adapter_id: path_or_repo_id} + self.initial_adapter_paths: dict[str, str] | None = initial_adapter_paths + + # Path management / imports + self.save_path = str(os.path.join(output_directory, model_name, "adapters")) + self.adapter_paths = { + adapter_id: os.path.join(self.save_path, adapter_id) + for adapter_id in self.adapter_ids + } + checkpoints_dir = os.path.join(self.output_directory, "checkpoints") + self.past_agent_adapter_paths = {} + if os.path.isdir(checkpoints_dir): + for dirname in os.listdir(checkpoints_dir): + dirpath = os.path.join(checkpoints_dir, dirname) + if os.path.isdir(dirpath): + self.past_agent_adapter_paths[f"{dirname}_buffer"] = os.path.join( + dirpath, "agent_adapter" + ) + logger.info( + f"Loaded {len(self.past_agent_adapter_paths)} past agent adapters from checkpoints directory." + ) + if self.initial_buffer_paths is not None: + previous_count = len(self.past_agent_adapter_paths) + for path in self.initial_buffer_paths: + if os.path.isdir(path): + for dirname in os.listdir(path): + dirpath = os.path.join(path, dirname) + if os.path.isdir(dirpath): + self.past_agent_adapter_paths[ + f"{dirname}_buffer" + ] = os.path.join(dirpath, "agent_adapter") + else: + logger.warning( + f"Initial buffer path {path} does not exist or is not a directory." + ) + logger.info( + f"Loaded {len(self.past_agent_adapter_paths) - previous_count} past agent adapters from user-specified initial buffer paths." + ) + self.past_agent_adapter_ids = list(self.past_agent_adapter_paths.keys()) + + # ID management for tracking adapter versions + self.adapter_train_ids = { + adapter_id: self.short_id_generator() for adapter_id in self.adapter_ids + } + # Initialize tokenizer + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + # Setup padding token to be same as EOS token + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.weights_got_updated: dict[AdapterID, bool] = { + adapter_id: False for adapter_id in self.adapter_ids + } + self.weights_got_updated.update( + {adapter_id: False for adapter_id in self.past_agent_adapter_ids} + ) + self.current_lora_request = None + self.currently_loaded_adapter_id = None + + # --------------------------------------------------------- + # Init HF model, peft adapters + # --------------------------------------------------------- + self.shared_hf_llm = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_name, + **hf_kwargs, + ) + self.hf_adapters = {} + self.optimizers = {} + for adapter_id in self.adapter_ids: + # Prefer output-folder path if it exists; else fall back to user-specified initial path if provided + output_path = os.path.join(self.save_path, adapter_id) + chosen_path: str | None = None + if os.path.isdir(output_path) and os.listdir(output_path): + chosen_path = output_path + logger.info( + f"Initializing adapter '{adapter_id}': using existing weights from output folder '{chosen_path}'." + ) + elif ( + self.initial_adapter_paths and adapter_id in self.initial_adapter_paths + ): + chosen_path = self.initial_adapter_paths[adapter_id] + logger.info( + f"Initializing adapter '{adapter_id}': using provided initial path '{chosen_path}'." + ) + else: + logger.info( + f"Initializing adapter '{adapter_id}': no initial weights provided or found; starting from scratch." + ) + hf_adapter = AdapterWrapper( + shared_llm=self.shared_hf_llm, + adapter_id=adapter_id, + lora_config=adapter_configs[adapter_id], + path=chosen_path, + ).to(device) + self.hf_adapters[adapter_id] = hf_adapter + # Persist current state of all adapters (ensures remote loads are cached to disk) + self.export_adapters() + + # --------------------------------------------------------- + # Init inference inference_backend + # --------------------------------------------------------- + + if inference_backend == "vllm": + self.inference_backend = VLLMAsyncBackend( + model_name=self.model_name, + # adapter_paths=self.adapter_paths, + tokenizer=self.tokenizer, + engine_init_kwargs=inference_backend_init_kwargs, + sampling_params=inference_backend_sampling_params, + ) + elif inference_backend == "dummy": + self.inference_backend = DummyInferenceBackend() + else: + raise ValueError(f"Unknown inference_backend: {inference_backend}") + + def reset_regex_retries_count(self) -> None: + self.regex_retries_count = 0 + + def get_inference_policies(self) -> dict[PolicyID, Callable]: + """ + Build async policy callables keyed by adapter id for inference-only usage. + """ + policies = {} + for adapter_id in self.adapter_ids: + # define policy func + async def policy( + state: list[ChatTurn], + agent_id: str, + regex: str | None = None, + _adapter_id=adapter_id, + ): + self.prepare_adapter_for_inference(adapter_id=_adapter_id) + response = await self.get_action(state, agent_id, regex) + return response + + policies[self.llm_id + "/" + adapter_id] = policy + + for adapter_id in self.past_agent_adapter_ids: + # define policy func + async def policy( + state: list[ChatTurn], + agent_id: str, + regex: str | None = None, + _adapter_id=adapter_id, + ): + self.prepare_adapter_for_inference(adapter_id=_adapter_id) + response = await self.get_action(state, agent_id, regex) + return response + + policies[self.llm_id + "/" + adapter_id] = policy + return policies + + def get_adapter_modules(self) -> dict[PolicyID, nn.Module]: + """ + Returns wrappers over the adapters which allows them be + interfaced like regular PyTorch models. + AdapterWrapper lives in adapter_wrapper.py; the huggingface modules already wrap + parameters here, so we surface them directly until an extra shim is required. + """ + trainable_objects = {an: self.hf_adapters[an] for an in self.adapter_ids} + return trainable_objects + + async def toggle_training_mode(self) -> None: + for adn in self.adapter_ids: + self.adapter_train_ids[adn] = self.short_id_generator() + await self.inference_backend.toggle_training_mode() + + async def toggle_eval_mode(self) -> None: + await self.inference_backend.toggle_eval_mode() + + def prepare_adapter_for_inference(self, adapter_id: AdapterID) -> None: + self.inference_backend.prepare_adapter( + adapter_id, + adapter_path=self.adapter_paths.get( + adapter_id, self.past_agent_adapter_paths.get(adapter_id, None) + ), + weights_got_updated=self.weights_got_updated[adapter_id], + ) + self.currently_loaded_adapter_id = adapter_id + self.weights_got_updated[adapter_id] = False + + # def _make_prompt_text(self, prompt: list[dict]) -> str: + # if self.enable_thinking is not None: + # prompt_text = self.tokenizer.apply_chat_template( + # prompt, + # tokenize=False, + # add_generation_prompt=True, + # enable_thinking=self.enable_thinking, + # ) + # else: + # prompt_text = self.tokenizer.apply_chat_template( + # prompt, + # tokenize=False, + # add_generation_prompt=True, + # ) + + # return prompt_text + + async def get_action( + self, state: list[ChatTurn], agent_id: str, regex: str | None = None + ) -> ChatTurn: + current_regex = regex if self.regex_max_attempts == -1 else None + pattern = re.compile(regex) if regex else None + nb_attempts = 0 + state = state[:] + while True: + context_token_ids = chat_turns_to_token_ids( + chats=state, + tokenizer=self.tokenizer, + enable_thinking=self.enable_thinking, + ) + policy_output = await self.inference_backend.generate( + input_token_ids=context_token_ids.tolist(), + extract_thinking=(self.max_thinking_characters > 0), + regex=current_regex, + ) + if ( + pattern is None + or (pattern.fullmatch(policy_output.content)) + or (nb_attempts >= self.regex_max_attempts) + ): + return ChatTurn( + agent_id=agent_id, + role="assistant", + content=policy_output.content, + reasoning_content=policy_output.reasoning_content, + out_token_ids=policy_output.out_token_ids, + log_probs=policy_output.log_probs, + is_state_end=False, + ) + else: + self.regex_retries_count += 1 + nb_attempts += 1 + logger.warning( + f"Response {policy_output.content} did not match regex: {regex}, retry {nb_attempts}/{self.regex_max_attempts}" + ) + if nb_attempts == self.regex_max_attempts: + current_regex = regex + # regex_prompt = ChatTurn( + # role="user", + # content=f"Invalid response format. Expected format (regex): {current_regex}\n Please try again and provide ONLY a response that matches this regex.", + # reasoning_content=None, + # log_probs=None, + # out_token_ids=None, + # is_state_end=False, + # ) + # state.append(regex_prompt) + + def export_adapters(self) -> None: + """ + Any peft wrapper, by default, saves all adapters, not just the one currently loaded. + """ + + # New version of the adapters available + for adapter_id in self.adapter_ids: + self.weights_got_updated[adapter_id] = True + for adapter_id in self.past_agent_adapter_ids: + self.weights_got_updated[adapter_id] = True + + adapter_id = self.adapter_ids[0] + self.hf_adapters[adapter_id].save_pretrained(self.save_path) + + def checkpoint_all_adapters(self, checkpoint_indicator: str) -> None: + """ + Checkpoints all adapters to the configured output directory. + """ + adapter_id = self.adapter_ids[0] + output_dir = os.path.join(self.output_directory, "checkpoints") + os.makedirs(output_dir, exist_ok=True) + date_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + agent_adapter_dir = f"{adapter_id}-{checkpoint_indicator}-{date_str}" + export_path = os.path.join(output_dir, agent_adapter_dir) + for adapter_id in self.adapter_ids: + if "agent" in adapter_id: + self.past_agent_adapter_paths[ + f"{agent_adapter_dir}_buffer" + ] = os.path.join(export_path, adapter_id) + self.past_agent_adapter_ids.append(f"{agent_adapter_dir}_buffer") + self.weights_got_updated[f"{agent_adapter_dir}_buffer"] = False + self.hf_adapters[adapter_id].save_pretrained(export_path) + + def short_id_generator(self) -> str: + """ + Generates a short unique ID for tracking adapter versions. + + Returns: + int: An 8-digit integer ID. + """ + return str(uuid.uuid4().int)[:8] diff --git a/src_code_for_reproducibility/models/scalar_critic.py b/src_code_for_reproducibility/models/scalar_critic.py new file mode 100644 index 0000000000000000000000000000000000000000..0b704dcc78fdfbed1c68b1ac469e9c7b51758211 --- /dev/null +++ b/src_code_for_reproducibility/models/scalar_critic.py @@ -0,0 +1,59 @@ +""" +File: mllm/models/scalar_critic.py +Summary: Defines a scalar critic network and helper utilities. +""" + +import torch +import torch.nn as nn +import torch.optim as optim +from peft import LoraConfig, get_peft_model +from transformers import AutoModelForCausalLM, AutoTokenizer + +from mllm.models.adapter_training_wrapper import AdapterWrapper + + +class ScalarCritic(nn.Module): + """ + A causal-LM critic_adapter + a scalar value head: + V_φ(s) = wᵀ h_last + b + Only LoRA adapters (inside critic_adapter) and the value head are trainable. + """ + + def __init__(self, critic_adapter: AdapterWrapper): + super().__init__() + self.critic_adapter = critic_adapter + hidden_size = self.critic_adapter.shared_llm.config.hidden_size + self.value_head = nn.Linear(hidden_size, 1).to( + dtype=critic_adapter.dtype, device=critic_adapter.device + ) + + def forward(self, input_ids, attention_mask=None, **kwargs): + # AdapterWrapper activates its own adapter internally + outputs = self.critic_adapter( + input_ids=input_ids, + attention_mask=attention_mask, + output_hidden_states=True, + **kwargs, + ) + h_last = outputs.hidden_states[-1] # (B, S, H) + values = self.value_head(h_last).squeeze(-1) # (B, S) + return values + + def parameters(self, recurse: bool = True): + """Iterator over *trainable* parameters for this critic.""" + # 1) LoRA params for *this* adapter + for p in self.critic_adapter.parameters(): + yield p + # 2) scalar head + yield from self.value_head.parameters() + + def gradient_checkpointing_enable(self, *args, **kwargs): + self.critic_adapter.gradient_checkpointing_enable(*args, **kwargs) + + @property + def dtype(self): + return self.critic_adapter.dtype + + @property + def device(self): + return self.critic_adapter.device diff --git a/src_code_for_reproducibility/training/__init__.py b/src_code_for_reproducibility/training/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..188fde562de5d8f658ef12708df9967f45cb2a7a --- /dev/null +++ b/src_code_for_reproducibility/training/__init__.py @@ -0,0 +1,4 @@ +""" +File: mllm/training/__init__.py +Summary: Exposes training submodules through the package namespace. +""" diff --git a/src_code_for_reproducibility/training/__pycache__/__init__.cpython-312.pyc b/src_code_for_reproducibility/training/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb3d8d4bee48323b7b756ccf91dc2a191e067e3b Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/__init__.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-312.pyc b/src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46eae35640116412e52dc6fbbc7f7038c07dd49d Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/tally_metrics.cpython-312.pyc b/src_code_for_reproducibility/training/__pycache__/tally_metrics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0bfb113d53be2b81539815445c95d5e83d1dcc6 Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/tally_metrics.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-312.pyc b/src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..854620cf63cea8ec419bcb2a5f63788abcb01bf6 Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/training/annealing_methods.py b/src_code_for_reproducibility/training/annealing_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..591d91f7720880fc202b116b27b15b996c256dc4 --- /dev/null +++ b/src_code_for_reproducibility/training/annealing_methods.py @@ -0,0 +1,20 @@ +""" +File: mllm/training/annealing_methods.py +Summary: Implements annealing schedules used across training loops. +""" + +import numpy as np + + +def sigmoid_annealing(step: int, temperature: float) -> float: + """ + Smoothly ramp a scalar from 0 → 1 using a temperature-controlled sigmoid. + + Args: + step: Current training step or iteration. + temperature: Controls how sharp the transition is; larger values flatten the curve. + + Returns: + Float in [-1, 1] that can be rescaled for annealing schedules. + """ + return 2 / (1 + np.exp(-step / temperature)) - 1 diff --git a/src_code_for_reproducibility/training/credit_methods.py b/src_code_for_reproducibility/training/credit_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..c29032630d06536e7efe6ceaae04092a616ce53a --- /dev/null +++ b/src_code_for_reproducibility/training/credit_methods.py @@ -0,0 +1,307 @@ +""" +File: mllm/training/credit_methods.py +Summary: Holds credit-assignment routines for reinforcement learning updates. +""" + +import torch + + +def whiten_advantages(advantages: torch.Tensor) -> torch.Tensor: + """ + Normalize a vector of advantages to zero mean / unit variance (global). + + Useful for variance reduction before computing gradients. + """ + whitened_advantages = (advantages - torch.mean(advantages)) / ( + torch.std(advantages) + 1e-9 + ) + return whitened_advantages + + +def whiten_advantages_time_step_wise( + advantages: torch.Tensor, # (B, T) +) -> torch.Tensor: + """ + Whiten advantages independently per timestep (column-wise mean/std). + + Helps when rollout lengths differ or certain positions have very different scales. + """ + assert advantages.dim() == 2, "Wrong dimensions." + whitened_advantages_time_step_wise = ( + advantages - advantages.mean(dim=0, keepdim=True) + ) / (advantages.std(dim=0, keepdim=True) + 1e-9) + return whitened_advantages_time_step_wise + + +def get_discounted_state_visitation_credits( + credits: torch.Tensor, discount_factor: float # (B, T) +) -> torch.Tensor: + """ + Apply geometric discounting to credits so earlier visits count less. + + Equivalent to per-timestep multiplication by ``gamma^t``. + """ + return credits * ( + discount_factor ** torch.arange(credits.shape[1], device=credits.device) + ) + + +def get_discounted_returns( + rewards: torch.Tensor, # (B, T) + discount_factor: float, +) -> torch.Tensor: + """ + Computes Monte Carlo discounted returns for a sequence of rewards. + + Args: + rewards (torch.Tensor): Array of rewards for each timestep. + + Returns: + torch.Tensor: Array of discounted returns. + """ + assert rewards.dim() == 2, "Wrong dimensions." + B, T = rewards.shape + discounted_returns = torch.zeros_like(rewards) + accumulator = torch.zeros(B, device=rewards.device, dtype=rewards.dtype) + for t in reversed(range(T)): + accumulator = rewards[:, t] + discount_factor * accumulator + discounted_returns[:, t] = accumulator + return discounted_returns + + +def get_rloo_credits(credits: torch.Tensor): # (B, S) + """Compute leave-one-out baselines for a batch of credits.""" + assert credits.dim() == 2, "Wrong dimensions." + rloo_baselines = torch.zeros_like(credits) + n = credits.shape[0] + if n == 1: + return credits, rloo_baselines + rloo_baselines = (torch.sum(credits, dim=0, keepdim=True) - credits) / (n - 1) + rloo_credits = credits - rloo_baselines + return rloo_credits, rloo_baselines + + +def get_generalized_advantage_estimates( + rewards: torch.Tensor, # (B, T) + value_estimates: torch.Tensor, # (B, T+1) + discount_factor: float, + lambda_coef: float, +) -> torch.Tensor: + """ + Compute Generalized Advantage Estimates (GAE). + + See https://arxiv.org/pdf/1506.02438 for derivation. + """ + assert rewards.dim() == value_estimates.dim() == 2, "Wrong dimensions." + + assert ( + rewards.shape[0] == value_estimates.shape[0] + ), f"Got shapes {rewards.shape} and {value_estimates.shape} of rewards and value estimates." + assert ( + rewards.shape[1] == value_estimates.shape[1] - 1 + ), f"Got shapes {rewards.shape} and {value_estimates.shape} of rewards and value estimates." + + T = rewards.shape[1] + tds = rewards + discount_factor * value_estimates[:, 1:] - value_estimates[:, :-1] + gaes = torch.zeros_like(tds) + acc = 0.0 + for t in reversed(range(T)): + acc = tds[:, t] + lambda_coef * discount_factor * acc + gaes[:, t] = acc + return gaes + + +def get_advantage_alignment_weights( + advantages: torch.Tensor, # (B, T) + exclude_k_equals_t: bool, + gamma: float, + discount_t: bool, +) -> torch.Tensor: + """ + The advantage alignment credit is calculated as + + \[ + A^*(s_t, a_t, b_t) = A^1(s_t, a_t, b_t) + \beta \cdot + \left( \sum_{k < t} \gamma^{t-k} A^1(s_k, a_k, b_k) \right) + A^2(s_t, a_t, b_t) + \] + + Here, the weights are defined as \( \beta \cdot + \left( \sum_{k < t} \gamma^{t-k} A^1(s_k, a_k, b_k) \) + """ + T = advantages.shape[1] + discounted_advantages = advantages * ( + gamma * torch.ones((1, T), device=advantages.device) + ) ** (-torch.arange(0, T, 1, device=advantages.device)) + if exclude_k_equals_t: + sub = torch.eye(T, device=advantages.device) + else: + sub = torch.zeros((T, T), device=advantages.device) + # Identity is for \( k < t \), remove for \( k \leq t \) + ad_align_weights = discounted_advantages @ ( + torch.triu(torch.ones((T, T), device=advantages.device)) - sub + ) + t_discounts = (gamma * torch.ones((1, T), device=advantages.device)) ** ( + torch.arange(0, T, 1, device=advantages.device) + ) + ad_align_weights = t_discounts * ad_align_weights + if discount_t: + time_discounted_advantages = advantages * ( + gamma * torch.ones((1, T), device=advantages.device) + ) ** (torch.arange(0, T, 1, device=advantages.device)) + ad_align_weights = ad_align_weights - advantages + time_discounted_advantages + return ad_align_weights + + +def get_advantage_alignment_credits( + a1: torch.Tensor, # (B, S) + a1_alternative: torch.Tensor, # (B, S, A) + a2: torch.Tensor, # (B, S) + exclude_k_equals_t: bool, + beta: float, + gamma: float = 1.0, + use_old_ad_align: bool = False, + use_sign: bool = False, + clipping: float | None = None, + use_time_regularization: bool = False, + force_coop_first_step: bool = False, + use_variance_regularization: bool = False, + rloo_branch: bool = False, + reuse_baseline: bool = False, + mean_normalize_ad_align: bool = False, + whiten_adalign_advantages: bool = False, + whiten_adalign_advantages_time_step_wise: bool = False, + discount_t: bool = False, +) -> torch.Tensor: + """ + Calculate the advantage alignment credits with vectorization, as described in https://arxiv.org/abs/2406.14662. + + Recall that the advantage opponent shaping term of the AdAlign policy gradient is: + \[ + \beta \mathbb{E}_{\substack{ + \tau \sim \text{Pr}_{\mu}^{\pi^1, \pi^2} \\ + a_t' \sim \pi^1(\cdot \mid s_t) + }} + \left[\sum_{t=0}^\infty \gamma^{t}\left( \sum_{k\leq t} A^1(s_k,a^{\prime}_k,b_k) \right) A^{2}(s_t,a_t, b_t)\nabla_{\theta^1}\text{log } \pi^1(a_t|s_t) \right] + \] + + This method computes the following: + \[ + Credit(s_t, a_t, b_t) = \gamma^t \left[ A^1(s_t, a_t, b_t) + \beta \left( \sum_{k\leq t} A^1(s_k,a^{\prime}_k,b_k) \right) A^{2}(s_t,a_t, b_t) \right] + \] + + Args: + a1: Advantages of the main trajectories for the current agent. + a1_alternative: Advantages of the alternative trajectories for the current agent. + a2: Advantages of the main trajectories for the other agent. + discount_factor: Discount factor for the advantage alignment. + beta: Beta parameter for the advantage alignment. + gamma: Gamma parameter for the advantage alignment. + use_sign_in_ad_align: Whether to use sign in the advantage alignment. + + Returns: + torch.Tensor: The advantage alignment credits. + """ + + assert a1.dim() == a2.dim() == 2, "Advantages must be of shape (B, S)" + if a1_alternative is not None: + assert ( + a1_alternative.dim() == 3 + ), "Alternative advantages must be of shape (B, S, A)" + B, T, A = a1_alternative.shape + else: + B, T = a1.shape + assert a1.shape == a2.shape, "Not the same shape" + + sub_tensors = {} + + if use_old_ad_align: + ad_align_weights = get_advantage_alignment_weights( + advantages=a1, + exclude_k_equals_t=exclude_k_equals_t, + gamma=gamma, + discount_t=discount_t, + ) + sub_tensors["ad_align_weights_prev"] = ad_align_weights + if exclude_k_equals_t: + ad_align_weights = gamma * ad_align_weights + else: + assert a1_alternative is not None, "Alternative advantages must be provided" + if rloo_branch: + a1_alternative = torch.cat([a1.unsqueeze(2), a1_alternative], dim=2) + a1_alternative = a1_alternative.mean(dim=2) + a1, baseline = get_rloo_credits(a1) + if reuse_baseline: + a1_alternative = a1_alternative - baseline + else: + a1_alternative, _ = get_rloo_credits(a1_alternative) + assert a1.shape == a1_alternative.shape, "Not the same shape" + ad_align_weights = get_advantage_alignment_weights( + advantages=a1_alternative, + exclude_k_equals_t=exclude_k_equals_t, + gamma=gamma, + ) + sub_tensors["ad_align_weights"] = ad_align_weights + + # Use sign + if use_sign: + assert beta == 1.0, "beta should be 1.0 when using sign" + positive_signs = ad_align_weights > 0 + negative_signs = ad_align_weights < 0 + ad_align_weights[positive_signs] = 1 + ad_align_weights[negative_signs] = -1 + sub_tensors["ad_align_weights_sign"] = ad_align_weights + # (rest are 0) + + ################### + # Process weights + ################### + + # Use clipping + if clipping not in [0.0, None]: + upper_mask = ad_align_weights > 1 + lower_mask = ad_align_weights < -1 + + ad_align_weights = torch.clip( + ad_align_weights, + -clipping, + clipping, + ) + clipping_ratio = ( + torch.sum(upper_mask) + torch.sum(lower_mask) + ) / upper_mask.size + sub_tensors["clipped_ad_align_weights"] = ad_align_weights + + # 1/1+t Regularization + if use_time_regularization: + t_values = torch.arange(1, T + 1).to(ad_align_weights.device) + ad_align_weights = ad_align_weights / t_values + sub_tensors["time_regularized_ad_align_weights"] = ad_align_weights + + # Use coop on t=0 + if force_coop_first_step: + ad_align_weights[:, 0] = 1 + sub_tensors["coop_first_step_ad_align_weights"] = ad_align_weights + + #################################### + # Compose elements together + #################################### + + opp_shaping_terms = beta * ad_align_weights * a2 + sub_tensors["ad_align_opp_shaping_terms"] = opp_shaping_terms + + credits = a1 + opp_shaping_terms + if mean_normalize_ad_align: + credits = credits - credits.mean(dim=0) + sub_tensors["mean_normalized_ad_align_credits"] = credits + if whiten_adalign_advantages: + credits = (credits - credits.mean()) / (credits.std() + 1e-9) + sub_tensors["whitened_ad_align_credits"] = credits + if whiten_adalign_advantages_time_step_wise: + credits = (credits - credits.mean(dim=0, keepdim=True)) / ( + credits.std(dim=0, keepdim=True) + 1e-9 + ) + sub_tensors["whitened_ad_align_credits_time_step_wise"] = credits + sub_tensors["final_ad_align_credits"] = credits + + return credits, sub_tensors diff --git a/src_code_for_reproducibility/training/tally_metrics.py b/src_code_for_reproducibility/training/tally_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..a0073ab3be86bbaf448b6a898341a4f5a3087b24 --- /dev/null +++ b/src_code_for_reproducibility/training/tally_metrics.py @@ -0,0 +1,64 @@ +""" +File: mllm/training/tally_metrics.py +Summary: Transforms tally files into aggregated metric summaries. +""" + +import os +from numbers import Number +from typing import Union + +import wandb + + +class Tally: + """ + Minimal scalar-first tally. + - Keys are strings. + - First add stores a scalar; subsequent adds upgrade to a list of scalars. + """ + + def __init__(self): + self.stats = {} + + def reset(self): + """Reset all recorded metrics back to an empty dictionary.""" + self.stats = {} + + def _coerce_scalar(self, value: Union[int, float]) -> Union[int, float]: + """Ensure ``value`` is a plain Python scalar (detach tensors, etc.).""" + if hasattr(value, "item") and callable(getattr(value, "item")): + try: + value = value.item() + except Exception: + pass + if isinstance(value, Number): + return value + raise AssertionError("Metric must be a scalar number") + + def add_metric(self, path: str, metric: Union[int, float]): + """Accumulate a metric under ``path`` (scalar on first add, list thereafter).""" + metric = float(metric) + assert isinstance(path, str), "Path must be a string." + assert isinstance(metric, float), "Metric must be a scalar number." + + scalar = self._coerce_scalar(metric) + existing = self.stats.get(path) + if existing is None: + self.stats[path] = scalar + elif isinstance(existing, list): + existing.append(scalar) + else: + self.stats[path] = [existing, scalar] + + def save(self, identifier: str, folder: str): + """Persist the tally as a pickle file under ``folder``.""" + os.makedirs(name=folder, exist_ok=True) + try: + import pickle + + pkl_path = os.path.join(folder, f"{identifier}.tally.pkl") + payload = self.stats + with open(pkl_path, "wb") as f: + pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL) + except Exception: + pass diff --git a/src_code_for_reproducibility/training/tally_rollout.py b/src_code_for_reproducibility/training/tally_rollout.py new file mode 100644 index 0000000000000000000000000000000000000000..04bb4f36d7f1c6759c3fb0f0102f29b117ea57c1 --- /dev/null +++ b/src_code_for_reproducibility/training/tally_rollout.py @@ -0,0 +1,116 @@ +""" +File: mllm/training/tally_rollout.py +Summary: Serializes rollout data into tallies for downstream processing. +""" + +import json +import os +from copy import deepcopy +from typing import Union + +import numpy as np +import pandas as pd +import torch +from transformers import AutoTokenizer + + +class RolloutTallyItem: + def __init__( + self, + crn_ids: list[str], + rollout_ids: list[str], + agent_ids: list[str], + metric_matrix: torch.Tensor, + ): + """Lightweight data container that keeps rollout-aligned metric matrices.""" + if isinstance(crn_ids, torch.Tensor): + crn_ids = crn_ids.detach().cpu().numpy() + if isinstance(rollout_ids, torch.Tensor): + rollout_ids = rollout_ids.detach().cpu().numpy() + if isinstance(agent_ids, torch.Tensor): + agent_ids = agent_ids.detach().cpu().numpy() + self.crn_ids = crn_ids + self.rollout_ids = rollout_ids + self.agent_ids = agent_ids + metric_matrix = metric_matrix.detach().cpu() + assert ( + 0 < metric_matrix.ndim <= 2 + ), "Metric matrix must have less than or equal to 2 dimensions" + if metric_matrix.ndim == 1: + metric_matrix = metric_matrix.reshape(1, -1) + # Convert to float32 if tensor is in BFloat16 format (not supported by numpy) + if metric_matrix.dtype == torch.bfloat16: + metric_matrix = metric_matrix.float() + self.metric_matrix = metric_matrix.numpy() + + +class RolloutTally: + """ + Tally is a utility class for collecting and storing training metrics. + It supports adding metrics at specified paths and saving them to disk. + """ + + def __init__(self): + """ + Initializes the RolloutTally object. + + Args: + tokenizer (AutoTokenizer): Tokenizer for converting token IDs to strings. + max_context_length (int, optional): Maximum context length for contextualized metrics. Defaults to 30. + """ + # Array-preserving structure (leaf lists hold numpy arrays / scalars) + self.metrics = {} + # Global ordered list of sample identifiers (crn_id, rollout_id) added in the order samples are processed + + def reset(self): + """Reset the tally to an empty dict.""" + self.metrics = {} + + def get_from_nested_dict(self, dictio: dict, path: str): + """Retrieve a nested entry, creating intermediate dicts as needed.""" + assert isinstance(path, list), "Path must be list." + for sp in path[:-1]: + dictio = dictio.setdefault(sp, {}) + return dictio.get(path[-1], None) + + def set_at_path(self, dictio: dict, path: str, value): + """Store ``value`` at ``path``; helper used by ``add_metric``.""" + for sp in path[:-1]: + dictio = dictio.setdefault(sp, {}) + dictio[path[-1]] = value + + def add_metric(self, path: list[str], rollout_tally_item: RolloutTallyItem): + """ + Adds a metric to the base tally at the specified path. + + Args: + path (list): List of keys representing the path in the base tally. + rollout_tally_item (RolloutTallyItem): The rollout tally item to add. + """ + rollout_tally_item = deepcopy(rollout_tally_item) + + # Update array-preserving tally + array_list = self.get_from_nested_dict(dictio=self.metrics, path=path) + if array_list is None: + self.set_at_path(dictio=self.metrics, path=path, value=[rollout_tally_item]) + else: + array_list.append(rollout_tally_item) + + def save(self, identifier: str, folder: str): + """Persist the tally as a pickle (metrics only) under ``folder``.""" + os.makedirs(name=folder, exist_ok=True) + + from datetime import datetime + + now = datetime.now() + + # Pickle only (fastest, exact structure with numpy/scalars at leaves) + try: + import pickle + + pkl_path = os.path.join(folder, f"{identifier}.rt_tally.pkl") + payload = {"metrics": self.metrics} + with open(pkl_path, "wb") as f: + pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL) + except Exception: + pass diff --git a/src_code_for_reproducibility/training/tally_tokenwise.py b/src_code_for_reproducibility/training/tally_tokenwise.py new file mode 100644 index 0000000000000000000000000000000000000000..b7770e0cb79d5ed4e56a3f66b6982582c72e0bb7 --- /dev/null +++ b/src_code_for_reproducibility/training/tally_tokenwise.py @@ -0,0 +1,278 @@ +""" +File: mllm/training/tally_tokenwise.py +Summary: Converts token-level tallies into per-token statistics. +""" + +import json +import os +from typing import Any, Dict, List, Tuple, Union + +import numpy as np +import pandas as pd +import torch +from transformers import AutoTokenizer + + +class ContextualizedTokenwiseTally: + """ + Collect, store, and save token-level metrics per rollout. + + - One DataFrame per rollout_id in `paths` + - Index = timestep (int) + - Columns are added incrementally via `add_contexts()` and `add_data()` + - Cells may contain scalars, strings, or lists (dtype=object) + """ + + def __init__( + self, + tokenizer: AutoTokenizer, + paths: List[str], + max_context_length: int = 30, + ): + """ + Args: + tokenizer: HuggingFace tokenizer used to convert tids -> tokens + paths: rollout identifiers (parallel to batch dimension) + max_context_length: truncate context token lists to this length + """ + self.tokenizer = tokenizer + self.paths = paths + self.max_context_length = max_context_length + self.tally: Dict[str, pd.DataFrame] = {path: pd.DataFrame() for path in paths} + + # set later by setters + self.contexts: torch.Tensor | None = None + self.action_mask: torch.Tensor | None = None + self.range: Tuple[int, int] | None = None + + # --------- Utilities --------- + + def tids_to_str(self, tids: List[int]) -> List[str]: + """Convert a list of token IDs to a list of token strings.""" + return self.tokenizer.convert_ids_to_tokens(tids) + + def _ensure_ready(self): + """Validate that action mask and range are configured prior to writes.""" + assert self.action_mask is not None, "call set_action_mask(mask) first" + assert self.range is not None, "call set_range((start, end)) first" + + @staticmethod + def _sanitize_filename(name: Any) -> str: + """Make a safe filename from any rollout_id.""" + s = str(name) + bad = {os.sep, " ", ":", "|", "<", ">", '"', "'"} + if os.altsep is not None: + bad.add(os.altsep) + for ch in bad: + s = s.replace(ch, "_") + return s + + @staticmethod + def _pad_left(seq: List[Any], length: int, pad_val: Any = "") -> List[Any]: + """Left-pad a sequence to `length` with `pad_val`.""" + if len(seq) >= length: + return seq[-length:] + return [pad_val] * (length - len(seq)) + list(seq) + + # --------- Setters --------- + + def set_action_mask(self, action_mask: torch.Tensor): + """Register the (B, S) mask indicating which tokens correspond to actions.""" + self.action_mask = action_mask + + def set_range(self, range: Tuple[int, int]): + """Record which subset of ``paths`` the current mini-batch corresponds to.""" + self.range = range + + # --------- Column builders --------- + + def add_contexts(self, contexts: torch.Tensor): + """ + Add a single 'context' column (list[str]) for valid steps. + + Expects `contexts` with shape (B, S): token id at each timestep. + For each valid timestep t, we use the last N tokens up to and including t: + window = contexts[i, max(0, t - N + 1) : t + 1] + The list is left-padded with "" to always be length N. + """ + self._ensure_ready() + + current_paths = self.paths[self.range[0] : self.range[1]] + B, S = contexts.shape + N = self.max_context_length + + # to CPU ints once + contexts_cpu = contexts.detach().to("cpu") + + for i in range(B): + rollout_id = current_paths[i] + df = self.tally.get(rollout_id, pd.DataFrame()) + + valid_idx = torch.nonzero( + self.action_mask[i].bool(), as_tuple=False + ).squeeze(-1) + if valid_idx.numel() == 0: + self.tally[rollout_id] = df + continue + + idx_list = valid_idx.tolist() + + # ensure index contains valid steps + if df.empty: + df = pd.DataFrame(index=idx_list) + else: + new_index = sorted(set(df.index.tolist()) | set(idx_list)) + if list(df.index) != new_index: + df = df.reindex(new_index) + + # build context windows + ctx_token_lists = [] + for t in idx_list: + start = max(0, t - N + 1) + window_ids = contexts_cpu[i, start : t + 1].tolist() + window_toks = self.tids_to_str([int(x) for x in window_ids]) + if len(window_toks) < N: + window_toks = [""] * (N - len(window_toks)) + window_toks + else: + window_toks = window_toks[-N:] + ctx_token_lists.append(window_toks) + + # single 'context' column + if "context" not in df.columns: + df["context"] = pd.Series(index=df.index, dtype=object) + df.loc[idx_list, "context"] = pd.Series( + ctx_token_lists, index=idx_list, dtype=object + ) + + self.tally[rollout_id] = df + + def add_data( + self, + metric_id: str, + metrics: torch.Tensor, + to_tids: bool = False, + ): + """ + Add a metric column for valid steps. + + Args: + metric_id: column name + metrics: shape (B, S) for scalars/ids or (B, S, K) for top-k vectors + to_tids: if True, treat ints/lists of ints as tids and convert to tokens + """ + self._ensure_ready() + current_paths = self.paths[self.range[0] : self.range[1]] + + if metrics.dim() == 2: + B, S = metrics.shape + elif metrics.dim() == 3: + B, S, _ = metrics.shape + else: + raise ValueError("metrics must be (B, S) or (B, S, K)") + + for i in range(B): + rollout_id = current_paths[i] + df = self.tally.get(rollout_id, pd.DataFrame()) + + valid_idx = torch.nonzero( + self.action_mask[i].bool(), as_tuple=False + ).squeeze(-1) + if valid_idx.numel() == 0: + self.tally[rollout_id] = df + continue + + idx_list = valid_idx.detach().cpu().tolist() + + # Ensure index contains valid steps + if df.empty: + df = pd.DataFrame(index=idx_list) + else: + new_index = sorted(set(df.index.tolist()) | set(idx_list)) + if list(df.index) != new_index: + df = df.reindex(new_index) + + # Slice metrics at valid steps + m_valid = metrics[i][valid_idx] + + # -> pure python lists (1D list or list-of-lists) + values = m_valid.detach().cpu().tolist() + + # optional tids -> tokens + if to_tids: + + def _to_tokish(x): + if isinstance(x, list): + return self.tids_to_str([int(v) for v in x]) + else: + return self.tids_to_str([int(x)])[0] + + values = [_to_tokish(v) for v in values] + + # Ensure column exists with object dtype, then assign via aligned Series + if metric_id not in df.columns: + df[metric_id] = pd.Series(index=df.index, dtype=object) + + if isinstance(values, np.ndarray): + values = values.tolist() + + if len(values) != len(idx_list): + raise ValueError( + f"Length mismatch for '{metric_id}': values={len(values)} vs idx_list={len(idx_list)}" + ) + + df.loc[idx_list, metric_id] = pd.Series( + values, index=idx_list, dtype=object + ) + self.tally[rollout_id] = df + + # --------- Saving --------- + + def save(self, path: str): + """ + Write a manifest JSON and one CSV per rollout. + + - Manifest includes metadata only (safe to JSON). + - Each rollout CSV is written with index label 'timestep'. + - Only a single 'context' column (list[str]). + """ + if not self.tally or all(df.empty for df in self.tally.values()): + return + + os.makedirs(path, exist_ok=True) + from datetime import datetime + + now = datetime.now() + + manifest = { + "created_at": f"{now:%Y-%m-%d %H:%M:%S}", + "max_context_length": self.max_context_length, + "num_rollouts": len(self.tally), + "rollouts": [], + } + + for rid, df in self.tally.items(): + rid_str = str(rid) + safe_name = self._sanitize_filename(rid_str) + csv_path = os.path.join(path, f"{safe_name}_tokenwise.csv") + + # Put 'context' first, then the rest + cols = ["context"] + [c for c in df.columns if c != "context"] + try: + df[cols].to_csv(csv_path, index=True, index_label="timestep") + except Exception as e: + continue + + manifest["rollouts"].append( + { + "rollout_id": rid_str, + "csv": csv_path, + "num_rows": int(df.shape[0]), + "columns": cols, + } + ) + + manifest_path = os.path.join( + path, f"tokenwise_manifest_{now:%Y-%m-%d___%H-%M-%S}.json" + ) + with open(manifest_path, "w") as fp: + json.dump(manifest, fp, indent=2) diff --git a/src_code_for_reproducibility/training/tokenize_chats.py b/src_code_for_reproducibility/training/tokenize_chats.py new file mode 100644 index 0000000000000000000000000000000000000000..94da0030ec2afe19d5e5cd8a9a9e39b595d19975 --- /dev/null +++ b/src_code_for_reproducibility/training/tokenize_chats.py @@ -0,0 +1,128 @@ +""" +File: mllm/training/tokenize_chats.py +Summary: Tokenizes chat datasets and prepares tensors for training. +""" + +import logging +import sys + +import regex +import torch +from transformers import AutoTokenizer + +from mllm.training.training_data_utils import TrainingChatTurn, TrajectoryBatch + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +def process_training_chat( + tokenizer: AutoTokenizer, + chat_history: list[TrainingChatTurn], + entropy_mask_regex: str | None = None, + exploration_prompts_to_remove: list[str] = [], + use_engine_out_token_ids: bool = False, +) -> tuple[torch.IntTensor, torch.BoolTensor, torch.IntTensor, torch.BoolTensor]: + """Tokenize a single training chat and build aligned per-token masks. + + Given an ordered list of `TrainingChatTurn`, this function tokenizes each + turn independently using the tokenizer's chat template, then concatenates + all resulting token sequences. It also constructs three parallel 1D masks + that align with the concatenated tokens: + + - input_ids: token ids for the entire chat, turn by turn + - action_mask: True for tokens that belong to assistant turns (i.e., model + actions), False for tokens from other roles + - timesteps: per-token time step copied from the originating turn's + `time_step` + - state_ends_mask: True for the last token of any turn where + `is_state_end` is True, otherwise False + + Important details: + - Each turn is passed as a single-message list to + `tokenizer.apply_chat_template` and flattened; the per-turn outputs are + then concatenated in the original order. + - Turn boundaries are not explicitly encoded beyond what the chat template + inserts; masks provide alignment for learning signals and state endings. + - No truncation or padding is performed here; downstream code should handle + batching/padding as needed. + - Note on dtypes: `input_ids` will be a LongTensor (int64). `action_mask` + and `state_ends_mask` are BoolTensors. `timesteps` is currently created + as a float tensor; adjust the implementation if integer dtype is + required downstream. + + Args: + tokenizer: A Hugging Face tokenizer supporting `apply_chat_template`. + chat_history: Ordered list of `TrainingChatTurn` forming one dialogue. + + Returns: + A tuple of four 1D tensors, all of equal length N (the total number of + tokens across all turns), in the following order: + - input_ids (LongTensor) + - action_mask (BoolTensor) + - timesteps (FloatTensor as implemented; see note above) + - state_ends_mask (BoolTensor) + """ + state_ends_mask = [] + input_ids = [] + action_mask = [] + timesteps = [] + entropy_mask = [] + engine_log_probs = [] + for train_chat_turn in chat_history: + is_state_end = train_chat_turn.is_state_end + time_step = train_chat_turn.time_step + is_action = train_chat_turn.role == "assistant" + + # Remove exploration prompts from training data + for exploration_prompt in exploration_prompts_to_remove: + if exploration_prompt in train_chat_turn.content: + train_chat_turn.content = train_chat_turn.content.replace( + exploration_prompt, "" + ) + + chat_turn = { + "role": train_chat_turn.role, + "content": train_chat_turn.content, + } + if entropy_mask_regex is not None: + is_entropy_mask_true = ( + regex.search(entropy_mask_regex, train_chat_turn.content) is not None + ) + else: + is_entropy_mask_true = True + if is_action: + chat_turn_ids = train_chat_turn.out_token_ids + nb_chat_turns_ids = chat_turn_ids.numel() + action_mask.append(torch.ones(nb_chat_turns_ids, dtype=torch.bool)) + engine_log_probs.append(train_chat_turn.log_probs) + else: + chat_turn_ids = train_chat_turn.chat_template_token_ids + nb_chat_turns_ids = chat_turn_ids.numel() + action_mask.append(torch.zeros(nb_chat_turns_ids, dtype=torch.bool)) + engine_log_probs.append(torch.zeros(nb_chat_turns_ids, dtype=torch.float)) + nb_chat_turns_ids = chat_turn_ids.numel() + state_ends_mask.append(torch.zeros(nb_chat_turns_ids, dtype=torch.bool)) + if is_state_end: + state_ends_mask[-1][-1] = True # last token is state end + input_ids.append(chat_turn_ids) + entropy_mask.append(torch.ones(nb_chat_turns_ids, dtype=torch.bool)) + if not is_entropy_mask_true: + entropy_mask[-1] = entropy_mask[-1] * False + timesteps.append(torch.ones(nb_chat_turns_ids) * time_step) + input_ids = torch.cat(input_ids) + action_mask = torch.cat(action_mask) + entropy_mask = torch.cat(entropy_mask) + timesteps = torch.cat(timesteps) + timesteps = timesteps.to(torch.long) + state_ends_mask = torch.cat(state_ends_mask) + engine_log_probs = torch.cat(engine_log_probs) + + return ( + input_ids, + action_mask, + entropy_mask, + timesteps, + state_ends_mask, + engine_log_probs, + ) diff --git a/src_code_for_reproducibility/training/trainer_ad_align.py b/src_code_for_reproducibility/training/trainer_ad_align.py new file mode 100644 index 0000000000000000000000000000000000000000..14e18e51480e594355b3416555011223ff0e8f36 --- /dev/null +++ b/src_code_for_reproducibility/training/trainer_ad_align.py @@ -0,0 +1,505 @@ +""" +File: mllm/training/trainer_ad_align.py +Summary: Trainer specialized for the advantage-alignment objective. +""" + +import copy +import logging +import sys +from dataclasses import dataclass +from typing import Tuple + +import torch +from torch.nn.utils.rnn import pad_sequence + +from mllm.markov_games.rollout_tree import ( + ChatTurn, + RolloutTreeBranchNode, + RolloutTreeRootNode, +) +from mllm.training.credit_methods import ( + get_advantage_alignment_credits, + get_discounted_state_visitation_credits, +) +from mllm.training.tally_metrics import Tally +from mllm.training.tally_rollout import RolloutTally, RolloutTallyItem +from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally +from mllm.training.tokenize_chats import process_training_chat +from mllm.training.trainer_common import BaseTrainer +from mllm.training.training_data_utils import ( + AdvantagePacket, + TrainingBatch, + TrainingChatTurn, + TrajectoryBatch, + get_main_chat_list_and_rewards, + get_tokenwise_credits, +) +from mllm.utils.resource_context import resource_logger_context + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + +RolloutId = int +AgentId = str + + +@dataclass +class AdAlignTrainingData: + """Holds tensorized rollouts plus precomputed advantages for one agent.""" + + agent_id: str + main_data: TrajectoryBatch + # list-of-tensors: per rollout advantages with length jT + main_advantages: list[torch.FloatTensor] | None = None + # list-of-tensors: per rollout matrix (jT, A) + alternative_advantages: list[torch.FloatTensor] | None = None + advantage_alignment_credits: list[torch.FloatTensor] | None = None + + +def get_alternative_chat_histories( + agent_id: str, root: RolloutTreeRootNode +) -> list[list[TrainingChatTurn], list[torch.FloatTensor]]: + """ + Traverse every unilateral branch under ``root`` and collect chat/reward histories. + + Returns + ------- + alternative_chats: + Flattened list of chat turns for each branch (ordered by branch depth). + alternative_rewards: + Matching list of reward tensors aligned with the chat history. + """ + current_node = root.child + branches = current_node.branches + pre_branch_chat = [] + pre_branch_rewards = [] + alternative_rewards = [] + alternative_chats = [] + while current_node is not None: + assert isinstance( + current_node, RolloutTreeBranchNode + ), "Current node should be a branch node." + main_node = current_node.main_child + branches = current_node.branches + current_node = main_node.child + + # Get the `A` alternative trajectories + alternative_nodes = branches[agent_id] + for alt_node in alternative_nodes: + post_branch_chat, post_branch_rewards = get_main_chat_list_and_rewards( + agent_id=agent_id, root=alt_node + ) + branch_chat = pre_branch_chat + post_branch_chat + alternative_chats.append(branch_chat) + alternative_rewards.append( + torch.cat([torch.tensor(pre_branch_rewards), post_branch_rewards]) + ) + + chat_turns: list[ChatTurn] = main_node.step_log.action_logs[agent_id].chat_turns + chat_turns: list[TrainingChatTurn] = [ + TrainingChatTurn(time_step=main_node.time_step, **turn.model_dump()) + for turn in chat_turns + ] + + pre_branch_chat.extend(chat_turns) + pre_branch_rewards.append( + main_node.step_log.simulation_step_log.rewards[agent_id] + ) + + return alternative_chats, alternative_rewards + + +class TrainerAdAlign(BaseTrainer): + """ + Extends the reinforce trainer to support Advantage Alignment. + """ + + def __init__( + self, + ad_align_beta: float, + ad_align_gamma: float, + ad_align_exclude_k_equals_t: bool, + ad_align_use_sign: bool, + ad_align_clipping: float, + ad_align_force_coop_first_step: bool, + use_old_ad_align: bool, + use_time_regularization: bool, + rloo_branch: bool, + reuse_baseline: bool, + ad_align_beta_anneal_step: int = -1, + ad_align_beta_anneal_rate: float = 0.5, + min_ad_align_beta: float = 0.1, + mean_normalize_ad_align: bool = False, + whiten_adalign_advantages: bool = False, + whiten_adalign_advantages_time_step_wise: bool = False, + ad_align_discount_t: bool = False, + *args, + **kwargs, + ): + """ + Initialize the advantage alignment trainer. + Args: + ad_align_beta: Beta parameter for the advantage alignment. + ad_align_gamma: Gamma parameter for the advantage alignment. + ad_align_exclude_k_equals_t: Whether to include k = t in the advantage alignment. + ad_align_use_sign: Whether to use sign in the advantage alignment. + ad_align_clipping: Clipping value for the advantage alignment. + ad_align_force_coop_first_step: Whether to force coop on the first step of the advantage alignment. + """ + super().__init__(*args, **kwargs) + self.ad_align_beta = ad_align_beta + self.ad_align_gamma = ad_align_gamma + self.ad_align_exclude_k_equals_t = ad_align_exclude_k_equals_t + self.ad_align_use_sign = ad_align_use_sign + self.ad_align_clipping = ad_align_clipping + self.ad_align_force_coop_first_step = ad_align_force_coop_first_step + self.use_old_ad_align = use_old_ad_align + self.use_time_regularization = use_time_regularization + self.rloo_branch = rloo_branch + self.reuse_baseline = reuse_baseline + self.ad_align_beta_anneal_step = ad_align_beta_anneal_step + self.ad_align_beta_anneal_rate = ad_align_beta_anneal_rate + self.min_ad_align_beta = min_ad_align_beta + self.past_ad_align_step = -1 + self.mean_normalize_ad_align = mean_normalize_ad_align + self.whiten_adalign_advantages = whiten_adalign_advantages + self.whiten_adalign_advantages_time_step_wise = ( + whiten_adalign_advantages_time_step_wise + ) + self.ad_align_discount_t = ad_align_discount_t + self.training_data: dict[AgentId, AdAlignTrainingData] = {} + self.debug_path_list: list[str] = [] + + def set_agent_trajectory_data( + self, agent_id: str, roots: list[RolloutTreeRootNode] + ): + """ + Materialize main and alternative trajectory tensors used by the advantage-alignment trainer. + """ + + B = len(roots) # Number of rollouts + + # For main rollouts + batch_rollout_ids = [] + batch_crn_ids = [] + batch_input_ids = [] + batch_action_mask = [] + batch_entropy_mask = [] + batch_timesteps = [] + batch_state_ends_mask = [] + batch_engine_log_probs = [] + batch_rewards = [] + + # For alternative actions rollouts + batch_branching_time_steps = [] + alternative_batch_input_ids = [] + alternative_batch_action_mask = [] + alternative_batch_entropy_mask = [] + alternative_batch_timesteps = [] + alternative_batch_state_ends_mask = [] + alternative_batch_engine_log_probs = [] + alternative_batch_rewards = [] + jT_list = [] + + try: + A = len(roots[0].child.branches[agent_id]) # Number of alternative actions + except: + A = 0 + + for root in roots: + rollout_id = root.id + self.debug_path_list.append( + "mgid:" + str(rollout_id) + "_agent_id:" + agent_id + ) + # Get main trajectory + batch_rollout_ids.append(rollout_id) + batch_crn_ids.append(root.crn_id) + main_chat, main_rewards = get_main_chat_list_and_rewards( + agent_id=agent_id, root=root + ) + ( + input_ids, + action_mask, + entropy_mask, + timesteps, + state_ends_mask, + engine_log_probs, + ) = process_training_chat( + tokenizer=self.tokenizer, + chat_history=main_chat, + entropy_mask_regex=self.entropy_mask_regex, + exploration_prompts_to_remove=self.exploration_prompts_to_remove, + ) + batch_input_ids.append(input_ids) + batch_action_mask.append(action_mask) + batch_entropy_mask.append(entropy_mask) + batch_timesteps.append(timesteps) + batch_state_ends_mask.append(state_ends_mask) + batch_engine_log_probs.append(engine_log_probs) + batch_rewards.append(main_rewards) + jT = ( + main_rewards.numel() + ) # Number of timesteps inferred from reward tensor length. + jT_list.append(jT) + if A > 0: + # We get the branching time steps for each of the `jT` time steps in the main trajectory. + branching_time_steps = [bt for item in range(jT) for bt in A * [item]] + batch_branching_time_steps.extend(branching_time_steps) + + # Get all of the (jT*A) alternative trajectories in the tree + # (jT is the number of time steps in the main trajectory, A is the number of alternative actions) + alternative_chats, alternative_rewards = get_alternative_chat_histories( + agent_id=agent_id, root=root + ) + assert ( + len(alternative_chats) == A * jT + ), "Incorrect number of alternative trajectories." + + for chat, rewards in zip(alternative_chats, alternative_rewards): + ( + input_ids, + action_mask, + entropy_mask, + timesteps, + state_ends_mask, + engine_log_probs, + ) = process_training_chat( + tokenizer=self.tokenizer, + chat_history=chat, + entropy_mask_regex=self.entropy_mask_regex, + exploration_prompts_to_remove=self.exploration_prompts_to_remove, + ) + alternative_batch_input_ids.append(input_ids) + alternative_batch_action_mask.append(action_mask) + alternative_batch_entropy_mask.append(entropy_mask) + alternative_batch_timesteps.append(timesteps) + alternative_batch_state_ends_mask.append(state_ends_mask) + alternative_batch_engine_log_probs.append(engine_log_probs) + alternative_batch_rewards.append(rewards) + + jT_list = torch.Tensor(jT_list) + + # Assert that number of alternative actions is constant + # assert len(set(nb_alternative_actions)) == 1, "Number of alternative actions must be constant" + # A = nb_alternative_actions[0] + + trajectory_batch = TrajectoryBatch( + rollout_ids=torch.tensor(batch_rollout_ids, dtype=torch.int32), # (B,) + crn_ids=torch.tensor(batch_crn_ids, dtype=torch.int32), + agent_ids=[agent_id] * len(batch_rollout_ids), + batch_input_ids=batch_input_ids, + batch_action_mask=batch_action_mask, + batch_entropy_mask=batch_entropy_mask, + batch_timesteps=batch_timesteps, + batch_state_ends_mask=batch_state_ends_mask, + batch_engine_log_probs=batch_engine_log_probs, + batch_rewards=batch_rewards, + ) + # Get Advantages & Train Critic + with resource_logger_context( + logger, "Get advantages with critic gradient accumulation" + ): + self.batch_advantages: torch.FloatTensor = ( + self.get_advantages_with_critic_gradient_accumulation(trajectory_batch) + ) # (B, jT) + + if A > 0: + # Here, `A` is the number of alternative actions / trajectories taken at each time step. + # For each of the `B` rollout perspectives, at each of its jT (`j` is for jagged, since each main rollout may be of a different length) steps, we take A alternate trajectories (from different actions). + # Therefore, we have ∑jT * A trajectories to process. If each of the main trajectories have T steps, we will have `B*T*A` to process. + with resource_logger_context(logger, "Create alternative trajectory batch"): + sum_jT = int(torch.sum(jT_list).item()) + jT_list = ( + jT_list.int().tolist() + ) # (jT,) # (we only want the advantages where we branched out) + alternative_trajectory_batch = TrajectoryBatch( + rollout_ids=torch.zeros(A * sum_jT, dtype=torch.int32), + crn_ids=torch.zeros(A * sum_jT, dtype=torch.int32), + agent_ids=[agent_id] * (A * sum_jT), + batch_input_ids=alternative_batch_input_ids, + batch_action_mask=alternative_batch_action_mask, + batch_entropy_mask=alternative_batch_entropy_mask, + batch_timesteps=alternative_batch_timesteps, + batch_state_ends_mask=alternative_batch_state_ends_mask, + batch_engine_log_probs=alternative_batch_engine_log_probs, + batch_rewards=alternative_batch_rewards, + ) + + # Get alternative advantages + # BAAs stands for batch alternative advantages + # (torch nested tensors have very little api support, so we have to do some odd manual work here) + with resource_logger_context( + logger, "Compute alternative advantage estimates" + ): + BAAs_list = self.get_advantages_with_critic_gradient_accumulation( + alternative_trajectory_batch + ) # list length (∑jT * A), each (jT',) + # Pad alternative advantages to (∑jT*A, P) + + BAAs_padded = pad_sequence( + BAAs_list, batch_first=True, padding_value=0.0 + ) + branch_idx = torch.tensor( + batch_branching_time_steps, + device=BAAs_padded.device, + dtype=torch.long, + ) + gathered = BAAs_padded.gather( + dim=1, index=branch_idx.unsqueeze(1) + ).squeeze(1) + # Reshape and split per rollout, then transpose to (jT_i, A) + gathered = gathered.view(A, sum_jT) # (A, ∑jT) + blocks = list( + torch.split(gathered, jT_list, dim=1) + ) # len B, shapes (A, jT_i) + BAAs = [ + blk.transpose(0, 1).contiguous() for blk in blocks + ] # list of (jT_i, A) + if self.ad_align_beta_anneal_step > 0: + max_rollout_id = torch.max(trajectory_batch.rollout_ids) + 1 + if ( + max_rollout_id % self.ad_align_beta_anneal_step == 0 + and self.past_ad_align_step != max_rollout_id + ): + self.ad_align_beta = max( + self.ad_align_beta * self.ad_align_beta_anneal_rate, + self.min_ad_align_beta, + ) + logger.info(f"Annealing ad_align_beta to {self.ad_align_beta}") + self.past_ad_align_step = max_rollout_id + self.training_data[agent_id] = AdAlignTrainingData( + agent_id=agent_id, + main_data=trajectory_batch, + main_advantages=self.batch_advantages, + alternative_advantages=BAAs if A > 0 else None, + ) + + def share_advantage_data(self) -> list[AdvantagePacket]: + """ + Share the advantage alignment data with other agents. + Returns: + AdvantagePacket: The advantage packet containing the agent's advantages. + """ + logger.info(f"Sharing advantage alignment data.") + advantage_packets = [] + for _, agent_data in self.training_data.items(): + advantage_packets.append( + AdvantagePacket( + agent_id=agent_data.agent_id, + rollout_ids=agent_data.main_data.rollout_ids, + main_advantages=agent_data.main_advantages, + ) + ) + return advantage_packets + + def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]): + """ + Receive advantage packets from other players. + These contain the advantages of the other players' rollouts estimated by them. + """ + logger.info(f"Receiving advantage packets.") + + assert ( + len(advantage_packets) > 0 + ), "At least one advantage packet must be provided." + + for agent_id, agent_data in self.training_data.items(): + coagent_advantage_packets = [ + packet for packet in advantage_packets if packet.agent_id != agent_id + ] + agent_rollout_ids = agent_data.main_data.rollout_ids + agent_advantages = agent_data.main_advantages + co_agent_advantages = [] + for rollout_id in agent_rollout_ids: + for co_agent_packet in coagent_advantage_packets: + if rollout_id in co_agent_packet.rollout_ids: + index = torch.where(rollout_id == co_agent_packet.rollout_ids)[ + 0 + ].item() + co_agent_advantages.append( + co_agent_packet.main_advantages[index] + ) + # assumes that its two player game, with one co-agent + break + assert len(co_agent_advantages) == len(agent_advantages) + B = len(agent_advantages) + assert all( + a.shape[0] == b.shape[0] + for a, b in zip(co_agent_advantages, agent_advantages) + ), "Number of advantages must match for advantage alignment." + + # Get padded tensors (advantage alignment is invariant to padding) + lengths = torch.tensor( + [len(t) for t in agent_advantages], + device=self.device, + dtype=torch.long, + ) + padded_main_advantages = pad_sequence( + agent_advantages, batch_first=True, padding_value=0.0 + ) + if agent_data.alternative_advantages: + padded_alternative_advantages = pad_sequence( + agent_data.alternative_advantages, + batch_first=True, + padding_value=0.0, + ) # (B, P, A) + else: + padded_alternative_advantages = None + padded_co_agent_advantages = pad_sequence( + co_agent_advantages, batch_first=True, padding_value=0.0 + ) + + # Create training batch data + credits, sub_tensors = get_advantage_alignment_credits( + a1=padded_main_advantages, + a1_alternative=padded_alternative_advantages, + a2=padded_co_agent_advantages, + beta=self.ad_align_beta, + gamma=self.ad_align_gamma, + exclude_k_equals_t=self.ad_align_exclude_k_equals_t, + use_sign=self.ad_align_use_sign, + clipping=self.ad_align_clipping, + force_coop_first_step=self.ad_align_force_coop_first_step, + use_old_ad_align=self.use_old_ad_align, + use_time_regularization=self.use_time_regularization, + rloo_branch=self.rloo_branch, + reuse_baseline=self.reuse_baseline, + mean_normalize_ad_align=self.mean_normalize_ad_align, + whiten_adalign_advantages=self.whiten_adalign_advantages, + whiten_adalign_advantages_time_step_wise=self.whiten_adalign_advantages_time_step_wise, + discount_t=self.ad_align_discount_t, + ) + for key, value in sub_tensors.items(): + self.rollout_tally.add_metric( + path=[key], + rollout_tally_item=RolloutTallyItem( + crn_ids=agent_data.main_data.crn_ids, + rollout_ids=agent_data.main_data.rollout_ids, + agent_ids=agent_data.main_data.agent_ids, + metric_matrix=value, + ), + ) + + if not self.skip_discounted_state_visitation: + credits = get_discounted_state_visitation_credits( + credits, + self.discount_factor, + ) + self.rollout_tally.add_metric( + path=["discounted_state_visitation_credits"], + rollout_tally_item=RolloutTallyItem( + crn_ids=agent_data.main_data.crn_ids, + rollout_ids=agent_data.main_data.rollout_ids, + agent_ids=agent_data.main_data.agent_ids, + metric_matrix=sub_tensors[ + "discounted_state_visitation_credits" + ], + ), + ) + + # Slice back to jagged + advantage_alignment_credits = [credits[i, : lengths[i]] for i in range(B)] + # Replace stored training data for this agent by the concrete trajectory batch + # and attach the computed credits for policy gradient. + self.training_data[agent_id] = agent_data.main_data + self.training_data[agent_id].batch_credits = advantage_alignment_credits diff --git a/src_code_for_reproducibility/training/trainer_common.py b/src_code_for_reproducibility/training/trainer_common.py new file mode 100644 index 0000000000000000000000000000000000000000..0a0fb6f64707df0314ca4ed53611c2c592cc4377 --- /dev/null +++ b/src_code_for_reproducibility/training/trainer_common.py @@ -0,0 +1,1032 @@ +""" +File: mllm/training/trainer_common.py +Summary: Shared trainer utilities, base classes, and gradient helpers. +""" + +import logging +import os +import pickle +import sys +from abc import ABC, abstractmethod +from typing import Callable, Literal, Union + +import numpy as np +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from pandas._libs.tslibs.offsets import CBMonthBegin +from peft import LoraConfig +from torch.nn.utils.rnn import pad_sequence +from transformers import AutoModelForCausalLM, AutoTokenizer + +from mllm.markov_games.rollout_tree import * +from mllm.markov_games.rollout_tree import RolloutTreeRootNode +from mllm.training.annealing_methods import sigmoid_annealing +from mllm.training.credit_methods import ( + get_discounted_returns, + get_generalized_advantage_estimates, + get_rloo_credits, + whiten_advantages, + whiten_advantages_time_step_wise, +) +from mllm.training.tally_metrics import Tally +from mllm.training.tally_rollout import RolloutTally, RolloutTallyItem +from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally +from mllm.training.tokenize_chats import * +from mllm.training.tokenize_chats import process_training_chat +from mllm.training.training_data_utils import * +from mllm.training.training_data_utils import ( + TrainingBatch, + TrajectoryBatch, + get_tokenwise_credits, +) +from mllm.utils.resource_context import resource_logger_context + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +@dataclass +class TrainerAnnealingState: + annealing_step_counter: int = 0 + + +class BaseTrainer(ABC): + """ + Shared scaffolding for policy-gradient trainers (optimizer wiring, logging, etc.). + + Subclasses implement `set_agent_trajectory_data` / `share_advantage_data` + to plug in algorithm-specific behavior. + """ + + def __init__( + self, + policy: AutoModelForCausalLM, + policy_optimizer: torch.optim.Optimizer, + critic: Union[AutoModelForCausalLM, None], + critic_optimizer: Union[torch.optim.Optimizer, None], + tokenizer: AutoTokenizer, + lr_scheduler: torch.optim.lr_scheduler.LRScheduler, + critic_lr_scheduler: Union[torch.optim.lr_scheduler.LRScheduler, None], + ###################################################################### + entropy_coeff: float, + entropy_topk: int, + entropy_mask_regex: Union[str, None], + kl_coeff: float, + gradient_clipping: Union[float, None], + restrict_tokens: Union[list[str], None], + mini_batch_size: int, + use_gradient_checkpointing: bool, + temperature: float, + device: str, + whiten_advantages: bool, + whiten_advantages_time_step_wise: bool, + use_gae: bool, + use_gae_lambda_annealing: bool, + gae_lambda_annealing_limit: float, + gae_lambda_annealing_method: Literal["sigmoid_annealing"], + gae_lambda_annealing_method_params: dict, + pg_loss_normalization: Literal["batch", "nb_tokens"], + use_rloo: bool, + skip_discounted_state_visitation: bool, + discount_factor: float, + enable_tokenwise_logging: bool, + save_path: str, + reward_normalizing_constant: float = 1.0, + critic_loss_type: Literal["mse", "huber"] = "huber", + exploration_prompts_to_remove: list[str] = [], + filter_higher_refprob_tokens_kl: bool = False, + truncated_importance_sampling_ratio_cap: float = 0.0, + importance_sampling_strategy: Literal[ + "per_token", "per_sequence" + ] = "per_token", + no_rloo_grouping: bool = False, + ): + """ + Initialize the REINFORCE trainer with reward shaping for multi-agent or single-agent training. + + Args: + model (AutoModelForCausalLM): The main policy model. + tokenizer (AutoTokenizer): Tokenizer for the model. + optimizer (torch.optim.Optimizer): Optimizer for the policy model. + lr_scheduler (torch.optim.lr_scheduler.LRScheduler): Learning rate scheduler for the policy model. + critic (AutoModelForCausalLM or None): Critic model for value estimation (optional). + critic_optimizer (torch.optim.Optimizer or None): Optimizer for the critic model (optional). + critic_lr_scheduler (torch.optim.lr_scheduler.LRScheduler or None): LR scheduler for the critic (optional). + config (RtConfig): Configuration object for training. + """ + self.tokenizer = tokenizer + # self.tokenizer.padding_side = "left" # needed for flash attention + if self.tokenizer.pad_token_id is None: + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + self.lr_scheduler = lr_scheduler + self.accelerator = Accelerator() + ( + self.policy, + self.policy_optimizer, + self.critic, + self.critic_optimizer, + ) = self.accelerator.prepare(policy, policy_optimizer, critic, critic_optimizer) + + self.critic_lr_scheduler = critic_lr_scheduler + self.tally = Tally() + + if use_gradient_checkpointing == True: + self.policy.gradient_checkpointing_enable(dict(use_reentrant=False)) + if critic is not None: + self.critic.gradient_checkpointing_enable(dict(use_reentrant=False)) + + self.save_path = save_path + + # Load trainer state if it exists + self.trainer_annealing_state_path = os.path.join( + self.save_path, "trainer_annealing_state.pkl" + ) + if os.path.exists(self.trainer_annealing_state_path): + logger.info( + f"Loading trainer state from {self.trainer_annealing_state_path}" + ) + self.trainer_annealing_state = pickle.load( + open(self.trainer_annealing_state_path, "rb") + ) + else: + self.trainer_annealing_state = TrainerAnnealingState() + + # Load policy optimizer state if it exists + self.policy_optimizer_path = os.path.join( + self.save_path, "policy_optimizer_state.pt" + ) + if os.path.exists(self.policy_optimizer_path): + logger.info( + f"Loading policy optimizer state from {self.policy_optimizer_path}" + ) + self.policy_optimizer.load_state_dict( + torch.load(self.policy_optimizer_path) + ) + + # Load critic optimizer state if it exists + self.critic_optimizer_path = os.path.join( + self.save_path, "critic_optimizer_state.pt" + ) + if ( + os.path.exists(self.critic_optimizer_path) + and self.critic_optimizer is not None + ): + logger.info( + f"Loading critic optimizer state from {self.critic_optimizer_path}" + ) + self.critic_optimizer.load_state_dict( + torch.load(self.critic_optimizer_path) + ) + self.device = self.accelerator.device + self.entropy_coeff = entropy_coeff + self.entropy_topk = entropy_topk + self.entropy_mask_regex = entropy_mask_regex + self.kl_coeff = kl_coeff + self.gradient_clipping = gradient_clipping + self.restrict_tokens = restrict_tokens + self.mini_batch_size = mini_batch_size + self.use_gradient_checkpointing = use_gradient_checkpointing + self.temperature = temperature + self.use_gae = use_gae + self.whiten_advantages = whiten_advantages + self.whiten_advantages_time_step_wise = whiten_advantages_time_step_wise + self.use_rloo = use_rloo + self.skip_discounted_state_visitation = skip_discounted_state_visitation + self.use_gae_lambda_annealing = use_gae_lambda_annealing + self.gae_lambda_annealing_limit = gae_lambda_annealing_limit + if use_gae_lambda_annealing: + self.gae_lambda_annealing_method: Callable[ + [int], float + ] = lambda step: eval(gae_lambda_annealing_method)( + step=step, **gae_lambda_annealing_method_params + ) + self.discount_factor = discount_factor + self.enable_tokenwise_logging = enable_tokenwise_logging + self.reward_normalizing_constant = reward_normalizing_constant + self.pg_loss_normalization = pg_loss_normalization + self.critic_loss_type = critic_loss_type + self.exploration_prompts_to_remove = exploration_prompts_to_remove + # Common containers used by all trainers + self.training_data: dict = {} + self.debug_path_list: list[str] = [] + self.policy_gradient_data = None + self.tally = Tally() + self.rollout_tally = RolloutTally() + self.tokenwise_tally: Union[ContextualizedTokenwiseTally, None] = None + self.filter_higher_refprob_tokens_kl = filter_higher_refprob_tokens_kl + self.truncated_importance_sampling_ratio_cap = ( + truncated_importance_sampling_ratio_cap + ) + self.importance_sampling_strategy = importance_sampling_strategy + self.no_rloo_grouping = no_rloo_grouping + + def mask_non_restricted_token_logits(self, logits: torch.Tensor) -> torch.Tensor: + """ + Masks logits so that only allowed tokens (as specified in config.restrict_tokens) + and the EOS token are active. + All other logits are set to -inf, effectively removing them from the softmax. + + Args: + logits (torch.Tensor): The logits tensor of shape (B, S, V). + + Returns: + torch.Tensor: The masked logits tensor. + """ + # Gradients flow only through the kept logits; masking is recomputed per batch for clarity. + + if self.restrict_tokens is not None: + allowed_token_ids = [] + for token in self.restrict_tokens: + token_ids = self.tokenizer(token, add_special_tokens=False)["input_ids"] + allowed_token_ids.append(token_ids[0]) + allowed_token_ids.append( + self.tokenizer.eos_token_id + ) # This token should always be active + allowed_token_ids = torch.tensor(allowed_token_ids, device=logits.device) + # Mask log_probs and probs to only allowed tokens + mask = torch.zeros_like(logits).bool() # (B, S, V) + mask[..., allowed_token_ids] = True + logits = torch.where( + mask, + logits, + torch.tensor(-float("inf"), device=logits.device), + ) + + return logits + + def apply_reinforce_step( + self, + training_batch: TrainingBatch, + ) -> None: + """ + Applies a single REINFORCE policy gradient step using the provided batch of rollouts. + Handles batching, loss computation (including entropy and KL regularization), gradient accumulation, and optimizer step. + Optionally logs various metrics and statistics. + + Args: + paths (list[str]): List of game complete file paths for each rollout. + contexts (list[torch.Tensor]): List of context tensors for each rollout. + credits (list[torch.Tensor]): List of credit tensors (rewards/advantages) for each rollout. + action_masks (list[torch.Tensor]): List of action mask tensors for each rollout. + """ + with resource_logger_context(logger, "Apply reinforce step"): + self.policy.train() + mb_size = self.mini_batch_size + nb_rollouts = len(training_batch) + + # Initialize running mean logs + running_mean_logs = { + "rl_objective": 0.0, + "policy_gradient_loss": 0.0, + "policy_gradient_norm": 0.0, + "log_probs": 0.0, + "credits": 0.0, + "entropy": 0.0, + "engine_log_probs_diff_clampfrac": 0.0, + "tis_imp_ratio": 0.0, + "ref_log_probs_diff_clampfrac": 0.0, + "higher_refprob_frac": 0.0, + "tis_imp_ratio_clampfrac": 0.0, + } + if self.entropy_coeff != 0.0: + running_mean_logs["entropy"] = 0.0 + if self.kl_coeff != 0.0: + running_mean_logs["kl_divergence"] = 0.0 + + # Get total number of tokens generated + total_tokens_generated = 0 + for att_mask in training_batch.batch_action_mask: + total_tokens_generated += att_mask.sum() + + # Obtain loss normalization + if self.pg_loss_normalization == "nb_tokens": + normalization_factor = total_tokens_generated + elif self.pg_loss_normalization == "batch": + normalization_factor = np.ceil(nb_rollouts / mb_size).astype(int) + else: + raise ValueError( + f"Invalid pg_loss_normalization: {self.pg_loss_normalization}" + ) + + # Gradient accumulation for each mini-batch + for mb in range(0, nb_rollouts, mb_size): + logger.info(f"Processing mini-batch {mb} of {nb_rollouts}") + loss = 0.0 + training_mb = training_batch[mb : mb + mb_size] + training_mb = training_mb.get_padded_tensors() + training_mb.to(self.device) + ( + tokens_mb, + action_mask_mb, + entropy_mask_mb, + credits_mb, + engine_log_probs_mb, + timesteps_mb, + ) = ( + training_mb.batch_input_ids, + training_mb.batch_action_mask, + training_mb.batch_entropy_mask, + training_mb.batch_credits, + training_mb.batch_engine_log_probs, + training_mb.batch_timesteps, + ) + + # Next token prediction + contexts_mb = tokens_mb[:, :-1] + shifted_contexts_mb = tokens_mb[:, 1:] + action_mask_mb = action_mask_mb[:, 1:] + entropy_mask_mb = entropy_mask_mb[:, 1:] + credits_mb = credits_mb[:, 1:] + engine_log_probs_mb = engine_log_probs_mb[:, 1:] + timesteps_mb = timesteps_mb[:, 1:] + + if self.enable_tokenwise_logging: + self.tokenwise_tally.set_action_mask(action_mask=action_mask_mb) + self.tokenwise_tally.set_range(range=(mb, mb + mb_size)) + self.tokenwise_tally.add_contexts(contexts=contexts_mb) + self.tokenwise_tally.add_data( + metric_id="next_token", + metrics=shifted_contexts_mb, + to_tids=True, + ) + self.tokenwise_tally.add_data( + metric_id="entropy_mask", + metrics=entropy_mask_mb, + ) + + if self.enable_tokenwise_logging: + self.tokenwise_tally.add_data( + metric_id="next_token_credit", metrics=credits_mb + ) + + # Forward pass + cast to FP-32 for higher prec. Causal LM attention masks are implicit; + # wire up a custom mask here only if the policy deviates from standard autoregressive behavior. + logits = self.policy(input_ids=contexts_mb)[0] # (B, S, V) + + # Mask non-restricted tokens + if self.restrict_tokens is not None: + logits = self.mask_non_restricted_token_logits(logits) + + logits /= self.temperature # (B, S, V) + + # Compute new log probabilities + log_probs = F.log_softmax(logits, dim=-1) # (B, S, V) + + # Get log probabilities of actions taken during rollouts + action_log_probs = log_probs.gather( + dim=-1, index=shifted_contexts_mb.unsqueeze(-1) + ).squeeze( + -1 + ) # (B, S) + if self.pg_loss_normalization == "batch": + den_running_mean = action_mask_mb.sum() * normalization_factor + else: + den_running_mean = normalization_factor + running_mean_logs["log_probs"] += ( + action_log_probs * action_mask_mb + ).sum().item() / den_running_mean + running_mean_logs["credits"] += ( + credits_mb * action_mask_mb + ).sum().item() / den_running_mean + + if self.enable_tokenwise_logging: + self.tokenwise_tally.add_data( + metric_id="next_token_log_prob", + metrics=action_log_probs, + ) + self.tokenwise_tally.add_data( + metric_id="engine_next_token_log_prob", + metrics=engine_log_probs_mb, + ) + self.tokenwise_tally.add_data( + metric_id="next_token_prob", + metrics=torch.exp(action_log_probs), + ) + top_k_indices = torch.topk(logits, k=5, dim=-1).indices + self.tokenwise_tally.add_data( + metric_id=f"top_{5}_tids", + metrics=top_k_indices, + to_tids=True, + ) + self.tokenwise_tally.add_data( + metric_id=f"top_{5}_probs", + metrics=torch.exp(log_probs).gather( + dim=-1, index=top_k_indices + ), + ) + + rewarded_action_log_probs = ( + action_mask_mb * credits_mb * action_log_probs + ) + # (B, S) + INVALID_LOGPROB = 1.0 + CLAMP_VALUE = 40.0 + masked_action_log_probs = torch.masked_fill( + action_log_probs, ~action_mask_mb, INVALID_LOGPROB + ) + masked_engine_log_probs = torch.masked_fill( + engine_log_probs_mb, ~action_mask_mb, INVALID_LOGPROB + ) + with torch.no_grad(): + action_engine_log_probs_diff = ( + masked_action_log_probs - masked_engine_log_probs + ).clamp(-CLAMP_VALUE, CLAMP_VALUE) + running_mean_logs["engine_log_probs_diff_clampfrac"] += ( + action_engine_log_probs_diff.abs() + .eq(CLAMP_VALUE) + .float() + .sum() + .item() + / den_running_mean + ) + if self.importance_sampling_strategy == "per_sequence": + tis_imp_ratio = torch.zeros_like(action_engine_log_probs_diff) + for mb_idx in range(action_engine_log_probs_diff.shape[0]): + valid_token_mask = action_mask_mb[mb_idx] + timestep_ids = timesteps_mb[mb_idx][valid_token_mask] + timestep_logprob_diffs = action_engine_log_probs_diff[mb_idx][ + valid_token_mask + ] + max_timestep = int(timestep_ids.max().item()) + 1 + timestep_sums = torch.zeros( + max_timestep, + device=action_engine_log_probs_diff.device, + dtype=action_engine_log_probs_diff.dtype, + ) + timestep_sums.scatter_add_( + 0, timestep_ids, timestep_logprob_diffs + ) + timestep_ratios = torch.exp(timestep_sums) + tis_imp_ratio[ + mb_idx, valid_token_mask + ] = timestep_ratios.gather(0, timestep_ids) + else: + tis_imp_ratio = torch.exp(action_engine_log_probs_diff) + running_mean_logs["tis_imp_ratio"] += ( + tis_imp_ratio * action_mask_mb + ).sum().item() / den_running_mean + if self.truncated_importance_sampling_ratio_cap > 0.0: + tis_imp_ratio = torch.clamp( + tis_imp_ratio, max=self.truncated_importance_sampling_ratio_cap + ) + running_mean_logs["tis_imp_ratio_clampfrac"] += ( + tis_imp_ratio.eq(self.truncated_importance_sampling_ratio_cap) + .float() + .sum() + .item() + ) / den_running_mean + rewarded_action_log_probs = ( + rewarded_action_log_probs * tis_imp_ratio + ) + + if self.enable_tokenwise_logging: + self.tokenwise_tally.add_data( + metric_id="next_token_clogπ", + metrics=rewarded_action_log_probs, + ) + + # Add value term to loss + if self.pg_loss_normalization == "batch": + nb_act_tokens = action_mask_mb.sum() + mb_value = -rewarded_action_log_probs.sum() / nb_act_tokens + else: + mb_value = -rewarded_action_log_probs.sum() + + loss += mb_value + running_mean_logs["rl_objective"] += mb_value.item() / den_running_mean + + # ------------------------------------------------- + # Entropy Regularization + # ------------------------------------------------- + # Only apply entropy on distribution defined over most probable tokens + if self.entropy_topk is not None: + top_k_indices = torch.topk( + logits, k=self.entropy_topk, dim=-1 + ).indices + entropy_logits = logits.gather(dim=-1, index=top_k_indices) + else: + entropy_logits = logits + + token_entropy_terms = -F.softmax( + entropy_logits, dim=-1 + ) * F.log_softmax( + entropy_logits, dim=-1 + ) # (B, S, T) + token_entropy_terms *= ( + action_mask_mb[:, :, None] * entropy_mask_mb[:, :, None] + ) # only get loss on specific action tokens + + mb_entropy = token_entropy_terms.sum(dim=-1) + + if self.enable_tokenwise_logging: + self.tokenwise_tally.add_data( + metric_id="entropy", + metrics=mb_entropy, + ) + if self.pg_loss_normalization == "batch": + nb_act_tokens = action_mask_mb.sum() + mb_entropy = -mb_entropy.sum() / nb_act_tokens + else: + mb_entropy = -mb_entropy.sum() + running_mean_logs["entropy"] += -mb_entropy.item() / den_running_mean + if self.entropy_coeff != 0.0: + mb_entropy *= self.entropy_coeff + loss += mb_entropy + + # ------------------------------------------------- + # KL-DIVERGENCE + # ------------------------------------------------- + if self.kl_coeff != 0.0: + ref_model_logits = self.policy.get_base_model_logits(contexts_mb) + ref_model_logits = ref_model_logits / self.temperature + # (B, S, V) + ref_model_logits = self.mask_non_restricted_token_logits( + logits=ref_model_logits + ) + # (B, S, V) + ref_model_log_probs = F.log_softmax(ref_model_logits, dim=-1) + # (B, S, V) + ref_model_action_log_probs = ref_model_log_probs.gather( + dim=-1, index=shifted_contexts_mb.unsqueeze(-1) + ).squeeze( + -1 + ) # (B,S) + # Approximating KL Divergence (see refs in docstring) + # Ref 1: http://joschu.net/blog/kl-approx.html + # Ref 2: https://github.dev/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py#L1332 + masked_ref_model_action_log_probs = torch.masked_fill( + ref_model_action_log_probs, ~action_mask_mb, INVALID_LOGPROB + ) + action_log_probs_diff = ( + masked_ref_model_action_log_probs - masked_action_log_probs + ).clamp(-CLAMP_VALUE, CLAMP_VALUE) + running_mean_logs["ref_log_probs_diff_clampfrac"] += ( + action_log_probs_diff.abs().eq(CLAMP_VALUE).float().sum().item() + / den_running_mean + ) + if self.filter_higher_refprob_tokens_kl: + higher_refprob_tokens_mask = action_log_probs_diff > 0.0 + running_mean_logs["higher_refprob_frac"] += ( + higher_refprob_tokens_mask.sum().item() / den_running_mean + ) + action_log_probs_diff = action_log_probs_diff * ( + ~higher_refprob_tokens_mask + ) + kl_div = torch.expm1(action_log_probs_diff) - action_log_probs_diff + kl_div *= action_mask_mb # We only care about KLD of action tokens + if self.truncated_importance_sampling_ratio_cap > 0.0: + kl_div = kl_div * tis_imp_ratio + kl_div *= self.kl_coeff + if self.enable_tokenwise_logging: + self.tokenwise_tally.add_data( + metric_id="ref_model_next_token_log_prob", + metrics=ref_model_action_log_probs, + ) + self.tokenwise_tally.add_data( + metric_id="kl_divergence", + metrics=kl_div, + ) + + if self.pg_loss_normalization == "batch": + nb_act_tokens = action_mask_mb.sum() + mb_kl = kl_div.sum() / nb_act_tokens + else: + mb_kl = kl_div.sum() + running_mean_logs["kl_divergence"] += ( + mb_kl.item() / den_running_mean + ) + loss += mb_kl + + # Accumulate gradient + running_mean_logs["policy_gradient_loss"] += ( + loss.item() / den_running_mean + ) + loss /= normalization_factor + self.accelerator.backward(loss) + + # ensure gpu memory is freed + del training_mb + del log_probs + del logits + del loss + del action_log_probs + del rewarded_action_log_probs + + logger.info( + f"Accumulated the policy gradient loss for {total_tokens_generated} tokens." + ) + + # Clip gradients and take step + if self.gradient_clipping is not None: + grad_norm = self.accelerator.clip_grad_norm_( + self.policy.parameters(), self.gradient_clipping + ) + running_mean_logs["policy_gradient_norm"] += grad_norm.item() + + # Take step + self.policy_optimizer.step() + self.policy_optimizer.zero_grad() + + # Store logs + for key, value in running_mean_logs.items(): + self.tally.add_metric(path=key, metric=value) + + # Clear accelerator state so we do not accumulate references between optimizer steps. + self.accelerator.clear(self.policy, self.policy_optimizer) + import gc + + gc.collect() + torch.cuda.empty_cache() + return running_mean_logs + + def get_advantages_with_critic_gradient_accumulation( + self, trajectories: TrajectoryBatch, critic_loss_scaling_factor: float = 2.0 + ) -> torch.FloatTensor: + """ + Compute (and optionally whiten) advantages while training the critic in mini-batches. + Uses GAE if enabled, otherwise uses Monte Carlo returns. + Optionally trains the critic if GAE is used. + Returns: + advantages: NestedFloatTensors + """ + + mb_size = self.mini_batch_size + batch_size = trajectories.rollout_ids.shape[0] + agent_id = trajectories.agent_ids[0] + batch_rewards = trajectories.batch_rewards + + ###################################### + # use critic for advantage estimation + ###################################### + if self.use_gae: + if "buffer" in agent_id: + self.critic.eval() + training = False + else: + self.critic.train() + training = True + advantages = [] + # critic_loss_scaling_factor comes learning single critic for two agents + normalization_factor = ( + np.ceil(batch_size / mb_size).astype(int) * critic_loss_scaling_factor + ) + # For each minibatch + for mb in range(0, batch_size, mb_size): + trajectory_mb = trajectories[mb : mb + mb_size] + trajectory_mb.to(self.device) + rewards_mb = trajectory_mb.batch_rewards + ( + tokens_mb, + state_ends_mask_mb, + timestep_counts, + ) = trajectory_mb.get_padded_tensors_for_critic() + # critic causal attention up to end flags + if training: + vals_estimate_full = self.critic(tokens_mb) + else: + with torch.no_grad(): + vals_estimate_full = self.critic(tokens_mb) + + # if vals_estimate_full.dim() == 3: + # vals_estimate_full = vals_estimate_full.squeeze(-1) + + # Select only positions where states end, per sample → list of (jT,) + B = tokens_mb.shape[0] + vals_list = [ + vals_estimate_full[b][state_ends_mask_mb[b]] for b in range(B) + ] + + # Pad to (B, max_jT) = (B, S) + vals_estimate_mb = pad_sequence( + vals_list, batch_first=True, padding_value=0.0 + ) + dtype = vals_estimate_mb.dtype + rewards_mb = pad_sequence( + rewards_mb, batch_first=True, padding_value=0.0 + ).to( + dtype=dtype + ) # (B, S) + self.rollout_tally.add_metric( + path=["batch_rewards"], + rollout_tally_item=RolloutTallyItem( + crn_ids=trajectory_mb.crn_ids, + rollout_ids=trajectory_mb.rollout_ids, + agent_ids=trajectory_mb.agent_ids, + metric_matrix=rewards_mb, + ), + ) + if self.reward_normalizing_constant != 1.0: + rewards_mb /= self.reward_normalizing_constant + + det_vals_estimate_mb = vals_estimate_mb.detach() # (B, max_jT) + self.rollout_tally.add_metric( + path=["mb_value_estimates_critic"], + rollout_tally_item=RolloutTallyItem( + crn_ids=trajectory_mb.crn_ids, + rollout_ids=trajectory_mb.rollout_ids, + agent_ids=trajectory_mb.agent_ids, + metric_matrix=det_vals_estimate_mb, + ), + ) + + # Append a 0 value to the end of the value estimates + if det_vals_estimate_mb.shape[1] == rewards_mb.shape[1]: + Bsize = det_vals_estimate_mb.shape[0] + device = det_vals_estimate_mb.device + dtype = det_vals_estimate_mb.dtype + det_vals_estimate_mb = torch.cat( + [ + det_vals_estimate_mb, + torch.zeros((Bsize, 1), device=device, dtype=dtype), + ], + dim=1, + ) # (B, max_jT+1) + else: + raise ValueError( + "Incompatible shapes for value estimates and rewards." + ) + + # Get annealed lambda + if self.use_gae_lambda_annealing: + annealing_constant = self.gae_lambda_annealing_method( + step=self.trainer_annealing_state.annealing_step_counter + ) + annealed_lambda = ( + self.gae_lambda_annealing_limit * annealing_constant + ) + self.tally.add_metric( + path="annealed_lambda", metric=annealed_lambda + ) + else: + annealed_lambda = self.gae_lambda_annealing_limit + + # Get GAE advantages + gae_advantages = get_generalized_advantage_estimates( + rewards=rewards_mb, + value_estimates=det_vals_estimate_mb, + discount_factor=self.discount_factor, + lambda_coef=annealed_lambda, + ) # (B, max_jT) + self.rollout_tally.add_metric( + path=["mb_gae_advantages"], + rollout_tally_item=RolloutTallyItem( + crn_ids=trajectory_mb.crn_ids, + rollout_ids=trajectory_mb.rollout_ids, + agent_ids=trajectory_mb.agent_ids, + metric_matrix=gae_advantages, + ), + ) + if training: + targets = ( + gae_advantages.to(dtype=dtype) + det_vals_estimate_mb[:, :-1] + ) # (B, max_jT) # A(s, a, b) + V(s) = Q(s, a, b) + self.rollout_tally.add_metric( + path=["mb_targets_critic"], + rollout_tally_item=RolloutTallyItem( + crn_ids=trajectory_mb.crn_ids, + rollout_ids=trajectory_mb.rollout_ids, + agent_ids=trajectory_mb.agent_ids, + metric_matrix=targets, + ), + ) + if self.critic_loss_type == "mse": + loss = F.mse_loss( + input=vals_estimate_mb, + target=targets, + ) + elif self.critic_loss_type == "huber": + loss = F.huber_loss( + input=vals_estimate_mb, + target=targets, + ) + self.tally.add_metric(path=["mb_critic_loss"], metric=loss.item()) + # Accumulate gradient + loss /= normalization_factor + self.accelerator.backward(loss) + del loss + del targets + del vals_estimate_mb + del trajectory_mb + del vals_estimate_full + + # Get jagged back using timestep_counts + advantages.extend( + [gae_advantages[i, : timestep_counts[i]] for i in range(B)] + ) + + ###################################### + # use exclusively Monte Carlo returns & rloo for advantage estimation + ###################################### + else: + lengths = [len(c) for c in batch_rewards] + padded_rewards = pad_sequence( + batch_rewards, batch_first=True, padding_value=0.0 + ) + self.rollout_tally.add_metric( + path=["mb_rewards"], + rollout_tally_item=RolloutTallyItem( + crn_ids=trajectories.crn_ids, + rollout_ids=trajectories.rollout_ids, + agent_ids=trajectories.agent_ids, + metric_matrix=padded_rewards, + ), + ) + if self.reward_normalizing_constant != 1.0: + padded_rewards /= self.reward_normalizing_constant + padded_advantages = get_discounted_returns( + rewards=padded_rewards, + discount_factor=self.discount_factor, + ) # no baseline for now + if self.use_rloo: + is_grouped_by_rng = ( + trajectories.crn_ids.unique().shape[0] + != trajectories.crn_ids.shape[0] + ) + if is_grouped_by_rng and not self.no_rloo_grouping: + for crn_id in trajectories.crn_ids.unique(): + rng_mask = trajectories.crn_ids == crn_id + rng_advantages = padded_advantages[rng_mask] + rng_advantages, _ = get_rloo_credits(credits=rng_advantages) + padded_advantages[rng_mask] = rng_advantages + else: + padded_advantages, _ = get_rloo_credits(credits=padded_advantages) + self.rollout_tally.add_metric( + path=["mb_rloo_advantages"], + rollout_tally_item=RolloutTallyItem( + crn_ids=trajectories.crn_ids, + rollout_ids=trajectories.rollout_ids, + agent_ids=trajectories.agent_ids, + metric_matrix=padded_advantages, + ), + ) + advantages = [ + padded_advantages[i, : lengths[i]] + for i in range(padded_advantages.shape[0]) + ] + + if self.whiten_advantages_time_step_wise or self.whiten_advantages: + lengths = [len(c) for c in advantages] + padded_advantages = pad_sequence( + advantages, batch_first=True, padding_value=0.0 + ) + if self.whiten_advantages_time_step_wise: + whitened_padded_advantages = whiten_advantages_time_step_wise( + padded_advantages + ) + path = ["mb_whitened_advantages_time_step_wise"] + elif self.whiten_advantages: + whitened_padded_advantages = whiten_advantages(padded_advantages) + path = ["mb_whitened_advantages"] + self.rollout_tally.add_metric( + path=path, + rollout_tally_item=RolloutTallyItem( + crn_ids=trajectories.crn_ids, + rollout_ids=trajectories.rollout_ids, + agent_ids=trajectories.agent_ids, + metric_matrix=whitened_padded_advantages, + ), + ) + advantages = [ + whitened_padded_advantages[i, : lengths[i]] + for i in range(whitened_padded_advantages.shape[0]) + ] + + self.trainer_annealing_state.annealing_step_counter += 1 + + return advantages + + @abstractmethod + def set_agent_trajectory_data( + self, agent_id: str, roots: list[RolloutTreeRootNode] + ) -> None: + """ + Populate self.training_data for a single agent using the provided rollout trees. + """ + pass + + def set_trajectory_data( + self, roots: list[RolloutTreeRootNode], agent_ids: list[str] + ) -> None: + """ + Convenience wrapper to ingest trajectory data for every training agent. + """ + for agent_id in agent_ids: + self.set_agent_trajectory_data(agent_id, roots) + + @abstractmethod + def share_advantage_data(self) -> list[AdvantagePacket]: + pass + + @abstractmethod + def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]) -> None: + pass + + def set_policy_gradient_data(self, agent_ids: list[str]) -> None: + """ + Reset and rebuild the policy-gradient minibatches before iterating through agents. + """ + self.policy_gradient_data = None + for agent_id in agent_ids: + assert "buffer" not in agent_id, "Buffer agents do not train policy" + trajectory_batch = self.training_data[agent_id] + tokenwise_batch_credits = get_tokenwise_credits( + batch_timesteps=trajectory_batch.batch_timesteps, + batch_credits=trajectory_batch.batch_credits, + ) + policy_gradient_data = TrainingBatch( + rollout_ids=trajectory_batch.rollout_ids, + batch_input_ids=trajectory_batch.batch_input_ids, + batch_action_mask=trajectory_batch.batch_action_mask, + batch_entropy_mask=trajectory_batch.batch_entropy_mask, + batch_credits=tokenwise_batch_credits, + batch_engine_log_probs=trajectory_batch.batch_engine_log_probs, + batch_timesteps=trajectory_batch.batch_timesteps, + ) + if self.policy_gradient_data is None: + self.policy_gradient_data = policy_gradient_data + else: + self.policy_gradient_data.append(policy_gradient_data) + + self.training_data = {} + self.tokenwise_tally = ContextualizedTokenwiseTally( + tokenizer=self.tokenizer, + paths=self.debug_path_list, + ) + + def train(self) -> None: + """ + Entry point for policy updates: prepare batches, compute gradients, and update parameters. + """ + assert self.policy_gradient_data is not None, "Policy gradient data is not set" + if self.critic_optimizer is not None: + if self.gradient_clipping is not None: + grad_norm = self.accelerator.clip_grad_norm_( + self.critic.parameters(), self.gradient_clipping + ) + self.tally.add_metric( + path="gradient_norm_critic", metric=grad_norm.item() + ) + # Take step + self.critic_optimizer.step() + self.critic_optimizer.zero_grad() + self.accelerator.clear(self.critic, self.critic_optimizer) + import gc + + gc.collect() + torch.cuda.empty_cache() + running_mean_logs = self.apply_reinforce_step( + training_batch=self.policy_gradient_data + ) + return running_mean_logs + + def export_training_tally(self, identifier: str, folder: str) -> None: + """ + Saves and resets the collected training metrics using the tally object. + """ + os.makedirs(folder, exist_ok=True) + self.tally.save(identifier=identifier, folder=folder) + self.tokenwise_tally.save( + path=os.path.join(folder, f"{identifier}_tokenwise.csv") + ) + self.rollout_tally.save(identifier=identifier, folder=folder) + self.tally.reset() + self.tokenwise_tally = None + self.rollout_tally.reset() + self.debug_path_list = [] + + def export_optimizer_states(self) -> None: + """ + Saves the optimizer states for both the main model and critic (if it exists). + """ + try: + os.makedirs(self.save_path, exist_ok=True) + + torch.save(self.policy_optimizer.state_dict(), self.policy_optimizer_path) + logger.info(f"Saved main optimizer state to {self.policy_optimizer_path}") + + if self.critic_optimizer is not None: + torch.save( + self.critic_optimizer.state_dict(), self.critic_optimizer_path + ) + logger.info( + f"Saved critic optimizer state to {self.critic_optimizer_path}" + ) + except Exception as e: + logger.error(f"Error saving optimizer states: {str(e)}") + raise + + def export_trainer_annealing_state(self) -> None: + """ + Saves the trainer state. + """ + with open(self.trainer_annealing_state_path, "wb") as f: + pickle.dump(self.trainer_annealing_state, f) + logger.info(f"Saved trainer state to {self.trainer_annealing_state_path}") + + def export_trainer_states(self) -> None: + """ + Saves the trainer states. + """ + self.export_optimizer_states() + self.export_trainer_annealing_state() diff --git a/src_code_for_reproducibility/training/trainer_independent.py b/src_code_for_reproducibility/training/trainer_independent.py new file mode 100644 index 0000000000000000000000000000000000000000..807c69b3e903024028d8255ae3e79eb6537e609b --- /dev/null +++ b/src_code_for_reproducibility/training/trainer_independent.py @@ -0,0 +1,159 @@ +""" +File: mllm/training/trainer_independent.py +Summary: Trainer for independently optimizing each agent. +""" + +import logging +import os +import sys +from typing import Union + +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from pandas._libs.tslibs.offsets import CBMonthBegin +from peft import LoraConfig +from torch.nn.utils.rnn import pad_sequence +from transformers import AutoModelForCausalLM, AutoTokenizer + +from mllm.markov_games.rollout_tree import * +from mllm.markov_games.rollout_tree import RolloutTreeRootNode +from mllm.training.credit_methods import ( + get_discounted_returns, + get_discounted_state_visitation_credits, + get_generalized_advantage_estimates, + get_rloo_credits, +) +from mllm.training.tally_metrics import Tally +from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally +from mllm.training.tokenize_chats import * +from mllm.training.tokenize_chats import process_training_chat +from mllm.training.trainer_common import BaseTrainer +from mllm.training.training_data_utils import * +from mllm.training.training_data_utils import ( + TrainingBatch, + TrajectoryBatch, + get_tokenwise_credits, +) +from mllm.utils.resource_context import resource_logger_context + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +@dataclass +class TrainingData: + """Caches per-agent trajectory tensors plus their computed advantages.""" + + agent_id: str + main_data: TrajectoryBatch + # list-of-tensors: per rollout advantages with length jT + main_advantages: list[torch.FloatTensor] | None = None + + +class TrainerNaive(BaseTrainer): + def set_agent_trajectory_data( + self, agent_id: str, roots: list[RolloutTreeRootNode] + ) -> None: + """ + Tokenize rollouts for a given agent and cache the tensors used for training. + """ + # Reset per-agent buffers; extend this logic if joint training batches are needed. + self.policy_gradient_data = None + + # Tensorize Chats + rollout_ids = [] + crn_ids = [] # common random number id + batch_input_ids = [] + batch_action_mask = [] + batch_entropy_mask = [] + batch_timesteps = [] + batch_state_ends_mask = [] + batch_engine_log_probs = [] + batch_rewards = [] + for root in roots: + rollout_id = root.id + self.debug_path_list.append( + "mgid:" + str(rollout_id) + "_agent_id:" + agent_id + ) + rollout_ids.append(rollout_id) + crn_ids.append(root.crn_id) + chat, rewards = get_main_chat_list_and_rewards(agent_id=agent_id, root=root) + ( + input_ids, + action_mask, + entropy_mask, + timesteps, + state_ends_mask, + engine_log_probs, + ) = process_training_chat( + tokenizer=self.tokenizer, + chat_history=chat, + entropy_mask_regex=self.entropy_mask_regex, + exploration_prompts_to_remove=self.exploration_prompts_to_remove, + ) + batch_input_ids.append(input_ids) + batch_action_mask.append(action_mask) + batch_entropy_mask.append(entropy_mask) + batch_timesteps.append(timesteps) + batch_state_ends_mask.append(state_ends_mask) + batch_engine_log_probs.append(engine_log_probs) + batch_rewards.append(rewards) + + trajectory_batch = TrajectoryBatch( + rollout_ids=torch.tensor(rollout_ids, dtype=torch.int32), + crn_ids=torch.tensor(crn_ids, dtype=torch.int32), + agent_ids=[agent_id] * len(rollout_ids), + batch_input_ids=batch_input_ids, + batch_action_mask=batch_action_mask, + batch_entropy_mask=batch_entropy_mask, + batch_timesteps=batch_timesteps, + batch_state_ends_mask=batch_state_ends_mask, + batch_rewards=batch_rewards, + batch_engine_log_probs=batch_engine_log_probs, + ) + + # Get Advantages + batch_advantages: torch.FloatTensor = ( + self.get_advantages_with_critic_gradient_accumulation(trajectory_batch) + ) + + # Discount state visitation (the mathematically correct way) + if not self.skip_discounted_state_visitation: + for i in range(len(batch_advantages)): + batch_advantages[i] = get_discounted_state_visitation_credits( + batch_advantages[i].unsqueeze(0), + self.discount_factor, + ).squeeze(0) + + self.training_data[agent_id] = TrainingData( + agent_id=agent_id, + main_data=trajectory_batch, + main_advantages=batch_advantages, + ) + + def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]): + """ + This trainer ignores the advantages of the other trainers. + """ + for agent_id, agent_data in self.training_data.items(): + self.training_data[agent_id] = agent_data.main_data + self.training_data[agent_id].batch_credits = agent_data.main_advantages + + def share_advantage_data(self) -> list[AdvantagePacket]: + """ + Share the advantage data with other agents. + Returns: + AdvantagePacket: The advantage packet containing the agent's advantages. + """ + logger.info(f"Sharing advantage data.") + advantage_packets = [] + for agent_id, agent_data in self.training_data.items(): + advantage_packets.append( + AdvantagePacket( + agent_id=agent_id, + rollout_ids=agent_data.main_data.rollout_ids, + main_advantages=agent_data.main_advantages, + ) + ) + return advantage_packets diff --git a/src_code_for_reproducibility/training/trainer_sum_rewards.py b/src_code_for_reproducibility/training/trainer_sum_rewards.py new file mode 100644 index 0000000000000000000000000000000000000000..336a542bbf13691a9041bcf15da063f3183db4fe --- /dev/null +++ b/src_code_for_reproducibility/training/trainer_sum_rewards.py @@ -0,0 +1,127 @@ +""" +File: mllm/training/trainer_sum_rewards.py +Summary: Trainer that optimizes the sum-of-rewards objective. +""" + +import logging +import os +import sys +from typing import Union + +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from pandas._libs.tslibs.offsets import CBMonthBegin +from peft import LoraConfig +from torch.nn.utils.rnn import pad_sequence +from transformers import AutoModelForCausalLM, AutoTokenizer + +from mllm.markov_games.rollout_tree import * +from mllm.markov_games.rollout_tree import RolloutTreeRootNode +from mllm.training.credit_methods import ( + get_discounted_returns, + get_discounted_state_visitation_credits, + get_generalized_advantage_estimates, + get_rloo_credits, +) +from mllm.training.tally_metrics import Tally +from mllm.training.tally_rollout import RolloutTally, RolloutTallyItem +from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally +from mllm.training.tokenize_chats import * +from mllm.training.tokenize_chats import process_training_chat +from mllm.training.trainer_common import BaseTrainer +from mllm.training.trainer_independent import TrainerNaive, TrainingData +from mllm.training.training_data_utils import * +from mllm.training.training_data_utils import ( + AdvantagePacket, + TrainingBatch, + TrajectoryBatch, + get_tokenwise_credits, +) +from mllm.utils.resource_context import resource_logger_context + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +class TrainerSumRewards(TrainerNaive): + def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]): + """Sum peer advantages onto this agent's advantages to optimize joint reward.""" + logger.info(f"Receiving advantage packets.") + + assert ( + len(advantage_packets) > 0 + ), "At least one advantage packet must be provided." + + for agent_id, agent_data in self.training_data.items(): + coagent_advantage_packets = [ + packet for packet in advantage_packets if packet.agent_id != agent_id + ] + agent_rollout_ids = agent_data.main_data.rollout_ids + agent_advantages = agent_data.main_advantages + co_agent_advantages = [] + for rollout_id in agent_rollout_ids: + for co_agent_packet in coagent_advantage_packets: + if rollout_id in co_agent_packet.rollout_ids: + index = torch.where(rollout_id == co_agent_packet.rollout_ids)[ + 0 + ].item() + co_agent_advantages.append( + co_agent_packet.main_advantages[index] + ) + # assumes that its two player game, with one co-agent + break + assert len(co_agent_advantages) == len(agent_advantages) + B = len(agent_advantages) + assert all( + a.shape[0] == b.shape[0] + for a, b in zip(co_agent_advantages, agent_advantages) + ), "Number of advantages must match in order to sum them up." + + # Get padded tensors (advantage alignment is invariant to padding) + lengths = torch.tensor( + [len(t) for t in agent_advantages], + device=self.device, + dtype=torch.long, + ) + padded_main_advantages = pad_sequence( + agent_advantages, batch_first=True, padding_value=0.0 + ) + + padded_co_agent_advantages = pad_sequence( + co_agent_advantages, batch_first=True, padding_value=0.0 + ) + + # Create training batch data + sum_of_ad_credits = padded_main_advantages + padded_co_agent_advantages + self.rollout_tally.add_metric( + path=["sum_of_ad_credits"], + rollout_tally_item=RolloutTallyItem( + crn_ids=agent_data.main_data.crn_ids, + rollout_ids=agent_data.main_data.rollout_ids, + agent_ids=agent_data.main_data.agent_ids, + metric_matrix=sum_of_ad_credits, + ), + ) + + if not self.skip_discounted_state_visitation: + sum_of_ad_credits = get_discounted_state_visitation_credits( + sum_of_ad_credits, + self.discount_factor, + ) + self.rollout_tally.add_metric( + path=["discounted_state_visitation_credits"], + rollout_tally_item=RolloutTallyItem( + crn_ids=agent_data.main_data.crn_ids, + rollout_ids=agent_data.main_data.rollout_ids, + agent_ids=agent_data.main_data.agent_ids, + metric_matrix=sub_tensors[ + "discounted_state_visitation_credits" + ], + ), + ) + + # Slice back to jagged and convert to tokenwise credits + sum_of_ad_credits = [sum_of_ad_credits[i, : lengths[i]] for i in range(B)] + self.training_data[agent_id] = agent_data.main_data + self.training_data[agent_id].batch_credits = sum_of_ad_credits diff --git a/src_code_for_reproducibility/training/training_data_utils.py b/src_code_for_reproducibility/training/training_data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2685ee93b27a1208c9ee36a3605bd3321f4fe9fc --- /dev/null +++ b/src_code_for_reproducibility/training/training_data_utils.py @@ -0,0 +1,395 @@ +""" +File: mllm/training/training_data_utils.py +Summary: Utilities for loading, filtering, and batching training data. +""" + +from dataclasses import dataclass +from typing import Literal, Optional, Tuple + +import torch +from torch.nn.utils.rnn import pad_sequence + +from mllm.markov_games.rollout_tree import ( + ChatTurn, + RolloutTreeBranchNode, + RolloutTreeNode, + RolloutTreeRootNode, +) + + +@dataclass +class AdvantagePacket: + """Message used by trainers to share per-rollout advantages.""" + + agent_id: str + rollout_ids: torch.IntTensor # (B,) + # list-of-tensors + main_advantages: list[torch.FloatTensor] + + +class TrainingChatTurn: + """ + Lightweight ChatTurn variant that records the timestep index alongside role/content. + """ + + def __init__( + self, + time_step: int, + role: str, + agent_id: str, + content: str, + chat_template_token_ids: list[int], + reasoning_content: str, + is_state_end: bool, + out_token_ids: Optional[list[int]] = None, + log_probs: Optional[list[float]] = None, + ) -> None: + self.time_step = time_step + self.role = role + self.agent_id = agent_id + self.content = content + self.chat_template_token_ids = chat_template_token_ids + self.reasoning_content = reasoning_content + self.is_state_end = is_state_end + self.out_token_ids = out_token_ids + self.log_probs = log_probs + + def dict(self): + return { + "time_step": self.time_step, + "role": self.role, + "agent_id": self.agent_id, + "content": self.content, + "chat_template_token_ids": self.chat_template_token_ids, + "reasoning_content": self.reasoning_content, + "is_state_end": self.is_state_end, + "out_token_ids": self.out_token_ids, + "log_probs": self.log_probs, + } + + +def get_main_chat_list_and_rewards( + agent_id: str, root: RolloutTreeRootNode | RolloutTreeNode +) -> Tuple[list[TrainingChatTurn], torch.FloatTensor]: + """ + This method traverses a rollout tree and returns a the list of ChatTurn + for an agent. If it encounters a branch node, it follows the main path. + """ + # Currently follows only the main branch; extend if side branches must be included. + if isinstance(root, RolloutTreeRootNode): + current_node = root.child + else: + current_node = root + + chat = [] + rewards = [] + while current_node is not None: + if isinstance(current_node, RolloutTreeBranchNode): + current_node = current_node.main_child + reward: float = current_node.step_log.simulation_step_log.rewards[agent_id] + rewards.append(reward) + chat_turns: list[TrainingChatTurn] = current_node.step_log.action_logs[ + agent_id + ].chat_turns + chat_turns = [ + TrainingChatTurn(time_step=current_node.time_step, **turn.model_dump()) + for turn in chat_turns + ] + chat.extend(chat_turns) + current_node = current_node.child + return chat, torch.FloatTensor(rewards) + + +def get_tokenwise_credits( + # B := batch size, S := number of tokens / seq. length, T := number of states. `j` stands for jagged (see pytorch nested tensors.) + batch_timesteps: torch.IntTensor | torch.Tensor, # (B, jS), + batch_credits: torch.FloatTensor | torch.Tensor, # (B, jT) +) -> torch.FloatTensor | torch.Tensor: # (B, jS) + """ + Expand per-state credits so every token at that timestep receives the same value. + """ + # The explicit loops keep jagged tensor semantics simple; optimize later if profiling warrants it. + batch_token_credits = [] + for credits, timesteps in zip(batch_credits, batch_timesteps): + token_credits = torch.zeros_like( + timesteps, + dtype=credits.dtype, + device=timesteps.device, + ) + for idx, credit in enumerate(credits): + token_credits[timesteps == idx] = credit + batch_token_credits.append(token_credits) + return batch_token_credits + + +@dataclass +class TrajectoryBatch: + """ + Tensorized batch of trajectories using list-of-tensors for jagged dimensions. + """ + + # B := batch size, S := number of tokens / seq. length, T := number of states. + rollout_ids: torch.IntTensor # (B,) + crn_ids: torch.IntTensor # (B,) + agent_ids: list[str] # (B,) + batch_input_ids: list[torch.LongTensor] # List[(jS,)] + batch_action_mask: list[torch.BoolTensor] # List[(jS,)] + batch_entropy_mask: list[torch.BoolTensor] # List[(jS,)] + batch_timesteps: list[torch.IntTensor] # List[(jS,)] + batch_state_ends_mask: list[torch.BoolTensor] # List[(jS,)] + batch_engine_log_probs: Optional[list[torch.FloatTensor]] # List[(jS,)] + batch_rewards: list[torch.FloatTensor] # List[(jT,)] + batch_credits: Optional[list[torch.FloatTensor]] = None # List[(jS,)] + + def __post_init__(self): + """ + Validate per-sample consistency. + """ + B = self.rollout_ids.shape[0] + assert ( + self.crn_ids.shape[0] == B + ), "RNG IDs must have length equal to batch size." + assert ( + len(self.agent_ids) == B + ), "agent_ids must have length equal to batch size." + assert ( + len(self.batch_input_ids) + == len(self.batch_action_mask) + == len(self.batch_entropy_mask) + == len(self.batch_timesteps) + == len(self.batch_state_ends_mask) + == len(self.batch_engine_log_probs) + == len(self.batch_rewards) + == B + ), "Jagged lists must all have length equal to batch size." + + for b in range(B): + nb_rewards = int(self.batch_rewards[b].shape[0]) + nb_timesteps = int(torch.max(self.batch_timesteps[b]).item()) + 1 + assert ( + nb_rewards == nb_timesteps + ), "Number of rewards and timesteps mismatch." + assert ( + self.batch_input_ids[b].shape[0] + == self.batch_action_mask[b].shape[0] + == self.batch_entropy_mask[b].shape[0] + == self.batch_engine_log_probs[b].shape[0] + == self.batch_timesteps[b].shape[0] + ), "Tensors must have the same shape along the jagged dimension." + assert ( + int(self.batch_state_ends_mask[b].sum()) + == self.batch_rewards[b].shape[0] + ), "Number of rewards must match number of state ends." + + """ + Entries: + Here, we ignore the batch dimension. + input_ids: + All of the tokens of both the user and the assistant, flattened. + action_mask: + Set to true on the tokens of the assistant (tokens generated by the model). + timesteps: + Therefore, max(timesteps) = Ns - 1. + state_ends_idx: + Indices of the tokens at which state descriptions end. + rewards: + rewards[t] := R_t(s_t, a_t) + Example: + position: "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14" + input_ids: "U U U a a a U a U a a a U U U" (U := User, a := Assistant) + action_mask: "x x x ✓ ✓ ✓ x ✓ x ✓ ✓ ✓ x x x" + timestep: "0 0 0 0 0 0 1 1 1 1 1 1 2 2 2" + state_ends_dx: [2, 6, 14] + rewards: [r0, r1, r2] + """ + + def __getitem__(self, key) -> "TrajectoryBatch": + if isinstance(key, slice): + return TrajectoryBatch( + rollout_ids=self.rollout_ids.__getitem__(key), + crn_ids=self.crn_ids.__getitem__(key), + agent_ids=self.agent_ids[key], + batch_input_ids=self.batch_input_ids[key], + batch_action_mask=self.batch_action_mask[key], + batch_entropy_mask=self.batch_entropy_mask[key], + batch_timesteps=self.batch_timesteps[key], + batch_state_ends_mask=self.batch_state_ends_mask[key], + batch_engine_log_probs=self.batch_engine_log_probs[key], + batch_rewards=self.batch_rewards[key], + batch_credits=self.batch_credits[key] if self.batch_credits else None, + ) + + def __len__(self): + return len(self.batch_input_ids) + + def to(self, device): + self.rollout_ids = self.rollout_ids.to(device) + self.crn_ids = self.crn_ids.to(device) + self.batch_input_ids = [t.to(device) for t in self.batch_input_ids] + self.batch_action_mask = [t.to(device) for t in self.batch_action_mask] + self.batch_entropy_mask = [t.to(device) for t in self.batch_entropy_mask] + self.batch_timesteps = [t.to(device) for t in self.batch_timesteps] + self.batch_state_ends_mask = [t.to(device) for t in self.batch_state_ends_mask] + self.batch_engine_log_probs = [ + t.to(device) for t in self.batch_engine_log_probs + ] + self.batch_rewards = [t.to(device) for t in self.batch_rewards] + self.batch_credits = ( + [t.to(device) for t in self.batch_credits] if self.batch_credits else None + ) + + def get_padded_tensors_for_critic(self): + """ + Returns: + padded_batch_input_ids: (B, P) + padded_batch_state_ends_mask: (B, P) + timestep_counts: (B,) tensor of ints indicating number of states per sample + """ + padded_batch_input_ids = pad_sequence( + self.batch_input_ids, batch_first=True, padding_value=0 + ) + padded_batch_state_ends_mask = pad_sequence( + self.batch_state_ends_mask, batch_first=True, padding_value=0 + ).bool() + # number of states equals number of True in state_ends_mask + timestep_counts = torch.tensor( + [int(mask.sum().item()) for mask in self.batch_state_ends_mask], + device=padded_batch_input_ids.device, + dtype=torch.long, + ) + return padded_batch_input_ids, padded_batch_state_ends_mask, timestep_counts + + +timestep = int + + +@dataclass +class PaddedTensorTrainingBatch: + """Helper struct returned by ``TrainingBatch.get_padded_tensors``.""" + + batch_input_ids: torch.LongTensor | torch.Tensor + batch_action_mask: torch.BoolTensor | torch.Tensor + batch_entropy_mask: Optional[torch.BoolTensor | torch.Tensor] + batch_credits: torch.FloatTensor | torch.Tensor + batch_engine_log_probs: torch.FloatTensor | torch.Tensor + batch_timesteps: torch.IntTensor | torch.Tensor + + def __len__(self): + return self.batch_input_ids.shape[0] + + def to(self, device): + self.batch_input_ids = self.batch_input_ids.to(device) + self.batch_action_mask = self.batch_action_mask.to(device) + self.batch_entropy_mask = self.batch_entropy_mask.to(device) + self.batch_credits = self.batch_credits.to(device) + self.batch_engine_log_probs = self.batch_engine_log_probs.to(device) + self.batch_timesteps = self.batch_timesteps.to(device) + + +@dataclass +class TrainingBatch: + rollout_ids: torch.IntTensor | torch.Tensor # (B,) + batch_input_ids: list[torch.LongTensor] # List[(jS,)] + batch_action_mask: list[torch.BoolTensor] # List[(jS,)] + batch_entropy_mask: Optional[list[torch.BoolTensor]] # List[(jS,)] + batch_credits: list[torch.FloatTensor] # List[(jS,)] + batch_engine_log_probs: list[torch.FloatTensor] # List[(jS,)] + batch_timesteps: list[torch.IntTensor] # List[(jS,)] + + def __post_init__(self): + # Ensure batch dimension is present + assert ( + len(self.batch_input_ids) + == len(self.batch_action_mask) + == len(self.batch_entropy_mask) + == len(self.batch_credits) + == len(self.batch_engine_log_probs) + == len(self.batch_timesteps) + == self.rollout_ids.shape[0] + ), "Jagged lists must all have length equal to batch size." + for inp, mask, cred, engine_log_prob, timestep in zip( + self.batch_input_ids, + self.batch_action_mask, + self.batch_credits, + self.batch_engine_log_probs, + self.batch_timesteps, + ): + assert ( + inp.shape[0] + == mask.shape[0] + == cred.shape[0] + == engine_log_prob.shape[0] + == timestep.shape[0] + ), "Tensors must have the same shapes along the jagged dimension." + + def __getitem__(self, key) -> "TrainingBatch": + if isinstance(key, slice): + return TrainingBatch( + rollout_ids=self.rollout_ids.__getitem__(key), + batch_input_ids=self.batch_input_ids[key], + batch_action_mask=self.batch_action_mask[key], + batch_entropy_mask=self.batch_entropy_mask[key], + batch_credits=self.batch_credits[key], + batch_engine_log_probs=self.batch_engine_log_probs[key], + batch_timesteps=self.batch_timesteps[key], + ) + + def __len__(self): + return len(self.batch_input_ids) + + def to(self, device): + self.rollout_ids = self.rollout_ids.to(device) + self.batch_input_ids = [t.to(device) for t in self.batch_input_ids] + self.batch_action_mask = [t.to(device) for t in self.batch_action_mask] + self.batch_entropy_mask = [t.to(device) for t in self.batch_entropy_mask] + self.batch_credits = [t.to(device) for t in self.batch_credits] + self.batch_engine_log_probs = [ + t.to(device) for t in self.batch_engine_log_probs + ] + self.batch_timesteps = [t.to(device) for t in self.batch_timesteps] + + def get_padded_tensors(self, padding: float = 0.0): + """ + Materialize right-padded tensors so PyTorch ops can run on uniform shapes. + """ + padded_batch_input_ids = pad_sequence( + self.batch_input_ids, batch_first=True, padding_value=int(padding) + ) + padded_batch_action_mask = pad_sequence( + [m.to(dtype=torch.bool) for m in self.batch_action_mask], + batch_first=True, + padding_value=False, + ) + padded_batch_entropy_mask = pad_sequence( + self.batch_entropy_mask, batch_first=True, padding_value=False + ) + padded_batch_credits = pad_sequence( + self.batch_credits, batch_first=True, padding_value=float(padding) + ) + padded_batch_engine_log_probs = pad_sequence( + self.batch_engine_log_probs, batch_first=True, padding_value=float(padding) + ) + padded_batch_timesteps = pad_sequence( + self.batch_timesteps, batch_first=True, padding_value=0 + ) + + return PaddedTensorTrainingBatch( + padded_batch_input_ids, + padded_batch_action_mask, + padded_batch_entropy_mask, + padded_batch_credits, + padded_batch_engine_log_probs, + padded_batch_timesteps, + ) + + def append(self, other: "TrainingBatch"): + self.rollout_ids = torch.cat([self.rollout_ids, other.rollout_ids]) + self.batch_input_ids.extend(other.batch_input_ids) + self.batch_action_mask.extend(other.batch_action_mask) + self.batch_entropy_mask.extend(other.batch_entropy_mask) + self.batch_credits.extend(other.batch_credits) + self.batch_engine_log_probs.extend(other.batch_engine_log_probs) + self.batch_timesteps.extend(other.batch_timesteps) + + +timestep = int diff --git a/src_code_for_reproducibility/utils/__init__.py b/src_code_for_reproducibility/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f8f86a6250af7b404a43dc38a7d58aef50dbeb6d --- /dev/null +++ b/src_code_for_reproducibility/utils/__init__.py @@ -0,0 +1,4 @@ +""" +File: mllm/utils/__init__.py +Summary: Utility package exposing helper modules. +""" diff --git a/src_code_for_reproducibility/utils/__pycache__/__init__.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e63b428c00db55d1fd97a390ba95c8e0e2b8f6a7 Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/__pycache__/dict_get_path.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/dict_get_path.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf519a8c2dd0e9288fbeec449932cb32f838bea2 Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/dict_get_path.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/__pycache__/get_coagent_id.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/get_coagent_id.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a954fa00312e4c613fe3feb2fe2248988dc6f00d Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/get_coagent_id.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/__pycache__/resource_context.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/resource_context.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..084c8810b7f0ced9d7ecc96db6933f4bd0a8931d Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/resource_context.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/__pycache__/rollout_tree_gather_utils.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/rollout_tree_gather_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..348e542fbd081961b48ba11f57ac98c99d11e41c Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/rollout_tree_gather_utils.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/__pycache__/rollout_tree_stats.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/rollout_tree_stats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..34a48d6500c7c7e62e47a7def5f05e8824eaac0c Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/rollout_tree_stats.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/__pycache__/short_id_gen.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/short_id_gen.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f20495b7f4d59902c0ff611c84ba21a2311e77b Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/short_id_gen.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/__pycache__/stat_pack.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/stat_pack.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a454b258bff6fdfbd14a21b1b02cf2e957747f8 Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/stat_pack.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/__pycache__/update_start_epoch.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/update_start_epoch.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f9d3c6bca8b73cbb23dff3263b3a5f12f598599 Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/update_start_epoch.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/__pycache__/wandb_utils.cpython-312.pyc b/src_code_for_reproducibility/utils/__pycache__/wandb_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4da2d0a7c94f30a7b0dcb724c0e4eee16bebd6d Binary files /dev/null and b/src_code_for_reproducibility/utils/__pycache__/wandb_utils.cpython-312.pyc differ diff --git a/src_code_for_reproducibility/utils/dict_get_path.py b/src_code_for_reproducibility/utils/dict_get_path.py new file mode 100644 index 0000000000000000000000000000000000000000..16b91ec7ec8ecf4e5ed96af29945f44d27bd0276 --- /dev/null +++ b/src_code_for_reproducibility/utils/dict_get_path.py @@ -0,0 +1,17 @@ +""" +File: mllm/utils/dict_get_path.py +Summary: Retrieves nested dictionary values using dotted key paths. +""" + + +def get_from_nested_dict(a: dict, path) -> any: + # path is string or list of string + try: + if isinstance(path, str): + return a[path] + else: + for p in path: + a = a[p] + return a + except Exception: + return None diff --git a/src_code_for_reproducibility/utils/gather_training_stats.py b/src_code_for_reproducibility/utils/gather_training_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..067fc238c3899ade78a0a4622d002a2c99e337aa --- /dev/null +++ b/src_code_for_reproducibility/utils/gather_training_stats.py @@ -0,0 +1,262 @@ +""" +File: mllm/utils/gather_training_stats.py +Summary: Aggregates training statistics from rollouts and exports artifacts. +""" + +import copy +import csv +import gc +import json +import logging +import os +import pickle +import random +import re +import subprocess +import sys +import time +from datetime import datetime +from statistics import mean +from typing import Any, Dict + +import hydra +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import torch +from omegaconf import OmegaConf + +from mllm.training.tally_metrics import Tally +from mllm.utils.stat_pack import StatPack + + +def get_from_nested_dict(dictio: dict, path: list[str]): + for sp in path[:-1]: + dictio = dictio[sp] + return dictio.get(path[-1]) + + +def set_at_path(dictio: dict, path: list[str], value): + for sp in path[:-1]: + if sp not in dictio: + dictio[sp] = {} + dictio = dictio[sp] + dictio[path[-1]] = value + + +def produce_tabular_render(inpath: str, outpath: str = None): + """ + Convert a JSON metrics dump into per-rollout CSV tables for easier inspection. + """ + with open(inpath, "r") as f: + data = json.load(f) + rollout_paths = data.keys() + for rollout_path in rollout_paths: + if outpath is None: + m_path = rollout_path.replace("/", "|") + m_path = m_path.replace(".json", "") + m_path = ( + os.path.split(inpath)[0] + + "/contextualized_tabular_renders/" + + m_path + + "_tabular_render.render.csv" + ) + # import pdb; pdb.set_trace() + os.makedirs(os.path.split(m_path)[0], exist_ok=True) + metrics = data[rollout_path] + d = {k: [] for k in metrics[0].keys()} + for m in metrics: + for k, v in m.items(): + d[k].append(v) + d = pd.DataFrame(d) + d.to_csv(m_path) + + +def get_metric_paths(data: list[dict]): + d = data[0] + paths = [] + + def traverse_dict(d, current_path=[]): + for key, value in d.items(): + new_path = current_path + [key] + if isinstance(value, dict): + traverse_dict(value, new_path) + else: + paths.append(new_path) + + traverse_dict(d) + return paths + + +def print_metric_paths(data: list[dict]): + paths = get_metric_paths(data) + for p in paths: + print(p) + + +def get_metric_iteration_list(data: list[dict], metric_path: list[str]): + if isinstance(metric_path, str): + metric_path = [metric_path] + sgl = [] + for d in data: + sgl.append(get_from_nested_dict(d, metric_path)) + return sgl + + +def to_1d_numeric(x): + """Return a 1-D float array (or None if not numeric). Accepts scalars, numpy arrays, or nested list/tuple of them.""" + if x is None: + return None + if isinstance(x, (int, float, np.number)): + return np.array([float(x)], dtype=float) + if isinstance(x, np.ndarray): + try: + return x.astype(float).ravel() + except Exception: + return None + if isinstance(x, (list, tuple)): + parts = [] + for e in x: + arr = to_1d_numeric(e) + if arr is not None and arr.size > 0: + parts.append(arr) + if parts: + return np.concatenate(parts) + return None + return None + + +def get_single_metric_vector(data, metric_path, iterations=None): + if isinstance(metric_path, str): + metric_path = [metric_path] + if iterations == None: + iterations = len(data) + vecs = [] + for d in data: + ar = get_from_nested_dict(d, metric_path) + arr = to_1d_numeric(ar) + if arr is not None: + vecs.append(arr) + + return np.concatenate(vecs) if vecs else np.empty(0, dtype=float) + + +def _load_metrics_file(file_path: str): + if not (file_path.endswith(".tally.pkl") or file_path.endswith(".pkl")): + raise ValueError("Only *.tally.pkl files are supported.") + import pickle + + with open(file_path, "rb") as f: + tree = pickle.load(f) + return tree + + +def get_leaf_items(array_tally: dict, prefix: list[str] = None): + if prefix is None: + prefix = [] + for key, value in array_tally.items(): + next_prefix = prefix + [str(key)] + if isinstance(value, dict): + yield from get_leaf_items(value, next_prefix) + else: + yield next_prefix, value + + +def _sanitize_filename_part(part: str) -> str: + s = part.replace("/", "|") + s = s.replace(" ", "_") + return s + + +def render_rt_tally_pkl_to_csvs(pkl_path: str, outdir: str): + """ + This method takes care of tokenwise logging. + """ + with open(pkl_path, "rb") as f: + payload = pickle.load(f) + # Backward compatibility: older tallies stored the dict directly + if isinstance(payload, dict) and "array_tally" in payload: + array_tally = payload.get("array_tally", {}) + else: + array_tally = payload + + os.makedirs(outdir, exist_ok=True) + trainer_id = os.path.basename(pkl_path).replace(".rt_tally.pkl", "") + for path_list, rollout_tally_items in get_leaf_items(array_tally): + # Create file and initiate writer + path_part = ".".join(_sanitize_filename_part(p) for p in path_list) + filename = f"{trainer_id}__{path_part}.render.csv" + out_path = os.path.join(outdir, filename) + + # Write metric rows to CSV + with open(out_path, "w", newline="") as f: + writer = csv.writer(f) + + # Write header row - need to determine metric column count from first rollout_tally_item + first_item = rollout_tally_items[0] + metric_cols = ( + first_item.metric_matrix.shape[1] + if first_item.metric_matrix.ndim > 1 + else 1 + ) + header = ["agent_id", "crn_id", "rollout_id"] + [ + f"t_{i}" for i in range(metric_cols) + ] + writer.writerow(header) + + for rollout_tally_item in rollout_tally_items: + crn_ids = rollout_tally_item.crn_ids + rollout_ids = rollout_tally_item.rollout_ids + agent_ids = rollout_tally_item.agent_ids + metric_matrix = rollout_tally_item.metric_matrix + for i in range(metric_matrix.shape[0]): + row_vals = metric_matrix[i].reshape(-1) + # Convert row_vals to a list to avoid numpy concatenation issues + row_vals = ( + row_vals.tolist() + if hasattr(row_vals, "tolist") + else list(row_vals) + ) + row_prefix = [ + agent_ids[i], + crn_ids[i], + rollout_ids[i], + ] + writer.writerow(row_prefix + row_vals) + + +def tally_to_stat_pack(tally: Dict[str, Any]): + stat_pack = StatPack() + if "array_tally" in tally: + tally = tally["array_tally"] + + # backward compatibility: will remove later, flatten keys in tally + def get_from_nested_dict(dictio: dict, path: list[str]): + for sp in path[:-1]: + dictio = dictio[sp] + return dictio.get(path[-1]) + + def get_metric_paths(tally: dict): + paths = [] + + def traverse_dict(tally, current_path=[]): + for key, value in tally.items(): + new_path = current_path + [key] + if isinstance(value, dict): + traverse_dict(value, new_path) + else: + paths.append(new_path) + + traverse_dict(tally) + return paths + + paths = get_metric_paths(tally) + modified_tally = {} + for p in paths: + val = get_from_nested_dict(tally, p) + modified_tally["_".join(p)] = np.mean(val) + del tally + tally = modified_tally + for key, value in tally.items(): + stat_pack.add_stat(key, value) + return stat_pack diff --git a/src_code_for_reproducibility/utils/get_coagent_id.py b/src_code_for_reproducibility/utils/get_coagent_id.py new file mode 100644 index 0000000000000000000000000000000000000000..f51674757ebb4ba1b0c18a36dd4ea9257564f890 --- /dev/null +++ b/src_code_for_reproducibility/utils/get_coagent_id.py @@ -0,0 +1,10 @@ +""" +File: mllm/utils/get_coagent_id.py +Summary: Helper for deriving co-agent identifiers from rollout metadata. +""" + + +def get_coagent_id(ids: list[str], agent_id: str) -> str | None: + for id in ids: + if id != agent_id: + return id diff --git a/src_code_for_reproducibility/utils/get_stochastic_game_lengths.py b/src_code_for_reproducibility/utils/get_stochastic_game_lengths.py new file mode 100644 index 0000000000000000000000000000000000000000..98a01013b063e7d2504f5f85b1c4a4f9d145412b --- /dev/null +++ b/src_code_for_reproducibility/utils/get_stochastic_game_lengths.py @@ -0,0 +1,33 @@ +""" +File: mllm/utils/get_stochastic_game_lengths.py +Summary: Computes distributions over stochastic game lengths. +""" + +import numpy as np + + +def get_stochastic_game_lengths( + max_length, nb_games, continuation_prob, same_length_batch=False +): + """ + Generates stochastic game lengths based on a geometric distribution. + + Args: + max_length (int): The maximum length a game can have. + nb_games (int): The number of games to generate lengths for. + continuation_prob (float): The probability of the game continuing after each round. + same_length_batch (bool): If True, all games will have the same length. + + Returns: + Array: An array of game lengths. + """ + if continuation_prob == 1: + return [max_length] * nb_games + if same_length_batch: + length = np.random.geometric(1 - continuation_prob, 1) + game_lengths = np.repeat(length, nb_games) + else: + game_lengths = np.random.geometric(1 - continuation_prob, nb_games) + + game_lengths = np.where(game_lengths > max_length, max_length, game_lengths) + return game_lengths.tolist() diff --git a/src_code_for_reproducibility/utils/resource_context.py b/src_code_for_reproducibility/utils/resource_context.py new file mode 100644 index 0000000000000000000000000000000000000000..e0713364ce54d2d20745162329fea9dec2665efd --- /dev/null +++ b/src_code_for_reproducibility/utils/resource_context.py @@ -0,0 +1,83 @@ +""" +File: mllm/utils/resource_context.py +Summary: Tracks system resource usage via a context manager. +""" + +import logging +import time +from contextlib import contextmanager + +import torch + + +def vram_usage(): + output = "" + for i in range(torch.cuda.device_count()): + gpu_memory_allocated = torch.cuda.memory_allocated(i) / ( + 1024**3 + ) # Convert bytes to GB + gpu_memory_reserved = torch.cuda.memory_reserved(i) / ( + 1024**3 + ) # Convert bytes to GB + output += f"GPU {i}: Memory Allocated: {gpu_memory_allocated:.2f} GB, Memory Reserved: {gpu_memory_reserved:.2f} GB" + return output + + +def ram_usage(): + import psutil + + process = psutil.Process() + memory_info = process.memory_info() + ram_used = memory_info.rss / (1024**3) # Convert bytes to GB + return f"RAM Usage: {ram_used:.2f} GB" + + +@contextmanager +def resource_logger_context(logger: logging.Logger, task_description: str): + """ + Context manager to log the resource usage of the current task. + Args: + logger: The logger to use to log the resource usage. + task_description: The description of the task to log. + Returns: + None + """ + try: + initial_time = time.time() + # Assume CUDA is available and use device 0 only + total_mem_bytes = torch.cuda.get_device_properties(0).total_memory + initial_total_bytes = torch.cuda.memory_allocated( + 0 + ) + torch.cuda.memory_reserved(0) + torch.cuda.reset_peak_memory_stats(0) + yield None + finally: + final_time = time.time() + # Ensure kernels within the block are accounted for + torch.cuda.synchronize() + + # Compute metrics + final_allocated_bytes = torch.cuda.memory_allocated(0) + final_reserved_bytes = torch.cuda.memory_reserved(0) + final_total_bytes = final_allocated_bytes + final_reserved_bytes + + delta_vram_percent_total = ( + 100 * (final_total_bytes - initial_total_bytes) / total_mem_bytes + if total_mem_bytes + else 0.0 + ) + current_percent_vram_taken = ( + 100 * final_total_bytes / total_mem_bytes if total_mem_bytes else 0.0 + ) + block_peak_percent = ( + 100 * torch.cuda.max_memory_allocated(0) / total_mem_bytes + if total_mem_bytes + else 0.0 + ) + delta_time_str = time.strftime( + "%H:%M:%S", time.gmtime(final_time - initial_time) + ) + + logger.info( + f"For task: {task_description}, ΔVRAM % (total): {delta_vram_percent_total:.2f}%, Current % of VRAM taken: {current_percent_vram_taken:.2f}%, Block Peak % of device VRAM: {block_peak_percent:.2f}%, ΔTime: {delta_time_str}" + ) diff --git a/src_code_for_reproducibility/utils/rollout_tree_chat_htmls.py b/src_code_for_reproducibility/utils/rollout_tree_chat_htmls.py new file mode 100644 index 0000000000000000000000000000000000000000..8806c1a9df9412c8bce5ae42c3de031b81db52f5 --- /dev/null +++ b/src_code_for_reproducibility/utils/rollout_tree_chat_htmls.py @@ -0,0 +1,1597 @@ +""" +File: mllm/utils/rollout_tree_chat_htmls.py +Summary: Renders rollout tree chat transcripts into HTML artifacts. +""" + +from pathlib import Path +from typing import List + +from mllm.utils.rollout_tree_gather_utils import * + + +def html_from_chat_turns(chat_turns: List[ChatTurnLog]) -> str: + """ + Render chat turns as a single, wrapping sequence of messages in time order. + Keep badge and message bubble styles, include time on every badge and + include rewards on assistant badges. Each message is individually + hide/show by click; when hidden, only the badge remains and "(...)" is + shown inline (not inside a bubble). + """ + import html + import re as _re + + # Prepare ordering: sort by (time_step, original_index) to keep stable order within same step + indexed_turns = list(enumerate(chat_turns)) + indexed_turns.sort(key=lambda t: (t[1].time_step, t[0])) + + # Get unique agent IDs and sort alphabetically for consistent assignment + # Agent with alphabetically lower name gets agent-0 (left, green) + # Agent with alphabetically higher name gets agent-1 (right, orange) + unique_agent_ids = sorted( + set(turn.agent_id for turn in chat_turns if turn.role == "assistant") + ) + agent_id_to_index = {aid: idx for idx, aid in enumerate(unique_agent_ids)} + + # CSS styles (simplified layout; no time-step or agent-column backgrounds) + css = """ + + """ + + # HTML structure + html_parts = [ + "", + "", + "", + "", + "Chat Turns", + css, + "", + "", + "", + '
', + '
', + '
', + '', + '', + '', + '', + '', + '900px', + "", + '', + '', + '", + "", + '', + '', + '', + "px", + "", + '', + '', + f'", + f'', + '|', + f'", + f'', + '', + "", + "
", + "
", + ] + + # Add Chat View + import html as _html_mod + + html_parts.append('
') + + # Helper function to add context annotation areas + def add_context_area(position: str, time_step: int): + context_key = f"round-context-{position}-{time_step}" + placeholder = f"Add context {position} round {time_step}..." + color_buttons = "" + # Add default/reset color button first + color_buttons += ( + f'
' + ) + for color_name, color_value in [ + ("red", "#d32f2f"), + ("orange", "#f57c00"), + ("yellow", "#f9a825"), + ("green", "#388e3c"), + ("blue", "#1976d2"), + ("purple", "#7b1fa2"), + ("gray", "#666666"), + ]: + color_buttons += ( + f'
' + ) + + html_parts.append( + f'
' + f'
' + f'
{color_buttons}
' + f"
" + ) + + # Helper function to add split agent context boxes + def add_split_agent_contexts(position: str, time_step: int): + color_buttons = "" + # Add default/reset color button first + color_buttons += ( + f'
' + ) + for color_name, color_value in [ + ("red", "#d32f2f"), + ("orange", "#f57c00"), + ("yellow", "#f9a825"), + ("green", "#388e3c"), + ("blue", "#1976d2"), + ("purple", "#7b1fa2"), + ("gray", "#666666"), + ]: + color_buttons += ( + f'
' + ) + + html_parts.append('
') + + # Agent 0 box + agent0_key = f"agent-context-0-{position}-{time_step}" + agent0_placeholder = f"..." + html_parts.append( + f'
' + f'
' + f'
{color_buttons}
' + f"
" + ) + + # Agent 1 box + agent1_key = f"agent-context-1-{position}-{time_step}" + agent1_placeholder = f"..." + html_parts.append( + f'
' + f'
' + f'
{color_buttons}
' + f"
" + ) + + html_parts.append("
") # split-agent-context + + last_time_step_chat = None + for original_index, turn in indexed_turns: + # Use agent index for CSS class (agent-0 or agent-1) instead of agent ID + agent_index = agent_id_to_index.get(turn.agent_id, 0) + agent_class = f"agent-{agent_index}" + role_class = f"role-{turn.role}" + + # Add time step divider and beginning context + if last_time_step_chat is None or turn.time_step != last_time_step_chat: + # Add end contexts for previous round (only regular context, not prompt summary) + if last_time_step_chat is not None: + add_context_area("end", last_time_step_chat) + + html_parts.append( + f'
' + f'⏱ Round {turn.time_step + 1}' + f"
" + ) + + # Add beginning contexts for new round (both context and prompt summary) + add_context_area("beginning", turn.time_step) + add_split_agent_contexts("beginning", turn.time_step) + + last_time_step_chat = turn.time_step + + # Build chat message with merge controls + html_parts.append( + f'
' + ) + + # Add merge control button + html_parts.append( + f'' + ) + + html_parts.append('
') + + # Header with agent name and reward (always show reward) + if turn.role == "assistant": + name = _html_mod.escape(turn.agent_id) + raw_val = turn.reward + if isinstance(raw_val, (int, float)): + reward_val = f"{raw_val:.4f}".rstrip("0").rstrip(".") + if len(reward_val) > 8: + reward_val = reward_val[:8] + "…" + else: + reward_val = str(raw_val) + header_html = ( + f'
' + f'🤖 {name}' + f'⚑ {reward_val}' + f"
" + ) + else: + name = _html_mod.escape(turn.agent_id) + header_html = f'
Prompt of {name}
' + + html_parts.append(header_html) + + # Reasoning content if present + if turn.reasoning_content: + _raw_reasoning = turn.reasoning_content.replace("\r\n", "\n") + _raw_reasoning = _re.sub(r"^\s*\n+", "", _raw_reasoning) + esc_reasoning = _html_mod.escape(_raw_reasoning) + html_parts.append( + f'" + ) + + # Message bubble + esc_content = _html_mod.escape(turn.content) + html_parts.append(f'
{esc_content}
') + + html_parts.append("
") # chat-message-content + html_parts.append("
") # chat-message + + # Add end contexts for the last round (only regular context, not prompt summary) + if last_time_step_chat is not None: + add_context_area("end", last_time_step_chat) + + html_parts.append("
") # flow-chat + html_parts.extend(["", ""]) + + return "\n".join(html_parts) + + +def export_html_from_rollout_tree(path: Path, outdir: Path, main_only: bool = False): + """Process a rollout tree file and generate HTML files for each path. + Creates separate HTML files for the main path and each branch path. + The main path is saved in the root output directory, while branch paths + are saved in a 'branches' subdirectory. + + Args: + path: Path to the rollout tree JSON file + outdir: Output directory for HTML files + main_only: If True, only export the main trajectory (default: False) + """ + root = load_rollout_tree(path) + mgid = root.id + + main_path, branch_paths = get_rollout_tree_paths(root) + + outdir.mkdir(parents=True, exist_ok=True) + + # Create branches subdirectory if we have branch paths + if not main_only and branch_paths: + branches_dir = outdir / f"mgid:{mgid}_branches_html_renders" + branches_dir.mkdir(parents=True, exist_ok=True) + + # Generate HTML for the main path + chat_turns = gather_all_chat_turns_for_path(main_path) + html_content = html_from_chat_turns(chat_turns) + output_file = outdir / f"mgid:{mgid}_main_html_render.render.html" + with open(output_file, "w", encoding="utf-8") as f: + f.write(html_content) + + # Generate HTML for each branch path + for path_obj in branch_paths: + chat_turns = gather_all_chat_turns_for_path(path_obj) + + html_content = html_from_chat_turns(chat_turns) + + path_id: str = path_obj.id + output_filename = f"{path_id}_html_render.render.html" + + output_file = branches_dir / output_filename + + with open(output_file, "w", encoding="utf-8") as f: + f.write(html_content) diff --git a/src_code_for_reproducibility/utils/rollout_tree_gather_utils.py b/src_code_for_reproducibility/utils/rollout_tree_gather_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4e8730eedc347dd0a67d765bb073905e3f100af2 --- /dev/null +++ b/src_code_for_reproducibility/utils/rollout_tree_gather_utils.py @@ -0,0 +1,314 @@ +""" +File: mllm/utils/rollout_tree_gather_utils.py +Summary: Utilities for gathering rollout tree files and metadata. +""" + +from __future__ import annotations + +import csv +import os +import pickle +import re +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple + +from mllm.markov_games.rollout_tree import * + + +def load_rollout_tree(path: Path) -> RolloutTreeRootNode: + """Load a rollout tree from a PKL file containing a dict.""" + with open(path, "rb") as f: + data = pickle.load(f) + return RolloutTreeRootNode.model_validate(data) + + +@dataclass +class RolloutNodeList: + id: str + nodes: List[RolloutTreeNode] + + +def get_rollout_tree_paths( + root: RolloutTreeRootNode, mgid: Optional[str] = None +) -> Tuple[RolloutNodeList, List[RolloutNodeList]]: + """ + Returns: + main_path: The main path from the root to the end of the tree. + branch_paths: A list of all branch paths from the root to the end of the tree. + Each branch path contains a list of nodes that are part of the branch, including the nodes from the main path before the branch was taken. + """ + branch_paths = [] + + def collect_path_nodes(current) -> List[RolloutTreeNode]: + """Recursively collect all nodes in a path starting from current node.""" + if current is None: + return [] + + if isinstance(current, RolloutTreeNode): + return [current] + collect_path_nodes(current.child) + + elif isinstance(current, RolloutTreeBranchNode): + # For branch nodes, we only follow the main_child for path collection + if current.main_child: + return [current.main_child] + collect_path_nodes( + current.main_child.child + ) + else: + return [] + + def traverse_for_branches( + current, + main_path_prefix: List[RolloutTreeNode], + path_id: str, + current_time_step: Optional[int] = 0, + ): + """Traverse tree to collect all branch paths.""" + if current is None: + return + + if isinstance(current, RolloutTreeNode): + # Continue traversing with this node added to the main path prefix + new_prefix = main_path_prefix + [current] + traverse_for_branches(current.child, new_prefix, path_id, current.time_step) + + elif isinstance(current, RolloutTreeBranchNode): + # Collect all branch paths + if current.branches: + for agent_id, branch_node_list in current.branches.items(): + if branch_node_list: + # Start with the main path prefix, then recursively collect all nodes in this branch + branch_path_nodes = main_path_prefix.copy() + for branch_node in branch_node_list: + branch_path_nodes.extend(collect_path_nodes(branch_node)) + + # Create proper branch path ID with mgid, agent_id, and time_step + mgid_str = mgid or str(root.id) + branch_path_id = f"mgid:{mgid_str}_type:branch_agent:{agent_id}_time_step:{current_time_step}" + branch_paths.append( + RolloutNodeList(id=branch_path_id, nodes=branch_path_nodes) + ) + + # Process the main child and add to prefix + new_prefix = main_path_prefix + if current.main_child: + new_prefix = main_path_prefix + [current.main_child] + + # Continue traversing the main path + if current.main_child: + traverse_for_branches( + current.main_child.child, + new_prefix, + path_id, + current.main_child.time_step, + ) + + # Collect the main path nodes + main_path_nodes = collect_path_nodes(root.child) + + # Traverse to collect all branch paths + traverse_for_branches(root.child, [], "") + + # Create the main path with proper mgid format + mgid_str = mgid or str(root.id) + main_path = RolloutNodeList(id=f"mgid:{mgid_str}_type:main", nodes=main_path_nodes) + + return main_path, branch_paths + + +class ChatTurnLog(BaseModel): + time_step: int + agent_id: str + role: str + content: str + reasoning_content: Optional[str] = None + is_state_end: bool + reward: float + + +def gather_agent_chat_turns_for_path( + agent_id: str, path: RolloutNodeList +) -> List[ChatTurnLog]: + """Iterate through all chat turns for a specific agent in a path sorted by time step.""" + turns = [] + for node in path.nodes: + action_log = node.step_log.action_logs.get(agent_id, []) + if action_log: + for chat_turn in action_log.chat_turns or []: + turns.append( + ChatTurnLog( + time_step=node.time_step, + agent_id=agent_id, + role=chat_turn.role, + content=chat_turn.content, + reasoning_content=getattr(chat_turn, "reasoning_content", None), + is_state_end=chat_turn.is_state_end, + reward=node.step_log.simulation_step_log.rewards.get( + agent_id, 0 + ), + ) + ) + return turns + + +def gather_all_chat_turns_for_path(path: RolloutNodeList) -> List[ChatTurnLog]: + """Iterate through all chat turns for all agents in a path sorted by time step.""" + turns = [] + + # Collect turns from all agents, but interleave them per timestep by (user, assistant) pairs + for node in path.nodes: + # Build (user[, assistant]) pairs for each agent at this timestep + agent_ids = sorted(list(node.step_log.action_logs.keys())) + per_agent_pairs: Dict[str, List[List[ChatTurnLog]]] = {} + + for agent_id in agent_ids: + action_log = node.step_log.action_logs.get(agent_id) + pairs: List[List[ChatTurnLog]] = [] + current_pair: List[ChatTurnLog] = [] + + if action_log and action_log.chat_turns: + for chat_turn in action_log.chat_turns: + turn_log = ChatTurnLog( + time_step=node.time_step, + agent_id=agent_id, + role=chat_turn.role, + content=chat_turn.content, + reasoning_content=getattr(chat_turn, "reasoning_content", None), + is_state_end=chat_turn.is_state_end, + reward=node.step_log.simulation_step_log.rewards.get( + agent_id, 0 + ), + ) + + if chat_turn.role == "user": + # If a previous pair is open, close it and start a new one + if current_pair: + pairs.append(current_pair) + current_pair = [] + current_pair = [turn_log] + else: + # assistant: attach to an open user message if present; otherwise stand alone + if ( + current_pair + and len(current_pair) == 1 + and current_pair[0].role == "user" + ): + current_pair.append(turn_log) + pairs.append(current_pair) + current_pair = [] + else: + # No preceding user or already paired; treat as its own unit + pairs.append([turn_log]) + + if current_pair: + # Unpaired trailing user message + pairs.append(current_pair) + + per_agent_pairs[agent_id] = pairs + + # Interleave pairs across agents: A1, B1, A2, B2, ... + index = 0 + while True: + added_any = False + for agent_id in agent_ids: + agent_pairs = per_agent_pairs.get(agent_id, []) + if index < len(agent_pairs): + for tl in agent_pairs[index]: + turns.append(tl) + added_any = True + if not added_any: + break + index += 1 + + return turns + + +def chat_turns_to_dict(chat_turns: Iterator[ChatTurnLog]) -> Iterator[Dict[str, Any]]: + """Render all chat turns for a path as structured data for JSON.""" + for chat_turn in chat_turns: + yield chat_turn.model_dump() + + +def get_all_agents(root: RolloutTreeRootNode) -> List[str]: + """list of all agent IDs that appear in the tree.""" + if root.child is None: + return [] + + # Get the first node to extract all agent IDs + first_node = root.child + if isinstance(first_node, RolloutTreeBranchNode): + first_node = first_node.main_child + + if first_node is None: + return [] + + # All agents should be present in the first node + agents = set(first_node.step_log.action_logs.keys()) + agents.update(first_node.step_log.simulation_step_log.rewards.keys()) + + return sorted(list(agents)) + + +def gather_agent_main_rewards(agent_id: str, path: RolloutNodeList) -> List[float]: + """Gather main rewards for a specific agent in a path.""" + rewards = [] + for node in path.nodes: + reward = node.step_log.simulation_step_log.rewards[agent_id] + rewards.append(reward) + return rewards + + +def gather_all_rewards(path: RolloutNodeList) -> List[Dict[AgentId, float]]: + """Gather main rewards from main trajectory in a path.""" + rewards = [] + for node in path.nodes: + rewards.append(node.step_log.simulation_step_log.rewards.copy()) + return rewards + + +def gather_simulation_stats( + path: RolloutNodeList, + filter: Callable[[SimulationStepLog], bool], + stat_func: Callable[[SimulationStepLog], Any], +) -> List[Any]: + """Gather stats from main trajectory in a path.""" + stats = [] + for node in path.nodes: + sl = node.step_log.simulation_step_log + if filter(sl): + stats.append(stat_func(sl)) + return stats + + +def gather_simulation_step_logs(path: RolloutNodeList) -> List[SimulationStepLog]: + """Gather simulation information from main trajectory in a path.""" + infos = [] + for node in path.nodes: + infos.append(node.step_log.simulation_step_log) + return infos + + +def export_chat_logs(path: Path, outdir: Path): + """Process a rollout tree PKL file and generate a JSONL of chat turns as dicts. + Each line contains an object with path_id and chat_turns for a single path. + """ + import json + + root = load_rollout_tree(path) + mgid = root.id + + main_path, branch_paths = get_rollout_tree_paths(root) + all_paths = [main_path] + branch_paths + + outdir.mkdir(parents=True, exist_ok=True) + output_file = outdir / f"mgid:{mgid}_plucked_chats.render.jsonl" + + with open(output_file, "w", encoding="utf-8") as f: + for path_obj in all_paths: + chat_turns = gather_all_chat_turns_for_path(path_obj) + output_obj = { + "path_id": str(path_obj.id), + "chat_turns": list(chat_turns_to_dict(iter(chat_turns))), + } + f.write(json.dumps(output_obj, ensure_ascii=False) + "\n") diff --git a/src_code_for_reproducibility/utils/rollout_tree_stats.py b/src_code_for_reproducibility/utils/rollout_tree_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..4725160156230d7efb89588c765fb5b63a7bbbe1 --- /dev/null +++ b/src_code_for_reproducibility/utils/rollout_tree_stats.py @@ -0,0 +1,55 @@ +""" +File: mllm/utils/rollout_tree_stats.py +Summary: Computes descriptive statistics from rollout tree collections. +""" + +from typing import Any, Callable, List, Tuple + +from mllm.markov_games.rollout_tree import RolloutTreeRootNode +from mllm.markov_games.simulation import SimulationStepLog +from mllm.utils.rollout_tree_gather_utils import ( + gather_simulation_step_logs, + get_rollout_tree_paths, +) +from mllm.utils.stat_pack import StatPack + + +def get_rollout_tree_stat_tally( + rollout_tree: RolloutTreeRootNode, + metrics: List[Callable[[SimulationStepLog], List[Tuple[str, float]]]], +) -> StatPack: + stat_tally = StatPack() + # get simulation step logs + node_list = get_rollout_tree_paths(rollout_tree)[0] + simulation_step_logs = gather_simulation_step_logs(node_list) + for simulation_step_log in simulation_step_logs: + for metric in metrics: + metric_result = metric(simulation_step_log) + if metric_result is not None: + for key, value in metric_result: + stat_tally.add_stat(key, value) + return stat_tally + + +def get_rollout_tree_mean_stats( + rollout_tree: RolloutTreeRootNode, metrics: List[Callable[[SimulationStepLog], Any]] +) -> StatPack: + """Get the mean stats for a rollout tree.""" + stat_tally = get_rollout_tree_stat_tally(rollout_tree, metrics) + return stat_tally.mean() + + +def get_mean_rollout_tree_stats( + rollout_trees: List[RolloutTreeRootNode], + metrics: List[Callable[[SimulationStepLog], Any]], +) -> StatPack: + """Get the mean stats for a list of rollout trees.""" + # Compute per-rollout means first, then aggregate them across the entire batch. + stat_tallies = [ + get_rollout_tree_mean_stats(rollout_tree, metrics) + for rollout_tree in rollout_trees + ] + mean_stat_tally = StatPack() + for stat_tally in stat_tallies: + mean_stat_tally.add_stats(stat_tally) + return mean_stat_tally.mean() diff --git a/src_code_for_reproducibility/utils/short_id_gen.py b/src_code_for_reproducibility/utils/short_id_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..6c08ffdc3362c767ea8916496ea5b0e1c01dbd7e --- /dev/null +++ b/src_code_for_reproducibility/utils/short_id_gen.py @@ -0,0 +1,16 @@ +""" +File: mllm/utils/short_id_gen.py +Summary: Generates short unique identifiers for experiment assets. +""" + +import uuid + + +def generate_short_id() -> int: + """ + Generates a short unique ID for tracking adapter versions. + + Returns: + int: An 8-digit integer ID. + """ + return int(str(uuid.uuid4().int)[:8]) diff --git a/src_code_for_reproducibility/utils/stat_pack.py b/src_code_for_reproducibility/utils/stat_pack.py new file mode 100644 index 0000000000000000000000000000000000000000..d4da475dafa8e3290ba9be10922be5687ac2c862 --- /dev/null +++ b/src_code_for_reproducibility/utils/stat_pack.py @@ -0,0 +1,117 @@ +""" +File: mllm/utils/stat_pack.py +Summary: Implements the StatPack container for incremental statistics. +""" + +import csv +import json +import os +import pickle +from collections import Counter +from copy import deepcopy +from locale import strcoll +from statistics import mean +from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict + +import matplotlib.pyplot as plt +import numpy as np + +style_path = os.environ.get("ADALIGN_MPLSTYLE") +if style_path: + plt.style.use(style_path) + +import wandb + +from . import wandb_utils + + +class StatPack: + def __init__(self): + self.data = {} + + def add_stat(self, key: str, value: float | int | None): + assert ( + isinstance(value, float) or isinstance(value, int) or value is None + ), f"Value {value} is not a valid type" + if key not in self.data: + self.data[key] = [] + self.data[key].append(value) + + def add_stats(self, other: "StatPack"): + for key in other.keys(): + self.add_stat(key, other[key]) + + def __getitem__(self, key: str): + return self.data[key] + + def __setitem__(self, key: str, value: Any): + self.data[key] = value + + def __contains__(self, key: str): + return key in self.data + + def __len__(self): + return len(self.data) + + def __iter__(self): + return iter(self.data) + + def keys(self): + return self.data.keys() + + def values(self): + return self.data.values() + + def items(self): + return self.data.items() + + def mean(self): + mean_st = StatPack() + for key in self.keys(): + if isinstance(self[key], list): + # Ignore None entries so missing measurements do not bias the mean. + non_none_values = [v for v in self[key] if v is not None] + if non_none_values: + mean_st[key] = np.mean(np.array(non_none_values)) + else: + mean_st[key] = None + return mean_st + + def store_plots(self, folder: str): + os.makedirs(folder, exist_ok=True) + for key in self.keys(): + plt.figure(figsize=(10, 5)) + plt.plot(self[key]) + plt.title(key) + plt.savefig(os.path.join(folder, f"{key}.pdf")) + plt.close() + + def store_numpy(self, folder: str): + os.makedirs(folder, exist_ok=True) + for key in self.keys(): + # Sanitize filename components (avoid slashes, spaces, etc.) + safe_key = str(key).replace(os.sep, "_").replace("/", "_").replace(" ", "_") + values = self[key] + # Convert None to NaN for numpy compatibility + arr = np.array( + [(np.nan if (v is None) else v) for v in values], dtype=float + ) + np.save(os.path.join(folder, f"{safe_key}.npy"), arr) + + def store_json(self, folder: str, filename: str = "stats.json"): + os.makedirs(folder, exist_ok=True) + with open(os.path.join(folder, filename), "w") as f: + json.dump(self.data, f, indent=4) + + def store_csv(self, folder: str): + os.makedirs(folder, exist_ok=True) + for key in self.keys(): + with open(os.path.join(folder, f"stats.csv"), "w") as f: + writer = csv.writer(f) + writer.writerow([key] + self[key]) + + def store_pickle(self, folder: str): + os.makedirs(folder, exist_ok=True) + for key in self.keys(): + with open(os.path.join(folder, f"stats.pkl"), "wb") as f: + pickle.dump(self[key], f) diff --git a/src_code_for_reproducibility/utils/update_start_epoch.py b/src_code_for_reproducibility/utils/update_start_epoch.py new file mode 100644 index 0000000000000000000000000000000000000000..24a2ab0ae28dc2f5bccc61a98b920e165bc9d813 --- /dev/null +++ b/src_code_for_reproducibility/utils/update_start_epoch.py @@ -0,0 +1,17 @@ +""" +File: mllm/utils/update_start_epoch.py +Summary: Updates persisted start-epoch metadata when resuming runs. +""" + +import os + + +# During run, set hydra.run.dir=./outputs/{folder} +def update_start_epoch(cfg, output_directory): + if cfg["experiment"]["resume_experiment"]: + folders = [ + f for f in os.listdir(output_directory) if f.startswith("iteration_") + ] + iterations = [int(f.split("_")[1]) for f in folders] if folders else [0] + cfg["experiment"]["start_epoch"] = max(iterations) + return None diff --git a/src_code_for_reproducibility/utils/wandb_utils.py b/src_code_for_reproducibility/utils/wandb_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..46289bfdbb48b72fa3fe3b531d150447cfc1eb01 --- /dev/null +++ b/src_code_for_reproducibility/utils/wandb_utils.py @@ -0,0 +1,170 @@ +""" +File: mllm/utils/wandb_utils.py +Summary: Shared Weights & Biases helper functions. +""" + +import os +from typing import Any, Dict, Optional + +_WANDB_AVAILABLE = False +_WANDB_RUN = None + + +def _try_import_wandb(): + global _WANDB_AVAILABLE + if _WANDB_AVAILABLE: + return True + try: + import wandb # type: ignore + + _WANDB_AVAILABLE = True + return True + except Exception: + _WANDB_AVAILABLE = False + return False + + +def _safe_get(cfg: Dict[str, Any], path: list[str], default: Any = None) -> Any: + cur: Any = cfg + for key in path: + if not isinstance(cur, dict) or key not in cur: + return default + cur = cur[key] + return cur + + +def is_enabled(cfg: Dict[str, Any]) -> bool: + return bool(_safe_get(cfg, ["logging", "wandb", "enabled"], False)) + + +def init(cfg: Dict[str, Any], run_dir: str, run_name: Optional[str] = None) -> None: + """ + Initialize Weights & Biases if enabled in config. No-op if disabled or wandb not installed. + """ + global _WANDB_RUN + if not is_enabled(cfg): + return + if not _try_import_wandb(): + return + + import wandb # type: ignore + + project = _safe_get(cfg, ["logging", "wandb", "project"], "llm-negotiation") + entity = _safe_get(cfg, ["logging", "wandb", "entity"], None) + mode = _safe_get(cfg, ["logging", "wandb", "mode"], "online") + tags = _safe_get(cfg, ["logging", "wandb", "tags"], []) or [] + notes = _safe_get(cfg, ["logging", "wandb", "notes"], None) + group = _safe_get(cfg, ["logging", "wandb", "group"], None) + name = _safe_get(cfg, ["logging", "wandb", "name"], run_name) + + # Ensure files are written into the hydra run directory + os.makedirs(run_dir, exist_ok=True) + os.environ.setdefault("WANDB_DIR", run_dir) + + # Convert cfg to plain types for W&B config; fallback to minimal dictionary + try: + from omegaconf import OmegaConf # type: ignore + + cfg_container = OmegaConf.to_container(cfg, resolve=True) # type: ignore + except Exception: + cfg_container = cfg + + _WANDB_RUN = wandb.init( + project=project, + entity=entity, + mode=mode, + name=name, + group=group, + tags=tags, + notes=notes, + config=cfg_container, + dir=run_dir, + reinit=True, + ) + + +def log(metrics: Dict[str, Any], step: Optional[int] = None) -> None: + """Log a flat dictionary of metrics to W&B if active.""" + if not _WANDB_AVAILABLE or _WANDB_RUN is None: + return + try: + import wandb # type: ignore + + wandb.log(metrics if step is None else dict(metrics, step=step)) + except Exception: + pass + + +def _flatten(prefix: str, data: Dict[str, Any], out: Dict[str, Any]) -> None: + for k, v in data.items(): + key = f"{prefix}.{k}" if prefix else k + if isinstance(v, dict): + _flatten(key, v, out) + else: + out[key] = v + + +def _summarize_value(value: Any) -> Dict[str, Any]: + import numpy as np # local import to avoid hard dependency during disabled mode + + if value is None: + return {"none": 1} + # Scalars + if isinstance(value, (int, float)): + return {"value": float(value)} + # Lists or arrays + try: + arr = np.asarray(value) + if arr.size == 0: + return {"size": 0} + return { + "mean": float(np.nanmean(arr)), + "min": float(np.nanmin(arr)), + "max": float(np.nanmax(arr)), + "last": float(arr.reshape(-1)[-1]), + "size": int(arr.size), + } + except Exception: + # Fallback: string repr + return {"text": str(value)} + + +def log_tally( + array_tally: Dict[str, Any], prefix: str = "", step: Optional[int] = None +) -> None: + """ + Flatten and summarize Tally.array_tally and log to WandB. + Each leaf list/array is summarized with mean/min/max/last/size. + """ + if not _WANDB_AVAILABLE or _WANDB_RUN is None: + return + summarized: Dict[str, Any] = {} + + def walk(node: Any, path: list[str]): + if isinstance(node, dict): + for k, v in node.items(): + walk(v, path + [k]) + return + # node is a list of values accumulated over time + key = ".".join([p for p in ([prefix] if prefix else []) + path]) + try: + summary = _summarize_value(node) + for sk, sv in summary.items(): + summarized[f"{key}.{sk}"] = sv + except Exception: + summarized[f"{key}.error"] = 1 + + walk(array_tally, []) + if summarized: + log(summarized, step=step) + + +def log_flat_stats( + stats: Dict[str, Any], prefix: str = "", step: Optional[int] = None +) -> None: + if not _WANDB_AVAILABLE or _WANDB_RUN is None: + return + flat: Dict[str, Any] = {} + _flatten(prefix, stats, flat) + if flat: + log(flat, step=step)