penfever commited on
Commit
3b8148a
·
verified ·
1 Parent(s): 05ddcd3

Add files using upload-large-folder tool

Browse files
config.json CHANGED
@@ -29,7 +29,7 @@
29
  "rope_theta": 1000000.0,
30
  "sliding_window": 32768,
31
  "tie_word_embeddings": false,
32
- "torch_dtype": "float32",
33
  "transformers_version": "4.51.3",
34
  "use_cache": false,
35
  "use_sliding_window": false,
 
29
  "rope_theta": 1000000.0,
30
  "sliding_window": 32768,
31
  "tie_word_embeddings": false,
32
+ "torch_dtype": "bfloat16",
33
  "transformers_version": "4.51.3",
34
  "use_cache": false,
35
  "use_sliding_window": false,
logs/rank_0000.log CHANGED
@@ -1,4 +1,4 @@
1
- [2025-07-04 10:07:00,006][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:283] TrainingConfig:
2
  TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(dataset_name='hf_vision',
3
  dataset_path=None,
4
  subset=None,
@@ -52,13 +52,13 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
52
  tokenizer_pad_token=None,
53
  tokenizer_kwargs={},
54
  processor_kwargs={},
55
- model_max_length=10000,
56
  load_pretrained_weights=True,
57
  trust_remote_code=True,
58
  torch_dtype_str='bfloat16',
59
  compile=False,
60
  chat_template='qwen2-vl-instruct',
61
- attn_implementation='sdpa',
62
  device_map='auto',
63
  model_kwargs={},
64
  enable_liger_kernel=False,
@@ -73,7 +73,7 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
73
  per_device_train_batch_size=1,
74
  per_device_eval_batch_size=8,
75
  gradient_accumulation_steps=1,
76
- max_steps=3750,
77
  num_train_epochs=5,
78
  save_epoch=False,
79
  save_steps=0,
@@ -93,7 +93,7 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
93
  remove_unused_columns=False,
94
  repetition_penalty=1.0,
95
  use_vllm=False,
96
- vllm_device=None,
97
  vllm_gpu_memory_utilization=0.9,
98
  vllm_dtype=None,
99
  vllm_max_model_len=None,
@@ -128,14 +128,14 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
128
  log_model_summary=False,
129
  resume_from_checkpoint=None,
130
  try_resume_from_last_checkpoint=False,
131
- dataloader_num_workers=2,
132
  dataloader_persistent_workers=False,
133
- dataloader_prefetch_factor=8,
134
- dataloader_main_process_only=False,
135
  ddp_find_unused_parameters=False,
136
  max_grad_norm=1.0,
137
  trainer_kwargs={'dataset_kwargs': {'skip_prepare_dataset': True},
138
- 'max_seq_length': 10000,
139
  'remove_unused_columns': False},
140
  verl_config_overrides={},
141
  profiler=ProfilerParams(save_dir=None,
@@ -176,51 +176,52 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
176
  bnb_4bit_compute_dtype='float32',
177
  peft_save_mode=<PeftSaveMode.ADAPTER_ONLY: 'adapter_only'>),
178
  fsdp=FSDPParams(enable_fsdp=True,
179
- sharding_strategy=<ShardingStrategy.HYBRID_SHARD: 'HYBRID_SHARD'>,
180
  cpu_offload=False,
181
- mixed_precision='bf16',
182
  backward_prefetch=<BackwardPrefetch.BACKWARD_PRE: 'BACKWARD_PRE'>,
183
  forward_prefetch=True,
184
- use_orig_params=None,
185
  state_dict_type=<StateDictType.FULL_STATE_DICT: 'FULL_STATE_DICT'>,
186
- auto_wrap_policy=<AutoWrapPolicy.SIZE_BASED_WRAP: 'SIZE_BASED_WRAP'>,
187
  min_num_params=100000,
188
- transformer_layer_cls=None,
189
  sync_module_states=True))
190
- [2025-07-04 10:07:00,227][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
191
- [2025-07-04 10:07:02,023][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
192
- [2025-07-04 10:07:03,883][oumi][rank0][pid:2619050][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
193
- [2025-07-04 10:07:03,883][oumi][rank0][pid:2619050][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
194
- [2025-07-04 10:07:05,192][oumi][rank0][pid:2619050][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
195
  Split: train
196
  Version: 0.0.0
197
- Dataset size: 57058499
198
- Download size: 48789762
199
- Size: 105848261 bytes
200
- Rows: 4286
201
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
202
- [2025-07-04 10:07:07,950][oumi][rank0][pid:2619050][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
203
  problem object
204
  solution object
205
  original_question object
206
  original_answer object
207
  image object
208
  dtype: object
209
- [2025-07-04 10:07:21,068][oumi][rank0][pid:2619050][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0)
210
- [2025-07-04 10:07:21,069][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
211
- [2025-07-04 10:07:21,069][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
212
- [2025-07-04 10:07:21,069][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0))...
213
- [2025-07-04 10:07:21,105][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
214
- [2025-07-04 10:07:23,607][oumi][rank0][pid:2619050][MainThread][INFO]][torch_utils.py:289]
 
215
  Model Parameters Summary:
216
  🔢 Total parameters: 8,292,166,656
217
  🔗 Embedding parameters: 544,997,376
218
  🎯 Trainable parameters: 8,292,166,656
219
  🔒 Frozen parameters: 0 (0.00%)
220
 
221
- [2025-07-04 10:07:24,853][oumi][rank0][pid:2619050][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
222
- [2025-07-04 10:07:24,853][oumi][rank0][pid:2619050][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
223
- [2025-07-04 10:07:25,176][oumi][rank0][pid:2619050][MainThread][INFO]][training.py:62] SFTConfig(output_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
224
  overwrite_output_dir=False,
225
  do_train=False,
226
  do_eval=False,
@@ -242,7 +243,7 @@ Model Parameters Summary:
242
  adam_epsilon=1e-08,
243
  max_grad_norm=1.0,
244
  num_train_epochs=5,
245
- max_steps=3750,
246
  lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>,
247
  lr_scheduler_kwargs={},
248
  warmup_ratio=0.03,
@@ -250,7 +251,7 @@ Model Parameters Summary:
250
  log_level='warning',
251
  log_level_replica='warning',
252
  log_on_each_node=True,
253
- logging_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/runs/Jul04_10-07-24_oumi-compute004',
254
  logging_strategy=<IntervalStrategy.STEPS: 'steps'>,
255
  logging_first_step=False,
256
  logging_steps=50,
@@ -283,8 +284,8 @@ Model Parameters Summary:
283
  debug=[],
284
  dataloader_drop_last=False,
285
  eval_steps=500,
286
- dataloader_num_workers=2,
287
- dataloader_prefetch_factor=8,
288
  past_index=-1,
289
  run_name='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
290
  disable_tqdm=False,
@@ -303,7 +304,7 @@ Model Parameters Summary:
303
  tp_size=0,
304
  fsdp_transformer_layer_cls_to_wrap=None,
305
  accelerator_config=AcceleratorConfig(split_batches=False,
306
- dispatch_batches=False,
307
  even_batches=True,
308
  use_seedable_sampler=True,
309
  non_blocking=False,
@@ -363,34 +364,28 @@ Model Parameters Summary:
363
  dataset_kwargs={'skip_prepare_dataset': True},
364
  dataset_num_proc=None,
365
  pad_token=None,
366
- max_length=10000,
367
  packing=False,
368
  padding_free=False,
369
  eval_packing=None,
370
  dataset_batch_size=None,
371
  num_of_sequences=None,
372
  chars_per_token=None,
373
- max_seq_length=10000,
374
  use_liger=None)
375
- [2025-07-04 10:07:25,232][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.434, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
376
- [2025-07-04 10:07:25,287][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:510] Training init time: 29.326s
377
- [2025-07-04 10:07:25,287][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
378
- [2025-07-04 12:47:15,514][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=193.085, power_limit_watts=700.0, gpu_utilization=11, memory_utilization=2, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
379
- [2025-07-04 15:25:44,400][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=187.86, power_limit_watts=700.0, gpu_utilization=32, memory_utilization=9, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
380
- [2025-07-04 18:04:39,027][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=188.725, power_limit_watts=700.0, gpu_utilization=49, memory_utilization=3, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
381
- [2025-07-04 20:43:20,054][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=193.78, power_limit_watts=700.0, gpu_utilization=35, memory_utilization=2, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
382
- [2025-07-04 23:21:14,672][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=191.45000000000002, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
383
- [2025-07-05 01:59:05,749][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=35, fan_speed=None, fan_speeds=None, power_usage_watts=187.851, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
384
- [2025-07-05 04:37:20,509][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=191.013, power_limit_watts=700.0, gpu_utilization=36, memory_utilization=3, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
385
- [2025-07-05 04:37:20,705][oumi][rank0][pid:2619050][MainThread][INFO]][telemetry_callback.py:242] Saving telemetry callback summary to output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/telemetry/telemetry_callback_rank0000.json...
386
- [2025-07-05 04:37:21,418][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:518] Training is Complete.
387
- [2025-07-05 04:37:21,441][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
388
- [2025-07-05 04:37:21,442][oumi][rank0][pid:2619050][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 29.37 GB
389
- [2025-07-05 04:37:21,442][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:525] Saving final state...
390
- [2025-07-05 04:37:21,451][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:530] Saving final model...
391
- [2025-07-05 04:37:21,454][oumi][rank0][pid:2619050][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
392
- [2025-07-05 04:43:01,650][oumi][rank0][pid:2619050][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
393
- [2025-07-05 04:43:05,405][oumi][rank0][pid:2619050][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
394
- [2025-07-05 04:43:07,241][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:214]
395
 
396
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
 
1
+ [2025-07-09 18:42:07,302][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:279] TrainingConfig:
2
  TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(dataset_name='hf_vision',
3
  dataset_path=None,
4
  subset=None,
 
52
  tokenizer_pad_token=None,
53
  tokenizer_kwargs={},
54
  processor_kwargs={},
55
+ model_max_length=16384,
56
  load_pretrained_weights=True,
57
  trust_remote_code=True,
58
  torch_dtype_str='bfloat16',
59
  compile=False,
60
  chat_template='qwen2-vl-instruct',
61
+ attn_implementation='flash_attention_2',
62
  device_map='auto',
63
  model_kwargs={},
64
  enable_liger_kernel=False,
 
73
  per_device_train_batch_size=1,
74
  per_device_eval_batch_size=8,
75
  gradient_accumulation_steps=1,
76
+ max_steps=1250,
77
  num_train_epochs=5,
78
  save_epoch=False,
79
  save_steps=0,
 
93
  remove_unused_columns=False,
94
  repetition_penalty=1.0,
95
  use_vllm=False,
96
+ vllm_mode=None,
97
  vllm_gpu_memory_utilization=0.9,
98
  vllm_dtype=None,
99
  vllm_max_model_len=None,
 
128
  log_model_summary=False,
129
  resume_from_checkpoint=None,
130
  try_resume_from_last_checkpoint=False,
131
+ dataloader_num_workers=64,
132
  dataloader_persistent_workers=False,
133
+ dataloader_prefetch_factor=32,
134
+ dataloader_main_process_only=None,
135
  ddp_find_unused_parameters=False,
136
  max_grad_norm=1.0,
137
  trainer_kwargs={'dataset_kwargs': {'skip_prepare_dataset': True},
138
+ 'max_seq_length': 16384,
139
  'remove_unused_columns': False},
140
  verl_config_overrides={},
141
  profiler=ProfilerParams(save_dir=None,
 
176
  bnb_4bit_compute_dtype='float32',
177
  peft_save_mode=<PeftSaveMode.ADAPTER_ONLY: 'adapter_only'>),
178
  fsdp=FSDPParams(enable_fsdp=True,
179
+ sharding_strategy=<ShardingStrategy.FULL_SHARD: 'FULL_SHARD'>,
180
  cpu_offload=False,
181
+ mixed_precision=None,
182
  backward_prefetch=<BackwardPrefetch.BACKWARD_PRE: 'BACKWARD_PRE'>,
183
  forward_prefetch=True,
184
+ use_orig_params=True,
185
  state_dict_type=<StateDictType.FULL_STATE_DICT: 'FULL_STATE_DICT'>,
186
+ auto_wrap_policy=<AutoWrapPolicy.TRANSFORMER_BASED_WRAP: 'TRANSFORMER_BASED_WRAP'>,
187
  min_num_params=100000,
188
+ transformer_layer_cls='transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer',
189
  sync_module_states=True))
190
+ [2025-07-09 18:42:07,490][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
191
+ [2025-07-09 18:42:08,862][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
192
+ [2025-07-09 18:42:09,767][oumi][rank0][pid:1196049][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
193
+ [2025-07-09 18:42:09,767][oumi][rank0][pid:1196049][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
194
+ [2025-07-09 18:42:11,870][oumi][rank0][pid:1196049][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
195
  Split: train
196
  Version: 0.0.0
197
+ Dataset size: 109368497
198
+ Download size: 96255659
199
+ Size: 205624156 bytes
200
+ Rows: 10000
201
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
202
+ [2025-07-09 18:42:14,502][oumi][rank0][pid:1196049][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
203
  problem object
204
  solution object
205
  original_question object
206
  original_answer object
207
  image object
208
  dtype: object
209
+ [2025-07-09 18:42:26,192][oumi][rank0][pid:1196049][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0)
210
+ [2025-07-09 18:42:26,193][oumi][rank0][pid:1196049][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
211
+ [2025-07-09 18:42:26,195][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
212
+ [2025-07-09 18:42:26,195][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
213
+ [2025-07-09 18:42:26,195][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0))...
214
+ [2025-07-09 18:42:26,237][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
215
+ [2025-07-09 18:42:27,434][oumi][rank0][pid:1196049][MainThread][INFO]][torch_utils.py:289]
216
  Model Parameters Summary:
217
  🔢 Total parameters: 8,292,166,656
218
  🔗 Embedding parameters: 544,997,376
219
  🎯 Trainable parameters: 8,292,166,656
220
  🔒 Frozen parameters: 0 (0.00%)
221
 
222
+ [2025-07-09 18:42:29,502][oumi][rank0][pid:1196049][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
223
+ [2025-07-09 18:42:29,502][oumi][rank0][pid:1196049][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
224
+ [2025-07-09 18:42:29,751][oumi][rank0][pid:1196049][MainThread][INFO]][training.py:62] SFTConfig(output_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
225
  overwrite_output_dir=False,
226
  do_train=False,
227
  do_eval=False,
 
243
  adam_epsilon=1e-08,
244
  max_grad_norm=1.0,
245
  num_train_epochs=5,
246
+ max_steps=1250,
247
  lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>,
248
  lr_scheduler_kwargs={},
249
  warmup_ratio=0.03,
 
251
  log_level='warning',
252
  log_level_replica='warning',
253
  log_on_each_node=True,
254
+ logging_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/runs/Jul09_18-42-29_oumi-compute002',
255
  logging_strategy=<IntervalStrategy.STEPS: 'steps'>,
256
  logging_first_step=False,
257
  logging_steps=50,
 
284
  debug=[],
285
  dataloader_drop_last=False,
286
  eval_steps=500,
287
+ dataloader_num_workers=64,
288
+ dataloader_prefetch_factor=32,
289
  past_index=-1,
290
  run_name='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
291
  disable_tqdm=False,
 
304
  tp_size=0,
305
  fsdp_transformer_layer_cls_to_wrap=None,
306
  accelerator_config=AcceleratorConfig(split_batches=False,
307
+ dispatch_batches=None,
308
  even_batches=True,
309
  use_seedable_sampler=True,
310
  non_blocking=False,
 
364
  dataset_kwargs={'skip_prepare_dataset': True},
365
  dataset_num_proc=None,
366
  pad_token=None,
367
+ max_length=16384,
368
  packing=False,
369
  padding_free=False,
370
  eval_packing=None,
371
  dataset_batch_size=None,
372
  num_of_sequences=None,
373
  chars_per_token=None,
374
+ max_seq_length=16384,
375
  use_liger=None)
376
+ [2025-07-09 18:42:29,820][oumi][rank0][pid:1196049][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
377
+ [2025-07-09 18:42:29,911][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:506] Training init time: 27.219s
378
+ [2025-07-09 18:42:29,911][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
379
+ [2025-07-09 21:45:52,946][oumi][rank0][pid:1196049][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=156.229, power_limit_watts=700.0, gpu_utilization=71, memory_utilization=7, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
380
+ [2025-07-09 21:45:55,125][oumi][rank0][pid:1196049][MainThread][INFO]][telemetry_callback.py:242] Saving telemetry callback summary to output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/telemetry/telemetry_callback_rank0000.json...
381
+ [2025-07-09 21:46:14,567][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:514] Training is Complete.
382
+ [2025-07-09 21:46:14,567][oumi][rank0][pid:1196049][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
383
+ [2025-07-09 21:46:14,568][oumi][rank0][pid:1196049][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.33 GB
384
+ [2025-07-09 21:46:14,572][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:521] Saving final state...
385
+ [2025-07-09 21:46:14,590][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:526] Saving final model...
386
+ [2025-07-09 21:46:14,591][oumi][rank0][pid:1196049][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
387
+ [2025-07-09 21:48:54,605][oumi][rank0][pid:1196049][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
388
+ [2025-07-09 21:48:57,509][oumi][rank0][pid:1196049][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
389
+ [2025-07-09 21:48:58,046][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:214]
 
 
 
 
 
 
390
 
391
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0001.log CHANGED
@@ -1,40 +1,41 @@
1
- [2025-07-04 10:06:56,067][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
- [2025-07-04 10:06:57,882][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
- [2025-07-04 10:07:03,033][oumi][rank1][pid:2619051][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
- [2025-07-04 10:07:03,033][oumi][rank1][pid:2619051][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
- [2025-07-04 10:07:05,285][oumi][rank1][pid:2619051][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
- Dataset size: 57058499
9
- Download size: 48789762
10
- Size: 105848261 bytes
11
- Rows: 4286
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
- [2025-07-04 10:07:07,699][oumi][rank1][pid:2619051][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
- [2025-07-04 10:07:21,069][oumi][rank1][pid:2619051][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1)
21
- [2025-07-04 10:07:21,070][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
- [2025-07-04 10:07:21,070][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
- [2025-07-04 10:07:21,071][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1))...
24
- [2025-07-04 10:07:21,100][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
- [2025-07-04 10:07:24,479][oumi][rank1][pid:2619051][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
- [2025-07-04 10:07:24,479][oumi][rank1][pid:2619051][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
- [2025-07-04 10:07:24,827][oumi][rank1][pid:2619051][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
- [2025-07-04 10:07:25,288][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:510] Training init time: 29.332s
29
- [2025-07-04 10:07:25,288][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
- [2025-07-05 04:37:21,400][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:518] Training is Complete.
31
- [2025-07-05 04:37:21,406][oumi][rank1][pid:2619051][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
- [2025-07-05 04:37:21,407][oumi][rank1][pid:2619051][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 26.58 GB
33
- [2025-07-05 04:37:21,407][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:525] Saving final state...
34
- [2025-07-05 04:37:21,453][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:530] Saving final model...
35
- [2025-07-05 04:37:21,453][oumi][rank1][pid:2619051][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
- [2025-07-05 04:40:16,622][oumi][rank1][pid:2619051][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
- [2025-07-05 04:40:20,790][oumi][rank1][pid:2619051][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
- [2025-07-05 04:43:06,891][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:214]
 
39
 
40
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
 
1
+ [2025-07-09 18:42:02,781][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-09 18:42:04,286][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-09 18:42:08,705][oumi][rank1][pid:1196050][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-09 18:42:08,706][oumi][rank1][pid:1196050][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-09 18:42:11,867][oumi][rank1][pid:1196050][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
+ Dataset size: 109368497
9
+ Download size: 96255659
10
+ Size: 205624156 bytes
11
+ Rows: 10000
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-09 18:42:14,493][oumi][rank1][pid:1196050][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
+ [2025-07-09 18:42:26,186][oumi][rank1][pid:1196050][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1)
21
+ [2025-07-09 18:42:26,187][oumi][rank1][pid:1196050][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
22
+ [2025-07-09 18:42:26,187][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
23
+ [2025-07-09 18:42:26,187][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
24
+ [2025-07-09 18:42:26,187][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1))...
25
+ [2025-07-09 18:42:26,219][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
26
+ [2025-07-09 18:42:29,441][oumi][rank1][pid:1196050][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
27
+ [2025-07-09 18:42:29,442][oumi][rank1][pid:1196050][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
28
+ [2025-07-09 18:42:29,825][oumi][rank1][pid:1196050][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
29
+ [2025-07-09 18:42:29,895][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:506] Training init time: 27.203s
30
+ [2025-07-09 18:42:29,895][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
31
+ [2025-07-09 21:46:14,533][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:514] Training is Complete.
32
+ [2025-07-09 21:46:14,538][oumi][rank1][pid:1196050][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
33
+ [2025-07-09 21:46:14,538][oumi][rank1][pid:1196050][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.34 GB
34
+ [2025-07-09 21:46:14,538][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:521] Saving final state...
35
+ [2025-07-09 21:46:14,591][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:526] Saving final model...
36
+ [2025-07-09 21:46:14,591][oumi][rank1][pid:1196050][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
37
+ [2025-07-09 21:47:28,950][oumi][rank1][pid:1196050][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-09 21:47:32,606][oumi][rank1][pid:1196050][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
39
+ [2025-07-09 21:48:57,834][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:214]
40
 
41
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0002.log CHANGED
@@ -1,40 +1,41 @@
1
- [2025-07-04 10:06:56,062][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
- [2025-07-04 10:06:57,926][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
- [2025-07-04 10:07:02,953][oumi][rank2][pid:2619052][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
- [2025-07-04 10:07:02,953][oumi][rank2][pid:2619052][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
- [2025-07-04 10:07:05,190][oumi][rank2][pid:2619052][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
- Dataset size: 57058499
9
- Download size: 48789762
10
- Size: 105848261 bytes
11
- Rows: 4286
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
- [2025-07-04 10:07:07,667][oumi][rank2][pid:2619052][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
- [2025-07-04 10:07:21,065][oumi][rank2][pid:2619052][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2)
21
- [2025-07-04 10:07:21,065][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
- [2025-07-04 10:07:21,066][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
- [2025-07-04 10:07:21,066][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2))...
24
- [2025-07-04 10:07:21,094][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
- [2025-07-04 10:07:24,380][oumi][rank2][pid:2619052][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
- [2025-07-04 10:07:24,381][oumi][rank2][pid:2619052][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
- [2025-07-04 10:07:24,794][oumi][rank2][pid:2619052][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
- [2025-07-04 10:07:25,289][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:510] Training init time: 29.331s
29
- [2025-07-04 10:07:25,289][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
- [2025-07-05 04:37:21,430][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:518] Training is Complete.
31
- [2025-07-05 04:37:21,437][oumi][rank2][pid:2619052][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=127.816, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
- [2025-07-05 04:37:21,438][oumi][rank2][pid:2619052][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 26.67 GB
33
- [2025-07-05 04:37:21,438][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:525] Saving final state...
34
- [2025-07-05 04:37:21,453][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:530] Saving final model...
35
- [2025-07-05 04:37:21,453][oumi][rank2][pid:2619052][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
- [2025-07-05 04:40:16,620][oumi][rank2][pid:2619052][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
- [2025-07-05 04:40:20,693][oumi][rank2][pid:2619052][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
- [2025-07-05 04:43:06,319][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:214]
 
39
 
40
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
 
1
+ [2025-07-09 18:42:02,789][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-09 18:42:04,333][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-09 18:42:08,778][oumi][rank2][pid:1196051][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-09 18:42:08,779][oumi][rank2][pid:1196051][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-09 18:42:11,873][oumi][rank2][pid:1196051][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
+ Dataset size: 109368497
9
+ Download size: 96255659
10
+ Size: 205624156 bytes
11
+ Rows: 10000
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-09 18:42:14,489][oumi][rank2][pid:1196051][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
+ [2025-07-09 18:42:26,181][oumi][rank2][pid:1196051][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2)
21
+ [2025-07-09 18:42:26,183][oumi][rank2][pid:1196051][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
22
+ [2025-07-09 18:42:26,183][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
23
+ [2025-07-09 18:42:26,183][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
24
+ [2025-07-09 18:42:26,183][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2))...
25
+ [2025-07-09 18:42:26,222][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
26
+ [2025-07-09 18:42:29,442][oumi][rank2][pid:1196051][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
27
+ [2025-07-09 18:42:29,442][oumi][rank2][pid:1196051][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
28
+ [2025-07-09 18:42:29,813][oumi][rank2][pid:1196051][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
29
+ [2025-07-09 18:42:29,900][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:506] Training init time: 27.207s
30
+ [2025-07-09 18:42:29,900][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
31
+ [2025-07-09 21:46:14,551][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:514] Training is Complete.
32
+ [2025-07-09 21:46:14,564][oumi][rank2][pid:1196051][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
33
+ [2025-07-09 21:46:14,564][oumi][rank2][pid:1196051][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.33 GB
34
+ [2025-07-09 21:46:14,564][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:521] Saving final state...
35
+ [2025-07-09 21:46:14,591][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:526] Saving final model...
36
+ [2025-07-09 21:46:14,591][oumi][rank2][pid:1196051][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
37
+ [2025-07-09 21:47:28,955][oumi][rank2][pid:1196051][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-09 21:47:32,430][oumi][rank2][pid:1196051][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
39
+ [2025-07-09 21:48:57,844][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:214]
40
 
41
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0003.log CHANGED
@@ -1,40 +1,41 @@
1
- [2025-07-04 10:06:56,066][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
- [2025-07-04 10:06:57,878][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
- [2025-07-04 10:07:02,977][oumi][rank3][pid:2619053][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
- [2025-07-04 10:07:02,978][oumi][rank3][pid:2619053][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
- [2025-07-04 10:07:05,195][oumi][rank3][pid:2619053][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
- Dataset size: 57058499
9
- Download size: 48789762
10
- Size: 105848261 bytes
11
- Rows: 4286
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
- [2025-07-04 10:07:07,990][oumi][rank3][pid:2619053][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
- [2025-07-04 10:07:21,052][oumi][rank3][pid:2619053][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3)
21
- [2025-07-04 10:07:21,054][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
- [2025-07-04 10:07:21,054][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
- [2025-07-04 10:07:21,054][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3))...
24
- [2025-07-04 10:07:21,084][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
- [2025-07-04 10:07:24,294][oumi][rank3][pid:2619053][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
- [2025-07-04 10:07:24,294][oumi][rank3][pid:2619053][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
- [2025-07-04 10:07:24,748][oumi][rank3][pid:2619053][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.277, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
- [2025-07-04 10:07:25,301][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:510] Training init time: 29.344s
29
- [2025-07-04 10:07:25,301][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
- [2025-07-05 04:37:21,335][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:518] Training is Complete.
31
- [2025-07-05 04:37:21,341][oumi][rank3][pid:2619053][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
- [2025-07-05 04:37:21,341][oumi][rank3][pid:2619053][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 31.98 GB
33
- [2025-07-05 04:37:21,341][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:525] Saving final state...
34
- [2025-07-05 04:37:21,453][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:530] Saving final model...
35
- [2025-07-05 04:37:21,453][oumi][rank3][pid:2619053][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
- [2025-07-05 04:40:16,620][oumi][rank3][pid:2619053][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
- [2025-07-05 04:40:20,801][oumi][rank3][pid:2619053][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
- [2025-07-05 04:43:06,312][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:214]
 
39
 
40
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
 
1
+ [2025-07-09 18:42:02,797][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-09 18:42:04,341][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-09 18:42:08,788][oumi][rank3][pid:1196052][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-09 18:42:08,789][oumi][rank3][pid:1196052][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-09 18:42:11,883][oumi][rank3][pid:1196052][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
+ Dataset size: 109368497
9
+ Download size: 96255659
10
+ Size: 205624156 bytes
11
+ Rows: 10000
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-09 18:42:14,500][oumi][rank3][pid:1196052][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
+ [2025-07-09 18:42:26,193][oumi][rank3][pid:1196052][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3)
21
+ [2025-07-09 18:42:26,194][oumi][rank3][pid:1196052][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
22
+ [2025-07-09 18:42:26,194][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
23
+ [2025-07-09 18:42:26,194][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
24
+ [2025-07-09 18:42:26,194][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3))...
25
+ [2025-07-09 18:42:26,231][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
26
+ [2025-07-09 18:42:29,356][oumi][rank3][pid:1196052][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
27
+ [2025-07-09 18:42:29,361][oumi][rank3][pid:1196052][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
28
+ [2025-07-09 18:42:29,829][oumi][rank3][pid:1196052][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
29
+ [2025-07-09 18:42:29,898][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:506] Training init time: 27.206s
30
+ [2025-07-09 18:42:29,898][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
31
+ [2025-07-09 21:46:14,513][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:514] Training is Complete.
32
+ [2025-07-09 21:46:14,532][oumi][rank3][pid:1196052][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
33
+ [2025-07-09 21:46:14,532][oumi][rank3][pid:1196052][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.32 GB
34
+ [2025-07-09 21:46:14,532][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:521] Saving final state...
35
+ [2025-07-09 21:46:14,592][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:526] Saving final model...
36
+ [2025-07-09 21:46:14,595][oumi][rank3][pid:1196052][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
37
+ [2025-07-09 21:47:28,950][oumi][rank3][pid:1196052][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-09 21:47:32,582][oumi][rank3][pid:1196052][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
39
+ [2025-07-09 21:48:57,854][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:214]
40
 
41
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0004.log CHANGED
@@ -1,40 +1,41 @@
1
- [2025-07-04 10:06:56,052][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
- [2025-07-04 10:06:57,909][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
- [2025-07-04 10:07:03,045][oumi][rank4][pid:2619054][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
- [2025-07-04 10:07:03,045][oumi][rank4][pid:2619054][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
- [2025-07-04 10:07:05,190][oumi][rank4][pid:2619054][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
- Dataset size: 57058499
9
- Download size: 48789762
10
- Size: 105848261 bytes
11
- Rows: 4286
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
- [2025-07-04 10:07:07,804][oumi][rank4][pid:2619054][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
- [2025-07-04 10:07:21,061][oumi][rank4][pid:2619054][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4)
21
- [2025-07-04 10:07:21,062][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
- [2025-07-04 10:07:21,062][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
- [2025-07-04 10:07:21,062][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4))...
24
- [2025-07-04 10:07:21,090][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
- [2025-07-04 10:07:24,417][oumi][rank4][pid:2619054][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
- [2025-07-04 10:07:24,417][oumi][rank4][pid:2619054][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
- [2025-07-04 10:07:24,786][oumi][rank4][pid:2619054][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
- [2025-07-04 10:07:25,287][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:510] Training init time: 29.328s
29
- [2025-07-04 10:07:25,287][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
- [2025-07-05 04:37:21,286][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:518] Training is Complete.
31
- [2025-07-05 04:37:21,293][oumi][rank4][pid:2619054][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=135.197, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
- [2025-07-05 04:37:21,293][oumi][rank4][pid:2619054][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 29.58 GB
33
- [2025-07-05 04:37:21,293][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:525] Saving final state...
34
- [2025-07-05 04:37:21,451][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:530] Saving final model...
35
- [2025-07-05 04:37:21,452][oumi][rank4][pid:2619054][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
- [2025-07-05 04:40:16,617][oumi][rank4][pid:2619054][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
- [2025-07-05 04:40:20,756][oumi][rank4][pid:2619054][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
- [2025-07-05 04:43:06,046][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:214]
 
39
 
40
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
 
1
+ [2025-07-09 18:42:02,786][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-09 18:42:04,223][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-09 18:42:08,742][oumi][rank4][pid:1196053][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-09 18:42:08,742][oumi][rank4][pid:1196053][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-09 18:42:11,870][oumi][rank4][pid:1196053][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
+ Dataset size: 109368497
9
+ Download size: 96255659
10
+ Size: 205624156 bytes
11
+ Rows: 10000
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-09 18:42:14,519][oumi][rank4][pid:1196053][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
+ [2025-07-09 18:42:26,188][oumi][rank4][pid:1196053][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4)
21
+ [2025-07-09 18:42:26,188][oumi][rank4][pid:1196053][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
22
+ [2025-07-09 18:42:26,188][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
23
+ [2025-07-09 18:42:26,188][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
24
+ [2025-07-09 18:42:26,189][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4))...
25
+ [2025-07-09 18:42:26,212][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
26
+ [2025-07-09 18:42:29,329][oumi][rank4][pid:1196053][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
27
+ [2025-07-09 18:42:29,329][oumi][rank4][pid:1196053][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
28
+ [2025-07-09 18:42:29,748][oumi][rank4][pid:1196053][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
29
+ [2025-07-09 18:42:29,897][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:506] Training init time: 27.205s
30
+ [2025-07-09 18:42:29,897][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
31
+ [2025-07-09 21:46:14,458][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:514] Training is Complete.
32
+ [2025-07-09 21:46:14,479][oumi][rank4][pid:1196053][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.102, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
33
+ [2025-07-09 21:46:14,479][oumi][rank4][pid:1196053][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.33 GB
34
+ [2025-07-09 21:46:14,480][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:521] Saving final state...
35
+ [2025-07-09 21:46:14,591][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:526] Saving final model...
36
+ [2025-07-09 21:46:14,591][oumi][rank4][pid:1196053][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
37
+ [2025-07-09 21:47:28,949][oumi][rank4][pid:1196053][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-09 21:47:32,380][oumi][rank4][pid:1196053][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
39
+ [2025-07-09 21:48:57,872][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:214]
40
 
41
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0005.log CHANGED
@@ -1,40 +1,41 @@
1
- [2025-07-04 10:06:56,055][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
- [2025-07-04 10:06:57,908][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
- [2025-07-04 10:07:03,051][oumi][rank5][pid:2619055][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
- [2025-07-04 10:07:03,051][oumi][rank5][pid:2619055][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
- [2025-07-04 10:07:05,197][oumi][rank5][pid:2619055][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
- Dataset size: 57058499
9
- Download size: 48789762
10
- Size: 105848261 bytes
11
- Rows: 4286
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
- [2025-07-04 10:07:07,706][oumi][rank5][pid:2619055][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
- [2025-07-04 10:07:21,046][oumi][rank5][pid:2619055][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5)
21
- [2025-07-04 10:07:21,047][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
- [2025-07-04 10:07:21,047][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
- [2025-07-04 10:07:21,047][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5))...
24
- [2025-07-04 10:07:21,083][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
- [2025-07-04 10:07:24,308][oumi][rank5][pid:2619055][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
- [2025-07-04 10:07:24,308][oumi][rank5][pid:2619055][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
- [2025-07-04 10:07:24,772][oumi][rank5][pid:2619055][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.277, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
- [2025-07-04 10:07:25,292][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:510] Training init time: 29.335s
29
- [2025-07-04 10:07:25,292][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
- [2025-07-05 04:37:21,368][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:518] Training is Complete.
31
- [2025-07-05 04:37:21,373][oumi][rank5][pid:2619055][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
- [2025-07-05 04:37:21,373][oumi][rank5][pid:2619055][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 28.73 GB
33
- [2025-07-05 04:37:21,373][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:525] Saving final state...
34
- [2025-07-05 04:37:21,454][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:530] Saving final model...
35
- [2025-07-05 04:37:21,454][oumi][rank5][pid:2619055][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
- [2025-07-05 04:40:16,620][oumi][rank5][pid:2619055][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
- [2025-07-05 04:40:20,536][oumi][rank5][pid:2619055][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
- [2025-07-05 04:43:05,901][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:214]
 
39
 
40
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
 
1
+ [2025-07-09 18:42:02,768][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-09 18:42:04,332][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-09 18:42:08,810][oumi][rank5][pid:1196054][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-09 18:42:08,811][oumi][rank5][pid:1196054][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-09 18:42:11,868][oumi][rank5][pid:1196054][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
+ Dataset size: 109368497
9
+ Download size: 96255659
10
+ Size: 205624156 bytes
11
+ Rows: 10000
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-09 18:42:14,512][oumi][rank5][pid:1196054][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
+ [2025-07-09 18:42:26,178][oumi][rank5][pid:1196054][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5)
21
+ [2025-07-09 18:42:26,181][oumi][rank5][pid:1196054][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
22
+ [2025-07-09 18:42:26,181][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
23
+ [2025-07-09 18:42:26,181][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
24
+ [2025-07-09 18:42:26,181][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5))...
25
+ [2025-07-09 18:42:26,207][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
26
+ [2025-07-09 18:42:29,286][oumi][rank5][pid:1196054][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
27
+ [2025-07-09 18:42:29,286][oumi][rank5][pid:1196054][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
28
+ [2025-07-09 18:42:29,704][oumi][rank5][pid:1196054][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.01, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
29
+ [2025-07-09 18:42:29,901][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:506] Training init time: 27.209s
30
+ [2025-07-09 18:42:29,901][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
31
+ [2025-07-09 21:46:14,497][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:514] Training is Complete.
32
+ [2025-07-09 21:46:14,504][oumi][rank5][pid:1196054][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
33
+ [2025-07-09 21:46:14,505][oumi][rank5][pid:1196054][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.33 GB
34
+ [2025-07-09 21:46:14,505][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:521] Saving final state...
35
+ [2025-07-09 21:46:14,591][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:526] Saving final model...
36
+ [2025-07-09 21:46:14,592][oumi][rank5][pid:1196054][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
37
+ [2025-07-09 21:47:28,949][oumi][rank5][pid:1196054][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-09 21:47:32,371][oumi][rank5][pid:1196054][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
39
+ [2025-07-09 21:48:58,279][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:214]
40
 
41
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0006.log CHANGED
@@ -1,40 +1,41 @@
1
- [2025-07-04 10:06:56,070][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
- [2025-07-04 10:06:57,902][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
- [2025-07-04 10:07:03,025][oumi][rank6][pid:2619056][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
- [2025-07-04 10:07:03,026][oumi][rank6][pid:2619056][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
- [2025-07-04 10:07:05,222][oumi][rank6][pid:2619056][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
- Dataset size: 57058499
9
- Download size: 48789762
10
- Size: 105848261 bytes
11
- Rows: 4286
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
- [2025-07-04 10:07:07,724][oumi][rank6][pid:2619056][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
- [2025-07-04 10:07:21,068][oumi][rank6][pid:2619056][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6)
21
- [2025-07-04 10:07:21,070][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
- [2025-07-04 10:07:21,070][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
- [2025-07-04 10:07:21,071][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6))...
24
- [2025-07-04 10:07:21,106][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
- [2025-07-04 10:07:24,500][oumi][rank6][pid:2619056][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
- [2025-07-04 10:07:24,500][oumi][rank6][pid:2619056][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
- [2025-07-04 10:07:24,822][oumi][rank6][pid:2619056][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
- [2025-07-04 10:07:25,293][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:510] Training init time: 29.334s
29
- [2025-07-04 10:07:25,293][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
- [2025-07-05 04:37:21,401][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:518] Training is Complete.
31
- [2025-07-05 04:37:21,402][oumi][rank6][pid:2619056][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
- [2025-07-05 04:37:21,406][oumi][rank6][pid:2619056][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 29.56 GB
33
- [2025-07-05 04:37:21,406][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:525] Saving final state...
34
- [2025-07-05 04:37:21,452][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:530] Saving final model...
35
- [2025-07-05 04:37:21,452][oumi][rank6][pid:2619056][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
- [2025-07-05 04:40:16,619][oumi][rank6][pid:2619056][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
- [2025-07-05 04:40:20,620][oumi][rank6][pid:2619056][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
- [2025-07-05 04:43:06,598][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:214]
 
39
 
40
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
 
1
+ [2025-07-09 18:42:02,772][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-09 18:42:04,162][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-09 18:42:08,508][oumi][rank6][pid:1196055][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-09 18:42:08,508][oumi][rank6][pid:1196055][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-09 18:42:11,880][oumi][rank6][pid:1196055][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
+ Dataset size: 109368497
9
+ Download size: 96255659
10
+ Size: 205624156 bytes
11
+ Rows: 10000
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-09 18:42:14,525][oumi][rank6][pid:1196055][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
+ [2025-07-09 18:42:26,183][oumi][rank6][pid:1196055][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6)
21
+ [2025-07-09 18:42:26,185][oumi][rank6][pid:1196055][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
22
+ [2025-07-09 18:42:26,185][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
23
+ [2025-07-09 18:42:26,185][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
24
+ [2025-07-09 18:42:26,185][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6))...
25
+ [2025-07-09 18:42:26,217][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
26
+ [2025-07-09 18:42:29,227][oumi][rank6][pid:1196055][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
27
+ [2025-07-09 18:42:29,228][oumi][rank6][pid:1196055][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
28
+ [2025-07-09 18:42:29,672][oumi][rank6][pid:1196055][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.01, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
29
+ [2025-07-09 18:42:29,895][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:506] Training init time: 27.202s
30
+ [2025-07-09 18:42:29,895][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
31
+ [2025-07-09 21:46:14,530][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:514] Training is Complete.
32
+ [2025-07-09 21:46:14,539][oumi][rank6][pid:1196055][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
33
+ [2025-07-09 21:46:14,540][oumi][rank6][pid:1196055][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.32 GB
34
+ [2025-07-09 21:46:14,540][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:521] Saving final state...
35
+ [2025-07-09 21:46:14,592][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:526] Saving final model...
36
+ [2025-07-09 21:46:14,592][oumi][rank6][pid:1196055][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
37
+ [2025-07-09 21:47:28,951][oumi][rank6][pid:1196055][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-09 21:47:32,573][oumi][rank6][pid:1196055][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
39
+ [2025-07-09 21:48:57,874][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:214]
40
 
41
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0007.log CHANGED
@@ -1,40 +1,41 @@
1
- [2025-07-04 10:06:56,058][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
- [2025-07-04 10:06:57,900][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
- [2025-07-04 10:07:03,049][oumi][rank7][pid:2619057][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
- [2025-07-04 10:07:03,049][oumi][rank7][pid:2619057][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
- [2025-07-04 10:07:05,217][oumi][rank7][pid:2619057][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
- Dataset size: 57058499
9
- Download size: 48789762
10
- Size: 105848261 bytes
11
- Rows: 4286
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
- [2025-07-04 10:07:07,979][oumi][rank7][pid:2619057][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
- [2025-07-04 10:07:21,057][oumi][rank7][pid:2619057][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7)
21
- [2025-07-04 10:07:21,058][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
- [2025-07-04 10:07:21,058][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
- [2025-07-04 10:07:21,058][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7))...
24
- [2025-07-04 10:07:21,086][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
- [2025-07-04 10:07:24,262][oumi][rank7][pid:2619057][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
- [2025-07-04 10:07:24,262][oumi][rank7][pid:2619057][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
- [2025-07-04 10:07:24,740][oumi][rank7][pid:2619057][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.277, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
- [2025-07-04 10:07:25,299][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:510] Training init time: 29.341s
29
- [2025-07-04 10:07:25,299][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
- [2025-07-05 04:37:21,399][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:518] Training is Complete.
31
- [2025-07-05 04:37:21,401][oumi][rank7][pid:2619057][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
- [2025-07-05 04:37:21,402][oumi][rank7][pid:2619057][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 30.10 GB
33
- [2025-07-05 04:37:21,402][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:525] Saving final state...
34
- [2025-07-05 04:37:21,454][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:530] Saving final model...
35
- [2025-07-05 04:37:21,454][oumi][rank7][pid:2619057][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
- [2025-07-05 04:40:16,621][oumi][rank7][pid:2619057][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
- [2025-07-05 04:40:20,698][oumi][rank7][pid:2619057][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
- [2025-07-05 04:43:06,529][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:214]
 
39
 
40
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
 
1
+ [2025-07-09 18:42:03,107][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-09 18:42:04,432][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-09 18:42:08,894][oumi][rank7][pid:1196056][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-09 18:42:08,895][oumi][rank7][pid:1196056][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-09 18:42:11,909][oumi][rank7][pid:1196056][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
  Split: train
7
  Version: 0.0.0
8
+ Dataset size: 109368497
9
+ Download size: 96255659
10
+ Size: 205624156 bytes
11
+ Rows: 10000
12
  Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-09 18:42:14,543][oumi][rank7][pid:1196056][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
14
  problem object
15
  solution object
16
  original_question object
17
  original_answer object
18
  image object
19
  dtype: object
20
+ [2025-07-09 18:42:26,194][oumi][rank7][pid:1196056][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7)
21
+ [2025-07-09 18:42:26,195][oumi][rank7][pid:1196056][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
22
+ [2025-07-09 18:42:26,195][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
23
+ [2025-07-09 18:42:26,195][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
24
+ [2025-07-09 18:42:26,195][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7))...
25
+ [2025-07-09 18:42:26,224][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
26
+ [2025-07-09 18:42:29,448][oumi][rank7][pid:1196056][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
27
+ [2025-07-09 18:42:29,448][oumi][rank7][pid:1196056][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
28
+ [2025-07-09 18:42:29,796][oumi][rank7][pid:1196056][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
29
+ [2025-07-09 18:42:29,904][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:506] Training init time: 27.211s
30
+ [2025-07-09 18:42:29,904][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
31
+ [2025-07-09 21:46:14,494][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:514] Training is Complete.
32
+ [2025-07-09 21:46:14,503][oumi][rank7][pid:1196056][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
33
+ [2025-07-09 21:46:14,503][oumi][rank7][pid:1196056][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.32 GB
34
+ [2025-07-09 21:46:14,503][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:521] Saving final state...
35
+ [2025-07-09 21:46:14,590][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:526] Saving final model...
36
+ [2025-07-09 21:46:14,591][oumi][rank7][pid:1196056][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
37
+ [2025-07-09 21:47:28,953][oumi][rank7][pid:1196056][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-09 21:47:32,602][oumi][rank7][pid:1196056][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
39
+ [2025-07-09 21:48:58,154][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:214]
40
 
41
  » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0ace3109f45472b8e455e628311825ac55b941a759bcfb3b8f60af343cc4b8f
3
+ size 4968243304
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b938b06fe81e84b3431ccc418f2c977e9cde81a76923429b7a0addcab0e830e3
3
+ size 4991495816
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f63c2b1eb4774528c039a90d9f6cb97ce1e2ccb5dbb078d32e6b64e6df39cb7
3
+ size 4932751040
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8b4dbc975b9c0281451a3a61daed283465565d20f3343f2a786c44c6631cf8f
3
+ size 1691924384
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
runs/Jul09_18-42-29_oumi-compute002/events.out.tfevents.1752086716.oumi-compute002.1196049.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8e69fbc3d95d79cdeeeaceaf06ab1f6e42eb2ec84533cf923f22899e8351d14
3
+ size 16776
telemetry/telemetry_callback_metrics_rank0000.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "train_runtime": 66532.5206,
3
- "train_samples_per_second": 0.451,
4
- "train_steps_per_second": 0.056,
5
- "train_tokens_per_second": 70.398,
6
- "total_flos": 1.4113858770069094e+17,
7
- "train_loss": 0.21003443336486816,
8
- "epoch": 6.996268656716418,
9
- "num_input_tokens_seen": 24290780
10
  }
 
1
  {
2
+ "train_runtime": 10837.9895,
3
+ "train_samples_per_second": 0.923,
4
+ "train_steps_per_second": 0.115,
5
+ "train_tokens_per_second": 41.751,
6
+ "total_flos": 3.3689960431222784e+16,
7
+ "train_loss": 0.5481613777160644,
8
+ "epoch": 1.0,
9
+ "num_input_tokens_seen": 5798240
10
  }
telemetry/telemetry_callback_rank0000.json CHANGED
@@ -1,36 +1,36 @@
1
  {
2
- "hostname": "oumi-compute004",
3
- "total_time": 66595.79496112792,
4
  "timers": {
5
  "epochs": {
6
- "count": 7.0,
7
- "mean": 9503.914948322012,
8
- "median": 9508.863333210349,
9
- "std_dev": 24.585813027685713,
10
- "min": 9471.0654975418,
11
- "max": 9534.607692892198,
12
- "total": 66527.40463825408,
13
- "percentage": 99.89730534350741
14
  },
15
  "microsteps": {
16
- "count": 3748.0,
17
- "mean": 17.662540777144716,
18
- "median": 17.44136324687861,
19
- "std_dev": 1.5278314346422193,
20
- "min": 15.953362683299929,
21
- "max": 25.46622396213934,
22
- "total": 66199.2028327384,
23
- "percentage": 99.40447872328726
24
  },
25
  "steps": {
26
- "count": 3748.0,
27
- "mean": 17.662547419246746,
28
- "median": 17.441364587983117,
29
- "std_dev": 1.5278260400852177,
30
- "min": 15.953363878186792,
31
- "max": 25.46622501220554,
32
- "total": 66199.22772733681,
33
- "percentage": 99.40451610492437
34
  }
35
  },
36
  "cuda_timers": {},
 
1
  {
2
+ "hostname": "oumi-compute002",
3
+ "total_time": 11004.431441733148,
4
  "timers": {
5
  "epochs": {
6
+ "count": 1.0,
7
+ "mean": 10836.051646457054,
8
+ "median": 10836.051646457054,
9
+ "std_dev": 0,
10
+ "min": 10836.051646457054,
11
+ "max": 10836.051646457054,
12
+ "total": 10836.051646457054,
13
+ "percentage": 98.46989100556772
14
  },
15
  "microsteps": {
16
+ "count": 1248.0,
17
+ "mean": 8.3604591369908,
18
+ "median": 8.76854655193165,
19
+ "std_dev": 1.778935564684271,
20
+ "min": 3.7527454742230475,
21
+ "max": 17.169754883274436,
22
+ "total": 10433.853002964519,
23
+ "percentage": 94.81501209953682
24
  },
25
  "steps": {
26
+ "count": 1248.0,
27
+ "mean": 8.360481434502454,
28
+ "median": 8.768546879524365,
29
+ "std_dev": 1.778942205649326,
30
+ "min": 3.7527463790029287,
31
+ "max": 17.169755581766367,
32
+ "total": 10433.880830259062,
33
+ "percentage": 94.81526497306956
34
  }
35
  },
36
  "cuda_timers": {},
telemetry/telemetry_callback_wandb_rank0000.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "id": "p2r4ua3y",
3
  "name": "output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered",
4
- "url": "https://wandb.ai/nyu-dice-lab/huggingface/runs/p2r4ua3y"
5
  }
 
1
  {
2
+ "id": "kb5s58m1",
3
  "name": "output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered",
4
+ "url": "https://wandb.ai/nyu-dice-lab/huggingface/runs/kb5s58m1"
5
  }
telemetry/training_config.yaml CHANGED
@@ -59,13 +59,13 @@ model:
59
  tokenizer_pad_token: null
60
  tokenizer_kwargs: {}
61
  processor_kwargs: {}
62
- model_max_length: 10000
63
  load_pretrained_weights: true
64
  trust_remote_code: true
65
  torch_dtype_str: bfloat16
66
  compile: false
67
  chat_template: qwen2-vl-instruct
68
- attn_implementation: sdpa
69
  device_map: auto
70
  model_kwargs: {}
71
  enable_liger_kernel: false
@@ -82,7 +82,7 @@ training:
82
  per_device_train_batch_size: 1
83
  per_device_eval_batch_size: 8
84
  gradient_accumulation_steps: 1
85
- max_steps: 3750
86
  num_train_epochs: 5
87
  save_epoch: false
88
  save_steps: 0
@@ -103,7 +103,7 @@ training:
103
  remove_unused_columns: false
104
  repetition_penalty: 1.0
105
  use_vllm: false
106
- vllm_device: null
107
  vllm_gpu_memory_utilization: 0.9
108
  vllm_dtype: null
109
  vllm_max_model_len: null
@@ -138,14 +138,14 @@ training:
138
  log_model_summary: false
139
  resume_from_checkpoint: null
140
  try_resume_from_last_checkpoint: false
141
- dataloader_num_workers: 2
142
  dataloader_persistent_workers: false
143
- dataloader_prefetch_factor: 8
144
- dataloader_main_process_only: false
145
  ddp_find_unused_parameters: false
146
  max_grad_norm: 1.0
147
  trainer_kwargs:
148
- max_seq_length: 10000
149
  remove_unused_columns: false
150
  dataset_kwargs:
151
  skip_prepare_dataset: true
@@ -193,14 +193,14 @@ peft:
193
  peft_save_mode: ADAPTER_ONLY
194
  fsdp:
195
  enable_fsdp: true
196
- sharding_strategy: HYBRID_SHARD
197
  cpu_offload: false
198
- mixed_precision: bf16
199
  backward_prefetch: BACKWARD_PRE
200
  forward_prefetch: true
201
- use_orig_params: null
202
  state_dict_type: FULL_STATE_DICT
203
- auto_wrap_policy: SIZE_BASED_WRAP
204
  min_num_params: 100000
205
- transformer_layer_cls: null
206
  sync_module_states: true
 
59
  tokenizer_pad_token: null
60
  tokenizer_kwargs: {}
61
  processor_kwargs: {}
62
+ model_max_length: 16384
63
  load_pretrained_weights: true
64
  trust_remote_code: true
65
  torch_dtype_str: bfloat16
66
  compile: false
67
  chat_template: qwen2-vl-instruct
68
+ attn_implementation: flash_attention_2
69
  device_map: auto
70
  model_kwargs: {}
71
  enable_liger_kernel: false
 
82
  per_device_train_batch_size: 1
83
  per_device_eval_batch_size: 8
84
  gradient_accumulation_steps: 1
85
+ max_steps: 1250
86
  num_train_epochs: 5
87
  save_epoch: false
88
  save_steps: 0
 
103
  remove_unused_columns: false
104
  repetition_penalty: 1.0
105
  use_vllm: false
106
+ vllm_mode: null
107
  vllm_gpu_memory_utilization: 0.9
108
  vllm_dtype: null
109
  vllm_max_model_len: null
 
138
  log_model_summary: false
139
  resume_from_checkpoint: null
140
  try_resume_from_last_checkpoint: false
141
+ dataloader_num_workers: 64
142
  dataloader_persistent_workers: false
143
+ dataloader_prefetch_factor: 32
144
+ dataloader_main_process_only: null
145
  ddp_find_unused_parameters: false
146
  max_grad_norm: 1.0
147
  trainer_kwargs:
148
+ max_seq_length: 16384
149
  remove_unused_columns: false
150
  dataset_kwargs:
151
  skip_prepare_dataset: true
 
193
  peft_save_mode: ADAPTER_ONLY
194
  fsdp:
195
  enable_fsdp: true
196
+ sharding_strategy: FULL_SHARD
197
  cpu_offload: false
198
+ mixed_precision: null
199
  backward_prefetch: BACKWARD_PRE
200
  forward_prefetch: true
201
+ use_orig_params: true
202
  state_dict_type: FULL_STATE_DICT
203
+ auto_wrap_policy: TRANSFORMER_BASED_WRAP
204
  min_num_params: 100000
205
+ transformer_layer_cls: transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer
206
  sync_module_states: true
tokenizer_config.json CHANGED
@@ -200,7 +200,7 @@
200
  "eos_token": "<|im_end|>",
201
  "errors": "replace",
202
  "extra_special_tokens": {},
203
- "model_max_length": 10000,
204
  "pad_token": "<|endoftext|>",
205
  "padding_side": "right",
206
  "processor_class": "Qwen2_5_VLProcessor",
 
200
  "eos_token": "<|im_end|>",
201
  "errors": "replace",
202
  "extra_special_tokens": {},
203
+ "model_max_length": 16384,
204
  "pad_token": "<|endoftext|>",
205
  "padding_side": "right",
206
  "processor_class": "Qwen2_5_VLProcessor",
trainer_state.json CHANGED
@@ -2,779 +2,279 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 6.996268656716418,
6
  "eval_steps": 500,
7
- "global_step": 3750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.09328358208955224,
14
- "grad_norm": 2.4583044052124023,
15
- "learning_rate": 8.672566371681418e-06,
16
- "loss": 0.8247,
17
- "mean_token_accuracy": 0.8019651556015015,
18
- "num_input_tokens_seen": 323038,
19
- "num_tokens": 323038.0,
20
  "step": 50
21
  },
22
  {
23
- "epoch": 0.1865671641791045,
24
- "grad_norm": 2.1223530769348145,
25
- "learning_rate": 1.7522123893805313e-05,
26
- "loss": 0.6247,
27
- "mean_token_accuracy": 0.8224787962436676,
28
- "num_input_tokens_seen": 642936,
29
- "num_tokens": 642936.0,
30
  "step": 100
31
  },
32
  {
33
- "epoch": 0.2798507462686567,
34
- "grad_norm": 5.335413932800293,
35
- "learning_rate": 1.9995165482321775e-05,
36
- "loss": 0.6278,
37
- "mean_token_accuracy": 0.8260715854167938,
38
- "num_input_tokens_seen": 976988,
39
- "num_tokens": 976988.0,
40
  "step": 150
41
  },
42
  {
43
- "epoch": 0.373134328358209,
44
- "grad_norm": 5.00244140625,
45
- "learning_rate": 1.9972420885061576e-05,
46
- "loss": 0.6247,
47
- "mean_token_accuracy": 0.8278178679943085,
48
- "num_input_tokens_seen": 1303490,
49
- "num_tokens": 1303490.0,
50
  "step": 200
51
  },
52
  {
53
- "epoch": 0.4664179104477612,
54
- "grad_norm": 5.494905948638916,
55
- "learning_rate": 1.9931077431357095e-05,
56
- "loss": 0.649,
57
- "mean_token_accuracy": 0.8199629688262939,
58
- "num_input_tokens_seen": 1633598,
59
- "num_tokens": 1633598.0,
60
  "step": 250
61
  },
62
  {
63
- "epoch": 0.5597014925373134,
64
- "grad_norm": 5.7639923095703125,
65
- "learning_rate": 1.9871212227957962e-05,
66
- "loss": 0.6276,
67
- "mean_token_accuracy": 0.8245489084720612,
68
- "num_input_tokens_seen": 1959940,
69
- "num_tokens": 1959940.0,
70
  "step": 300
71
  },
72
  {
73
- "epoch": 0.6529850746268657,
74
- "grad_norm": 5.357442378997803,
75
- "learning_rate": 1.979293692521837e-05,
76
- "loss": 0.6308,
77
- "mean_token_accuracy": 0.8240770590305329,
78
- "num_input_tokens_seen": 2273744,
79
- "num_tokens": 2273744.0,
80
  "step": 350
81
  },
82
  {
83
- "epoch": 0.746268656716418,
84
- "grad_norm": 7.147578239440918,
85
- "learning_rate": 1.9696397508865917e-05,
86
- "loss": 0.6753,
87
- "mean_token_accuracy": 0.8150083267688751,
88
- "num_input_tokens_seen": 2594534,
89
- "num_tokens": 2594534.0,
90
  "step": 400
91
  },
92
  {
93
- "epoch": 0.8395522388059702,
94
- "grad_norm": 5.7040605545043945,
95
- "learning_rate": 1.9581774027733947e-05,
96
- "loss": 0.6221,
97
- "mean_token_accuracy": 0.8259347748756408,
98
- "num_input_tokens_seen": 2913982,
99
- "num_tokens": 2913982.0,
100
  "step": 450
101
  },
102
  {
103
- "epoch": 0.9328358208955224,
104
- "grad_norm": 5.201114654541016,
105
- "learning_rate": 1.944928025796521e-05,
106
- "loss": 0.6178,
107
- "mean_token_accuracy": 0.8253031682968139,
108
- "num_input_tokens_seen": 3233948,
109
- "num_tokens": 3233948.0,
110
  "step": 500
111
  },
112
  {
113
- "epoch": 1.0261194029850746,
114
- "grad_norm": 5.712339878082275,
115
- "learning_rate": 1.929916330431312e-05,
116
- "loss": 0.6321,
117
- "mean_token_accuracy": 0.8240388989448547,
118
- "num_input_tokens_seen": 3568746,
119
- "num_tokens": 3568746.0,
120
  "step": 550
121
  },
122
  {
123
- "epoch": 1.1194029850746268,
124
- "grad_norm": 3.397759199142456,
125
- "learning_rate": 1.9131703139284143e-05,
126
- "loss": 0.4505,
127
- "mean_token_accuracy": 0.8675171172618866,
128
- "num_input_tokens_seen": 3886940,
129
- "num_tokens": 3886940.0,
130
  "step": 600
131
  },
132
  {
133
- "epoch": 1.212686567164179,
134
- "grad_norm": 3.1189541816711426,
135
- "learning_rate": 1.894721208098092e-05,
136
- "loss": 0.3317,
137
- "mean_token_accuracy": 0.9006148743629455,
138
- "num_input_tokens_seen": 4206696,
139
- "num_tokens": 4206696.0,
140
  "step": 650
141
  },
142
  {
143
- "epoch": 1.3059701492537314,
144
- "grad_norm": 2.917012929916382,
145
- "learning_rate": 1.874603421061986e-05,
146
- "loss": 0.3288,
147
- "mean_token_accuracy": 0.900334278345108,
148
- "num_input_tokens_seen": 4538022,
149
- "num_tokens": 4538022.0,
150
  "step": 700
151
  },
152
  {
153
- "epoch": 1.3992537313432836,
154
- "grad_norm": 3.822690963745117,
155
- "learning_rate": 1.852854473080961e-05,
156
- "loss": 0.4135,
157
- "mean_token_accuracy": 0.8779956555366516,
158
- "num_input_tokens_seen": 4868192,
159
- "num_tokens": 4868192.0,
160
  "step": 750
161
  },
162
  {
163
- "epoch": 1.4925373134328357,
164
- "grad_norm": 3.944916009902954,
165
- "learning_rate": 1.8295149265787224e-05,
166
- "loss": 0.4413,
167
- "mean_token_accuracy": 0.8670336186885834,
168
- "num_input_tokens_seen": 5201704,
169
- "num_tokens": 5201704.0,
170
  "step": 800
171
  },
172
  {
173
- "epoch": 1.585820895522388,
174
- "grad_norm": 4.3602423667907715,
175
- "learning_rate": 1.8046283104917116e-05,
176
- "loss": 0.4167,
177
- "mean_token_accuracy": 0.87647913813591,
178
- "num_input_tokens_seen": 5521682,
179
- "num_tokens": 5521682.0,
180
  "step": 850
181
  },
182
  {
183
- "epoch": 1.6791044776119404,
184
- "grad_norm": 4.468040943145752,
185
- "learning_rate": 1.7782410390863664e-05,
186
- "loss": 0.4282,
187
- "mean_token_accuracy": 0.8733036065101624,
188
- "num_input_tokens_seen": 5840636,
189
- "num_tokens": 5840636.0,
190
  "step": 900
191
  },
192
  {
193
- "epoch": 1.7723880597014925,
194
- "grad_norm": 5.0916666984558105,
195
- "learning_rate": 1.750402325395156e-05,
196
- "loss": 0.422,
197
- "mean_token_accuracy": 0.8751583611965179,
198
- "num_input_tokens_seen": 6152288,
199
- "num_tokens": 6152288.0,
200
  "step": 950
201
  },
202
  {
203
- "epoch": 1.8656716417910446,
204
- "grad_norm": 3.583630084991455,
205
- "learning_rate": 1.7211640894328413e-05,
206
- "loss": 0.4014,
207
- "mean_token_accuracy": 0.8789325177669525,
208
- "num_input_tokens_seen": 6475382,
209
- "num_tokens": 6475382.0,
210
  "step": 1000
211
  },
212
  {
213
- "epoch": 1.9589552238805972,
214
- "grad_norm": 4.270807266235352,
215
- "learning_rate": 1.6905808613641233e-05,
216
- "loss": 0.4074,
217
- "mean_token_accuracy": 0.8787497889995575,
218
- "num_input_tokens_seen": 6799610,
219
- "num_tokens": 6799610.0,
220
  "step": 1050
221
  },
222
  {
223
- "epoch": 2.0522388059701493,
224
- "grad_norm": 3.991750717163086,
225
- "learning_rate": 1.6587096798032984e-05,
226
- "loss": 0.3727,
227
- "mean_token_accuracy": 0.8879767000675202,
228
- "num_input_tokens_seen": 7129112,
229
- "num_tokens": 7129112.0,
230
  "step": 1100
231
  },
232
  {
233
- "epoch": 2.1455223880597014,
234
- "grad_norm": 2.423408269882202,
235
- "learning_rate": 1.625609985435571e-05,
236
- "loss": 0.2241,
237
- "mean_token_accuracy": 0.9320108902454376,
238
- "num_input_tokens_seen": 7449332,
239
- "num_tokens": 7449332.0,
240
  "step": 1150
241
  },
242
  {
243
- "epoch": 2.2388059701492535,
244
- "grad_norm": 1.7369468212127686,
245
- "learning_rate": 1.59134351015844e-05,
246
- "loss": 0.1621,
247
- "mean_token_accuracy": 0.9507821369171142,
248
- "num_input_tokens_seen": 7773514,
249
- "num_tokens": 7773514.0,
250
  "step": 1200
251
  },
252
  {
253
- "epoch": 2.332089552238806,
254
- "grad_norm": 1.9339622259140015,
255
- "learning_rate": 1.555974161949906e-05,
256
- "loss": 0.174,
257
- "mean_token_accuracy": 0.9477302300930023,
258
- "num_input_tokens_seen": 8102664,
259
- "num_tokens": 8102664.0,
260
  "step": 1250
261
  },
262
  {
263
- "epoch": 2.425373134328358,
264
- "grad_norm": 2.86017107963562,
265
- "learning_rate": 1.519567905678223e-05,
266
- "loss": 0.2275,
267
- "mean_token_accuracy": 0.9304351592063904,
268
- "num_input_tokens_seen": 8432668,
269
- "num_tokens": 8432668.0,
270
- "step": 1300
271
- },
272
- {
273
- "epoch": 2.5186567164179103,
274
- "grad_norm": 2.300647735595703,
275
- "learning_rate": 1.4821926400754915e-05,
276
- "loss": 0.2328,
277
- "mean_token_accuracy": 0.9280073237419129,
278
- "num_input_tokens_seen": 8759670,
279
- "num_tokens": 8759670.0,
280
- "step": 1350
281
- },
282
- {
283
- "epoch": 2.611940298507463,
284
- "grad_norm": 2.8485522270202637,
285
- "learning_rate": 1.4439180711045395e-05,
286
- "loss": 0.2274,
287
- "mean_token_accuracy": 0.9305808675289154,
288
- "num_input_tokens_seen": 9083690,
289
- "num_tokens": 9083690.0,
290
- "step": 1400
291
- },
292
- {
293
- "epoch": 2.705223880597015,
294
- "grad_norm": 2.8772568702697754,
295
- "learning_rate": 1.4048155819552617e-05,
296
- "loss": 0.2385,
297
- "mean_token_accuracy": 0.9277240431308746,
298
- "num_input_tokens_seen": 9400762,
299
- "num_tokens": 9400762.0,
300
- "step": 1450
301
- },
302
- {
303
- "epoch": 2.798507462686567,
304
- "grad_norm": 3.3837602138519287,
305
- "learning_rate": 1.3649580999128871e-05,
306
- "loss": 0.2225,
307
- "mean_token_accuracy": 0.9319508814811707,
308
- "num_input_tokens_seen": 9719416,
309
- "num_tokens": 9719416.0,
310
- "step": 1500
311
- },
312
- {
313
- "epoch": 2.8917910447761193,
314
- "grad_norm": 3.256094217300415,
315
- "learning_rate": 1.3244199603464581e-05,
316
- "loss": 0.2307,
317
- "mean_token_accuracy": 0.9294045794010163,
318
- "num_input_tokens_seen": 10038208,
319
- "num_tokens": 10038208.0,
320
- "step": 1550
321
- },
322
- {
323
- "epoch": 2.9850746268656714,
324
- "grad_norm": 2.5858240127563477,
325
- "learning_rate": 1.2832767680711941e-05,
326
- "loss": 0.2196,
327
- "mean_token_accuracy": 0.9315053272247314,
328
- "num_input_tokens_seen": 10366444,
329
- "num_tokens": 10366444.0,
330
- "step": 1600
331
- },
332
- {
333
- "epoch": 3.078358208955224,
334
- "grad_norm": 1.8756778240203857,
335
- "learning_rate": 1.2416052563433043e-05,
336
- "loss": 0.19,
337
- "mean_token_accuracy": 0.9421073424816132,
338
- "num_input_tokens_seen": 10687314,
339
- "num_tokens": 10687314.0,
340
- "step": 1650
341
- },
342
- {
343
- "epoch": 3.171641791044776,
344
- "grad_norm": 0.8857208490371704,
345
- "learning_rate": 1.1994831437502172e-05,
346
- "loss": 0.1102,
347
- "mean_token_accuracy": 0.9673730087280273,
348
- "num_input_tokens_seen": 11009694,
349
- "num_tokens": 11009694.0,
350
- "step": 1700
351
- },
352
- {
353
- "epoch": 3.264925373134328,
354
- "grad_norm": 1.2341127395629883,
355
- "learning_rate": 1.1569889892631488e-05,
356
- "loss": 0.0797,
357
- "mean_token_accuracy": 0.9754443645477295,
358
- "num_input_tokens_seen": 11337744,
359
- "num_tokens": 11337744.0,
360
- "step": 1750
361
- },
362
- {
363
- "epoch": 3.3582089552238807,
364
- "grad_norm": 1.431805968284607,
365
- "learning_rate": 1.1142020457223195e-05,
366
- "loss": 0.0912,
367
- "mean_token_accuracy": 0.9724087870121002,
368
- "num_input_tokens_seen": 11665956,
369
- "num_tokens": 11665956.0,
370
- "step": 1800
371
- },
372
- {
373
- "epoch": 3.451492537313433,
374
- "grad_norm": 1.9885361194610596,
375
- "learning_rate": 1.0712021120280951e-05,
376
- "loss": 0.1156,
377
- "mean_token_accuracy": 0.9630080580711364,
378
- "num_input_tokens_seen": 12000080,
379
- "num_tokens": 12000080.0,
380
- "step": 1850
381
- },
382
- {
383
- "epoch": 3.544776119402985,
384
- "grad_norm": 1.1434038877487183,
385
- "learning_rate": 1.028069384313702e-05,
386
- "loss": 0.1109,
387
- "mean_token_accuracy": 0.9657398784160613,
388
- "num_input_tokens_seen": 12322044,
389
- "num_tokens": 12322044.0,
390
- "step": 1900
391
- },
392
- {
393
- "epoch": 3.638059701492537,
394
- "grad_norm": 1.1466773748397827,
395
- "learning_rate": 9.848843063770963e-06,
396
- "loss": 0.1098,
397
- "mean_token_accuracy": 0.966309015750885,
398
- "num_input_tokens_seen": 12642090,
399
- "num_tokens": 12642090.0,
400
- "step": 1950
401
- },
402
- {
403
- "epoch": 3.7313432835820897,
404
- "grad_norm": 1.5840574502944946,
405
- "learning_rate": 9.41727419650929e-06,
406
- "loss": 0.121,
407
- "mean_token_accuracy": 0.961807359457016,
408
- "num_input_tokens_seen": 12961610,
409
- "num_tokens": 12961610.0,
410
- "step": 2000
411
- },
412
- {
413
- "epoch": 3.824626865671642,
414
- "grad_norm": 1.5495262145996094,
415
- "learning_rate": 8.986792129904186e-06,
416
- "loss": 0.1102,
417
- "mean_token_accuracy": 0.9657205975055695,
418
- "num_input_tokens_seen": 13279862,
419
- "num_tokens": 13279862.0,
420
- "step": 2050
421
- },
422
- {
423
- "epoch": 3.917910447761194,
424
- "grad_norm": 1.7342835664749146,
425
- "learning_rate": 8.558199725592856e-06,
426
- "loss": 0.1156,
427
- "mean_token_accuracy": 0.9632673525810241,
428
- "num_input_tokens_seen": 13597816,
429
- "num_tokens": 13597816.0,
430
- "step": 2100
431
- },
432
- {
433
- "epoch": 4.0111940298507465,
434
- "grad_norm": 2.0673277378082275,
435
- "learning_rate": 8.132296320937085e-06,
436
- "loss": 0.118,
437
- "mean_token_accuracy": 0.9625415456295013,
438
- "num_input_tokens_seen": 13928436,
439
- "num_tokens": 13928436.0,
440
- "step": 2150
441
- },
442
- {
443
- "epoch": 4.104477611940299,
444
- "grad_norm": 1.0974746942520142,
445
- "learning_rate": 7.709876238235702e-06,
446
- "loss": 0.0839,
447
- "mean_token_accuracy": 0.9738617813587189,
448
- "num_input_tokens_seen": 14249072,
449
- "num_tokens": 14249072.0,
450
- "step": 2200
451
- },
452
- {
453
- "epoch": 4.197761194029851,
454
- "grad_norm": 1.0228750705718994,
455
- "learning_rate": 7.29172730329028e-06,
456
- "loss": 0.0498,
457
- "mean_token_accuracy": 0.9851219677925109,
458
- "num_input_tokens_seen": 14571152,
459
- "num_tokens": 14571152.0,
460
- "step": 2250
461
- },
462
- {
463
- "epoch": 4.291044776119403,
464
- "grad_norm": 0.6385033130645752,
465
- "learning_rate": 6.8786293760869695e-06,
466
- "loss": 0.0388,
467
- "mean_token_accuracy": 0.9884346830844879,
468
- "num_input_tokens_seen": 14903168,
469
- "num_tokens": 14903168.0,
470
- "step": 2300
471
- },
472
- {
473
- "epoch": 4.384328358208955,
474
- "grad_norm": 0.8694852590560913,
475
- "learning_rate": 6.4713528963348506e-06,
476
- "loss": 0.0411,
477
- "mean_token_accuracy": 0.9874970281124115,
478
- "num_input_tokens_seen": 15233224,
479
- "num_tokens": 15233224.0,
480
- "step": 2350
481
- },
482
- {
483
- "epoch": 4.477611940298507,
484
- "grad_norm": 0.929122805595398,
485
- "learning_rate": 6.070657446573347e-06,
486
- "loss": 0.0476,
487
- "mean_token_accuracy": 0.985052285194397,
488
- "num_input_tokens_seen": 15565618,
489
- "num_tokens": 15565618.0,
490
- "step": 2400
491
- },
492
- {
493
- "epoch": 4.57089552238806,
494
- "grad_norm": 0.6402145624160767,
495
- "learning_rate": 5.677290335528576e-06,
496
- "loss": 0.0694,
497
- "mean_token_accuracy": 0.9864287662506104,
498
- "num_input_tokens_seen": 15886728,
499
- "num_tokens": 15886728.0,
500
- "step": 2450
501
- },
502
- {
503
- "epoch": 4.664179104477612,
504
- "grad_norm": 0.9010692238807678,
505
- "learning_rate": 5.291985204360754e-06,
506
- "loss": 0.0424,
507
- "mean_token_accuracy": 0.9870220470428467,
508
- "num_input_tokens_seen": 16200844,
509
- "num_tokens": 16200844.0,
510
- "step": 2500
511
- },
512
- {
513
- "epoch": 4.757462686567164,
514
- "grad_norm": 0.8838453888893127,
515
- "learning_rate": 4.9154606584019646e-06,
516
- "loss": 0.0433,
517
- "mean_token_accuracy": 0.9861760056018829,
518
- "num_input_tokens_seen": 16518546,
519
- "num_tokens": 16518546.0,
520
- "step": 2550
521
- },
522
- {
523
- "epoch": 4.850746268656716,
524
- "grad_norm": 1.0508785247802734,
525
- "learning_rate": 4.548418926936235e-06,
526
- "loss": 0.0413,
527
- "mean_token_accuracy": 0.9870886874198913,
528
- "num_input_tokens_seen": 16837904,
529
- "num_tokens": 16837904.0,
530
- "step": 2600
531
- },
532
- {
533
- "epoch": 4.9440298507462686,
534
- "grad_norm": 1.0464740991592407,
535
- "learning_rate": 4.191544553521355e-06,
536
- "loss": 0.0428,
537
- "mean_token_accuracy": 0.9860042917728424,
538
- "num_input_tokens_seen": 17162930,
539
- "num_tokens": 17162930.0,
540
- "step": 2650
541
- },
542
- {
543
- "epoch": 5.037313432835821,
544
- "grad_norm": 1.167823076248169,
545
- "learning_rate": 3.845503119295182e-06,
546
- "loss": 0.0407,
547
- "mean_token_accuracy": 0.9872146189212799,
548
- "num_input_tokens_seen": 17492580,
549
- "num_tokens": 17492580.0,
550
- "step": 2700
551
- },
552
- {
553
- "epoch": 5.130597014925373,
554
- "grad_norm": 0.6037238836288452,
555
- "learning_rate": 3.5109400016473338e-06,
556
- "loss": 0.0259,
557
- "mean_token_accuracy": 0.9919403278827668,
558
- "num_input_tokens_seen": 17814314,
559
- "num_tokens": 17814314.0,
560
- "step": 2750
561
- },
562
- {
563
- "epoch": 5.223880597014926,
564
- "grad_norm": 0.4187396168708801,
565
- "learning_rate": 3.1884791705714936e-06,
566
- "loss": 0.0157,
567
- "mean_token_accuracy": 0.9955093479156494,
568
- "num_input_tokens_seen": 18139326,
569
- "num_tokens": 18139326.0,
570
- "step": 2800
571
- },
572
- {
573
- "epoch": 5.317164179104478,
574
- "grad_norm": 0.46902546286582947,
575
- "learning_rate": 2.878722024943139e-06,
576
- "loss": 0.0139,
577
- "mean_token_accuracy": 0.995677514076233,
578
- "num_input_tokens_seen": 18466486,
579
- "num_tokens": 18466486.0,
580
- "step": 2850
581
- },
582
- {
583
- "epoch": 5.41044776119403,
584
- "grad_norm": 0.3824739456176758,
585
- "learning_rate": 2.5822462708930607e-06,
586
- "loss": 0.0146,
587
- "mean_token_accuracy": 0.9957832169532775,
588
- "num_input_tokens_seen": 18796326,
589
- "num_tokens": 18796326.0,
590
- "step": 2900
591
- },
592
- {
593
- "epoch": 5.503731343283582,
594
- "grad_norm": 0.5814462304115295,
595
- "learning_rate": 2.299604844368547e-06,
596
- "loss": 0.0156,
597
- "mean_token_accuracy": 0.9954328262805938,
598
- "num_input_tokens_seen": 19129954,
599
- "num_tokens": 19129954.0,
600
- "step": 2950
601
- },
602
- {
603
- "epoch": 5.597014925373134,
604
- "grad_norm": 0.45167264342308044,
605
- "learning_rate": 2.031324879891664e-06,
606
- "loss": 0.013,
607
- "mean_token_accuracy": 0.9963365316390991,
608
- "num_input_tokens_seen": 19448322,
609
- "num_tokens": 19448322.0,
610
- "step": 3000
611
- },
612
- {
613
- "epoch": 5.690298507462686,
614
- "grad_norm": 0.5698373913764954,
615
- "learning_rate": 1.777906727437979e-06,
616
- "loss": 0.0127,
617
- "mean_token_accuracy": 0.9959760665893554,
618
- "num_input_tokens_seen": 19765536,
619
- "num_tokens": 19765536.0,
620
- "step": 3050
621
- },
622
- {
623
- "epoch": 5.7835820895522385,
624
- "grad_norm": 0.5558890104293823,
625
- "learning_rate": 1.5398230192692275e-06,
626
- "loss": 0.0117,
627
- "mean_token_accuracy": 0.9963926291465759,
628
- "num_input_tokens_seen": 20082350,
629
- "num_tokens": 20082350.0,
630
- "step": 3100
631
- },
632
- {
633
- "epoch": 5.8768656716417915,
634
- "grad_norm": 0.46126776933670044,
635
- "learning_rate": 1.3175177884603252e-06,
636
- "loss": 0.0112,
637
- "mean_token_accuracy": 0.9968423485755921,
638
- "num_input_tokens_seen": 20401204,
639
- "num_tokens": 20401204.0,
640
- "step": 3150
641
- },
642
- {
643
- "epoch": 5.970149253731344,
644
- "grad_norm": 0.4649079144001007,
645
- "learning_rate": 1.1114056407647045e-06,
646
- "loss": 0.0111,
647
- "mean_token_accuracy": 0.9965043890476227,
648
- "num_input_tokens_seen": 20730370,
649
- "num_tokens": 20730370.0,
650
- "step": 3200
651
- },
652
- {
653
- "epoch": 6.063432835820896,
654
- "grad_norm": 0.49266737699508667,
655
- "learning_rate": 9.218709813624749e-07,
656
- "loss": 0.012,
657
- "mean_token_accuracy": 0.9967284095287323,
658
- "num_input_tokens_seen": 21052760,
659
- "num_tokens": 21052760.0,
660
- "step": 3250
661
- },
662
- {
663
- "epoch": 6.156716417910448,
664
- "grad_norm": 0.38067445158958435,
665
- "learning_rate": 7.492672979335147e-07,
666
- "loss": 0.0069,
667
- "mean_token_accuracy": 0.998069132566452,
668
- "num_input_tokens_seen": 21373624,
669
- "num_tokens": 21373624.0,
670
- "step": 3300
671
- },
672
- {
673
- "epoch": 6.25,
674
- "grad_norm": 0.43708378076553345,
675
- "learning_rate": 5.939165013926195e-07,
676
- "loss": 0.0054,
677
- "mean_token_accuracy": 0.9985744786262513,
678
- "num_input_tokens_seen": 21701456,
679
- "num_tokens": 21701456.0,
680
- "step": 3350
681
- },
682
- {
683
- "epoch": 6.343283582089552,
684
- "grad_norm": 0.5964256525039673,
685
- "learning_rate": 4.56108325516238e-07,
686
- "loss": 0.0066,
687
- "mean_token_accuracy": 0.9981687092781066,
688
- "num_input_tokens_seen": 22028186,
689
- "num_tokens": 22028186.0,
690
- "step": 3400
691
- },
692
- {
693
- "epoch": 6.436567164179104,
694
- "grad_norm": 0.5558670163154602,
695
- "learning_rate": 3.3609978658051045e-07,
696
- "loss": 0.0065,
697
- "mean_token_accuracy": 0.9981926000118255,
698
- "num_input_tokens_seen": 22357840,
699
- "num_tokens": 22357840.0,
700
- "step": 3450
701
- },
702
- {
703
- "epoch": 6.529850746268656,
704
- "grad_norm": 0.5026367902755737,
705
- "learning_rate": 2.341147040184011e-07,
706
- "loss": 0.0052,
707
- "mean_token_accuracy": 0.9985788524150848,
708
- "num_input_tokens_seen": 22688738,
709
- "num_tokens": 22688738.0,
710
- "step": 3500
711
- },
712
- {
713
- "epoch": 6.6231343283582085,
714
- "grad_norm": 0.2897884249687195,
715
- "learning_rate": 1.5034328298990652e-07,
716
- "loss": 0.0046,
717
- "mean_token_accuracy": 0.9987975347042084,
718
- "num_input_tokens_seen": 23010742,
719
- "num_tokens": 23010742.0,
720
- "step": 3550
721
- },
722
- {
723
- "epoch": 6.7164179104477615,
724
- "grad_norm": 0.43537065386772156,
725
- "learning_rate": 8.494175964388285e-08,
726
- "loss": 0.0039,
727
- "mean_token_accuracy": 0.9988950288295746,
728
- "num_input_tokens_seen": 23329282,
729
- "num_tokens": 23329282.0,
730
- "step": 3600
731
- },
732
- {
733
- "epoch": 6.809701492537314,
734
- "grad_norm": 0.36725738644599915,
735
- "learning_rate": 3.803210973305715e-08,
736
- "loss": 0.003,
737
- "mean_token_accuracy": 0.9991644871234894,
738
- "num_input_tokens_seen": 23647522,
739
- "num_tokens": 23647522.0,
740
- "step": 3650
741
- },
742
- {
743
- "epoch": 6.902985074626866,
744
- "grad_norm": 0.531343400478363,
745
- "learning_rate": 9.7018211256783e-09,
746
- "loss": 0.0046,
747
- "mean_token_accuracy": 0.9987628531455993,
748
- "num_input_tokens_seen": 23963558,
749
- "num_tokens": 23963558.0,
750
- "step": 3700
751
- },
752
- {
753
- "epoch": 6.996268656716418,
754
- "grad_norm": 0.6237491369247437,
755
- "learning_rate": 3.7306380940016486e-12,
756
- "loss": 0.0051,
757
- "mean_token_accuracy": 0.9986482226848602,
758
- "num_input_tokens_seen": 24290780,
759
- "num_tokens": 24290780.0,
760
- "step": 3750
761
- },
762
- {
763
- "epoch": 6.996268656716418,
764
- "num_input_tokens_seen": 24290780,
765
- "step": 3750,
766
- "total_flos": 1.4113858770069094e+17,
767
- "train_loss": 0.21003443336486816,
768
- "train_runtime": 66532.5206,
769
- "train_samples_per_second": 0.451,
770
- "train_steps_per_second": 0.056,
771
- "train_tokens_per_second": 70.398
772
  }
773
  ],
774
  "logging_steps": 50,
775
- "max_steps": 3750,
776
- "num_input_tokens_seen": 24290780,
777
- "num_train_epochs": 7,
778
  "save_steps": 0,
779
  "stateful_callbacks": {
780
  "TrainerControl": {
@@ -788,7 +288,7 @@
788
  "attributes": {}
789
  }
790
  },
791
- "total_flos": 1.4113858770069094e+17,
792
  "train_batch_size": 1,
793
  "trial_name": null,
794
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 1250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.04,
14
+ "grad_norm": 16.125,
15
+ "learning_rate": 1.9995935375248608e-05,
16
+ "loss": 0.6783,
17
+ "mean_token_accuracy": 0.861860990524292,
18
+ "num_input_tokens_seen": 216880,
19
+ "num_tokens": 216880.0,
20
  "step": 50
21
  },
22
  {
23
+ "epoch": 0.08,
24
+ "grad_norm": 9.0625,
25
+ "learning_rate": 1.98752561390399e-05,
26
+ "loss": 0.3562,
27
+ "mean_token_accuracy": 0.9200676095485687,
28
+ "num_input_tokens_seen": 462464,
29
+ "num_tokens": 462464.0,
30
  "step": 100
31
  },
32
  {
33
+ "epoch": 0.12,
34
+ "grad_norm": 0.1494140625,
35
+ "learning_rate": 1.9588933215113926e-05,
36
+ "loss": 0.2534,
37
+ "mean_token_accuracy": 0.9416316163539886,
38
+ "num_input_tokens_seen": 724752,
39
+ "num_tokens": 724752.0,
40
  "step": 150
41
  },
42
  {
43
+ "epoch": 0.16,
44
+ "grad_norm": 0.921875,
45
+ "learning_rate": 1.9141769272315857e-05,
46
+ "loss": 0.203,
47
+ "mean_token_accuracy": 0.9502561473846436,
48
+ "num_input_tokens_seen": 895104,
49
+ "num_tokens": 895104.0,
50
  "step": 200
51
  },
52
  {
53
+ "epoch": 0.2,
54
+ "grad_norm": 22.875,
55
+ "learning_rate": 1.8541264863892755e-05,
56
+ "loss": 0.258,
57
+ "mean_token_accuracy": 0.939533520936966,
58
+ "num_input_tokens_seen": 1160016,
59
+ "num_tokens": 1160016.0,
60
  "step": 250
61
  },
62
  {
63
+ "epoch": 0.24,
64
+ "grad_norm": 7.90625,
65
+ "learning_rate": 1.7797492616144256e-05,
66
+ "loss": 0.277,
67
+ "mean_token_accuracy": 0.932598946094513,
68
+ "num_input_tokens_seen": 1447552,
69
+ "num_tokens": 1447552.0,
70
  "step": 300
71
  },
72
  {
73
+ "epoch": 0.28,
74
+ "grad_norm": 0.2470703125,
75
+ "learning_rate": 1.6922928274124887e-05,
76
+ "loss": 0.2399,
77
+ "mean_token_accuracy": 0.943250241279602,
78
+ "num_input_tokens_seen": 1751760,
79
+ "num_tokens": 1751760.0,
80
  "step": 350
81
  },
82
  {
83
+ "epoch": 0.32,
84
+ "grad_norm": 0.060791015625,
85
+ "learning_rate": 1.593224143837142e-05,
86
+ "loss": 0.2545,
87
+ "mean_token_accuracy": 0.9372260522842407,
88
+ "num_input_tokens_seen": 2040064,
89
+ "num_tokens": 2040064.0,
90
  "step": 400
91
  },
92
  {
93
+ "epoch": 0.36,
94
+ "grad_norm": 28.625,
95
+ "learning_rate": 1.484204950275565e-05,
96
+ "loss": 0.2971,
97
+ "mean_token_accuracy": 0.9284321248531342,
98
+ "num_input_tokens_seen": 2226672,
99
+ "num_tokens": 2226672.0,
100
  "step": 450
101
  },
102
  {
103
+ "epoch": 0.4,
104
+ "grad_norm": 14.5,
105
+ "learning_rate": 1.36706389208128e-05,
106
+ "loss": 0.3084,
107
+ "mean_token_accuracy": 0.9283360493183136,
108
+ "num_input_tokens_seen": 2481312,
109
+ "num_tokens": 2481312.0,
110
  "step": 500
111
  },
112
  {
113
+ "epoch": 0.44,
114
+ "grad_norm": 4.625,
115
+ "learning_rate": 1.2437658475915378e-05,
116
+ "loss": 0.285,
117
+ "mean_token_accuracy": 0.9289000463485718,
118
+ "num_input_tokens_seen": 2681408,
119
+ "num_tokens": 2681408.0,
120
  "step": 550
121
  },
122
  {
123
+ "epoch": 0.48,
124
+ "grad_norm": 0.2138671875,
125
+ "learning_rate": 1.1163789700258656e-05,
126
+ "loss": 0.3252,
127
+ "mean_token_accuracy": 0.9217388617992401,
128
+ "num_input_tokens_seen": 2908288,
129
+ "num_tokens": 2908288.0,
130
  "step": 600
131
  },
132
  {
133
+ "epoch": 0.52,
134
+ "grad_norm": 37.25,
135
+ "learning_rate": 9.870399970920932e-06,
136
+ "loss": 0.4363,
137
+ "mean_token_accuracy": 0.8960465312004089,
138
+ "num_input_tokens_seen": 3073056,
139
+ "num_tokens": 3073056.0,
140
  "step": 650
141
  },
142
  {
143
+ "epoch": 0.56,
144
+ "grad_norm": 26.375,
145
+ "learning_rate": 8.579184101829734e-06,
146
+ "loss": 0.3453,
147
+ "mean_token_accuracy": 0.9128145408630371,
148
+ "num_input_tokens_seen": 3346848,
149
+ "num_tokens": 3346848.0,
150
  "step": 700
151
  },
152
  {
153
+ "epoch": 0.6,
154
+ "grad_norm": 29.25,
155
+ "learning_rate": 7.311800443430251e-06,
156
+ "loss": 0.5762,
157
+ "mean_token_accuracy": 0.8654252612590789,
158
+ "num_input_tokens_seen": 3611920,
159
+ "num_tokens": 3611920.0,
160
  "step": 750
161
  },
162
  {
163
+ "epoch": 0.64,
164
+ "grad_norm": 7.96875,
165
+ "learning_rate": 6.0895075939779705e-06,
166
+ "loss": 0.4856,
167
+ "mean_token_accuracy": 0.8799186503887176,
168
+ "num_input_tokens_seen": 3815936,
169
+ "num_tokens": 3815936.0,
170
  "step": 800
171
  },
172
  {
173
+ "epoch": 0.68,
174
+ "grad_norm": 77.5,
175
+ "learning_rate": 4.932807816118347e-06,
176
+ "loss": 0.536,
177
+ "mean_token_accuracy": 0.8713834238052368,
178
+ "num_input_tokens_seen": 4003536,
179
+ "num_tokens": 4003536.0,
180
  "step": 850
181
  },
182
  {
183
+ "epoch": 0.72,
184
+ "grad_norm": 36.5,
185
+ "learning_rate": 3.861103139944448e-06,
186
+ "loss": 0.6726,
187
+ "mean_token_accuracy": 0.8307892644405365,
188
+ "num_input_tokens_seen": 4219808,
189
+ "num_tokens": 4219808.0,
190
  "step": 900
191
  },
192
  {
193
+ "epoch": 0.76,
194
+ "grad_norm": 38.0,
195
+ "learning_rate": 2.8923699209255285e-06,
196
+ "loss": 0.7238,
197
+ "mean_token_accuracy": 0.8186173605918884,
198
+ "num_input_tokens_seen": 4497936,
199
+ "num_tokens": 4497936.0,
200
  "step": 950
201
  },
202
  {
203
+ "epoch": 0.8,
204
+ "grad_norm": 41.0,
205
+ "learning_rate": 2.0428573115446394e-06,
206
+ "loss": 0.8584,
207
+ "mean_token_accuracy": 0.7786743235588074,
208
+ "num_input_tokens_seen": 4729216,
209
+ "num_tokens": 4729216.0,
210
  "step": 1000
211
  },
212
  {
213
+ "epoch": 0.84,
214
+ "grad_norm": 47.5,
215
+ "learning_rate": 1.326814704364262e-06,
216
+ "loss": 1.0424,
217
+ "mean_token_accuracy": 0.7482965195178986,
218
+ "num_input_tokens_seen": 4953984,
219
+ "num_tokens": 4953984.0,
220
  "step": 1050
221
  },
222
  {
223
+ "epoch": 0.88,
224
+ "grad_norm": 57.0,
225
+ "learning_rate": 7.562527182833978e-07,
226
+ "loss": 1.0274,
227
+ "mean_token_accuracy": 0.7586082279682159,
228
+ "num_input_tokens_seen": 5136224,
229
+ "num_tokens": 5136224.0,
230
  "step": 1100
231
  },
232
  {
233
+ "epoch": 0.92,
234
+ "grad_norm": 64.0,
235
+ "learning_rate": 3.4074173710931804e-07,
236
+ "loss": 1.1236,
237
+ "mean_token_accuracy": 0.7324086999893189,
238
+ "num_input_tokens_seen": 5339136,
239
+ "num_tokens": 5339136.0,
240
  "step": 1150
241
  },
242
  {
243
+ "epoch": 0.96,
244
+ "grad_norm": 103.0,
245
+ "learning_rate": 8.725137967920739e-08,
246
+ "loss": 1.1653,
247
+ "mean_token_accuracy": 0.7305689966678619,
248
+ "num_input_tokens_seen": 5558144,
249
+ "num_tokens": 5558144.0,
250
  "step": 1200
251
  },
252
  {
253
+ "epoch": 1.0,
254
+ "grad_norm": 53.0,
255
+ "learning_rate": 3.3594197175190743e-11,
256
+ "loss": 0.9751,
257
+ "mean_token_accuracy": 0.7792558777332306,
258
+ "num_input_tokens_seen": 5798240,
259
+ "num_tokens": 5798240.0,
260
  "step": 1250
261
  },
262
  {
263
+ "epoch": 1.0,
264
+ "num_input_tokens_seen": 5798240,
265
+ "step": 1250,
266
+ "total_flos": 3.3689960431222784e+16,
267
+ "train_loss": 0.5481613777160644,
268
+ "train_runtime": 10837.9895,
269
+ "train_samples_per_second": 0.923,
270
+ "train_steps_per_second": 0.115,
271
+ "train_tokens_per_second": 41.751
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  }
273
  ],
274
  "logging_steps": 50,
275
+ "max_steps": 1250,
276
+ "num_input_tokens_seen": 5798240,
277
+ "num_train_epochs": 1,
278
  "save_steps": 0,
279
  "stateful_callbacks": {
280
  "TrainerControl": {
 
288
  "attributes": {}
289
  }
290
  },
291
+ "total_flos": 3.3689960431222784e+16,
292
  "train_batch_size": 1,
293
  "trial_name": null,
294
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ceac200286f0cf175e77ab1763672204503403d9d5009bfa55db268c97ee492
3
  size 6161
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf29339948365a32eba15c3574185bfe9655ed282514047a80622f117445ebf1
3
  size 6161