penfever commited on Jul 10, 2025

Commit

3b8148a

verified ·

1 Parent(s): 05ddcd3

Add files using upload-large-folder tool

Browse files

Files changed (22) hide show

config.json +1 -1
logs/rank_0000.log +55 -60
logs/rank_0001.log +30 -29
logs/rank_0002.log +30 -29
logs/rank_0003.log +30 -29
logs/rank_0004.log +30 -29
logs/rank_0005.log +30 -29
logs/rank_0006.log +30 -29
logs/rank_0007.log +30 -29
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +0 -0
runs/Jul09_18-42-29_oumi-compute002/events.out.tfevents.1752086716.oumi-compute002.1196049.0 +3 -0
telemetry/telemetry_callback_metrics_rank0000.json +8 -8
telemetry/telemetry_callback_rank0000.json +26 -26
telemetry/telemetry_callback_wandb_rank0000.json +2 -2
telemetry/training_config.yaml +13 -13
tokenizer_config.json +1 -1
trainer_state.json +190 -690
training_args.bin +1 -1

config.json CHANGED Viewed

@@ -29,7 +29,7 @@
   "rope_theta": 1000000.0,
   "sliding_window": 32768,
   "tie_word_embeddings": false,
-  "torch_dtype": "float32",
   "transformers_version": "4.51.3",
   "use_cache": false,
   "use_sliding_window": false,

   "rope_theta": 1000000.0,
   "sliding_window": 32768,
   "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
   "transformers_version": "4.51.3",
   "use_cache": false,
   "use_sliding_window": false,

logs/rank_0000.log CHANGED Viewed

@@ -1,4 +1,4 @@
-[2025-07-04 10:07:00,006][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:283] TrainingConfig:
 TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(dataset_name='hf_vision',
                                                                                 dataset_path=None,
                                                                                 subset=None,
@@ -52,13 +52,13 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
                                  tokenizer_pad_token=None,
                                  tokenizer_kwargs={},
                                  processor_kwargs={},
-                                 model_max_length=10000,
                                  load_pretrained_weights=True,
                                  trust_remote_code=True,
                                  torch_dtype_str='bfloat16',
                                  compile=False,
                                  chat_template='qwen2-vl-instruct',
-                                 attn_implementation='sdpa',
                                  device_map='auto',
                                  model_kwargs={},
                                  enable_liger_kernel=False,
@@ -73,7 +73,7 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
                                        per_device_train_batch_size=1,
                                        per_device_eval_batch_size=8,
                                        gradient_accumulation_steps=1,
-                                       max_steps=3750,
                                        num_train_epochs=5,
                                        save_epoch=False,
                                        save_steps=0,
@@ -93,7 +93,7 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
                                                        remove_unused_columns=False,
                                                        repetition_penalty=1.0,
                                                        use_vllm=False,
-                                                       vllm_device=None,
                                                        vllm_gpu_memory_utilization=0.9,
                                                        vllm_dtype=None,
                                                        vllm_max_model_len=None,
@@ -128,14 +128,14 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
                                        log_model_summary=False,
                                        resume_from_checkpoint=None,
                                        try_resume_from_last_checkpoint=False,
-                                       dataloader_num_workers=2,
                                        dataloader_persistent_workers=False,
-                                       dataloader_prefetch_factor=8,
-                                       dataloader_main_process_only=False,
                                        ddp_find_unused_parameters=False,
                                        max_grad_norm=1.0,
                                        trainer_kwargs={'dataset_kwargs': {'skip_prepare_dataset': True},
-                                                       'max_seq_length': 10000,
                                                        'remove_unused_columns': False},
                                        verl_config_overrides={},
                                        profiler=ProfilerParams(save_dir=None,
@@ -176,51 +176,52 @@ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(
                                bnb_4bit_compute_dtype='float32',
                                peft_save_mode=<PeftSaveMode.ADAPTER_ONLY: 'adapter_only'>),
                fsdp=FSDPParams(enable_fsdp=True,
-                               sharding_strategy=<ShardingStrategy.HYBRID_SHARD: 'HYBRID_SHARD'>,
                                cpu_offload=False,
-                               mixed_precision='bf16',
                                backward_prefetch=<BackwardPrefetch.BACKWARD_PRE: 'BACKWARD_PRE'>,
                                forward_prefetch=True,
-                               use_orig_params=None,
                                state_dict_type=<StateDictType.FULL_STATE_DICT: 'FULL_STATE_DICT'>,
-                               auto_wrap_policy=<AutoWrapPolicy.SIZE_BASED_WRAP: 'SIZE_BASED_WRAP'>,
                                min_num_params=100000,
-                               transformer_layer_cls=None,
                                sync_module_states=True))
-[2025-07-04 10:07:00,227][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
-[2025-07-04 10:07:02,023][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
-[2025-07-04 10:07:03,883][oumi][rank0][pid:2619050][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
-[2025-07-04 10:07:03,883][oumi][rank0][pid:2619050][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
-[2025-07-04 10:07:05,192][oumi][rank0][pid:2619050][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
-	Dataset size: 57058499
-	Download size: 48789762
-	Size: 105848261 bytes
-	Rows: 4286
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
-[2025-07-04 10:07:07,950][oumi][rank0][pid:2619050][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
-[2025-07-04 10:07:21,068][oumi][rank0][pid:2619050][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0)
-[2025-07-04 10:07:21,069][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
-[2025-07-04 10:07:21,069][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
-[2025-07-04 10:07:21,069][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0))...
-[2025-07-04 10:07:21,105][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
-[2025-07-04 10:07:23,607][oumi][rank0][pid:2619050][MainThread][INFO]][torch_utils.py:289]
 Model Parameters Summary:
 🔢 Total     parameters: 8,292,166,656
 🔗 Embedding parameters: 544,997,376
 🎯 Trainable parameters: 8,292,166,656
 🔒 Frozen    parameters: 0 (0.00%)
-[2025-07-04 10:07:24,853][oumi][rank0][pid:2619050][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
-[2025-07-04 10:07:24,853][oumi][rank0][pid:2619050][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
-[2025-07-04 10:07:25,176][oumi][rank0][pid:2619050][MainThread][INFO]][training.py:62] SFTConfig(output_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
           overwrite_output_dir=False,
           do_train=False,
           do_eval=False,
@@ -242,7 +243,7 @@ Model Parameters Summary:
           adam_epsilon=1e-08,
           max_grad_norm=1.0,
           num_train_epochs=5,
-          max_steps=3750,
           lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>,
           lr_scheduler_kwargs={},
           warmup_ratio=0.03,
@@ -250,7 +251,7 @@ Model Parameters Summary:
           log_level='warning',
           log_level_replica='warning',
           log_on_each_node=True,
-          logging_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/runs/Jul04_10-07-24_oumi-compute004',
           logging_strategy=<IntervalStrategy.STEPS: 'steps'>,
           logging_first_step=False,
           logging_steps=50,
@@ -283,8 +284,8 @@ Model Parameters Summary:
           debug=[],
           dataloader_drop_last=False,
           eval_steps=500,
-          dataloader_num_workers=2,
-          dataloader_prefetch_factor=8,
           past_index=-1,
           run_name='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
           disable_tqdm=False,
@@ -303,7 +304,7 @@ Model Parameters Summary:
           tp_size=0,
           fsdp_transformer_layer_cls_to_wrap=None,
           accelerator_config=AcceleratorConfig(split_batches=False,
-                                               dispatch_batches=False,
                                                even_batches=True,
                                                use_seedable_sampler=True,
                                                non_blocking=False,
@@ -363,34 +364,28 @@ Model Parameters Summary:
           dataset_kwargs={'skip_prepare_dataset': True},
           dataset_num_proc=None,
           pad_token=None,
-          max_length=10000,
           packing=False,
           padding_free=False,
           eval_packing=None,
           dataset_batch_size=None,
           num_of_sequences=None,
           chars_per_token=None,
-          max_seq_length=10000,
           use_liger=None)
-[2025-07-04 10:07:25,232][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.434, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 10:07:25,287][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:510] Training init time: 29.326s
-[2025-07-04 10:07:25,287][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
-[2025-07-04 12:47:15,514][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=193.085, power_limit_watts=700.0, gpu_utilization=11, memory_utilization=2, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 15:25:44,400][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=187.86, power_limit_watts=700.0, gpu_utilization=32, memory_utilization=9, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 18:04:39,027][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=188.725, power_limit_watts=700.0, gpu_utilization=49, memory_utilization=3, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 20:43:20,054][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=193.78, power_limit_watts=700.0, gpu_utilization=35, memory_utilization=2, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 23:21:14,672][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=191.45000000000002, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 01:59:05,749][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=35, fan_speed=None, fan_speeds=None, power_usage_watts=187.851, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:20,509][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=191.013, power_limit_watts=700.0, gpu_utilization=36, memory_utilization=3, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:20,705][oumi][rank0][pid:2619050][MainThread][INFO]][telemetry_callback.py:242] Saving telemetry callback summary to output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/telemetry/telemetry_callback_rank0000.json...
-[2025-07-05 04:37:21,418][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:518] Training is Complete.
-[2025-07-05 04:37:21,441][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:21,442][oumi][rank0][pid:2619050][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 29.37 GB
-[2025-07-05 04:37:21,442][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:525] Saving final state...
-[2025-07-05 04:37:21,451][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:530] Saving final model...
-[2025-07-05 04:37:21,454][oumi][rank0][pid:2619050][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
-[2025-07-05 04:43:01,650][oumi][rank0][pid:2619050][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:43:05,405][oumi][rank0][pid:2619050][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:43:07,241][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

+[2025-07-09 18:42:07,302][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:279] TrainingConfig:
 TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(dataset_name='hf_vision',
                                                                                 dataset_path=None,
                                                                                 subset=None,
                                  tokenizer_pad_token=None,
                                  tokenizer_kwargs={},
                                  processor_kwargs={},
+                                 model_max_length=16384,
                                  load_pretrained_weights=True,
                                  trust_remote_code=True,
                                  torch_dtype_str='bfloat16',
                                  compile=False,
                                  chat_template='qwen2-vl-instruct',
+                                 attn_implementation='flash_attention_2',
                                  device_map='auto',
                                  model_kwargs={},
                                  enable_liger_kernel=False,
                                        per_device_train_batch_size=1,
                                        per_device_eval_batch_size=8,
                                        gradient_accumulation_steps=1,
+                                       max_steps=1250,
                                        num_train_epochs=5,
                                        save_epoch=False,
                                        save_steps=0,
                                                        remove_unused_columns=False,
                                                        repetition_penalty=1.0,
                                                        use_vllm=False,
+                                                       vllm_mode=None,
                                                        vllm_gpu_memory_utilization=0.9,
                                                        vllm_dtype=None,
                                                        vllm_max_model_len=None,
                                        log_model_summary=False,
                                        resume_from_checkpoint=None,
                                        try_resume_from_last_checkpoint=False,
+                                       dataloader_num_workers=64,
                                        dataloader_persistent_workers=False,
+                                       dataloader_prefetch_factor=32,
+                                       dataloader_main_process_only=None,
                                        ddp_find_unused_parameters=False,
                                        max_grad_norm=1.0,
                                        trainer_kwargs={'dataset_kwargs': {'skip_prepare_dataset': True},
+                                                       'max_seq_length': 16384,
                                                        'remove_unused_columns': False},
                                        verl_config_overrides={},
                                        profiler=ProfilerParams(save_dir=None,
                                bnb_4bit_compute_dtype='float32',
                                peft_save_mode=<PeftSaveMode.ADAPTER_ONLY: 'adapter_only'>),
                fsdp=FSDPParams(enable_fsdp=True,
+                               sharding_strategy=<ShardingStrategy.FULL_SHARD: 'FULL_SHARD'>,
                                cpu_offload=False,
+                               mixed_precision=None,
                                backward_prefetch=<BackwardPrefetch.BACKWARD_PRE: 'BACKWARD_PRE'>,
                                forward_prefetch=True,
+                               use_orig_params=True,
                                state_dict_type=<StateDictType.FULL_STATE_DICT: 'FULL_STATE_DICT'>,
+                               auto_wrap_policy=<AutoWrapPolicy.TRANSFORMER_BASED_WRAP: 'TRANSFORMER_BASED_WRAP'>,
                                min_num_params=100000,
+                               transformer_layer_cls='transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer',
                                sync_module_states=True))
+[2025-07-09 18:42:07,490][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
+[2025-07-09 18:42:08,862][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
+[2025-07-09 18:42:09,767][oumi][rank0][pid:1196049][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
+[2025-07-09 18:42:09,767][oumi][rank0][pid:1196049][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
+[2025-07-09 18:42:11,870][oumi][rank0][pid:1196049][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
+	Dataset size: 109368497
+	Download size: 96255659
+	Size: 205624156 bytes
+	Rows: 10000
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
+[2025-07-09 18:42:14,502][oumi][rank0][pid:1196049][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
+[2025-07-09 18:42:26,192][oumi][rank0][pid:1196049][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0)
+[2025-07-09 18:42:26,193][oumi][rank0][pid:1196049][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
+[2025-07-09 18:42:26,195][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
+[2025-07-09 18:42:26,195][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
+[2025-07-09 18:42:26,195][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0))...
+[2025-07-09 18:42:26,237][oumi][rank0][pid:1196049][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
+[2025-07-09 18:42:27,434][oumi][rank0][pid:1196049][MainThread][INFO]][torch_utils.py:289]
 Model Parameters Summary:
 🔢 Total     parameters: 8,292,166,656
 🔗 Embedding parameters: 544,997,376
 🎯 Trainable parameters: 8,292,166,656
 🔒 Frozen    parameters: 0 (0.00%)
+[2025-07-09 18:42:29,502][oumi][rank0][pid:1196049][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
+[2025-07-09 18:42:29,502][oumi][rank0][pid:1196049][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
+[2025-07-09 18:42:29,751][oumi][rank0][pid:1196049][MainThread][INFO]][training.py:62] SFTConfig(output_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
           overwrite_output_dir=False,
           do_train=False,
           do_eval=False,
           adam_epsilon=1e-08,
           max_grad_norm=1.0,
           num_train_epochs=5,
+          max_steps=1250,
           lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>,
           lr_scheduler_kwargs={},
           warmup_ratio=0.03,
           log_level='warning',
           log_level_replica='warning',
           log_on_each_node=True,
+          logging_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/runs/Jul09_18-42-29_oumi-compute002',
           logging_strategy=<IntervalStrategy.STEPS: 'steps'>,
           logging_first_step=False,
           logging_steps=50,
           debug=[],
           dataloader_drop_last=False,
           eval_steps=500,
+          dataloader_num_workers=64,
+          dataloader_prefetch_factor=32,
           past_index=-1,
           run_name='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
           disable_tqdm=False,
           tp_size=0,
           fsdp_transformer_layer_cls_to_wrap=None,
           accelerator_config=AcceleratorConfig(split_batches=False,
+                                               dispatch_batches=None,
                                                even_batches=True,
                                                use_seedable_sampler=True,
                                                non_blocking=False,
           dataset_kwargs={'skip_prepare_dataset': True},
           dataset_num_proc=None,
           pad_token=None,
+          max_length=16384,
           packing=False,
           padding_free=False,
           eval_packing=None,
           dataset_batch_size=None,
           num_of_sequences=None,
           chars_per_token=None,
+          max_seq_length=16384,
           use_liger=None)
+[2025-07-09 18:42:29,820][oumi][rank0][pid:1196049][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 18:42:29,911][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:506] Training init time: 27.219s
+[2025-07-09 18:42:29,911][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
+[2025-07-09 21:45:52,946][oumi][rank0][pid:1196049][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=156.229, power_limit_watts=700.0, gpu_utilization=71, memory_utilization=7, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 21:45:55,125][oumi][rank0][pid:1196049][MainThread][INFO]][telemetry_callback.py:242] Saving telemetry callback summary to output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/telemetry/telemetry_callback_rank0000.json...
+[2025-07-09 21:46:14,567][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:514] Training is Complete.
+[2025-07-09 21:46:14,567][oumi][rank0][pid:1196049][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 21:46:14,568][oumi][rank0][pid:1196049][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.33 GB
+[2025-07-09 21:46:14,572][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:521] Saving final state...
+[2025-07-09 21:46:14,590][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:526] Saving final model...
+[2025-07-09 21:46:14,591][oumi][rank0][pid:1196049][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
+[2025-07-09 21:48:54,605][oumi][rank0][pid:1196049][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:48:57,509][oumi][rank0][pid:1196049][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:48:58,046][oumi][rank0][pid:1196049][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0001.log CHANGED Viewed

@@ -1,40 +1,41 @@
-[2025-07-04 10:06:56,067][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
-[2025-07-04 10:06:57,882][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
-[2025-07-04 10:07:03,033][oumi][rank1][pid:2619051][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
-[2025-07-04 10:07:03,033][oumi][rank1][pid:2619051][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
-[2025-07-04 10:07:05,285][oumi][rank1][pid:2619051][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
-	Dataset size: 57058499
-	Download size: 48789762
-	Size: 105848261 bytes
-	Rows: 4286
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
-[2025-07-04 10:07:07,699][oumi][rank1][pid:2619051][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
-[2025-07-04 10:07:21,069][oumi][rank1][pid:2619051][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1)
-[2025-07-04 10:07:21,070][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
-[2025-07-04 10:07:21,070][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
-[2025-07-04 10:07:21,071][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1))...
-[2025-07-04 10:07:21,100][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
-[2025-07-04 10:07:24,479][oumi][rank1][pid:2619051][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
-[2025-07-04 10:07:24,479][oumi][rank1][pid:2619051][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
-[2025-07-04 10:07:24,827][oumi][rank1][pid:2619051][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 10:07:25,288][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:510] Training init time: 29.332s
-[2025-07-04 10:07:25,288][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
-[2025-07-05 04:37:21,400][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:518] Training is Complete.
-[2025-07-05 04:37:21,406][oumi][rank1][pid:2619051][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:21,407][oumi][rank1][pid:2619051][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 26.58 GB
-[2025-07-05 04:37:21,407][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:525] Saving final state...
-[2025-07-05 04:37:21,453][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:530] Saving final model...
-[2025-07-05 04:37:21,453][oumi][rank1][pid:2619051][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
-[2025-07-05 04:40:16,622][oumi][rank1][pid:2619051][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:40:20,790][oumi][rank1][pid:2619051][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:43:06,891][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

+[2025-07-09 18:42:02,781][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
+[2025-07-09 18:42:04,286][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
+[2025-07-09 18:42:08,705][oumi][rank1][pid:1196050][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
+[2025-07-09 18:42:08,706][oumi][rank1][pid:1196050][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
+[2025-07-09 18:42:11,867][oumi][rank1][pid:1196050][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
+	Dataset size: 109368497
+	Download size: 96255659
+	Size: 205624156 bytes
+	Rows: 10000
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
+[2025-07-09 18:42:14,493][oumi][rank1][pid:1196050][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
+[2025-07-09 18:42:26,186][oumi][rank1][pid:1196050][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1)
+[2025-07-09 18:42:26,187][oumi][rank1][pid:1196050][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
+[2025-07-09 18:42:26,187][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
+[2025-07-09 18:42:26,187][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
+[2025-07-09 18:42:26,187][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1))...
+[2025-07-09 18:42:26,219][oumi][rank1][pid:1196050][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
+[2025-07-09 18:42:29,441][oumi][rank1][pid:1196050][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
+[2025-07-09 18:42:29,442][oumi][rank1][pid:1196050][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
+[2025-07-09 18:42:29,825][oumi][rank1][pid:1196050][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 18:42:29,895][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:506] Training init time: 27.203s
+[2025-07-09 18:42:29,895][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
+[2025-07-09 21:46:14,533][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:514] Training is Complete.
+[2025-07-09 21:46:14,538][oumi][rank1][pid:1196050][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 21:46:14,538][oumi][rank1][pid:1196050][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.34 GB
+[2025-07-09 21:46:14,538][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:521] Saving final state...
+[2025-07-09 21:46:14,591][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:526] Saving final model...
+[2025-07-09 21:46:14,591][oumi][rank1][pid:1196050][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
+[2025-07-09 21:47:28,950][oumi][rank1][pid:1196050][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:47:32,606][oumi][rank1][pid:1196050][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:48:57,834][oumi][rank1][pid:1196050][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0002.log CHANGED Viewed

@@ -1,40 +1,41 @@
-[2025-07-04 10:06:56,062][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
-[2025-07-04 10:06:57,926][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
-[2025-07-04 10:07:02,953][oumi][rank2][pid:2619052][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
-[2025-07-04 10:07:02,953][oumi][rank2][pid:2619052][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
-[2025-07-04 10:07:05,190][oumi][rank2][pid:2619052][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
-	Dataset size: 57058499
-	Download size: 48789762
-	Size: 105848261 bytes
-	Rows: 4286
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
-[2025-07-04 10:07:07,667][oumi][rank2][pid:2619052][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
-[2025-07-04 10:07:21,065][oumi][rank2][pid:2619052][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2)
-[2025-07-04 10:07:21,065][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
-[2025-07-04 10:07:21,066][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
-[2025-07-04 10:07:21,066][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2))...
-[2025-07-04 10:07:21,094][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
-[2025-07-04 10:07:24,380][oumi][rank2][pid:2619052][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
-[2025-07-04 10:07:24,381][oumi][rank2][pid:2619052][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
-[2025-07-04 10:07:24,794][oumi][rank2][pid:2619052][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 10:07:25,289][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:510] Training init time: 29.331s
-[2025-07-04 10:07:25,289][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
-[2025-07-05 04:37:21,430][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:518] Training is Complete.
-[2025-07-05 04:37:21,437][oumi][rank2][pid:2619052][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=127.816, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:21,438][oumi][rank2][pid:2619052][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 26.67 GB
-[2025-07-05 04:37:21,438][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:525] Saving final state...
-[2025-07-05 04:37:21,453][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:530] Saving final model...
-[2025-07-05 04:37:21,453][oumi][rank2][pid:2619052][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
-[2025-07-05 04:40:16,620][oumi][rank2][pid:2619052][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:40:20,693][oumi][rank2][pid:2619052][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:43:06,319][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

+[2025-07-09 18:42:02,789][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
+[2025-07-09 18:42:04,333][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
+[2025-07-09 18:42:08,778][oumi][rank2][pid:1196051][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
+[2025-07-09 18:42:08,779][oumi][rank2][pid:1196051][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
+[2025-07-09 18:42:11,873][oumi][rank2][pid:1196051][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
+	Dataset size: 109368497
+	Download size: 96255659
+	Size: 205624156 bytes
+	Rows: 10000
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
+[2025-07-09 18:42:14,489][oumi][rank2][pid:1196051][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
+[2025-07-09 18:42:26,181][oumi][rank2][pid:1196051][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2)
+[2025-07-09 18:42:26,183][oumi][rank2][pid:1196051][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
+[2025-07-09 18:42:26,183][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
+[2025-07-09 18:42:26,183][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
+[2025-07-09 18:42:26,183][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2))...
+[2025-07-09 18:42:26,222][oumi][rank2][pid:1196051][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
+[2025-07-09 18:42:29,442][oumi][rank2][pid:1196051][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
+[2025-07-09 18:42:29,442][oumi][rank2][pid:1196051][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
+[2025-07-09 18:42:29,813][oumi][rank2][pid:1196051][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 18:42:29,900][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:506] Training init time: 27.207s
+[2025-07-09 18:42:29,900][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
+[2025-07-09 21:46:14,551][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:514] Training is Complete.
+[2025-07-09 21:46:14,564][oumi][rank2][pid:1196051][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 21:46:14,564][oumi][rank2][pid:1196051][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.33 GB
+[2025-07-09 21:46:14,564][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:521] Saving final state...
+[2025-07-09 21:46:14,591][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:526] Saving final model...
+[2025-07-09 21:46:14,591][oumi][rank2][pid:1196051][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
+[2025-07-09 21:47:28,955][oumi][rank2][pid:1196051][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:47:32,430][oumi][rank2][pid:1196051][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:48:57,844][oumi][rank2][pid:1196051][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0003.log CHANGED Viewed

@@ -1,40 +1,41 @@
-[2025-07-04 10:06:56,066][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
-[2025-07-04 10:06:57,878][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
-[2025-07-04 10:07:02,977][oumi][rank3][pid:2619053][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
-[2025-07-04 10:07:02,978][oumi][rank3][pid:2619053][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
-[2025-07-04 10:07:05,195][oumi][rank3][pid:2619053][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
-	Dataset size: 57058499
-	Download size: 48789762
-	Size: 105848261 bytes
-	Rows: 4286
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
-[2025-07-04 10:07:07,990][oumi][rank3][pid:2619053][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
-[2025-07-04 10:07:21,052][oumi][rank3][pid:2619053][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3)
-[2025-07-04 10:07:21,054][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
-[2025-07-04 10:07:21,054][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
-[2025-07-04 10:07:21,054][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3))...
-[2025-07-04 10:07:21,084][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
-[2025-07-04 10:07:24,294][oumi][rank3][pid:2619053][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
-[2025-07-04 10:07:24,294][oumi][rank3][pid:2619053][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
-[2025-07-04 10:07:24,748][oumi][rank3][pid:2619053][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.277, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 10:07:25,301][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:510] Training init time: 29.344s
-[2025-07-04 10:07:25,301][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
-[2025-07-05 04:37:21,335][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:518] Training is Complete.
-[2025-07-05 04:37:21,341][oumi][rank3][pid:2619053][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:21,341][oumi][rank3][pid:2619053][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 31.98 GB
-[2025-07-05 04:37:21,341][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:525] Saving final state...
-[2025-07-05 04:37:21,453][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:530] Saving final model...
-[2025-07-05 04:37:21,453][oumi][rank3][pid:2619053][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
-[2025-07-05 04:40:16,620][oumi][rank3][pid:2619053][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:40:20,801][oumi][rank3][pid:2619053][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:43:06,312][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

+[2025-07-09 18:42:02,797][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
+[2025-07-09 18:42:04,341][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
+[2025-07-09 18:42:08,788][oumi][rank3][pid:1196052][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
+[2025-07-09 18:42:08,789][oumi][rank3][pid:1196052][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
+[2025-07-09 18:42:11,883][oumi][rank3][pid:1196052][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
+	Dataset size: 109368497
+	Download size: 96255659
+	Size: 205624156 bytes
+	Rows: 10000
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
+[2025-07-09 18:42:14,500][oumi][rank3][pid:1196052][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
+[2025-07-09 18:42:26,193][oumi][rank3][pid:1196052][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3)
+[2025-07-09 18:42:26,194][oumi][rank3][pid:1196052][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
+[2025-07-09 18:42:26,194][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
+[2025-07-09 18:42:26,194][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
+[2025-07-09 18:42:26,194][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3))...
+[2025-07-09 18:42:26,231][oumi][rank3][pid:1196052][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
+[2025-07-09 18:42:29,356][oumi][rank3][pid:1196052][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
+[2025-07-09 18:42:29,361][oumi][rank3][pid:1196052][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
+[2025-07-09 18:42:29,829][oumi][rank3][pid:1196052][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 18:42:29,898][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:506] Training init time: 27.206s
+[2025-07-09 18:42:29,898][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
+[2025-07-09 21:46:14,513][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:514] Training is Complete.
+[2025-07-09 21:46:14,532][oumi][rank3][pid:1196052][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 21:46:14,532][oumi][rank3][pid:1196052][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.32 GB
+[2025-07-09 21:46:14,532][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:521] Saving final state...
+[2025-07-09 21:46:14,592][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:526] Saving final model...
+[2025-07-09 21:46:14,595][oumi][rank3][pid:1196052][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
+[2025-07-09 21:47:28,950][oumi][rank3][pid:1196052][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:47:32,582][oumi][rank3][pid:1196052][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:48:57,854][oumi][rank3][pid:1196052][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0004.log CHANGED Viewed

@@ -1,40 +1,41 @@
-[2025-07-04 10:06:56,052][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
-[2025-07-04 10:06:57,909][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
-[2025-07-04 10:07:03,045][oumi][rank4][pid:2619054][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
-[2025-07-04 10:07:03,045][oumi][rank4][pid:2619054][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
-[2025-07-04 10:07:05,190][oumi][rank4][pid:2619054][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
-	Dataset size: 57058499
-	Download size: 48789762
-	Size: 105848261 bytes
-	Rows: 4286
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
-[2025-07-04 10:07:07,804][oumi][rank4][pid:2619054][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
-[2025-07-04 10:07:21,061][oumi][rank4][pid:2619054][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4)
-[2025-07-04 10:07:21,062][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
-[2025-07-04 10:07:21,062][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
-[2025-07-04 10:07:21,062][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4))...
-[2025-07-04 10:07:21,090][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
-[2025-07-04 10:07:24,417][oumi][rank4][pid:2619054][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
-[2025-07-04 10:07:24,417][oumi][rank4][pid:2619054][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
-[2025-07-04 10:07:24,786][oumi][rank4][pid:2619054][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 10:07:25,287][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:510] Training init time: 29.328s
-[2025-07-04 10:07:25,287][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
-[2025-07-05 04:37:21,286][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:518] Training is Complete.
-[2025-07-05 04:37:21,293][oumi][rank4][pid:2619054][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=135.197, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:21,293][oumi][rank4][pid:2619054][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 29.58 GB
-[2025-07-05 04:37:21,293][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:525] Saving final state...
-[2025-07-05 04:37:21,451][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:530] Saving final model...
-[2025-07-05 04:37:21,452][oumi][rank4][pid:2619054][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
-[2025-07-05 04:40:16,617][oumi][rank4][pid:2619054][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:40:20,756][oumi][rank4][pid:2619054][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:43:06,046][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

+[2025-07-09 18:42:02,786][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
+[2025-07-09 18:42:04,223][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
+[2025-07-09 18:42:08,742][oumi][rank4][pid:1196053][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
+[2025-07-09 18:42:08,742][oumi][rank4][pid:1196053][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
+[2025-07-09 18:42:11,870][oumi][rank4][pid:1196053][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
+	Dataset size: 109368497
+	Download size: 96255659
+	Size: 205624156 bytes
+	Rows: 10000
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
+[2025-07-09 18:42:14,519][oumi][rank4][pid:1196053][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
+[2025-07-09 18:42:26,188][oumi][rank4][pid:1196053][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4)
+[2025-07-09 18:42:26,188][oumi][rank4][pid:1196053][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
+[2025-07-09 18:42:26,188][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
+[2025-07-09 18:42:26,188][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
+[2025-07-09 18:42:26,189][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4))...
+[2025-07-09 18:42:26,212][oumi][rank4][pid:1196053][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
+[2025-07-09 18:42:29,329][oumi][rank4][pid:1196053][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
+[2025-07-09 18:42:29,329][oumi][rank4][pid:1196053][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
+[2025-07-09 18:42:29,748][oumi][rank4][pid:1196053][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 18:42:29,897][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:506] Training init time: 27.205s
+[2025-07-09 18:42:29,897][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
+[2025-07-09 21:46:14,458][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:514] Training is Complete.
+[2025-07-09 21:46:14,479][oumi][rank4][pid:1196053][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.102, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 21:46:14,479][oumi][rank4][pid:1196053][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.33 GB
+[2025-07-09 21:46:14,480][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:521] Saving final state...
+[2025-07-09 21:46:14,591][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:526] Saving final model...
+[2025-07-09 21:46:14,591][oumi][rank4][pid:1196053][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
+[2025-07-09 21:47:28,949][oumi][rank4][pid:1196053][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:47:32,380][oumi][rank4][pid:1196053][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:48:57,872][oumi][rank4][pid:1196053][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0005.log CHANGED Viewed

@@ -1,40 +1,41 @@
-[2025-07-04 10:06:56,055][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
-[2025-07-04 10:06:57,908][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
-[2025-07-04 10:07:03,051][oumi][rank5][pid:2619055][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
-[2025-07-04 10:07:03,051][oumi][rank5][pid:2619055][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
-[2025-07-04 10:07:05,197][oumi][rank5][pid:2619055][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
-	Dataset size: 57058499
-	Download size: 48789762
-	Size: 105848261 bytes
-	Rows: 4286
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
-[2025-07-04 10:07:07,706][oumi][rank5][pid:2619055][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
-[2025-07-04 10:07:21,046][oumi][rank5][pid:2619055][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5)
-[2025-07-04 10:07:21,047][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
-[2025-07-04 10:07:21,047][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
-[2025-07-04 10:07:21,047][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5))...
-[2025-07-04 10:07:21,083][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
-[2025-07-04 10:07:24,308][oumi][rank5][pid:2619055][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
-[2025-07-04 10:07:24,308][oumi][rank5][pid:2619055][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
-[2025-07-04 10:07:24,772][oumi][rank5][pid:2619055][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.277, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 10:07:25,292][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:510] Training init time: 29.335s
-[2025-07-04 10:07:25,292][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
-[2025-07-05 04:37:21,368][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:518] Training is Complete.
-[2025-07-05 04:37:21,373][oumi][rank5][pid:2619055][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:21,373][oumi][rank5][pid:2619055][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 28.73 GB
-[2025-07-05 04:37:21,373][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:525] Saving final state...
-[2025-07-05 04:37:21,454][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:530] Saving final model...
-[2025-07-05 04:37:21,454][oumi][rank5][pid:2619055][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
-[2025-07-05 04:40:16,620][oumi][rank5][pid:2619055][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:40:20,536][oumi][rank5][pid:2619055][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:43:05,901][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

+[2025-07-09 18:42:02,768][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
+[2025-07-09 18:42:04,332][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
+[2025-07-09 18:42:08,810][oumi][rank5][pid:1196054][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
+[2025-07-09 18:42:08,811][oumi][rank5][pid:1196054][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
+[2025-07-09 18:42:11,868][oumi][rank5][pid:1196054][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
+	Dataset size: 109368497
+	Download size: 96255659
+	Size: 205624156 bytes
+	Rows: 10000
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
+[2025-07-09 18:42:14,512][oumi][rank5][pid:1196054][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
+[2025-07-09 18:42:26,178][oumi][rank5][pid:1196054][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5)
+[2025-07-09 18:42:26,181][oumi][rank5][pid:1196054][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
+[2025-07-09 18:42:26,181][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
+[2025-07-09 18:42:26,181][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
+[2025-07-09 18:42:26,181][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5))...
+[2025-07-09 18:42:26,207][oumi][rank5][pid:1196054][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
+[2025-07-09 18:42:29,286][oumi][rank5][pid:1196054][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
+[2025-07-09 18:42:29,286][oumi][rank5][pid:1196054][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
+[2025-07-09 18:42:29,704][oumi][rank5][pid:1196054][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.01, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 18:42:29,901][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:506] Training init time: 27.209s
+[2025-07-09 18:42:29,901][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
+[2025-07-09 21:46:14,497][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:514] Training is Complete.
+[2025-07-09 21:46:14,504][oumi][rank5][pid:1196054][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 21:46:14,505][oumi][rank5][pid:1196054][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.33 GB
+[2025-07-09 21:46:14,505][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:521] Saving final state...
+[2025-07-09 21:46:14,591][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:526] Saving final model...
+[2025-07-09 21:46:14,592][oumi][rank5][pid:1196054][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
+[2025-07-09 21:47:28,949][oumi][rank5][pid:1196054][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:47:32,371][oumi][rank5][pid:1196054][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:48:58,279][oumi][rank5][pid:1196054][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0006.log CHANGED Viewed

@@ -1,40 +1,41 @@
-[2025-07-04 10:06:56,070][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
-[2025-07-04 10:06:57,902][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
-[2025-07-04 10:07:03,025][oumi][rank6][pid:2619056][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
-[2025-07-04 10:07:03,026][oumi][rank6][pid:2619056][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
-[2025-07-04 10:07:05,222][oumi][rank6][pid:2619056][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
-	Dataset size: 57058499
-	Download size: 48789762
-	Size: 105848261 bytes
-	Rows: 4286
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
-[2025-07-04 10:07:07,724][oumi][rank6][pid:2619056][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
-[2025-07-04 10:07:21,068][oumi][rank6][pid:2619056][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6)
-[2025-07-04 10:07:21,070][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
-[2025-07-04 10:07:21,070][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
-[2025-07-04 10:07:21,071][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6))...
-[2025-07-04 10:07:21,106][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
-[2025-07-04 10:07:24,500][oumi][rank6][pid:2619056][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
-[2025-07-04 10:07:24,500][oumi][rank6][pid:2619056][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
-[2025-07-04 10:07:24,822][oumi][rank6][pid:2619056][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 10:07:25,293][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:510] Training init time: 29.334s
-[2025-07-04 10:07:25,293][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
-[2025-07-05 04:37:21,401][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:518] Training is Complete.
-[2025-07-05 04:37:21,402][oumi][rank6][pid:2619056][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:21,406][oumi][rank6][pid:2619056][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 29.56 GB
-[2025-07-05 04:37:21,406][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:525] Saving final state...
-[2025-07-05 04:37:21,452][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:530] Saving final model...
-[2025-07-05 04:37:21,452][oumi][rank6][pid:2619056][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
-[2025-07-05 04:40:16,619][oumi][rank6][pid:2619056][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:40:20,620][oumi][rank6][pid:2619056][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:43:06,598][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

+[2025-07-09 18:42:02,772][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
+[2025-07-09 18:42:04,162][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
+[2025-07-09 18:42:08,508][oumi][rank6][pid:1196055][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
+[2025-07-09 18:42:08,508][oumi][rank6][pid:1196055][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
+[2025-07-09 18:42:11,880][oumi][rank6][pid:1196055][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
+	Dataset size: 109368497
+	Download size: 96255659
+	Size: 205624156 bytes
+	Rows: 10000
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
+[2025-07-09 18:42:14,525][oumi][rank6][pid:1196055][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
+[2025-07-09 18:42:26,183][oumi][rank6][pid:1196055][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6)
+[2025-07-09 18:42:26,185][oumi][rank6][pid:1196055][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
+[2025-07-09 18:42:26,185][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
+[2025-07-09 18:42:26,185][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
+[2025-07-09 18:42:26,185][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6))...
+[2025-07-09 18:42:26,217][oumi][rank6][pid:1196055][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
+[2025-07-09 18:42:29,227][oumi][rank6][pid:1196055][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
+[2025-07-09 18:42:29,228][oumi][rank6][pid:1196055][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
+[2025-07-09 18:42:29,672][oumi][rank6][pid:1196055][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.01, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 18:42:29,895][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:506] Training init time: 27.202s
+[2025-07-09 18:42:29,895][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
+[2025-07-09 21:46:14,530][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:514] Training is Complete.
+[2025-07-09 21:46:14,539][oumi][rank6][pid:1196055][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 21:46:14,540][oumi][rank6][pid:1196055][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.32 GB
+[2025-07-09 21:46:14,540][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:521] Saving final state...
+[2025-07-09 21:46:14,592][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:526] Saving final model...
+[2025-07-09 21:46:14,592][oumi][rank6][pid:1196055][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
+[2025-07-09 21:47:28,951][oumi][rank6][pid:1196055][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:47:32,573][oumi][rank6][pid:1196055][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:48:57,874][oumi][rank6][pid:1196055][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

logs/rank_0007.log CHANGED Viewed

@@ -1,40 +1,41 @@
-[2025-07-04 10:06:56,058][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
-[2025-07-04 10:06:57,900][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
-[2025-07-04 10:07:03,049][oumi][rank7][pid:2619057][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
-[2025-07-04 10:07:03,049][oumi][rank7][pid:2619057][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
-[2025-07-04 10:07:05,217][oumi][rank7][pid:2619057][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
-	Dataset size: 57058499
-	Download size: 48789762
-	Size: 105848261 bytes
-	Rows: 4286
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
-[2025-07-04 10:07:07,979][oumi][rank7][pid:2619057][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
-[2025-07-04 10:07:21,057][oumi][rank7][pid:2619057][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7)
-[2025-07-04 10:07:21,058][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
-[2025-07-04 10:07:21,058][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
-[2025-07-04 10:07:21,058][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7))...
-[2025-07-04 10:07:21,086][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
-[2025-07-04 10:07:24,262][oumi][rank7][pid:2619057][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
-[2025-07-04 10:07:24,262][oumi][rank7][pid:2619057][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
-[2025-07-04 10:07:24,740][oumi][rank7][pid:2619057][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.277, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-04 10:07:25,299][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:510] Training init time: 29.341s
-[2025-07-04 10:07:25,299][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
-[2025-07-05 04:37:21,399][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:518] Training is Complete.
-[2025-07-05 04:37:21,401][oumi][rank7][pid:2619057][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
-[2025-07-05 04:37:21,402][oumi][rank7][pid:2619057][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 30.10 GB
-[2025-07-05 04:37:21,402][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:525] Saving final state...
-[2025-07-05 04:37:21,454][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:530] Saving final model...
-[2025-07-05 04:37:21,454][oumi][rank7][pid:2619057][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
-[2025-07-05 04:40:16,621][oumi][rank7][pid:2619057][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:40:20,698][oumi][rank7][pid:2619057][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
-[2025-07-05 04:43:06,529][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

+[2025-07-09 18:42:03,107][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
+[2025-07-09 18:42:04,432][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
+[2025-07-09 18:42:08,894][oumi][rank7][pid:1196056][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
+[2025-07-09 18:42:08,895][oumi][rank7][pid:1196056][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
+[2025-07-09 18:42:11,909][oumi][rank7][pid:1196056][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
 	Split: train
 	Version: 0.0.0
+	Dataset size: 109368497
+	Download size: 96255659
+	Size: 205624156 bytes
+	Rows: 10000
 	Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
+[2025-07-09 18:42:14,543][oumi][rank7][pid:1196056][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (10000, 5). Columns:
 problem              object
 solution             object
 original_question    object
 original_answer      object
 image                object
 dtype: object
+[2025-07-09 18:42:26,194][oumi][rank7][pid:1196056][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7)
+[2025-07-09 18:42:26,195][oumi][rank7][pid:1196056][MainThread][INFO]][distributed.py:481] 'FSDP_TRANSFORMER_CLS_TO_WRAP' is set to 'Qwen2_5_VLDecoderLayer' based on 'transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer'.
+[2025-07-09 18:42:26,195][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:402] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'FULL_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'TRANSFORMER_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_TRANSFORMER_CLS_TO_WRAP': 'Qwen2_5_VLDecoderLayer', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
+[2025-07-09 18:42:26,195][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
+[2025-07-09 18:42:26,195][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7))...
+[2025-07-09 18:42:26,224][oumi][rank7][pid:1196056][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
+[2025-07-09 18:42:29,448][oumi][rank7][pid:1196056][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
+[2025-07-09 18:42:29,448][oumi][rank7][pid:1196056][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
+[2025-07-09 18:42:29,796][oumi][rank7][pid:1196056][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.00200000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 18:42:29,904][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:506] Training init time: 27.211s
+[2025-07-09 18:42:29,904][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:507] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
+[2025-07-09 21:46:14,494][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:514] Training is Complete.
+[2025-07-09 21:46:14,503][oumi][rank7][pid:1196056][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=27264.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=124.11800000000001, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
+[2025-07-09 21:46:14,503][oumi][rank7][pid:1196056][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 17.32 GB
+[2025-07-09 21:46:14,503][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:521] Saving final state...
+[2025-07-09 21:46:14,590][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:526] Saving final model...
+[2025-07-09 21:46:14,591][oumi][rank7][pid:1196056][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
+[2025-07-09 21:47:28,953][oumi][rank7][pid:1196056][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:47:32,602][oumi][rank7][pid:1196056][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
+[2025-07-09 21:48:58,154][oumi][rank7][pid:1196056][MainThread][INFO]][train.py:214]
 » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0ace3109f45472b8e455e628311825ac55b941a759bcfb3b8f60af343cc4b8f
+size 4968243304

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b938b06fe81e84b3431ccc418f2c977e9cde81a76923429b7a0addcab0e830e3
+size 4991495816

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f63c2b1eb4774528c039a90d9f6cb97ce1e2ccb5dbb078d32e6b64e6df39cb7
+size 4932751040

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8b4dbc975b9c0281451a3a61daed283465565d20f3343f2a786c44c6631cf8f
+size 1691924384

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

runs/Jul09_18-42-29_oumi-compute002/events.out.tfevents.1752086716.oumi-compute002.1196049.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8e69fbc3d95d79cdeeeaceaf06ab1f6e42eb2ec84533cf923f22899e8351d14
+size 16776

telemetry/telemetry_callback_metrics_rank0000.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
-  "train_runtime": 66532.5206,
-  "train_samples_per_second": 0.451,
-  "train_steps_per_second": 0.056,
-  "train_tokens_per_second": 70.398,
-  "total_flos": 1.4113858770069094e+17,
-  "train_loss": 0.21003443336486816,
-  "epoch": 6.996268656716418,
-  "num_input_tokens_seen": 24290780
 }

 {
+  "train_runtime": 10837.9895,
+  "train_samples_per_second": 0.923,
+  "train_steps_per_second": 0.115,
+  "train_tokens_per_second": 41.751,
+  "total_flos": 3.3689960431222784e+16,
+  "train_loss": 0.5481613777160644,
+  "epoch": 1.0,
+  "num_input_tokens_seen": 5798240
 }

telemetry/telemetry_callback_rank0000.json CHANGED Viewed

@@ -1,36 +1,36 @@
 {
-  "hostname": "oumi-compute004",
-  "total_time": 66595.79496112792,
   "timers": {
     "epochs": {
-      "count": 7.0,
-      "mean": 9503.914948322012,
-      "median": 9508.863333210349,
-      "std_dev": 24.585813027685713,
-      "min": 9471.0654975418,
-      "max": 9534.607692892198,
-      "total": 66527.40463825408,
-      "percentage": 99.89730534350741
     },
     "microsteps": {
-      "count": 3748.0,
-      "mean": 17.662540777144716,
-      "median": 17.44136324687861,
-      "std_dev": 1.5278314346422193,
-      "min": 15.953362683299929,
-      "max": 25.46622396213934,
-      "total": 66199.2028327384,
-      "percentage": 99.40447872328726
     },
     "steps": {
-      "count": 3748.0,
-      "mean": 17.662547419246746,
-      "median": 17.441364587983117,
-      "std_dev": 1.5278260400852177,
-      "min": 15.953363878186792,
-      "max": 25.46622501220554,
-      "total": 66199.22772733681,
-      "percentage": 99.40451610492437
     }
   },
   "cuda_timers": {},

 {
+  "hostname": "oumi-compute002",
+  "total_time": 11004.431441733148,
   "timers": {
     "epochs": {
+      "count": 1.0,
+      "mean": 10836.051646457054,
+      "median": 10836.051646457054,
+      "std_dev": 0,
+      "min": 10836.051646457054,
+      "max": 10836.051646457054,
+      "total": 10836.051646457054,
+      "percentage": 98.46989100556772
     },
     "microsteps": {
+      "count": 1248.0,
+      "mean": 8.3604591369908,
+      "median": 8.76854655193165,
+      "std_dev": 1.778935564684271,
+      "min": 3.7527454742230475,
+      "max": 17.169754883274436,
+      "total": 10433.853002964519,
+      "percentage": 94.81501209953682
     },
     "steps": {
+      "count": 1248.0,
+      "mean": 8.360481434502454,
+      "median": 8.768546879524365,
+      "std_dev": 1.778942205649326,
+      "min": 3.7527463790029287,
+      "max": 17.169755581766367,
+      "total": 10433.880830259062,
+      "percentage": 94.81526497306956
     }
   },
   "cuda_timers": {},

telemetry/telemetry_callback_wandb_rank0000.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "id": "p2r4ua3y",
   "name": "output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered",
-  "url": "https://wandb.ai/nyu-dice-lab/huggingface/runs/p2r4ua3y"
 }

 {
+  "id": "kb5s58m1",
   "name": "output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered",
+  "url": "https://wandb.ai/nyu-dice-lab/huggingface/runs/kb5s58m1"
 }

telemetry/training_config.yaml CHANGED Viewed

@@ -59,13 +59,13 @@ model:
   tokenizer_pad_token: null
   tokenizer_kwargs: {}
   processor_kwargs: {}
-  model_max_length: 10000
   load_pretrained_weights: true
   trust_remote_code: true
   torch_dtype_str: bfloat16
   compile: false
   chat_template: qwen2-vl-instruct
-  attn_implementation: sdpa
   device_map: auto
   model_kwargs: {}
   enable_liger_kernel: false
@@ -82,7 +82,7 @@ training:
   per_device_train_batch_size: 1
   per_device_eval_batch_size: 8
   gradient_accumulation_steps: 1
-  max_steps: 3750
   num_train_epochs: 5
   save_epoch: false
   save_steps: 0
@@ -103,7 +103,7 @@ training:
     remove_unused_columns: false
     repetition_penalty: 1.0
     use_vllm: false
-    vllm_device: null
     vllm_gpu_memory_utilization: 0.9
     vllm_dtype: null
     vllm_max_model_len: null
@@ -138,14 +138,14 @@ training:
   log_model_summary: false
   resume_from_checkpoint: null
   try_resume_from_last_checkpoint: false
-  dataloader_num_workers: 2
   dataloader_persistent_workers: false
-  dataloader_prefetch_factor: 8
-  dataloader_main_process_only: false
   ddp_find_unused_parameters: false
   max_grad_norm: 1.0
   trainer_kwargs:
-    max_seq_length: 10000
     remove_unused_columns: false
     dataset_kwargs:
       skip_prepare_dataset: true
@@ -193,14 +193,14 @@ peft:
   peft_save_mode: ADAPTER_ONLY
 fsdp:
   enable_fsdp: true
-  sharding_strategy: HYBRID_SHARD
   cpu_offload: false
-  mixed_precision: bf16
   backward_prefetch: BACKWARD_PRE
   forward_prefetch: true
-  use_orig_params: null
   state_dict_type: FULL_STATE_DICT
-  auto_wrap_policy: SIZE_BASED_WRAP
   min_num_params: 100000
-  transformer_layer_cls: null
   sync_module_states: true

   tokenizer_pad_token: null
   tokenizer_kwargs: {}
   processor_kwargs: {}
+  model_max_length: 16384
   load_pretrained_weights: true
   trust_remote_code: true
   torch_dtype_str: bfloat16
   compile: false
   chat_template: qwen2-vl-instruct
+  attn_implementation: flash_attention_2
   device_map: auto
   model_kwargs: {}
   enable_liger_kernel: false
   per_device_train_batch_size: 1
   per_device_eval_batch_size: 8
   gradient_accumulation_steps: 1
+  max_steps: 1250
   num_train_epochs: 5
   save_epoch: false
   save_steps: 0
     remove_unused_columns: false
     repetition_penalty: 1.0
     use_vllm: false
+    vllm_mode: null
     vllm_gpu_memory_utilization: 0.9
     vllm_dtype: null
     vllm_max_model_len: null
   log_model_summary: false
   resume_from_checkpoint: null
   try_resume_from_last_checkpoint: false
+  dataloader_num_workers: 64
   dataloader_persistent_workers: false
+  dataloader_prefetch_factor: 32
+  dataloader_main_process_only: null
   ddp_find_unused_parameters: false
   max_grad_norm: 1.0
   trainer_kwargs:
+    max_seq_length: 16384
     remove_unused_columns: false
     dataset_kwargs:
       skip_prepare_dataset: true
   peft_save_mode: ADAPTER_ONLY
 fsdp:
   enable_fsdp: true
+  sharding_strategy: FULL_SHARD
   cpu_offload: false
+  mixed_precision: null
   backward_prefetch: BACKWARD_PRE
   forward_prefetch: true
+  use_orig_params: true
   state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
   min_num_params: 100000
+  transformer_layer_cls: transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLDecoderLayer
   sync_module_states: true

tokenizer_config.json CHANGED Viewed

@@ -200,7 +200,7 @@
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
-  "model_max_length": 10000,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "processor_class": "Qwen2_5_VLProcessor",

   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
+  "model_max_length": 16384,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "processor_class": "Qwen2_5_VLProcessor",

trainer_state.json CHANGED Viewed

@@ -2,779 +2,279 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 6.996268656716418,
   "eval_steps": 500,
-  "global_step": 3750,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.09328358208955224,
-      "grad_norm": 2.4583044052124023,
-      "learning_rate": 8.672566371681418e-06,
-      "loss": 0.8247,
-      "mean_token_accuracy": 0.8019651556015015,
-      "num_input_tokens_seen": 323038,
-      "num_tokens": 323038.0,
       "step": 50
     },
     {
-      "epoch": 0.1865671641791045,
-      "grad_norm": 2.1223530769348145,
-      "learning_rate": 1.7522123893805313e-05,
-      "loss": 0.6247,
-      "mean_token_accuracy": 0.8224787962436676,
-      "num_input_tokens_seen": 642936,
-      "num_tokens": 642936.0,
       "step": 100
     },
     {
-      "epoch": 0.2798507462686567,
-      "grad_norm": 5.335413932800293,
-      "learning_rate": 1.9995165482321775e-05,
-      "loss": 0.6278,
-      "mean_token_accuracy": 0.8260715854167938,
-      "num_input_tokens_seen": 976988,
-      "num_tokens": 976988.0,
       "step": 150
     },
     {
-      "epoch": 0.373134328358209,
-      "grad_norm": 5.00244140625,
-      "learning_rate": 1.9972420885061576e-05,
-      "loss": 0.6247,
-      "mean_token_accuracy": 0.8278178679943085,
-      "num_input_tokens_seen": 1303490,
-      "num_tokens": 1303490.0,
       "step": 200
     },
     {
-      "epoch": 0.4664179104477612,
-      "grad_norm": 5.494905948638916,
-      "learning_rate": 1.9931077431357095e-05,
-      "loss": 0.649,
-      "mean_token_accuracy": 0.8199629688262939,
-      "num_input_tokens_seen": 1633598,
-      "num_tokens": 1633598.0,
       "step": 250
     },
     {
-      "epoch": 0.5597014925373134,
-      "grad_norm": 5.7639923095703125,
-      "learning_rate": 1.9871212227957962e-05,
-      "loss": 0.6276,
-      "mean_token_accuracy": 0.8245489084720612,
-      "num_input_tokens_seen": 1959940,
-      "num_tokens": 1959940.0,
       "step": 300
     },
     {
-      "epoch": 0.6529850746268657,
-      "grad_norm": 5.357442378997803,
-      "learning_rate": 1.979293692521837e-05,
-      "loss": 0.6308,
-      "mean_token_accuracy": 0.8240770590305329,
-      "num_input_tokens_seen": 2273744,
-      "num_tokens": 2273744.0,
       "step": 350
     },
     {
-      "epoch": 0.746268656716418,
-      "grad_norm": 7.147578239440918,
-      "learning_rate": 1.9696397508865917e-05,
-      "loss": 0.6753,
-      "mean_token_accuracy": 0.8150083267688751,
-      "num_input_tokens_seen": 2594534,
-      "num_tokens": 2594534.0,
       "step": 400
     },
     {
-      "epoch": 0.8395522388059702,
-      "grad_norm": 5.7040605545043945,
-      "learning_rate": 1.9581774027733947e-05,
-      "loss": 0.6221,
-      "mean_token_accuracy": 0.8259347748756408,
-      "num_input_tokens_seen": 2913982,
-      "num_tokens": 2913982.0,
       "step": 450
     },
     {
-      "epoch": 0.9328358208955224,
-      "grad_norm": 5.201114654541016,
-      "learning_rate": 1.944928025796521e-05,
-      "loss": 0.6178,
-      "mean_token_accuracy": 0.8253031682968139,
-      "num_input_tokens_seen": 3233948,
-      "num_tokens": 3233948.0,
       "step": 500
     },
     {
-      "epoch": 1.0261194029850746,
-      "grad_norm": 5.712339878082275,
-      "learning_rate": 1.929916330431312e-05,
-      "loss": 0.6321,
-      "mean_token_accuracy": 0.8240388989448547,
-      "num_input_tokens_seen": 3568746,
-      "num_tokens": 3568746.0,
       "step": 550
     },
     {
-      "epoch": 1.1194029850746268,
-      "grad_norm": 3.397759199142456,
-      "learning_rate": 1.9131703139284143e-05,
-      "loss": 0.4505,
-      "mean_token_accuracy": 0.8675171172618866,
-      "num_input_tokens_seen": 3886940,
-      "num_tokens": 3886940.0,
       "step": 600
     },
     {
-      "epoch": 1.212686567164179,
-      "grad_norm": 3.1189541816711426,
-      "learning_rate": 1.894721208098092e-05,
-      "loss": 0.3317,
-      "mean_token_accuracy": 0.9006148743629455,
-      "num_input_tokens_seen": 4206696,
-      "num_tokens": 4206696.0,
       "step": 650
     },
     {
-      "epoch": 1.3059701492537314,
-      "grad_norm": 2.917012929916382,
-      "learning_rate": 1.874603421061986e-05,
-      "loss": 0.3288,
-      "mean_token_accuracy": 0.900334278345108,
-      "num_input_tokens_seen": 4538022,
-      "num_tokens": 4538022.0,
       "step": 700
     },
     {
-      "epoch": 1.3992537313432836,
-      "grad_norm": 3.822690963745117,
-      "learning_rate": 1.852854473080961e-05,
-      "loss": 0.4135,
-      "mean_token_accuracy": 0.8779956555366516,
-      "num_input_tokens_seen": 4868192,
-      "num_tokens": 4868192.0,
       "step": 750
     },
     {
-      "epoch": 1.4925373134328357,
-      "grad_norm": 3.944916009902954,
-      "learning_rate": 1.8295149265787224e-05,
-      "loss": 0.4413,
-      "mean_token_accuracy": 0.8670336186885834,
-      "num_input_tokens_seen": 5201704,
-      "num_tokens": 5201704.0,
       "step": 800
     },
     {
-      "epoch": 1.585820895522388,
-      "grad_norm": 4.3602423667907715,
-      "learning_rate": 1.8046283104917116e-05,
-      "loss": 0.4167,
-      "mean_token_accuracy": 0.87647913813591,
-      "num_input_tokens_seen": 5521682,
-      "num_tokens": 5521682.0,
       "step": 850
     },
     {
-      "epoch": 1.6791044776119404,
-      "grad_norm": 4.468040943145752,
-      "learning_rate": 1.7782410390863664e-05,
-      "loss": 0.4282,
-      "mean_token_accuracy": 0.8733036065101624,
-      "num_input_tokens_seen": 5840636,
-      "num_tokens": 5840636.0,
       "step": 900
     },
     {
-      "epoch": 1.7723880597014925,
-      "grad_norm": 5.0916666984558105,
-      "learning_rate": 1.750402325395156e-05,
-      "loss": 0.422,
-      "mean_token_accuracy": 0.8751583611965179,
-      "num_input_tokens_seen": 6152288,
-      "num_tokens": 6152288.0,
       "step": 950
     },
     {
-      "epoch": 1.8656716417910446,
-      "grad_norm": 3.583630084991455,
-      "learning_rate": 1.7211640894328413e-05,
-      "loss": 0.4014,
-      "mean_token_accuracy": 0.8789325177669525,
-      "num_input_tokens_seen": 6475382,
-      "num_tokens": 6475382.0,
       "step": 1000
     },
     {
-      "epoch": 1.9589552238805972,
-      "grad_norm": 4.270807266235352,
-      "learning_rate": 1.6905808613641233e-05,
-      "loss": 0.4074,
-      "mean_token_accuracy": 0.8787497889995575,
-      "num_input_tokens_seen": 6799610,
-      "num_tokens": 6799610.0,
       "step": 1050
     },
     {
-      "epoch": 2.0522388059701493,
-      "grad_norm": 3.991750717163086,
-      "learning_rate": 1.6587096798032984e-05,
-      "loss": 0.3727,
-      "mean_token_accuracy": 0.8879767000675202,
-      "num_input_tokens_seen": 7129112,
-      "num_tokens": 7129112.0,
       "step": 1100
     },
     {
-      "epoch": 2.1455223880597014,
-      "grad_norm": 2.423408269882202,
-      "learning_rate": 1.625609985435571e-05,
-      "loss": 0.2241,
-      "mean_token_accuracy": 0.9320108902454376,
-      "num_input_tokens_seen": 7449332,
-      "num_tokens": 7449332.0,
       "step": 1150
     },
     {
-      "epoch": 2.2388059701492535,
-      "grad_norm": 1.7369468212127686,
-      "learning_rate": 1.59134351015844e-05,
-      "loss": 0.1621,
-      "mean_token_accuracy": 0.9507821369171142,
-      "num_input_tokens_seen": 7773514,
-      "num_tokens": 7773514.0,
       "step": 1200
     },
     {
-      "epoch": 2.332089552238806,
-      "grad_norm": 1.9339622259140015,
-      "learning_rate": 1.555974161949906e-05,
-      "loss": 0.174,
-      "mean_token_accuracy": 0.9477302300930023,
-      "num_input_tokens_seen": 8102664,
-      "num_tokens": 8102664.0,
       "step": 1250
     },
     {
-      "epoch": 2.425373134328358,
-      "grad_norm": 2.86017107963562,
-      "learning_rate": 1.519567905678223e-05,
-      "loss": 0.2275,
-      "mean_token_accuracy": 0.9304351592063904,
-      "num_input_tokens_seen": 8432668,
-      "num_tokens": 8432668.0,
-      "step": 1300
-    },
-    {
-      "epoch": 2.5186567164179103,
-      "grad_norm": 2.300647735595703,
-      "learning_rate": 1.4821926400754915e-05,
-      "loss": 0.2328,
-      "mean_token_accuracy": 0.9280073237419129,
-      "num_input_tokens_seen": 8759670,
-      "num_tokens": 8759670.0,
-      "step": 1350
-    },
-    {
-      "epoch": 2.611940298507463,
-      "grad_norm": 2.8485522270202637,
-      "learning_rate": 1.4439180711045395e-05,
-      "loss": 0.2274,
-      "mean_token_accuracy": 0.9305808675289154,
-      "num_input_tokens_seen": 9083690,
-      "num_tokens": 9083690.0,
-      "step": 1400
-    },
-    {
-      "epoch": 2.705223880597015,
-      "grad_norm": 2.8772568702697754,
-      "learning_rate": 1.4048155819552617e-05,
-      "loss": 0.2385,
-      "mean_token_accuracy": 0.9277240431308746,
-      "num_input_tokens_seen": 9400762,
-      "num_tokens": 9400762.0,
-      "step": 1450
-    },
-    {
-      "epoch": 2.798507462686567,
-      "grad_norm": 3.3837602138519287,
-      "learning_rate": 1.3649580999128871e-05,
-      "loss": 0.2225,
-      "mean_token_accuracy": 0.9319508814811707,
-      "num_input_tokens_seen": 9719416,
-      "num_tokens": 9719416.0,
-      "step": 1500
-    },
-    {
-      "epoch": 2.8917910447761193,
-      "grad_norm": 3.256094217300415,
-      "learning_rate": 1.3244199603464581e-05,
-      "loss": 0.2307,
-      "mean_token_accuracy": 0.9294045794010163,
-      "num_input_tokens_seen": 10038208,
-      "num_tokens": 10038208.0,
-      "step": 1550
-    },
-    {
-      "epoch": 2.9850746268656714,
-      "grad_norm": 2.5858240127563477,
-      "learning_rate": 1.2832767680711941e-05,
-      "loss": 0.2196,
-      "mean_token_accuracy": 0.9315053272247314,
-      "num_input_tokens_seen": 10366444,
-      "num_tokens": 10366444.0,
-      "step": 1600
-    },
-    {
-      "epoch": 3.078358208955224,
-      "grad_norm": 1.8756778240203857,
-      "learning_rate": 1.2416052563433043e-05,
-      "loss": 0.19,
-      "mean_token_accuracy": 0.9421073424816132,
-      "num_input_tokens_seen": 10687314,
-      "num_tokens": 10687314.0,
-      "step": 1650
-    },
-    {
-      "epoch": 3.171641791044776,
-      "grad_norm": 0.8857208490371704,
-      "learning_rate": 1.1994831437502172e-05,
-      "loss": 0.1102,
-      "mean_token_accuracy": 0.9673730087280273,
-      "num_input_tokens_seen": 11009694,
-      "num_tokens": 11009694.0,
-      "step": 1700
-    },
-    {
-      "epoch": 3.264925373134328,
-      "grad_norm": 1.2341127395629883,
-      "learning_rate": 1.1569889892631488e-05,
-      "loss": 0.0797,
-      "mean_token_accuracy": 0.9754443645477295,
-      "num_input_tokens_seen": 11337744,
-      "num_tokens": 11337744.0,
-      "step": 1750
-    },
-    {
-      "epoch": 3.3582089552238807,
-      "grad_norm": 1.431805968284607,
-      "learning_rate": 1.1142020457223195e-05,
-      "loss": 0.0912,
-      "mean_token_accuracy": 0.9724087870121002,
-      "num_input_tokens_seen": 11665956,
-      "num_tokens": 11665956.0,
-      "step": 1800
-    },
-    {
-      "epoch": 3.451492537313433,
-      "grad_norm": 1.9885361194610596,
-      "learning_rate": 1.0712021120280951e-05,
-      "loss": 0.1156,
-      "mean_token_accuracy": 0.9630080580711364,
-      "num_input_tokens_seen": 12000080,
-      "num_tokens": 12000080.0,
-      "step": 1850
-    },
-    {
-      "epoch": 3.544776119402985,
-      "grad_norm": 1.1434038877487183,
-      "learning_rate": 1.028069384313702e-05,
-      "loss": 0.1109,
-      "mean_token_accuracy": 0.9657398784160613,
-      "num_input_tokens_seen": 12322044,
-      "num_tokens": 12322044.0,
-      "step": 1900
-    },
-    {
-      "epoch": 3.638059701492537,
-      "grad_norm": 1.1466773748397827,
-      "learning_rate": 9.848843063770963e-06,
-      "loss": 0.1098,
-      "mean_token_accuracy": 0.966309015750885,
-      "num_input_tokens_seen": 12642090,
-      "num_tokens": 12642090.0,
-      "step": 1950
-    },
-    {
-      "epoch": 3.7313432835820897,
-      "grad_norm": 1.5840574502944946,
-      "learning_rate": 9.41727419650929e-06,
-      "loss": 0.121,
-      "mean_token_accuracy": 0.961807359457016,
-      "num_input_tokens_seen": 12961610,
-      "num_tokens": 12961610.0,
-      "step": 2000
-    },
-    {
-      "epoch": 3.824626865671642,
-      "grad_norm": 1.5495262145996094,
-      "learning_rate": 8.986792129904186e-06,
-      "loss": 0.1102,
-      "mean_token_accuracy": 0.9657205975055695,
-      "num_input_tokens_seen": 13279862,
-      "num_tokens": 13279862.0,
-      "step": 2050
-    },
-    {
-      "epoch": 3.917910447761194,
-      "grad_norm": 1.7342835664749146,
-      "learning_rate": 8.558199725592856e-06,
-      "loss": 0.1156,
-      "mean_token_accuracy": 0.9632673525810241,
-      "num_input_tokens_seen": 13597816,
-      "num_tokens": 13597816.0,
-      "step": 2100
-    },
-    {
-      "epoch": 4.0111940298507465,
-      "grad_norm": 2.0673277378082275,
-      "learning_rate": 8.132296320937085e-06,
-      "loss": 0.118,
-      "mean_token_accuracy": 0.9625415456295013,
-      "num_input_tokens_seen": 13928436,
-      "num_tokens": 13928436.0,
-      "step": 2150
-    },
-    {
-      "epoch": 4.104477611940299,
-      "grad_norm": 1.0974746942520142,
-      "learning_rate": 7.709876238235702e-06,
-      "loss": 0.0839,
-      "mean_token_accuracy": 0.9738617813587189,
-      "num_input_tokens_seen": 14249072,
-      "num_tokens": 14249072.0,
-      "step": 2200
-    },
-    {
-      "epoch": 4.197761194029851,
-      "grad_norm": 1.0228750705718994,
-      "learning_rate": 7.29172730329028e-06,
-      "loss": 0.0498,
-      "mean_token_accuracy": 0.9851219677925109,
-      "num_input_tokens_seen": 14571152,
-      "num_tokens": 14571152.0,
-      "step": 2250
-    },
-    {
-      "epoch": 4.291044776119403,
-      "grad_norm": 0.6385033130645752,
-      "learning_rate": 6.8786293760869695e-06,
-      "loss": 0.0388,
-      "mean_token_accuracy": 0.9884346830844879,
-      "num_input_tokens_seen": 14903168,
-      "num_tokens": 14903168.0,
-      "step": 2300
-    },
-    {
-      "epoch": 4.384328358208955,
-      "grad_norm": 0.8694852590560913,
-      "learning_rate": 6.4713528963348506e-06,
-      "loss": 0.0411,
-      "mean_token_accuracy": 0.9874970281124115,
-      "num_input_tokens_seen": 15233224,
-      "num_tokens": 15233224.0,
-      "step": 2350
-    },
-    {
-      "epoch": 4.477611940298507,
-      "grad_norm": 0.929122805595398,
-      "learning_rate": 6.070657446573347e-06,
-      "loss": 0.0476,
-      "mean_token_accuracy": 0.985052285194397,
-      "num_input_tokens_seen": 15565618,
-      "num_tokens": 15565618.0,
-      "step": 2400
-    },
-    {
-      "epoch": 4.57089552238806,
-      "grad_norm": 0.6402145624160767,
-      "learning_rate": 5.677290335528576e-06,
-      "loss": 0.0694,
-      "mean_token_accuracy": 0.9864287662506104,
-      "num_input_tokens_seen": 15886728,
-      "num_tokens": 15886728.0,
-      "step": 2450
-    },
-    {
-      "epoch": 4.664179104477612,
-      "grad_norm": 0.9010692238807678,
-      "learning_rate": 5.291985204360754e-06,
-      "loss": 0.0424,
-      "mean_token_accuracy": 0.9870220470428467,
-      "num_input_tokens_seen": 16200844,
-      "num_tokens": 16200844.0,
-      "step": 2500
-    },
-    {
-      "epoch": 4.757462686567164,
-      "grad_norm": 0.8838453888893127,
-      "learning_rate": 4.9154606584019646e-06,
-      "loss": 0.0433,
-      "mean_token_accuracy": 0.9861760056018829,
-      "num_input_tokens_seen": 16518546,
-      "num_tokens": 16518546.0,
-      "step": 2550
-    },
-    {
-      "epoch": 4.850746268656716,
-      "grad_norm": 1.0508785247802734,
-      "learning_rate": 4.548418926936235e-06,
-      "loss": 0.0413,
-      "mean_token_accuracy": 0.9870886874198913,
-      "num_input_tokens_seen": 16837904,
-      "num_tokens": 16837904.0,
-      "step": 2600
-    },
-    {
-      "epoch": 4.9440298507462686,
-      "grad_norm": 1.0464740991592407,
-      "learning_rate": 4.191544553521355e-06,
-      "loss": 0.0428,
-      "mean_token_accuracy": 0.9860042917728424,
-      "num_input_tokens_seen": 17162930,
-      "num_tokens": 17162930.0,
-      "step": 2650
-    },
-    {
-      "epoch": 5.037313432835821,
-      "grad_norm": 1.167823076248169,
-      "learning_rate": 3.845503119295182e-06,
-      "loss": 0.0407,
-      "mean_token_accuracy": 0.9872146189212799,
-      "num_input_tokens_seen": 17492580,
-      "num_tokens": 17492580.0,
-      "step": 2700
-    },
-    {
-      "epoch": 5.130597014925373,
-      "grad_norm": 0.6037238836288452,
-      "learning_rate": 3.5109400016473338e-06,
-      "loss": 0.0259,
-      "mean_token_accuracy": 0.9919403278827668,
-      "num_input_tokens_seen": 17814314,
-      "num_tokens": 17814314.0,
-      "step": 2750
-    },
-    {
-      "epoch": 5.223880597014926,
-      "grad_norm": 0.4187396168708801,
-      "learning_rate": 3.1884791705714936e-06,
-      "loss": 0.0157,
-      "mean_token_accuracy": 0.9955093479156494,
-      "num_input_tokens_seen": 18139326,
-      "num_tokens": 18139326.0,
-      "step": 2800
-    },
-    {
-      "epoch": 5.317164179104478,
-      "grad_norm": 0.46902546286582947,
-      "learning_rate": 2.878722024943139e-06,
-      "loss": 0.0139,
-      "mean_token_accuracy": 0.995677514076233,
-      "num_input_tokens_seen": 18466486,
-      "num_tokens": 18466486.0,
-      "step": 2850
-    },
-    {
-      "epoch": 5.41044776119403,
-      "grad_norm": 0.3824739456176758,
-      "learning_rate": 2.5822462708930607e-06,
-      "loss": 0.0146,
-      "mean_token_accuracy": 0.9957832169532775,
-      "num_input_tokens_seen": 18796326,
-      "num_tokens": 18796326.0,
-      "step": 2900
-    },
-    {
-      "epoch": 5.503731343283582,
-      "grad_norm": 0.5814462304115295,
-      "learning_rate": 2.299604844368547e-06,
-      "loss": 0.0156,
-      "mean_token_accuracy": 0.9954328262805938,
-      "num_input_tokens_seen": 19129954,
-      "num_tokens": 19129954.0,
-      "step": 2950
-    },
-    {
-      "epoch": 5.597014925373134,
-      "grad_norm": 0.45167264342308044,
-      "learning_rate": 2.031324879891664e-06,
-      "loss": 0.013,
-      "mean_token_accuracy": 0.9963365316390991,
-      "num_input_tokens_seen": 19448322,
-      "num_tokens": 19448322.0,
-      "step": 3000
-    },
-    {
-      "epoch": 5.690298507462686,
-      "grad_norm": 0.5698373913764954,
-      "learning_rate": 1.777906727437979e-06,
-      "loss": 0.0127,
-      "mean_token_accuracy": 0.9959760665893554,
-      "num_input_tokens_seen": 19765536,
-      "num_tokens": 19765536.0,
-      "step": 3050
-    },
-    {
-      "epoch": 5.7835820895522385,
-      "grad_norm": 0.5558890104293823,
-      "learning_rate": 1.5398230192692275e-06,
-      "loss": 0.0117,
-      "mean_token_accuracy": 0.9963926291465759,
-      "num_input_tokens_seen": 20082350,
-      "num_tokens": 20082350.0,
-      "step": 3100
-    },
-    {
-      "epoch": 5.8768656716417915,
-      "grad_norm": 0.46126776933670044,
-      "learning_rate": 1.3175177884603252e-06,
-      "loss": 0.0112,
-      "mean_token_accuracy": 0.9968423485755921,
-      "num_input_tokens_seen": 20401204,
-      "num_tokens": 20401204.0,
-      "step": 3150
-    },
-    {
-      "epoch": 5.970149253731344,
-      "grad_norm": 0.4649079144001007,
-      "learning_rate": 1.1114056407647045e-06,
-      "loss": 0.0111,
-      "mean_token_accuracy": 0.9965043890476227,
-      "num_input_tokens_seen": 20730370,
-      "num_tokens": 20730370.0,
-      "step": 3200
-    },
-    {
-      "epoch": 6.063432835820896,
-      "grad_norm": 0.49266737699508667,
-      "learning_rate": 9.218709813624749e-07,
-      "loss": 0.012,
-      "mean_token_accuracy": 0.9967284095287323,
-      "num_input_tokens_seen": 21052760,
-      "num_tokens": 21052760.0,
-      "step": 3250
-    },
-    {
-      "epoch": 6.156716417910448,
-      "grad_norm": 0.38067445158958435,
-      "learning_rate": 7.492672979335147e-07,
-      "loss": 0.0069,
-      "mean_token_accuracy": 0.998069132566452,
-      "num_input_tokens_seen": 21373624,
-      "num_tokens": 21373624.0,
-      "step": 3300
-    },
-    {
-      "epoch": 6.25,
-      "grad_norm": 0.43708378076553345,
-      "learning_rate": 5.939165013926195e-07,
-      "loss": 0.0054,
-      "mean_token_accuracy": 0.9985744786262513,
-      "num_input_tokens_seen": 21701456,
-      "num_tokens": 21701456.0,
-      "step": 3350
-    },
-    {
-      "epoch": 6.343283582089552,
-      "grad_norm": 0.5964256525039673,
-      "learning_rate": 4.56108325516238e-07,
-      "loss": 0.0066,
-      "mean_token_accuracy": 0.9981687092781066,
-      "num_input_tokens_seen": 22028186,
-      "num_tokens": 22028186.0,
-      "step": 3400
-    },
-    {
-      "epoch": 6.436567164179104,
-      "grad_norm": 0.5558670163154602,
-      "learning_rate": 3.3609978658051045e-07,
-      "loss": 0.0065,
-      "mean_token_accuracy": 0.9981926000118255,
-      "num_input_tokens_seen": 22357840,
-      "num_tokens": 22357840.0,
-      "step": 3450
-    },
-    {
-      "epoch": 6.529850746268656,
-      "grad_norm": 0.5026367902755737,
-      "learning_rate": 2.341147040184011e-07,
-      "loss": 0.0052,
-      "mean_token_accuracy": 0.9985788524150848,
-      "num_input_tokens_seen": 22688738,
-      "num_tokens": 22688738.0,
-      "step": 3500
-    },
-    {
-      "epoch": 6.6231343283582085,
-      "grad_norm": 0.2897884249687195,
-      "learning_rate": 1.5034328298990652e-07,
-      "loss": 0.0046,
-      "mean_token_accuracy": 0.9987975347042084,
-      "num_input_tokens_seen": 23010742,
-      "num_tokens": 23010742.0,
-      "step": 3550
-    },
-    {
-      "epoch": 6.7164179104477615,
-      "grad_norm": 0.43537065386772156,
-      "learning_rate": 8.494175964388285e-08,
-      "loss": 0.0039,
-      "mean_token_accuracy": 0.9988950288295746,
-      "num_input_tokens_seen": 23329282,
-      "num_tokens": 23329282.0,
-      "step": 3600
-    },
-    {
-      "epoch": 6.809701492537314,
-      "grad_norm": 0.36725738644599915,
-      "learning_rate": 3.803210973305715e-08,
-      "loss": 0.003,
-      "mean_token_accuracy": 0.9991644871234894,
-      "num_input_tokens_seen": 23647522,
-      "num_tokens": 23647522.0,
-      "step": 3650
-    },
-    {
-      "epoch": 6.902985074626866,
-      "grad_norm": 0.531343400478363,
-      "learning_rate": 9.7018211256783e-09,
-      "loss": 0.0046,
-      "mean_token_accuracy": 0.9987628531455993,
-      "num_input_tokens_seen": 23963558,
-      "num_tokens": 23963558.0,
-      "step": 3700
-    },
-    {
-      "epoch": 6.996268656716418,
-      "grad_norm": 0.6237491369247437,
-      "learning_rate": 3.7306380940016486e-12,
-      "loss": 0.0051,
-      "mean_token_accuracy": 0.9986482226848602,
-      "num_input_tokens_seen": 24290780,
-      "num_tokens": 24290780.0,
-      "step": 3750
-    },
-    {
-      "epoch": 6.996268656716418,
-      "num_input_tokens_seen": 24290780,
-      "step": 3750,
-      "total_flos": 1.4113858770069094e+17,
-      "train_loss": 0.21003443336486816,
-      "train_runtime": 66532.5206,
-      "train_samples_per_second": 0.451,
-      "train_steps_per_second": 0.056,
-      "train_tokens_per_second": 70.398
     }
   ],
   "logging_steps": 50,
-  "max_steps": 3750,
-  "num_input_tokens_seen": 24290780,
-  "num_train_epochs": 7,
   "save_steps": 0,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -788,7 +288,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.4113858770069094e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 1250,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.04,
+      "grad_norm": 16.125,
+      "learning_rate": 1.9995935375248608e-05,
+      "loss": 0.6783,
+      "mean_token_accuracy": 0.861860990524292,
+      "num_input_tokens_seen": 216880,
+      "num_tokens": 216880.0,
       "step": 50
     },
     {
+      "epoch": 0.08,
+      "grad_norm": 9.0625,
+      "learning_rate": 1.98752561390399e-05,
+      "loss": 0.3562,
+      "mean_token_accuracy": 0.9200676095485687,
+      "num_input_tokens_seen": 462464,
+      "num_tokens": 462464.0,
       "step": 100
     },
     {
+      "epoch": 0.12,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 1.9588933215113926e-05,
+      "loss": 0.2534,
+      "mean_token_accuracy": 0.9416316163539886,
+      "num_input_tokens_seen": 724752,
+      "num_tokens": 724752.0,
       "step": 150
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 0.921875,
+      "learning_rate": 1.9141769272315857e-05,
+      "loss": 0.203,
+      "mean_token_accuracy": 0.9502561473846436,
+      "num_input_tokens_seen": 895104,
+      "num_tokens": 895104.0,
       "step": 200
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 22.875,
+      "learning_rate": 1.8541264863892755e-05,
+      "loss": 0.258,
+      "mean_token_accuracy": 0.939533520936966,
+      "num_input_tokens_seen": 1160016,
+      "num_tokens": 1160016.0,
       "step": 250
     },
     {
+      "epoch": 0.24,
+      "grad_norm": 7.90625,
+      "learning_rate": 1.7797492616144256e-05,
+      "loss": 0.277,
+      "mean_token_accuracy": 0.932598946094513,
+      "num_input_tokens_seen": 1447552,
+      "num_tokens": 1447552.0,
       "step": 300
     },
     {
+      "epoch": 0.28,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 1.6922928274124887e-05,
+      "loss": 0.2399,
+      "mean_token_accuracy": 0.943250241279602,
+      "num_input_tokens_seen": 1751760,
+      "num_tokens": 1751760.0,
       "step": 350
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 1.593224143837142e-05,
+      "loss": 0.2545,
+      "mean_token_accuracy": 0.9372260522842407,
+      "num_input_tokens_seen": 2040064,
+      "num_tokens": 2040064.0,
       "step": 400
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 28.625,
+      "learning_rate": 1.484204950275565e-05,
+      "loss": 0.2971,
+      "mean_token_accuracy": 0.9284321248531342,
+      "num_input_tokens_seen": 2226672,
+      "num_tokens": 2226672.0,
       "step": 450
     },
     {
+      "epoch": 0.4,
+      "grad_norm": 14.5,
+      "learning_rate": 1.36706389208128e-05,
+      "loss": 0.3084,
+      "mean_token_accuracy": 0.9283360493183136,
+      "num_input_tokens_seen": 2481312,
+      "num_tokens": 2481312.0,
       "step": 500
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 4.625,
+      "learning_rate": 1.2437658475915378e-05,
+      "loss": 0.285,
+      "mean_token_accuracy": 0.9289000463485718,
+      "num_input_tokens_seen": 2681408,
+      "num_tokens": 2681408.0,
       "step": 550
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 1.1163789700258656e-05,
+      "loss": 0.3252,
+      "mean_token_accuracy": 0.9217388617992401,
+      "num_input_tokens_seen": 2908288,
+      "num_tokens": 2908288.0,
       "step": 600
     },
     {
+      "epoch": 0.52,
+      "grad_norm": 37.25,
+      "learning_rate": 9.870399970920932e-06,
+      "loss": 0.4363,
+      "mean_token_accuracy": 0.8960465312004089,
+      "num_input_tokens_seen": 3073056,
+      "num_tokens": 3073056.0,
       "step": 650
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 26.375,
+      "learning_rate": 8.579184101829734e-06,
+      "loss": 0.3453,
+      "mean_token_accuracy": 0.9128145408630371,
+      "num_input_tokens_seen": 3346848,
+      "num_tokens": 3346848.0,
       "step": 700
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 29.25,
+      "learning_rate": 7.311800443430251e-06,
+      "loss": 0.5762,
+      "mean_token_accuracy": 0.8654252612590789,
+      "num_input_tokens_seen": 3611920,
+      "num_tokens": 3611920.0,
       "step": 750
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 7.96875,
+      "learning_rate": 6.0895075939779705e-06,
+      "loss": 0.4856,
+      "mean_token_accuracy": 0.8799186503887176,
+      "num_input_tokens_seen": 3815936,
+      "num_tokens": 3815936.0,
       "step": 800
     },
     {
+      "epoch": 0.68,
+      "grad_norm": 77.5,
+      "learning_rate": 4.932807816118347e-06,
+      "loss": 0.536,
+      "mean_token_accuracy": 0.8713834238052368,
+      "num_input_tokens_seen": 4003536,
+      "num_tokens": 4003536.0,
       "step": 850
     },
     {
+      "epoch": 0.72,
+      "grad_norm": 36.5,
+      "learning_rate": 3.861103139944448e-06,
+      "loss": 0.6726,
+      "mean_token_accuracy": 0.8307892644405365,
+      "num_input_tokens_seen": 4219808,
+      "num_tokens": 4219808.0,
       "step": 900
     },
     {
+      "epoch": 0.76,
+      "grad_norm": 38.0,
+      "learning_rate": 2.8923699209255285e-06,
+      "loss": 0.7238,
+      "mean_token_accuracy": 0.8186173605918884,
+      "num_input_tokens_seen": 4497936,
+      "num_tokens": 4497936.0,
       "step": 950
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 41.0,
+      "learning_rate": 2.0428573115446394e-06,
+      "loss": 0.8584,
+      "mean_token_accuracy": 0.7786743235588074,
+      "num_input_tokens_seen": 4729216,
+      "num_tokens": 4729216.0,
       "step": 1000
     },
     {
+      "epoch": 0.84,
+      "grad_norm": 47.5,
+      "learning_rate": 1.326814704364262e-06,
+      "loss": 1.0424,
+      "mean_token_accuracy": 0.7482965195178986,
+      "num_input_tokens_seen": 4953984,
+      "num_tokens": 4953984.0,
       "step": 1050
     },
     {
+      "epoch": 0.88,
+      "grad_norm": 57.0,
+      "learning_rate": 7.562527182833978e-07,
+      "loss": 1.0274,
+      "mean_token_accuracy": 0.7586082279682159,
+      "num_input_tokens_seen": 5136224,
+      "num_tokens": 5136224.0,
       "step": 1100
     },
     {
+      "epoch": 0.92,
+      "grad_norm": 64.0,
+      "learning_rate": 3.4074173710931804e-07,
+      "loss": 1.1236,
+      "mean_token_accuracy": 0.7324086999893189,
+      "num_input_tokens_seen": 5339136,
+      "num_tokens": 5339136.0,
       "step": 1150
     },
     {
+      "epoch": 0.96,
+      "grad_norm": 103.0,
+      "learning_rate": 8.725137967920739e-08,
+      "loss": 1.1653,
+      "mean_token_accuracy": 0.7305689966678619,
+      "num_input_tokens_seen": 5558144,
+      "num_tokens": 5558144.0,
       "step": 1200
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 53.0,
+      "learning_rate": 3.3594197175190743e-11,
+      "loss": 0.9751,
+      "mean_token_accuracy": 0.7792558777332306,
+      "num_input_tokens_seen": 5798240,
+      "num_tokens": 5798240.0,
       "step": 1250
     },
     {
+      "epoch": 1.0,
+      "num_input_tokens_seen": 5798240,
+      "step": 1250,
+      "total_flos": 3.3689960431222784e+16,
+      "train_loss": 0.5481613777160644,
+      "train_runtime": 10837.9895,
+      "train_samples_per_second": 0.923,
+      "train_steps_per_second": 0.115,
+      "train_tokens_per_second": 41.751
     }
   ],
   "logging_steps": 50,
+  "max_steps": 1250,
+  "num_input_tokens_seen": 5798240,
+  "num_train_epochs": 1,
   "save_steps": 0,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 3.3689960431222784e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ceac200286f0cf175e77ab1763672204503403d9d5009bfa55db268c97ee492
 size 6161

 version https://git-lfs.github.com/spec/v1
+oid sha256:cf29339948365a32eba15c3574185bfe9655ed282514047a80622f117445ebf1
 size 6161