penfever commited on
Commit
05ddcd3
·
verified ·
1 Parent(s): 062f9d9

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{%- for message in messages -%}{%- if loop.first and message['role'] != 'system' -%}{{ '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}{%- endif -%}{{ '<|im_start|>' + message['role'] + '\\n' }}{%- if message['content'] is string -%}{{- message['content'] -}}{%- elif message['content'] is iterable -%}{%- for item in message['content'] -%}{%- if item['type'].startswith('image') -%}{%- set image_count.value = image_count.value + 1 -%}{%- if add_vision_id -%}{{ 'Picture ' + image_count.value + ': ' }}{%- endif -%}{{ '<|vision_start|><|image_pad|><|vision_end|>' }}{%- elif item['type'].startswith('video') -%}{%- set video_count.value = video_count.value + 1 -%}{%- if add_vision_id -%}{{ 'Video ' + video_count.value + ': ' }}{%- endif -%}{{ '<|vision_start|><|video_pad|><|vision_end|>' }}{%- elif item['type']=='text' -%}{{- item['text'] if 'text' in item else item['content'] -}}{%- endif -%}{%- endfor -%}{%- endif -%}{{ '<|im_end|>\\n' }}{%- endfor -%}{%- if add_generation_prompt -%}{{- '<|im_start|>assistant\\n' -}}{%- endif -%}"
3
+ }
config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2_5_VLForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "image_token_id": 151655,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 18944,
13
+ "max_position_embeddings": 128000,
14
+ "max_window_layers": 28,
15
+ "model_type": "qwen2_5_vl",
16
+ "num_attention_heads": 28,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 4,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": {
21
+ "mrope_section": [
22
+ 16,
23
+ 24,
24
+ 24
25
+ ],
26
+ "rope_type": "default",
27
+ "type": "default"
28
+ },
29
+ "rope_theta": 1000000.0,
30
+ "sliding_window": 32768,
31
+ "tie_word_embeddings": false,
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.51.3",
34
+ "use_cache": false,
35
+ "use_sliding_window": false,
36
+ "video_token_id": 151656,
37
+ "vision_config": {
38
+ "depth": 32,
39
+ "fullatt_block_indexes": [
40
+ 7,
41
+ 15,
42
+ 23,
43
+ 31
44
+ ],
45
+ "hidden_act": "silu",
46
+ "hidden_size": 1280,
47
+ "in_channels": 3,
48
+ "in_chans": 3,
49
+ "intermediate_size": 3420,
50
+ "model_type": "qwen2_5_vl",
51
+ "num_heads": 16,
52
+ "out_hidden_size": 3584,
53
+ "patch_size": 14,
54
+ "spatial_merge_size": 2,
55
+ "spatial_patch_size": 14,
56
+ "temporal_patch_size": 2,
57
+ "tokens_per_second": 2,
58
+ "torch_dtype": "bfloat16",
59
+ "window_size": 112
60
+ },
61
+ "vision_end_token_id": 151653,
62
+ "vision_start_token_id": 151652,
63
+ "vision_token_id": 151654,
64
+ "vocab_size": 152064
65
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 1e-06,
11
+ "transformers_version": "4.51.3"
12
+ }
logs/rank_0000.log ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-07-04 10:07:00,006][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:283] TrainingConfig:
2
+ TrainingConfig(data=DataParams(train=DatasetSplitParams(datasets=[DatasetParams(dataset_name='hf_vision',
3
+ dataset_path=None,
4
+ subset=None,
5
+ split='train',
6
+ dataset_kwargs={'answer_column': 'solution',
7
+ 'hf_dataset_path': 'penfever/MM-MathInstruct-to-r1-format-filtered',
8
+ 'image_column': 'image',
9
+ 'processor_name': 'Qwen/Qwen2.5-VL-7B-Instruct',
10
+ 'question_column': 'problem',
11
+ 'return_conversations': True,
12
+ 'return_tensors': True},
13
+ sample_count=None,
14
+ mixture_proportion=None,
15
+ shuffle=True,
16
+ seed=42,
17
+ shuffle_buffer_size=1000,
18
+ trust_remote_code=True,
19
+ transform_num_workers='auto')],
20
+ collator_name='vision_language_sft',
21
+ collator_kwargs={'process_individually': True},
22
+ pack=False,
23
+ stream=False,
24
+ target_col=None,
25
+ mixture_strategy='first_exhausted',
26
+ seed=None,
27
+ use_async_dataset=False,
28
+ use_torchdata=True),
29
+ test=DatasetSplitParams(datasets=[],
30
+ collator_name=None,
31
+ collator_kwargs={},
32
+ pack=False,
33
+ stream=False,
34
+ target_col=None,
35
+ mixture_strategy='first_exhausted',
36
+ seed=None,
37
+ use_async_dataset=False,
38
+ use_torchdata=None),
39
+ validation=DatasetSplitParams(datasets=[],
40
+ collator_name=None,
41
+ collator_kwargs={},
42
+ pack=False,
43
+ stream=False,
44
+ target_col=None,
45
+ mixture_strategy='first_exhausted',
46
+ seed=None,
47
+ use_async_dataset=False,
48
+ use_torchdata=None)),
49
+ model=ModelParams(model_name='Qwen/Qwen2.5-VL-7B-Instruct',
50
+ adapter_model=None,
51
+ tokenizer_name=None,
52
+ tokenizer_pad_token=None,
53
+ tokenizer_kwargs={},
54
+ processor_kwargs={},
55
+ model_max_length=10000,
56
+ load_pretrained_weights=True,
57
+ trust_remote_code=True,
58
+ torch_dtype_str='bfloat16',
59
+ compile=False,
60
+ chat_template='qwen2-vl-instruct',
61
+ attn_implementation='sdpa',
62
+ device_map='auto',
63
+ model_kwargs={},
64
+ enable_liger_kernel=False,
65
+ shard_for_eval=False,
66
+ freeze_layers=[],
67
+ model_revision=None),
68
+ training=TrainingParams(use_peft=False,
69
+ trainer_type=<TrainerType.TRL_SFT: 'trl_sft'>,
70
+ enable_gradient_checkpointing=True,
71
+ gradient_checkpointing_kwargs={'use_reentrant': False},
72
+ output_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
73
+ per_device_train_batch_size=1,
74
+ per_device_eval_batch_size=8,
75
+ gradient_accumulation_steps=1,
76
+ max_steps=3750,
77
+ num_train_epochs=5,
78
+ save_epoch=False,
79
+ save_steps=0,
80
+ save_final_model=True,
81
+ seed=42,
82
+ data_seed=42,
83
+ use_deterministic=False,
84
+ full_determinism=False,
85
+ run_name=None,
86
+ metrics_function=None,
87
+ reward_functions=None,
88
+ grpo=GrpoParams(model_init_kwargs={},
89
+ max_prompt_length=None,
90
+ max_completion_length=None,
91
+ num_generations=None,
92
+ temperature=0.9,
93
+ remove_unused_columns=False,
94
+ repetition_penalty=1.0,
95
+ use_vllm=False,
96
+ vllm_device=None,
97
+ vllm_gpu_memory_utilization=0.9,
98
+ vllm_dtype=None,
99
+ vllm_max_model_len=None,
100
+ epsilon=0.2,
101
+ log_completions=False),
102
+ log_level='info',
103
+ dep_log_level='warning',
104
+ enable_wandb=True,
105
+ enable_mlflow=False,
106
+ enable_tensorboard=True,
107
+ logging_strategy='steps',
108
+ logging_dir=None,
109
+ logging_steps=50,
110
+ logging_first_step=False,
111
+ eval_strategy='no',
112
+ eval_steps=500,
113
+ learning_rate=2e-05,
114
+ lr_scheduler_type='cosine',
115
+ lr_scheduler_kwargs={},
116
+ warmup_ratio=0.03,
117
+ warmup_steps=None,
118
+ optimizer='adamw_torch_fused',
119
+ weight_decay=0.01,
120
+ adam_beta1=0.9,
121
+ adam_beta2=0.999,
122
+ adam_epsilon=1e-08,
123
+ sgd_momentum=0.0,
124
+ mixed_precision_dtype=<MixedPrecisionDtype.NONE: 'none'>,
125
+ compile=False,
126
+ include_performance_metrics=True,
127
+ include_alternative_mfu_metrics=False,
128
+ log_model_summary=False,
129
+ resume_from_checkpoint=None,
130
+ try_resume_from_last_checkpoint=False,
131
+ dataloader_num_workers=2,
132
+ dataloader_persistent_workers=False,
133
+ dataloader_prefetch_factor=8,
134
+ dataloader_main_process_only=False,
135
+ ddp_find_unused_parameters=False,
136
+ max_grad_norm=1.0,
137
+ trainer_kwargs={'dataset_kwargs': {'skip_prepare_dataset': True},
138
+ 'max_seq_length': 10000,
139
+ 'remove_unused_columns': False},
140
+ verl_config_overrides={},
141
+ profiler=ProfilerParams(save_dir=None,
142
+ enable_cpu_profiling=False,
143
+ enable_cuda_profiling=False,
144
+ record_shapes=False,
145
+ profile_memory=False,
146
+ with_stack=False,
147
+ with_flops=False,
148
+ with_modules=False,
149
+ row_limit=50,
150
+ schedule=ProfilerScheduleParams(enable_schedule=False,
151
+ wait=0,
152
+ warmup=1,
153
+ active=3,
154
+ repeat=1,
155
+ skip_first=1)),
156
+ telemetry=TelemetryParams(telemetry_dir='telemetry',
157
+ collect_telemetry_for_all_ranks=False,
158
+ track_gpu_temperature=False),
159
+ empty_device_cache_steps=1,
160
+ nccl_default_timeout_minutes=None,
161
+ label_ignore_index=None),
162
+ peft=PeftParams(lora_r=8,
163
+ lora_alpha=8,
164
+ lora_dropout=0.0,
165
+ lora_target_modules=None,
166
+ lora_modules_to_save=None,
167
+ lora_bias='none',
168
+ lora_init_weights=<LoraWeightInitialization.DEFAULT: 'default'>,
169
+ lora_task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>,
170
+ q_lora=False,
171
+ q_lora_bits=4,
172
+ bnb_4bit_quant_type='fp4',
173
+ llm_int8_skip_modules=None,
174
+ use_bnb_nested_quant=False,
175
+ bnb_4bit_quant_storage='uint8',
176
+ bnb_4bit_compute_dtype='float32',
177
+ peft_save_mode=<PeftSaveMode.ADAPTER_ONLY: 'adapter_only'>),
178
+ fsdp=FSDPParams(enable_fsdp=True,
179
+ sharding_strategy=<ShardingStrategy.HYBRID_SHARD: 'HYBRID_SHARD'>,
180
+ cpu_offload=False,
181
+ mixed_precision='bf16',
182
+ backward_prefetch=<BackwardPrefetch.BACKWARD_PRE: 'BACKWARD_PRE'>,
183
+ forward_prefetch=True,
184
+ use_orig_params=None,
185
+ state_dict_type=<StateDictType.FULL_STATE_DICT: 'FULL_STATE_DICT'>,
186
+ auto_wrap_policy=<AutoWrapPolicy.SIZE_BASED_WRAP: 'SIZE_BASED_WRAP'>,
187
+ min_num_params=100000,
188
+ transformer_layer_cls=None,
189
+ sync_module_states=True))
190
+ [2025-07-04 10:07:00,227][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
191
+ [2025-07-04 10:07:02,023][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
192
+ [2025-07-04 10:07:03,883][oumi][rank0][pid:2619050][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
193
+ [2025-07-04 10:07:03,883][oumi][rank0][pid:2619050][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
194
+ [2025-07-04 10:07:05,192][oumi][rank0][pid:2619050][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
195
+ Split: train
196
+ Version: 0.0.0
197
+ Dataset size: 57058499
198
+ Download size: 48789762
199
+ Size: 105848261 bytes
200
+ Rows: 4286
201
+ Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
202
+ [2025-07-04 10:07:07,950][oumi][rank0][pid:2619050][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
203
+ problem object
204
+ solution object
205
+ original_question object
206
+ original_answer object
207
+ image object
208
+ dtype: object
209
+ [2025-07-04 10:07:21,068][oumi][rank0][pid:2619050][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0)
210
+ [2025-07-04 10:07:21,069][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
211
+ [2025-07-04 10:07:21,069][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
212
+ [2025-07-04 10:07:21,069][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=0, local_world_size=8, local_rank=0))...
213
+ [2025-07-04 10:07:21,105][oumi][rank0][pid:2619050][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
214
+ [2025-07-04 10:07:23,607][oumi][rank0][pid:2619050][MainThread][INFO]][torch_utils.py:289]
215
+ Model Parameters Summary:
216
+ 🔢 Total parameters: 8,292,166,656
217
+ 🔗 Embedding parameters: 544,997,376
218
+ 🎯 Trainable parameters: 8,292,166,656
219
+ 🔒 Frozen parameters: 0 (0.00%)
220
+
221
+ [2025-07-04 10:07:24,853][oumi][rank0][pid:2619050][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
222
+ [2025-07-04 10:07:24,853][oumi][rank0][pid:2619050][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
223
+ [2025-07-04 10:07:25,176][oumi][rank0][pid:2619050][MainThread][INFO]][training.py:62] SFTConfig(output_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
224
+ overwrite_output_dir=False,
225
+ do_train=False,
226
+ do_eval=False,
227
+ do_predict=False,
228
+ eval_strategy=<IntervalStrategy.NO: 'no'>,
229
+ prediction_loss_only=False,
230
+ per_device_train_batch_size=1,
231
+ per_device_eval_batch_size=8,
232
+ per_gpu_train_batch_size=None,
233
+ per_gpu_eval_batch_size=None,
234
+ gradient_accumulation_steps=1,
235
+ eval_accumulation_steps=None,
236
+ eval_delay=0,
237
+ torch_empty_cache_steps=1,
238
+ learning_rate=2e-05,
239
+ weight_decay=0.01,
240
+ adam_beta1=0.9,
241
+ adam_beta2=0.999,
242
+ adam_epsilon=1e-08,
243
+ max_grad_norm=1.0,
244
+ num_train_epochs=5,
245
+ max_steps=3750,
246
+ lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>,
247
+ lr_scheduler_kwargs={},
248
+ warmup_ratio=0.03,
249
+ warmup_steps=0,
250
+ log_level='warning',
251
+ log_level_replica='warning',
252
+ log_on_each_node=True,
253
+ logging_dir='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/runs/Jul04_10-07-24_oumi-compute004',
254
+ logging_strategy=<IntervalStrategy.STEPS: 'steps'>,
255
+ logging_first_step=False,
256
+ logging_steps=50,
257
+ logging_nan_inf_filter=True,
258
+ save_strategy=<SaveStrategy.NO: 'no'>,
259
+ save_steps=0,
260
+ save_total_limit=None,
261
+ save_safetensors=True,
262
+ save_on_each_node=False,
263
+ save_only_model=False,
264
+ restore_callback_states_from_checkpoint=False,
265
+ no_cuda=False,
266
+ use_cpu=False,
267
+ use_mps_device=False,
268
+ seed=42,
269
+ data_seed=42,
270
+ jit_mode_eval=False,
271
+ use_ipex=False,
272
+ bf16=False,
273
+ fp16=False,
274
+ fp16_opt_level='O1',
275
+ half_precision_backend='auto',
276
+ bf16_full_eval=False,
277
+ fp16_full_eval=False,
278
+ tf32=None,
279
+ local_rank=0,
280
+ ddp_backend=None,
281
+ tpu_num_cores=None,
282
+ tpu_metrics_debug=False,
283
+ debug=[],
284
+ dataloader_drop_last=False,
285
+ eval_steps=500,
286
+ dataloader_num_workers=2,
287
+ dataloader_prefetch_factor=8,
288
+ past_index=-1,
289
+ run_name='output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered',
290
+ disable_tqdm=False,
291
+ remove_unused_columns=False,
292
+ label_names=None,
293
+ load_best_model_at_end=False,
294
+ metric_for_best_model=None,
295
+ greater_is_better=None,
296
+ ignore_data_skip=False,
297
+ fsdp=[],
298
+ fsdp_min_num_params=0,
299
+ fsdp_config={'min_num_params': 0,
300
+ 'xla': False,
301
+ 'xla_fsdp_grad_ckpt': False,
302
+ 'xla_fsdp_v2': False},
303
+ tp_size=0,
304
+ fsdp_transformer_layer_cls_to_wrap=None,
305
+ accelerator_config=AcceleratorConfig(split_batches=False,
306
+ dispatch_batches=False,
307
+ even_batches=True,
308
+ use_seedable_sampler=True,
309
+ non_blocking=False,
310
+ gradient_accumulation_kwargs=None,
311
+ use_configured_state=False),
312
+ deepspeed=None,
313
+ label_smoothing_factor=0.0,
314
+ optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>,
315
+ optim_args=None,
316
+ adafactor=False,
317
+ group_by_length=False,
318
+ length_column_name='length',
319
+ report_to=['wandb', 'tensorboard'],
320
+ ddp_find_unused_parameters=False,
321
+ ddp_bucket_cap_mb=None,
322
+ ddp_broadcast_buffers=None,
323
+ dataloader_pin_memory=True,
324
+ dataloader_persistent_workers=False,
325
+ skip_memory_metrics=True,
326
+ use_legacy_prediction_loop=False,
327
+ push_to_hub=False,
328
+ resume_from_checkpoint=None,
329
+ hub_model_id=None,
330
+ hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>,
331
+ hub_token=None,
332
+ hub_private_repo=None,
333
+ hub_always_push=False,
334
+ gradient_checkpointing=False,
335
+ gradient_checkpointing_kwargs={'use_reentrant': False},
336
+ include_inputs_for_metrics=False,
337
+ include_for_metrics=[],
338
+ eval_do_concat_batches=True,
339
+ fp16_backend='auto',
340
+ push_to_hub_model_id=None,
341
+ push_to_hub_organization=None,
342
+ push_to_hub_token=None,
343
+ mp_parameters='',
344
+ auto_find_batch_size=False,
345
+ full_determinism=False,
346
+ torchdynamo=None,
347
+ ray_scope='last',
348
+ ddp_timeout=1800,
349
+ torch_compile=False,
350
+ torch_compile_backend=None,
351
+ torch_compile_mode=None,
352
+ include_tokens_per_second=True,
353
+ include_num_input_tokens_seen=True,
354
+ neftune_noise_alpha=None,
355
+ optim_target_modules=None,
356
+ batch_eval_metrics=False,
357
+ eval_on_start=False,
358
+ use_liger_kernel=False,
359
+ eval_use_gather_object=False,
360
+ average_tokens_across_devices=False,
361
+ model_init_kwargs=None,
362
+ dataset_text_field='text',
363
+ dataset_kwargs={'skip_prepare_dataset': True},
364
+ dataset_num_proc=None,
365
+ pad_token=None,
366
+ max_length=10000,
367
+ packing=False,
368
+ padding_free=False,
369
+ eval_packing=None,
370
+ dataset_batch_size=None,
371
+ num_of_sequences=None,
372
+ chars_per_token=None,
373
+ max_seq_length=10000,
374
+ use_liger=None)
375
+ [2025-07-04 10:07:25,232][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.434, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
376
+ [2025-07-04 10:07:25,287][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:510] Training init time: 29.326s
377
+ [2025-07-04 10:07:25,287][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
378
+ [2025-07-04 12:47:15,514][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=193.085, power_limit_watts=700.0, gpu_utilization=11, memory_utilization=2, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
379
+ [2025-07-04 15:25:44,400][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=187.86, power_limit_watts=700.0, gpu_utilization=32, memory_utilization=9, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
380
+ [2025-07-04 18:04:39,027][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=188.725, power_limit_watts=700.0, gpu_utilization=49, memory_utilization=3, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
381
+ [2025-07-04 20:43:20,054][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=193.78, power_limit_watts=700.0, gpu_utilization=35, memory_utilization=2, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
382
+ [2025-07-04 23:21:14,672][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=191.45000000000002, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
383
+ [2025-07-05 01:59:05,749][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34750.0, temperature=35, fan_speed=None, fan_speeds=None, power_usage_watts=187.851, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
384
+ [2025-07-05 04:37:20,509][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=34, fan_speed=None, fan_speeds=None, power_usage_watts=191.013, power_limit_watts=700.0, gpu_utilization=36, memory_utilization=3, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
385
+ [2025-07-05 04:37:20,705][oumi][rank0][pid:2619050][MainThread][INFO]][telemetry_callback.py:242] Saving telemetry callback summary to output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered/telemetry/telemetry_callback_rank0000.json...
386
+ [2025-07-05 04:37:21,418][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:518] Training is Complete.
387
+ [2025-07-05 04:37:21,441][oumi][rank0][pid:2619050][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
388
+ [2025-07-05 04:37:21,442][oumi][rank0][pid:2619050][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 29.37 GB
389
+ [2025-07-05 04:37:21,442][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:525] Saving final state...
390
+ [2025-07-05 04:37:21,451][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:530] Saving final model...
391
+ [2025-07-05 04:37:21,454][oumi][rank0][pid:2619050][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
392
+ [2025-07-05 04:43:01,650][oumi][rank0][pid:2619050][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
393
+ [2025-07-05 04:43:05,405][oumi][rank0][pid:2619050][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
394
+ [2025-07-05 04:43:07,241][oumi][rank0][pid:2619050][MainThread][INFO]][train.py:214]
395
+
396
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0001.log ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-07-04 10:06:56,067][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-04 10:06:57,882][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-04 10:07:03,033][oumi][rank1][pid:2619051][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-04 10:07:03,033][oumi][rank1][pid:2619051][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-04 10:07:05,285][oumi][rank1][pid:2619051][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
+ Split: train
7
+ Version: 0.0.0
8
+ Dataset size: 57058499
9
+ Download size: 48789762
10
+ Size: 105848261 bytes
11
+ Rows: 4286
12
+ Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-04 10:07:07,699][oumi][rank1][pid:2619051][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
+ problem object
15
+ solution object
16
+ original_question object
17
+ original_answer object
18
+ image object
19
+ dtype: object
20
+ [2025-07-04 10:07:21,069][oumi][rank1][pid:2619051][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1)
21
+ [2025-07-04 10:07:21,070][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
+ [2025-07-04 10:07:21,070][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
+ [2025-07-04 10:07:21,071][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=1, local_world_size=8, local_rank=1))...
24
+ [2025-07-04 10:07:21,100][oumi][rank1][pid:2619051][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
+ [2025-07-04 10:07:24,479][oumi][rank1][pid:2619051][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
+ [2025-07-04 10:07:24,479][oumi][rank1][pid:2619051][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
+ [2025-07-04 10:07:24,827][oumi][rank1][pid:2619051][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
+ [2025-07-04 10:07:25,288][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:510] Training init time: 29.332s
29
+ [2025-07-04 10:07:25,288][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
+ [2025-07-05 04:37:21,400][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:518] Training is Complete.
31
+ [2025-07-05 04:37:21,406][oumi][rank1][pid:2619051][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
+ [2025-07-05 04:37:21,407][oumi][rank1][pid:2619051][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 26.58 GB
33
+ [2025-07-05 04:37:21,407][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:525] Saving final state...
34
+ [2025-07-05 04:37:21,453][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:530] Saving final model...
35
+ [2025-07-05 04:37:21,453][oumi][rank1][pid:2619051][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
+ [2025-07-05 04:40:16,622][oumi][rank1][pid:2619051][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
+ [2025-07-05 04:40:20,790][oumi][rank1][pid:2619051][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-05 04:43:06,891][oumi][rank1][pid:2619051][MainThread][INFO]][train.py:214]
39
+
40
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0002.log ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-07-04 10:06:56,062][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-04 10:06:57,926][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-04 10:07:02,953][oumi][rank2][pid:2619052][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-04 10:07:02,953][oumi][rank2][pid:2619052][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-04 10:07:05,190][oumi][rank2][pid:2619052][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
+ Split: train
7
+ Version: 0.0.0
8
+ Dataset size: 57058499
9
+ Download size: 48789762
10
+ Size: 105848261 bytes
11
+ Rows: 4286
12
+ Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-04 10:07:07,667][oumi][rank2][pid:2619052][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
+ problem object
15
+ solution object
16
+ original_question object
17
+ original_answer object
18
+ image object
19
+ dtype: object
20
+ [2025-07-04 10:07:21,065][oumi][rank2][pid:2619052][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2)
21
+ [2025-07-04 10:07:21,065][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
+ [2025-07-04 10:07:21,066][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
+ [2025-07-04 10:07:21,066][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=2, local_world_size=8, local_rank=2))...
24
+ [2025-07-04 10:07:21,094][oumi][rank2][pid:2619052][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
+ [2025-07-04 10:07:24,380][oumi][rank2][pid:2619052][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
+ [2025-07-04 10:07:24,381][oumi][rank2][pid:2619052][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
+ [2025-07-04 10:07:24,794][oumi][rank2][pid:2619052][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
+ [2025-07-04 10:07:25,289][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:510] Training init time: 29.331s
29
+ [2025-07-04 10:07:25,289][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
+ [2025-07-05 04:37:21,430][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:518] Training is Complete.
31
+ [2025-07-05 04:37:21,437][oumi][rank2][pid:2619052][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=127.816, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
+ [2025-07-05 04:37:21,438][oumi][rank2][pid:2619052][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 26.67 GB
33
+ [2025-07-05 04:37:21,438][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:525] Saving final state...
34
+ [2025-07-05 04:37:21,453][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:530] Saving final model...
35
+ [2025-07-05 04:37:21,453][oumi][rank2][pid:2619052][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
+ [2025-07-05 04:40:16,620][oumi][rank2][pid:2619052][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
+ [2025-07-05 04:40:20,693][oumi][rank2][pid:2619052][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-05 04:43:06,319][oumi][rank2][pid:2619052][MainThread][INFO]][train.py:214]
39
+
40
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0003.log ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-07-04 10:06:56,066][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-04 10:06:57,878][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-04 10:07:02,977][oumi][rank3][pid:2619053][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-04 10:07:02,978][oumi][rank3][pid:2619053][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-04 10:07:05,195][oumi][rank3][pid:2619053][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
+ Split: train
7
+ Version: 0.0.0
8
+ Dataset size: 57058499
9
+ Download size: 48789762
10
+ Size: 105848261 bytes
11
+ Rows: 4286
12
+ Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-04 10:07:07,990][oumi][rank3][pid:2619053][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
+ problem object
15
+ solution object
16
+ original_question object
17
+ original_answer object
18
+ image object
19
+ dtype: object
20
+ [2025-07-04 10:07:21,052][oumi][rank3][pid:2619053][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3)
21
+ [2025-07-04 10:07:21,054][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
+ [2025-07-04 10:07:21,054][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
+ [2025-07-04 10:07:21,054][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=3, local_world_size=8, local_rank=3))...
24
+ [2025-07-04 10:07:21,084][oumi][rank3][pid:2619053][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
+ [2025-07-04 10:07:24,294][oumi][rank3][pid:2619053][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
+ [2025-07-04 10:07:24,294][oumi][rank3][pid:2619053][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
+ [2025-07-04 10:07:24,748][oumi][rank3][pid:2619053][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.277, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
+ [2025-07-04 10:07:25,301][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:510] Training init time: 29.344s
29
+ [2025-07-04 10:07:25,301][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
+ [2025-07-05 04:37:21,335][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:518] Training is Complete.
31
+ [2025-07-05 04:37:21,341][oumi][rank3][pid:2619053][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
+ [2025-07-05 04:37:21,341][oumi][rank3][pid:2619053][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 31.98 GB
33
+ [2025-07-05 04:37:21,341][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:525] Saving final state...
34
+ [2025-07-05 04:37:21,453][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:530] Saving final model...
35
+ [2025-07-05 04:37:21,453][oumi][rank3][pid:2619053][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
+ [2025-07-05 04:40:16,620][oumi][rank3][pid:2619053][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
+ [2025-07-05 04:40:20,801][oumi][rank3][pid:2619053][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-05 04:43:06,312][oumi][rank3][pid:2619053][MainThread][INFO]][train.py:214]
39
+
40
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0004.log ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-07-04 10:06:56,052][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-04 10:06:57,909][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-04 10:07:03,045][oumi][rank4][pid:2619054][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-04 10:07:03,045][oumi][rank4][pid:2619054][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-04 10:07:05,190][oumi][rank4][pid:2619054][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
+ Split: train
7
+ Version: 0.0.0
8
+ Dataset size: 57058499
9
+ Download size: 48789762
10
+ Size: 105848261 bytes
11
+ Rows: 4286
12
+ Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-04 10:07:07,804][oumi][rank4][pid:2619054][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
+ problem object
15
+ solution object
16
+ original_question object
17
+ original_answer object
18
+ image object
19
+ dtype: object
20
+ [2025-07-04 10:07:21,061][oumi][rank4][pid:2619054][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4)
21
+ [2025-07-04 10:07:21,062][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
+ [2025-07-04 10:07:21,062][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
+ [2025-07-04 10:07:21,062][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=4, local_world_size=8, local_rank=4))...
24
+ [2025-07-04 10:07:21,090][oumi][rank4][pid:2619054][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
+ [2025-07-04 10:07:24,417][oumi][rank4][pid:2619054][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
+ [2025-07-04 10:07:24,417][oumi][rank4][pid:2619054][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
+ [2025-07-04 10:07:24,786][oumi][rank4][pid:2619054][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
+ [2025-07-04 10:07:25,287][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:510] Training init time: 29.328s
29
+ [2025-07-04 10:07:25,287][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
+ [2025-07-05 04:37:21,286][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:518] Training is Complete.
31
+ [2025-07-05 04:37:21,293][oumi][rank4][pid:2619054][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=135.197, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
+ [2025-07-05 04:37:21,293][oumi][rank4][pid:2619054][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 29.58 GB
33
+ [2025-07-05 04:37:21,293][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:525] Saving final state...
34
+ [2025-07-05 04:37:21,451][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:530] Saving final model...
35
+ [2025-07-05 04:37:21,452][oumi][rank4][pid:2619054][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
+ [2025-07-05 04:40:16,617][oumi][rank4][pid:2619054][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
+ [2025-07-05 04:40:20,756][oumi][rank4][pid:2619054][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-05 04:43:06,046][oumi][rank4][pid:2619054][MainThread][INFO]][train.py:214]
39
+
40
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0005.log ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-07-04 10:06:56,055][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-04 10:06:57,908][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-04 10:07:03,051][oumi][rank5][pid:2619055][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-04 10:07:03,051][oumi][rank5][pid:2619055][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-04 10:07:05,197][oumi][rank5][pid:2619055][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
+ Split: train
7
+ Version: 0.0.0
8
+ Dataset size: 57058499
9
+ Download size: 48789762
10
+ Size: 105848261 bytes
11
+ Rows: 4286
12
+ Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-04 10:07:07,706][oumi][rank5][pid:2619055][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
+ problem object
15
+ solution object
16
+ original_question object
17
+ original_answer object
18
+ image object
19
+ dtype: object
20
+ [2025-07-04 10:07:21,046][oumi][rank5][pid:2619055][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5)
21
+ [2025-07-04 10:07:21,047][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
+ [2025-07-04 10:07:21,047][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
+ [2025-07-04 10:07:21,047][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=5, local_world_size=8, local_rank=5))...
24
+ [2025-07-04 10:07:21,083][oumi][rank5][pid:2619055][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
+ [2025-07-04 10:07:24,308][oumi][rank5][pid:2619055][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
+ [2025-07-04 10:07:24,308][oumi][rank5][pid:2619055][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
+ [2025-07-04 10:07:24,772][oumi][rank5][pid:2619055][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.277, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
+ [2025-07-04 10:07:25,292][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:510] Training init time: 29.335s
29
+ [2025-07-04 10:07:25,292][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
+ [2025-07-05 04:37:21,368][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:518] Training is Complete.
31
+ [2025-07-05 04:37:21,373][oumi][rank5][pid:2619055][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
+ [2025-07-05 04:37:21,373][oumi][rank5][pid:2619055][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 28.73 GB
33
+ [2025-07-05 04:37:21,373][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:525] Saving final state...
34
+ [2025-07-05 04:37:21,454][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:530] Saving final model...
35
+ [2025-07-05 04:37:21,454][oumi][rank5][pid:2619055][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
+ [2025-07-05 04:40:16,620][oumi][rank5][pid:2619055][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
+ [2025-07-05 04:40:20,536][oumi][rank5][pid:2619055][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-05 04:43:05,901][oumi][rank5][pid:2619055][MainThread][INFO]][train.py:214]
39
+
40
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0006.log ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-07-04 10:06:56,070][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-04 10:06:57,902][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-04 10:07:03,025][oumi][rank6][pid:2619056][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-04 10:07:03,026][oumi][rank6][pid:2619056][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-04 10:07:05,222][oumi][rank6][pid:2619056][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
+ Split: train
7
+ Version: 0.0.0
8
+ Dataset size: 57058499
9
+ Download size: 48789762
10
+ Size: 105848261 bytes
11
+ Rows: 4286
12
+ Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-04 10:07:07,724][oumi][rank6][pid:2619056][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
+ problem object
15
+ solution object
16
+ original_question object
17
+ original_answer object
18
+ image object
19
+ dtype: object
20
+ [2025-07-04 10:07:21,068][oumi][rank6][pid:2619056][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6)
21
+ [2025-07-04 10:07:21,070][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
+ [2025-07-04 10:07:21,070][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
+ [2025-07-04 10:07:21,071][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=6, local_world_size=8, local_rank=6))...
24
+ [2025-07-04 10:07:21,106][oumi][rank6][pid:2619056][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
+ [2025-07-04 10:07:24,500][oumi][rank6][pid:2619056][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
+ [2025-07-04 10:07:24,500][oumi][rank6][pid:2619056][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
+ [2025-07-04 10:07:24,822][oumi][rank6][pid:2619056][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.266, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
+ [2025-07-04 10:07:25,293][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:510] Training init time: 29.334s
29
+ [2025-07-04 10:07:25,293][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
+ [2025-07-05 04:37:21,401][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:518] Training is Complete.
31
+ [2025-07-05 04:37:21,402][oumi][rank6][pid:2619056][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
+ [2025-07-05 04:37:21,406][oumi][rank6][pid:2619056][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 29.56 GB
33
+ [2025-07-05 04:37:21,406][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:525] Saving final state...
34
+ [2025-07-05 04:37:21,452][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:530] Saving final model...
35
+ [2025-07-05 04:37:21,452][oumi][rank6][pid:2619056][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
+ [2025-07-05 04:40:16,619][oumi][rank6][pid:2619056][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
+ [2025-07-05 04:40:20,620][oumi][rank6][pid:2619056][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-05 04:43:06,598][oumi][rank6][pid:2619056][MainThread][INFO]][train.py:214]
39
+
40
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
logs/rank_0007.log ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-07-04 10:06:56,058][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:433] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
2
+ [2025-07-04 10:06:57,900][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:479] Using the chat template 'qwen2-vl-instruct' specified in model config for model 'Qwen/Qwen2.5-VL-7B-Instruct'.
3
+ [2025-07-04 10:07:03,049][oumi][rank7][pid:2619057][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
4
+ [2025-07-04 10:07:03,049][oumi][rank7][pid:2619057][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'penfever/MM-MathInstruct-to-r1-format-filtered'
5
+ [2025-07-04 10:07:05,217][oumi][rank7][pid:2619057][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
6
+ Split: train
7
+ Version: 0.0.0
8
+ Dataset size: 57058499
9
+ Download size: 48789762
10
+ Size: 105848261 bytes
11
+ Rows: 4286
12
+ Columns: ['problem', 'solution', 'original_question', 'original_answer', 'image']
13
+ [2025-07-04 10:07:07,979][oumi][rank7][pid:2619057][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (4286, 5). Columns:
14
+ problem object
15
+ solution object
16
+ original_question object
17
+ original_answer object
18
+ image object
19
+ dtype: object
20
+ [2025-07-04 10:07:21,057][oumi][rank7][pid:2619057][MainThread][INFO]][distributed.py:303] Initialized distributed (True): DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7)
21
+ [2025-07-04 10:07:21,058][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:406] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
22
+ [2025-07-04 10:07:21,058][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:219] Accelerate FSDP run detected! Setting device_map to None.
23
+ [2025-07-04 10:07:21,058][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:228] Building model using device_map: None (DeviceRankInfo(world_size=8, rank=7, local_world_size=8, local_rank=7))...
24
+ [2025-07-04 10:07:21,086][oumi][rank7][pid:2619057][MainThread][INFO]][models.py:300] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
25
+ [2025-07-04 10:07:24,262][oumi][rank7][pid:2619057][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
26
+ [2025-07-04 10:07:24,262][oumi][rank7][pid:2619057][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
27
+ [2025-07-04 10:07:24,740][oumi][rank7][pid:2619057][MainThread][INFO]][device_utils.py:297] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=6166.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=121.277, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
28
+ [2025-07-04 10:07:25,299][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:510] Training init time: 29.341s
29
+ [2025-07-04 10:07:25,299][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:511] Starting training... (TrainerType.TRL_SFT, transformers: 4.51.3)
30
+ [2025-07-05 04:37:21,399][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:518] Training is Complete.
31
+ [2025-07-05 04:37:21,401][oumi][rank7][pid:2619057][MainThread][INFO]][device_utils.py:297] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=8, used_memory_mb=34792.0, temperature=33, fan_speed=None, fan_speeds=None, power_usage_watts=134.097, power_limit_watts=700.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1980, clock_speed_sm=1980, clock_speed_memory=2619).
32
+ [2025-07-05 04:37:21,402][oumi][rank7][pid:2619057][MainThread][INFO]][torch_utils.py:136] Peak GPU memory usage: 30.10 GB
33
+ [2025-07-05 04:37:21,402][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:525] Saving final state...
34
+ [2025-07-05 04:37:21,454][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:530] Saving final model...
35
+ [2025-07-05 04:37:21,454][oumi][rank7][pid:2619057][MainThread][INFO]][hf_trainer.py:142] Saving FULL_STATE_DICT for final model checkpoint.
36
+ [2025-07-05 04:40:16,621][oumi][rank7][pid:2619057][MainThread][INFO]][hf_trainer.py:149] Model has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
37
+ [2025-07-05 04:40:20,698][oumi][rank7][pid:2619057][MainThread][INFO]][hf_trainer.py:153] Processor config has been saved at output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
38
+ [2025-07-05 04:43:06,529][oumi][rank7][pid:2619057][MainThread][INFO]][train.py:214]
39
+
40
+ » We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25fbedaea8c11fdb479dba6c19b80b01f5d01b4986237aa3c1409183039395eb
3
+ size 4952311608
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f56ab8b5118155e68ee8b476ae7da83c842862316009d899f14c1fce1fa2b2a
3
+ size 4984124272
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d206ad218950e56775aef51a9c6f671364fef793e0b2458ea725fa11f00a4aed
3
+ size 4932743936
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1799f2b014c2734d7a51285c129bf4b8917f2d19a3ac5d9f202be5fde73ecb5
3
+ size 4998852296
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67b37cd470db735b247d9bcfba76d012667a622750a5887fdc886bff5c148e53
3
+ size 4984124336
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f58c58bf796f45e1d33f0ee102b1f72031df2727d9c02f0eac860feca1b13a7
3
+ size 4932743992
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcaec0e85ec6b051d57e263aff6bd7f27926b79689a40086a5da56deb573b37d
3
+ size 3383846800
model.safetensors.index.json ADDED
@@ -0,0 +1,736 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 33168666624
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00007-of-00007.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00002-of-00007.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00002-of-00007.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00004-of-00007.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00004-of-00007.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00004-of-00007.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00005-of-00007.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00005-of-00007.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00002-of-00007.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00006-of-00007.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00006-of-00007.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00006-of-00007.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00007-of-00007.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00007-of-00007.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00007-of-00007.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00007-of-00007.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00007-of-00007.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00003-of-00007.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00003-of-00007.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00003-of-00007.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
344
+ "model.norm.weight": "model-00007-of-00007.safetensors",
345
+ "visual.blocks.0.attn.proj.bias": "model-00001-of-00007.safetensors",
346
+ "visual.blocks.0.attn.proj.weight": "model-00001-of-00007.safetensors",
347
+ "visual.blocks.0.attn.qkv.bias": "model-00001-of-00007.safetensors",
348
+ "visual.blocks.0.attn.qkv.weight": "model-00001-of-00007.safetensors",
349
+ "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
350
+ "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
351
+ "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
352
+ "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
353
+ "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
354
+ "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
355
+ "visual.blocks.0.norm1.weight": "model-00001-of-00007.safetensors",
356
+ "visual.blocks.0.norm2.weight": "model-00001-of-00007.safetensors",
357
+ "visual.blocks.1.attn.proj.bias": "model-00001-of-00007.safetensors",
358
+ "visual.blocks.1.attn.proj.weight": "model-00001-of-00007.safetensors",
359
+ "visual.blocks.1.attn.qkv.bias": "model-00001-of-00007.safetensors",
360
+ "visual.blocks.1.attn.qkv.weight": "model-00001-of-00007.safetensors",
361
+ "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
362
+ "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
363
+ "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
364
+ "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
365
+ "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
366
+ "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
367
+ "visual.blocks.1.norm1.weight": "model-00001-of-00007.safetensors",
368
+ "visual.blocks.1.norm2.weight": "model-00001-of-00007.safetensors",
369
+ "visual.blocks.10.attn.proj.bias": "model-00001-of-00007.safetensors",
370
+ "visual.blocks.10.attn.proj.weight": "model-00001-of-00007.safetensors",
371
+ "visual.blocks.10.attn.qkv.bias": "model-00001-of-00007.safetensors",
372
+ "visual.blocks.10.attn.qkv.weight": "model-00001-of-00007.safetensors",
373
+ "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
374
+ "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
375
+ "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
376
+ "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
377
+ "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
378
+ "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
379
+ "visual.blocks.10.norm1.weight": "model-00001-of-00007.safetensors",
380
+ "visual.blocks.10.norm2.weight": "model-00001-of-00007.safetensors",
381
+ "visual.blocks.11.attn.proj.bias": "model-00001-of-00007.safetensors",
382
+ "visual.blocks.11.attn.proj.weight": "model-00001-of-00007.safetensors",
383
+ "visual.blocks.11.attn.qkv.bias": "model-00001-of-00007.safetensors",
384
+ "visual.blocks.11.attn.qkv.weight": "model-00001-of-00007.safetensors",
385
+ "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
386
+ "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
387
+ "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
388
+ "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
389
+ "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
390
+ "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
391
+ "visual.blocks.11.norm1.weight": "model-00001-of-00007.safetensors",
392
+ "visual.blocks.11.norm2.weight": "model-00001-of-00007.safetensors",
393
+ "visual.blocks.12.attn.proj.bias": "model-00001-of-00007.safetensors",
394
+ "visual.blocks.12.attn.proj.weight": "model-00001-of-00007.safetensors",
395
+ "visual.blocks.12.attn.qkv.bias": "model-00001-of-00007.safetensors",
396
+ "visual.blocks.12.attn.qkv.weight": "model-00001-of-00007.safetensors",
397
+ "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
398
+ "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
399
+ "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
400
+ "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
401
+ "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
402
+ "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
403
+ "visual.blocks.12.norm1.weight": "model-00001-of-00007.safetensors",
404
+ "visual.blocks.12.norm2.weight": "model-00001-of-00007.safetensors",
405
+ "visual.blocks.13.attn.proj.bias": "model-00001-of-00007.safetensors",
406
+ "visual.blocks.13.attn.proj.weight": "model-00001-of-00007.safetensors",
407
+ "visual.blocks.13.attn.qkv.bias": "model-00001-of-00007.safetensors",
408
+ "visual.blocks.13.attn.qkv.weight": "model-00001-of-00007.safetensors",
409
+ "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
410
+ "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
411
+ "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
412
+ "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
413
+ "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
414
+ "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
415
+ "visual.blocks.13.norm1.weight": "model-00001-of-00007.safetensors",
416
+ "visual.blocks.13.norm2.weight": "model-00001-of-00007.safetensors",
417
+ "visual.blocks.14.attn.proj.bias": "model-00001-of-00007.safetensors",
418
+ "visual.blocks.14.attn.proj.weight": "model-00001-of-00007.safetensors",
419
+ "visual.blocks.14.attn.qkv.bias": "model-00001-of-00007.safetensors",
420
+ "visual.blocks.14.attn.qkv.weight": "model-00001-of-00007.safetensors",
421
+ "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
422
+ "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
423
+ "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
424
+ "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
425
+ "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
426
+ "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
427
+ "visual.blocks.14.norm1.weight": "model-00001-of-00007.safetensors",
428
+ "visual.blocks.14.norm2.weight": "model-00001-of-00007.safetensors",
429
+ "visual.blocks.15.attn.proj.bias": "model-00001-of-00007.safetensors",
430
+ "visual.blocks.15.attn.proj.weight": "model-00001-of-00007.safetensors",
431
+ "visual.blocks.15.attn.qkv.bias": "model-00001-of-00007.safetensors",
432
+ "visual.blocks.15.attn.qkv.weight": "model-00001-of-00007.safetensors",
433
+ "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
434
+ "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
435
+ "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
436
+ "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
437
+ "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
438
+ "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
439
+ "visual.blocks.15.norm1.weight": "model-00001-of-00007.safetensors",
440
+ "visual.blocks.15.norm2.weight": "model-00001-of-00007.safetensors",
441
+ "visual.blocks.16.attn.proj.bias": "model-00001-of-00007.safetensors",
442
+ "visual.blocks.16.attn.proj.weight": "model-00001-of-00007.safetensors",
443
+ "visual.blocks.16.attn.qkv.bias": "model-00001-of-00007.safetensors",
444
+ "visual.blocks.16.attn.qkv.weight": "model-00001-of-00007.safetensors",
445
+ "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
446
+ "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
447
+ "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
448
+ "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
449
+ "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
450
+ "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
451
+ "visual.blocks.16.norm1.weight": "model-00001-of-00007.safetensors",
452
+ "visual.blocks.16.norm2.weight": "model-00001-of-00007.safetensors",
453
+ "visual.blocks.17.attn.proj.bias": "model-00001-of-00007.safetensors",
454
+ "visual.blocks.17.attn.proj.weight": "model-00001-of-00007.safetensors",
455
+ "visual.blocks.17.attn.qkv.bias": "model-00001-of-00007.safetensors",
456
+ "visual.blocks.17.attn.qkv.weight": "model-00001-of-00007.safetensors",
457
+ "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
458
+ "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
459
+ "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
460
+ "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
461
+ "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
462
+ "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
463
+ "visual.blocks.17.norm1.weight": "model-00001-of-00007.safetensors",
464
+ "visual.blocks.17.norm2.weight": "model-00001-of-00007.safetensors",
465
+ "visual.blocks.18.attn.proj.bias": "model-00001-of-00007.safetensors",
466
+ "visual.blocks.18.attn.proj.weight": "model-00001-of-00007.safetensors",
467
+ "visual.blocks.18.attn.qkv.bias": "model-00001-of-00007.safetensors",
468
+ "visual.blocks.18.attn.qkv.weight": "model-00001-of-00007.safetensors",
469
+ "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
470
+ "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
471
+ "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
472
+ "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
473
+ "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
474
+ "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
475
+ "visual.blocks.18.norm1.weight": "model-00001-of-00007.safetensors",
476
+ "visual.blocks.18.norm2.weight": "model-00001-of-00007.safetensors",
477
+ "visual.blocks.19.attn.proj.bias": "model-00001-of-00007.safetensors",
478
+ "visual.blocks.19.attn.proj.weight": "model-00001-of-00007.safetensors",
479
+ "visual.blocks.19.attn.qkv.bias": "model-00001-of-00007.safetensors",
480
+ "visual.blocks.19.attn.qkv.weight": "model-00001-of-00007.safetensors",
481
+ "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
482
+ "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
483
+ "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
484
+ "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
485
+ "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
486
+ "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
487
+ "visual.blocks.19.norm1.weight": "model-00001-of-00007.safetensors",
488
+ "visual.blocks.19.norm2.weight": "model-00001-of-00007.safetensors",
489
+ "visual.blocks.2.attn.proj.bias": "model-00001-of-00007.safetensors",
490
+ "visual.blocks.2.attn.proj.weight": "model-00001-of-00007.safetensors",
491
+ "visual.blocks.2.attn.qkv.bias": "model-00001-of-00007.safetensors",
492
+ "visual.blocks.2.attn.qkv.weight": "model-00001-of-00007.safetensors",
493
+ "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
494
+ "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
495
+ "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
496
+ "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
497
+ "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
498
+ "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
499
+ "visual.blocks.2.norm1.weight": "model-00001-of-00007.safetensors",
500
+ "visual.blocks.2.norm2.weight": "model-00001-of-00007.safetensors",
501
+ "visual.blocks.20.attn.proj.bias": "model-00001-of-00007.safetensors",
502
+ "visual.blocks.20.attn.proj.weight": "model-00001-of-00007.safetensors",
503
+ "visual.blocks.20.attn.qkv.bias": "model-00001-of-00007.safetensors",
504
+ "visual.blocks.20.attn.qkv.weight": "model-00001-of-00007.safetensors",
505
+ "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
506
+ "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
507
+ "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
508
+ "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
509
+ "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
510
+ "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
511
+ "visual.blocks.20.norm1.weight": "model-00001-of-00007.safetensors",
512
+ "visual.blocks.20.norm2.weight": "model-00001-of-00007.safetensors",
513
+ "visual.blocks.21.attn.proj.bias": "model-00001-of-00007.safetensors",
514
+ "visual.blocks.21.attn.proj.weight": "model-00001-of-00007.safetensors",
515
+ "visual.blocks.21.attn.qkv.bias": "model-00001-of-00007.safetensors",
516
+ "visual.blocks.21.attn.qkv.weight": "model-00001-of-00007.safetensors",
517
+ "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
518
+ "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
519
+ "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
520
+ "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
521
+ "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
522
+ "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
523
+ "visual.blocks.21.norm1.weight": "model-00001-of-00007.safetensors",
524
+ "visual.blocks.21.norm2.weight": "model-00001-of-00007.safetensors",
525
+ "visual.blocks.22.attn.proj.bias": "model-00001-of-00007.safetensors",
526
+ "visual.blocks.22.attn.proj.weight": "model-00001-of-00007.safetensors",
527
+ "visual.blocks.22.attn.qkv.bias": "model-00001-of-00007.safetensors",
528
+ "visual.blocks.22.attn.qkv.weight": "model-00001-of-00007.safetensors",
529
+ "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
530
+ "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
531
+ "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
532
+ "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
533
+ "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
534
+ "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
535
+ "visual.blocks.22.norm1.weight": "model-00001-of-00007.safetensors",
536
+ "visual.blocks.22.norm2.weight": "model-00001-of-00007.safetensors",
537
+ "visual.blocks.23.attn.proj.bias": "model-00001-of-00007.safetensors",
538
+ "visual.blocks.23.attn.proj.weight": "model-00001-of-00007.safetensors",
539
+ "visual.blocks.23.attn.qkv.bias": "model-00001-of-00007.safetensors",
540
+ "visual.blocks.23.attn.qkv.weight": "model-00001-of-00007.safetensors",
541
+ "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
542
+ "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
543
+ "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
544
+ "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
545
+ "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
546
+ "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
547
+ "visual.blocks.23.norm1.weight": "model-00001-of-00007.safetensors",
548
+ "visual.blocks.23.norm2.weight": "model-00001-of-00007.safetensors",
549
+ "visual.blocks.24.attn.proj.bias": "model-00001-of-00007.safetensors",
550
+ "visual.blocks.24.attn.proj.weight": "model-00001-of-00007.safetensors",
551
+ "visual.blocks.24.attn.qkv.bias": "model-00001-of-00007.safetensors",
552
+ "visual.blocks.24.attn.qkv.weight": "model-00001-of-00007.safetensors",
553
+ "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
554
+ "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
555
+ "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
556
+ "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
557
+ "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
558
+ "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
559
+ "visual.blocks.24.norm1.weight": "model-00001-of-00007.safetensors",
560
+ "visual.blocks.24.norm2.weight": "model-00001-of-00007.safetensors",
561
+ "visual.blocks.25.attn.proj.bias": "model-00001-of-00007.safetensors",
562
+ "visual.blocks.25.attn.proj.weight": "model-00001-of-00007.safetensors",
563
+ "visual.blocks.25.attn.qkv.bias": "model-00001-of-00007.safetensors",
564
+ "visual.blocks.25.attn.qkv.weight": "model-00001-of-00007.safetensors",
565
+ "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
566
+ "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
567
+ "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
568
+ "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
569
+ "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
570
+ "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
571
+ "visual.blocks.25.norm1.weight": "model-00001-of-00007.safetensors",
572
+ "visual.blocks.25.norm2.weight": "model-00001-of-00007.safetensors",
573
+ "visual.blocks.26.attn.proj.bias": "model-00001-of-00007.safetensors",
574
+ "visual.blocks.26.attn.proj.weight": "model-00001-of-00007.safetensors",
575
+ "visual.blocks.26.attn.qkv.bias": "model-00001-of-00007.safetensors",
576
+ "visual.blocks.26.attn.qkv.weight": "model-00001-of-00007.safetensors",
577
+ "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
578
+ "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
579
+ "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
580
+ "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
581
+ "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
582
+ "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
583
+ "visual.blocks.26.norm1.weight": "model-00001-of-00007.safetensors",
584
+ "visual.blocks.26.norm2.weight": "model-00001-of-00007.safetensors",
585
+ "visual.blocks.27.attn.proj.bias": "model-00001-of-00007.safetensors",
586
+ "visual.blocks.27.attn.proj.weight": "model-00001-of-00007.safetensors",
587
+ "visual.blocks.27.attn.qkv.bias": "model-00001-of-00007.safetensors",
588
+ "visual.blocks.27.attn.qkv.weight": "model-00001-of-00007.safetensors",
589
+ "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
590
+ "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
591
+ "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
592
+ "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
593
+ "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
594
+ "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
595
+ "visual.blocks.27.norm1.weight": "model-00001-of-00007.safetensors",
596
+ "visual.blocks.27.norm2.weight": "model-00001-of-00007.safetensors",
597
+ "visual.blocks.28.attn.proj.bias": "model-00001-of-00007.safetensors",
598
+ "visual.blocks.28.attn.proj.weight": "model-00001-of-00007.safetensors",
599
+ "visual.blocks.28.attn.qkv.bias": "model-00001-of-00007.safetensors",
600
+ "visual.blocks.28.attn.qkv.weight": "model-00001-of-00007.safetensors",
601
+ "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
602
+ "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
603
+ "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
604
+ "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
605
+ "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
606
+ "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
607
+ "visual.blocks.28.norm1.weight": "model-00001-of-00007.safetensors",
608
+ "visual.blocks.28.norm2.weight": "model-00001-of-00007.safetensors",
609
+ "visual.blocks.29.attn.proj.bias": "model-00001-of-00007.safetensors",
610
+ "visual.blocks.29.attn.proj.weight": "model-00001-of-00007.safetensors",
611
+ "visual.blocks.29.attn.qkv.bias": "model-00001-of-00007.safetensors",
612
+ "visual.blocks.29.attn.qkv.weight": "model-00001-of-00007.safetensors",
613
+ "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
614
+ "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
615
+ "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
616
+ "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
617
+ "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
618
+ "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
619
+ "visual.blocks.29.norm1.weight": "model-00001-of-00007.safetensors",
620
+ "visual.blocks.29.norm2.weight": "model-00001-of-00007.safetensors",
621
+ "visual.blocks.3.attn.proj.bias": "model-00001-of-00007.safetensors",
622
+ "visual.blocks.3.attn.proj.weight": "model-00001-of-00007.safetensors",
623
+ "visual.blocks.3.attn.qkv.bias": "model-00001-of-00007.safetensors",
624
+ "visual.blocks.3.attn.qkv.weight": "model-00001-of-00007.safetensors",
625
+ "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
626
+ "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
627
+ "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
628
+ "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
629
+ "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
630
+ "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
631
+ "visual.blocks.3.norm1.weight": "model-00001-of-00007.safetensors",
632
+ "visual.blocks.3.norm2.weight": "model-00001-of-00007.safetensors",
633
+ "visual.blocks.30.attn.proj.bias": "model-00001-of-00007.safetensors",
634
+ "visual.blocks.30.attn.proj.weight": "model-00001-of-00007.safetensors",
635
+ "visual.blocks.30.attn.qkv.bias": "model-00001-of-00007.safetensors",
636
+ "visual.blocks.30.attn.qkv.weight": "model-00001-of-00007.safetensors",
637
+ "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
638
+ "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
639
+ "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
640
+ "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
641
+ "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
642
+ "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
643
+ "visual.blocks.30.norm1.weight": "model-00001-of-00007.safetensors",
644
+ "visual.blocks.30.norm2.weight": "model-00001-of-00007.safetensors",
645
+ "visual.blocks.31.attn.proj.bias": "model-00001-of-00007.safetensors",
646
+ "visual.blocks.31.attn.proj.weight": "model-00001-of-00007.safetensors",
647
+ "visual.blocks.31.attn.qkv.bias": "model-00001-of-00007.safetensors",
648
+ "visual.blocks.31.attn.qkv.weight": "model-00001-of-00007.safetensors",
649
+ "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
650
+ "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
651
+ "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
652
+ "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
653
+ "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
654
+ "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
655
+ "visual.blocks.31.norm1.weight": "model-00001-of-00007.safetensors",
656
+ "visual.blocks.31.norm2.weight": "model-00001-of-00007.safetensors",
657
+ "visual.blocks.4.attn.proj.bias": "model-00001-of-00007.safetensors",
658
+ "visual.blocks.4.attn.proj.weight": "model-00001-of-00007.safetensors",
659
+ "visual.blocks.4.attn.qkv.bias": "model-00001-of-00007.safetensors",
660
+ "visual.blocks.4.attn.qkv.weight": "model-00001-of-00007.safetensors",
661
+ "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
662
+ "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
663
+ "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
664
+ "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
665
+ "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
666
+ "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
667
+ "visual.blocks.4.norm1.weight": "model-00001-of-00007.safetensors",
668
+ "visual.blocks.4.norm2.weight": "model-00001-of-00007.safetensors",
669
+ "visual.blocks.5.attn.proj.bias": "model-00001-of-00007.safetensors",
670
+ "visual.blocks.5.attn.proj.weight": "model-00001-of-00007.safetensors",
671
+ "visual.blocks.5.attn.qkv.bias": "model-00001-of-00007.safetensors",
672
+ "visual.blocks.5.attn.qkv.weight": "model-00001-of-00007.safetensors",
673
+ "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
674
+ "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
675
+ "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
676
+ "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
677
+ "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
678
+ "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
679
+ "visual.blocks.5.norm1.weight": "model-00001-of-00007.safetensors",
680
+ "visual.blocks.5.norm2.weight": "model-00001-of-00007.safetensors",
681
+ "visual.blocks.6.attn.proj.bias": "model-00001-of-00007.safetensors",
682
+ "visual.blocks.6.attn.proj.weight": "model-00001-of-00007.safetensors",
683
+ "visual.blocks.6.attn.qkv.bias": "model-00001-of-00007.safetensors",
684
+ "visual.blocks.6.attn.qkv.weight": "model-00001-of-00007.safetensors",
685
+ "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
686
+ "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
687
+ "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
688
+ "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
689
+ "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
690
+ "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
691
+ "visual.blocks.6.norm1.weight": "model-00001-of-00007.safetensors",
692
+ "visual.blocks.6.norm2.weight": "model-00001-of-00007.safetensors",
693
+ "visual.blocks.7.attn.proj.bias": "model-00001-of-00007.safetensors",
694
+ "visual.blocks.7.attn.proj.weight": "model-00001-of-00007.safetensors",
695
+ "visual.blocks.7.attn.qkv.bias": "model-00001-of-00007.safetensors",
696
+ "visual.blocks.7.attn.qkv.weight": "model-00001-of-00007.safetensors",
697
+ "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
698
+ "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
699
+ "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
700
+ "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
701
+ "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
702
+ "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
703
+ "visual.blocks.7.norm1.weight": "model-00001-of-00007.safetensors",
704
+ "visual.blocks.7.norm2.weight": "model-00001-of-00007.safetensors",
705
+ "visual.blocks.8.attn.proj.bias": "model-00001-of-00007.safetensors",
706
+ "visual.blocks.8.attn.proj.weight": "model-00001-of-00007.safetensors",
707
+ "visual.blocks.8.attn.qkv.bias": "model-00001-of-00007.safetensors",
708
+ "visual.blocks.8.attn.qkv.weight": "model-00001-of-00007.safetensors",
709
+ "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
710
+ "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
711
+ "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
712
+ "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
713
+ "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
714
+ "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
715
+ "visual.blocks.8.norm1.weight": "model-00001-of-00007.safetensors",
716
+ "visual.blocks.8.norm2.weight": "model-00001-of-00007.safetensors",
717
+ "visual.blocks.9.attn.proj.bias": "model-00001-of-00007.safetensors",
718
+ "visual.blocks.9.attn.proj.weight": "model-00001-of-00007.safetensors",
719
+ "visual.blocks.9.attn.qkv.bias": "model-00001-of-00007.safetensors",
720
+ "visual.blocks.9.attn.qkv.weight": "model-00001-of-00007.safetensors",
721
+ "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
722
+ "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
723
+ "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
724
+ "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
725
+ "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
726
+ "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
727
+ "visual.blocks.9.norm1.weight": "model-00001-of-00007.safetensors",
728
+ "visual.blocks.9.norm2.weight": "model-00001-of-00007.safetensors",
729
+ "visual.merger.ln_q.weight": "model-00001-of-00007.safetensors",
730
+ "visual.merger.mlp.0.bias": "model-00001-of-00007.safetensors",
731
+ "visual.merger.mlp.0.weight": "model-00001-of-00007.safetensors",
732
+ "visual.merger.mlp.2.bias": "model-00001-of-00007.safetensors",
733
+ "visual.merger.mlp.2.weight": "model-00001-of-00007.safetensors",
734
+ "visual.patch_embed.proj.weight": "model-00001-of-00007.safetensors"
735
+ }
736
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 12845056,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2_5_VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "longest_edge": 12845056,
26
+ "shortest_edge": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
runs/Jul04_10-07-24_oumi-compute004/events.out.tfevents.1751623712.oumi-compute004.2619050.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a7a1178cb9df90d8edd3eea89965c5626528e98d603bc8da169ea35c9b65695
3
+ size 36575
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
telemetry/devices_info.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ CPU cores: 208 CUDA devices: 8
2
+ device(0)='NVIDIA H100 80GB HBM3' Capability: (9, 0) Memory: [Total: 79.19GiB Free: 75.08GiB Allocated: 0.0GiB Cached: 0.0GiB]
3
+ device(1)='NVIDIA H100 80GB HBM3' Capability: (9, 0) Memory: [Total: 79.19GiB Free: 78.67GiB Allocated: 0.0GiB Cached: 0.0GiB]
4
+ device(2)='NVIDIA H100 80GB HBM3' Capability: (9, 0) Memory: [Total: 79.19GiB Free: 78.67GiB Allocated: 0.0GiB Cached: 0.0GiB]
5
+ device(3)='NVIDIA H100 80GB HBM3' Capability: (9, 0) Memory: [Total: 79.19GiB Free: 78.67GiB Allocated: 0.0GiB Cached: 0.0GiB]
6
+ device(4)='NVIDIA H100 80GB HBM3' Capability: (9, 0) Memory: [Total: 79.19GiB Free: 78.67GiB Allocated: 0.0GiB Cached: 0.0GiB]
7
+ device(5)='NVIDIA H100 80GB HBM3' Capability: (9, 0) Memory: [Total: 79.19GiB Free: 78.67GiB Allocated: 0.0GiB Cached: 0.0GiB]
8
+ device(6)='NVIDIA H100 80GB HBM3' Capability: (9, 0) Memory: [Total: 79.19GiB Free: 78.67GiB Allocated: 0.0GiB Cached: 0.0GiB]
9
+ device(7)='NVIDIA H100 80GB HBM3' Capability: (9, 0) Memory: [Total: 79.19GiB Free: 78.67GiB Allocated: 0.0GiB Cached: 0.0GiB]
telemetry/telemetry_callback_metrics_rank0000.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_runtime": 66532.5206,
3
+ "train_samples_per_second": 0.451,
4
+ "train_steps_per_second": 0.056,
5
+ "train_tokens_per_second": 70.398,
6
+ "total_flos": 1.4113858770069094e+17,
7
+ "train_loss": 0.21003443336486816,
8
+ "epoch": 6.996268656716418,
9
+ "num_input_tokens_seen": 24290780
10
+ }
telemetry/telemetry_callback_rank0000.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "hostname": "oumi-compute004",
3
+ "total_time": 66595.79496112792,
4
+ "timers": {
5
+ "epochs": {
6
+ "count": 7.0,
7
+ "mean": 9503.914948322012,
8
+ "median": 9508.863333210349,
9
+ "std_dev": 24.585813027685713,
10
+ "min": 9471.0654975418,
11
+ "max": 9534.607692892198,
12
+ "total": 66527.40463825408,
13
+ "percentage": 99.89730534350741
14
+ },
15
+ "microsteps": {
16
+ "count": 3748.0,
17
+ "mean": 17.662540777144716,
18
+ "median": 17.44136324687861,
19
+ "std_dev": 1.5278314346422193,
20
+ "min": 15.953362683299929,
21
+ "max": 25.46622396213934,
22
+ "total": 66199.2028327384,
23
+ "percentage": 99.40447872328726
24
+ },
25
+ "steps": {
26
+ "count": 3748.0,
27
+ "mean": 17.662547419246746,
28
+ "median": 17.441364587983117,
29
+ "std_dev": 1.5278260400852177,
30
+ "min": 15.953363878186792,
31
+ "max": 25.46622501220554,
32
+ "total": 66199.22772733681,
33
+ "percentage": 99.40451610492437
34
+ }
35
+ },
36
+ "cuda_timers": {},
37
+ "gpu_memory": [],
38
+ "gpu_temperature": {}
39
+ }
telemetry/telemetry_callback_wandb_rank0000.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "id": "p2r4ua3y",
3
+ "name": "output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered",
4
+ "url": "https://wandb.ai/nyu-dice-lab/huggingface/runs/p2r4ua3y"
5
+ }
telemetry/training_config.yaml ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ train:
3
+ datasets:
4
+ - dataset_name: hf_vision
5
+ dataset_path: null
6
+ subset: null
7
+ split: train
8
+ dataset_kwargs:
9
+ hf_dataset_path: penfever/MM-MathInstruct-to-r1-format-filtered
10
+ image_column: image
11
+ question_column: problem
12
+ answer_column: solution
13
+ return_tensors: true
14
+ processor_name: Qwen/Qwen2.5-VL-7B-Instruct
15
+ return_conversations: true
16
+ sample_count: null
17
+ mixture_proportion: null
18
+ shuffle: true
19
+ seed: 42
20
+ shuffle_buffer_size: 1000
21
+ trust_remote_code: true
22
+ transform_num_workers: auto
23
+ collator_name: vision_language_sft
24
+ collator_kwargs:
25
+ process_individually: true
26
+ pack: false
27
+ stream: false
28
+ target_col: null
29
+ mixture_strategy: first_exhausted
30
+ seed: null
31
+ use_async_dataset: false
32
+ use_torchdata: true
33
+ test:
34
+ datasets: []
35
+ collator_name: null
36
+ collator_kwargs: {}
37
+ pack: false
38
+ stream: false
39
+ target_col: null
40
+ mixture_strategy: first_exhausted
41
+ seed: null
42
+ use_async_dataset: false
43
+ use_torchdata: null
44
+ validation:
45
+ datasets: []
46
+ collator_name: null
47
+ collator_kwargs: {}
48
+ pack: false
49
+ stream: false
50
+ target_col: null
51
+ mixture_strategy: first_exhausted
52
+ seed: null
53
+ use_async_dataset: false
54
+ use_torchdata: null
55
+ model:
56
+ model_name: Qwen/Qwen2.5-VL-7B-Instruct
57
+ adapter_model: null
58
+ tokenizer_name: null
59
+ tokenizer_pad_token: null
60
+ tokenizer_kwargs: {}
61
+ processor_kwargs: {}
62
+ model_max_length: 10000
63
+ load_pretrained_weights: true
64
+ trust_remote_code: true
65
+ torch_dtype_str: bfloat16
66
+ compile: false
67
+ chat_template: qwen2-vl-instruct
68
+ attn_implementation: sdpa
69
+ device_map: auto
70
+ model_kwargs: {}
71
+ enable_liger_kernel: false
72
+ shard_for_eval: false
73
+ freeze_layers: []
74
+ model_revision: null
75
+ training:
76
+ use_peft: false
77
+ trainer_type: TRL_SFT
78
+ enable_gradient_checkpointing: true
79
+ gradient_checkpointing_kwargs:
80
+ use_reentrant: false
81
+ output_dir: output/qwen2_5_vl_7b_MM-MathInstruct-to-r1-format-filtered
82
+ per_device_train_batch_size: 1
83
+ per_device_eval_batch_size: 8
84
+ gradient_accumulation_steps: 1
85
+ max_steps: 3750
86
+ num_train_epochs: 5
87
+ save_epoch: false
88
+ save_steps: 0
89
+ save_final_model: true
90
+ seed: 42
91
+ data_seed: 42
92
+ use_deterministic: false
93
+ full_determinism: false
94
+ run_name: null
95
+ metrics_function: null
96
+ reward_functions: null
97
+ grpo:
98
+ model_init_kwargs: {}
99
+ max_prompt_length: null
100
+ max_completion_length: null
101
+ num_generations: null
102
+ temperature: 0.9
103
+ remove_unused_columns: false
104
+ repetition_penalty: 1.0
105
+ use_vllm: false
106
+ vllm_device: null
107
+ vllm_gpu_memory_utilization: 0.9
108
+ vllm_dtype: null
109
+ vllm_max_model_len: null
110
+ epsilon: 0.2
111
+ log_completions: false
112
+ log_level: info
113
+ dep_log_level: warning
114
+ enable_wandb: true
115
+ enable_mlflow: false
116
+ enable_tensorboard: true
117
+ logging_strategy: steps
118
+ logging_dir: null
119
+ logging_steps: 50
120
+ logging_first_step: false
121
+ eval_strategy: 'no'
122
+ eval_steps: 500
123
+ learning_rate: 2.0e-05
124
+ lr_scheduler_type: cosine
125
+ lr_scheduler_kwargs: {}
126
+ warmup_ratio: 0.03
127
+ warmup_steps: null
128
+ optimizer: adamw_torch_fused
129
+ weight_decay: 0.01
130
+ adam_beta1: 0.9
131
+ adam_beta2: 0.999
132
+ adam_epsilon: 1.0e-08
133
+ sgd_momentum: 0.0
134
+ mixed_precision_dtype: NONE
135
+ compile: false
136
+ include_performance_metrics: true
137
+ include_alternative_mfu_metrics: false
138
+ log_model_summary: false
139
+ resume_from_checkpoint: null
140
+ try_resume_from_last_checkpoint: false
141
+ dataloader_num_workers: 2
142
+ dataloader_persistent_workers: false
143
+ dataloader_prefetch_factor: 8
144
+ dataloader_main_process_only: false
145
+ ddp_find_unused_parameters: false
146
+ max_grad_norm: 1.0
147
+ trainer_kwargs:
148
+ max_seq_length: 10000
149
+ remove_unused_columns: false
150
+ dataset_kwargs:
151
+ skip_prepare_dataset: true
152
+ verl_config_overrides: {}
153
+ profiler:
154
+ save_dir: null
155
+ enable_cpu_profiling: false
156
+ enable_cuda_profiling: false
157
+ record_shapes: false
158
+ profile_memory: false
159
+ with_stack: false
160
+ with_flops: false
161
+ with_modules: false
162
+ row_limit: 50
163
+ schedule:
164
+ enable_schedule: false
165
+ wait: 0
166
+ warmup: 1
167
+ active: 3
168
+ repeat: 1
169
+ skip_first: 1
170
+ telemetry:
171
+ telemetry_dir: telemetry
172
+ collect_telemetry_for_all_ranks: false
173
+ track_gpu_temperature: false
174
+ empty_device_cache_steps: 1
175
+ nccl_default_timeout_minutes: null
176
+ label_ignore_index: null
177
+ peft:
178
+ lora_r: 8
179
+ lora_alpha: 8
180
+ lora_dropout: 0.0
181
+ lora_target_modules: null
182
+ lora_modules_to_save: null
183
+ lora_bias: none
184
+ lora_init_weights: DEFAULT
185
+ lora_task_type: CAUSAL_LM
186
+ q_lora: false
187
+ q_lora_bits: 4
188
+ bnb_4bit_quant_type: fp4
189
+ llm_int8_skip_modules: null
190
+ use_bnb_nested_quant: false
191
+ bnb_4bit_quant_storage: uint8
192
+ bnb_4bit_compute_dtype: float32
193
+ peft_save_mode: ADAPTER_ONLY
194
+ fsdp:
195
+ enable_fsdp: true
196
+ sharding_strategy: HYBRID_SHARD
197
+ cpu_offload: false
198
+ mixed_precision: bf16
199
+ backward_prefetch: BACKWARD_PRE
200
+ forward_prefetch: true
201
+ use_orig_params: null
202
+ state_dict_type: FULL_STATE_DICT
203
+ auto_wrap_policy: SIZE_BASED_WRAP
204
+ min_num_params: 100000
205
+ transformer_layer_cls: null
206
+ sync_module_states: true
telemetry/world_size.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "LOCAL_WORLD_SIZE": 8,
3
+ "WORLD_SIZE": 8
4
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{%- for message in messages -%}{%- if loop.first and message['role'] != 'system' -%}{{ '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}{%- endif -%}{{ '<|im_start|>' + message['role'] + '\\n' }}{%- if message['content'] is string -%}{{- message['content'] -}}{%- elif message['content'] is iterable -%}{%- for item in message['content'] -%}{%- if item['type'].startswith('image') -%}{%- set image_count.value = image_count.value + 1 -%}{%- if add_vision_id -%}{{ 'Picture ' + image_count.value + ': ' }}{%- endif -%}{{ '<|vision_start|><|image_pad|><|vision_end|>' }}{%- elif item['type'].startswith('video') -%}{%- set video_count.value = video_count.value + 1 -%}{%- if add_vision_id -%}{{ 'Video ' + video_count.value + ': ' }}{%- endif -%}{{ '<|vision_start|><|video_pad|><|vision_end|>' }}{%- elif item['type']=='text' -%}{{- item['text'] if 'text' in item else item['content'] -}}{%- endif -%}{%- endfor -%}{%- endif -%}{{ '<|im_end|>\\n' }}{%- endfor -%}{%- if add_generation_prompt -%}{{- '<|im_start|>assistant\\n' -}}{%- endif -%}",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 10000,
204
+ "pad_token": "<|endoftext|>",
205
+ "padding_side": "right",
206
+ "processor_class": "Qwen2_5_VLProcessor",
207
+ "split_special_tokens": false,
208
+ "tokenizer_class": "Qwen2Tokenizer",
209
+ "unk_token": null
210
+ }
trainer_state.json ADDED
@@ -0,0 +1,795 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 6.996268656716418,
6
+ "eval_steps": 500,
7
+ "global_step": 3750,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09328358208955224,
14
+ "grad_norm": 2.4583044052124023,
15
+ "learning_rate": 8.672566371681418e-06,
16
+ "loss": 0.8247,
17
+ "mean_token_accuracy": 0.8019651556015015,
18
+ "num_input_tokens_seen": 323038,
19
+ "num_tokens": 323038.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "epoch": 0.1865671641791045,
24
+ "grad_norm": 2.1223530769348145,
25
+ "learning_rate": 1.7522123893805313e-05,
26
+ "loss": 0.6247,
27
+ "mean_token_accuracy": 0.8224787962436676,
28
+ "num_input_tokens_seen": 642936,
29
+ "num_tokens": 642936.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "epoch": 0.2798507462686567,
34
+ "grad_norm": 5.335413932800293,
35
+ "learning_rate": 1.9995165482321775e-05,
36
+ "loss": 0.6278,
37
+ "mean_token_accuracy": 0.8260715854167938,
38
+ "num_input_tokens_seen": 976988,
39
+ "num_tokens": 976988.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "epoch": 0.373134328358209,
44
+ "grad_norm": 5.00244140625,
45
+ "learning_rate": 1.9972420885061576e-05,
46
+ "loss": 0.6247,
47
+ "mean_token_accuracy": 0.8278178679943085,
48
+ "num_input_tokens_seen": 1303490,
49
+ "num_tokens": 1303490.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "epoch": 0.4664179104477612,
54
+ "grad_norm": 5.494905948638916,
55
+ "learning_rate": 1.9931077431357095e-05,
56
+ "loss": 0.649,
57
+ "mean_token_accuracy": 0.8199629688262939,
58
+ "num_input_tokens_seen": 1633598,
59
+ "num_tokens": 1633598.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "epoch": 0.5597014925373134,
64
+ "grad_norm": 5.7639923095703125,
65
+ "learning_rate": 1.9871212227957962e-05,
66
+ "loss": 0.6276,
67
+ "mean_token_accuracy": 0.8245489084720612,
68
+ "num_input_tokens_seen": 1959940,
69
+ "num_tokens": 1959940.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "epoch": 0.6529850746268657,
74
+ "grad_norm": 5.357442378997803,
75
+ "learning_rate": 1.979293692521837e-05,
76
+ "loss": 0.6308,
77
+ "mean_token_accuracy": 0.8240770590305329,
78
+ "num_input_tokens_seen": 2273744,
79
+ "num_tokens": 2273744.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 0.746268656716418,
84
+ "grad_norm": 7.147578239440918,
85
+ "learning_rate": 1.9696397508865917e-05,
86
+ "loss": 0.6753,
87
+ "mean_token_accuracy": 0.8150083267688751,
88
+ "num_input_tokens_seen": 2594534,
89
+ "num_tokens": 2594534.0,
90
+ "step": 400
91
+ },
92
+ {
93
+ "epoch": 0.8395522388059702,
94
+ "grad_norm": 5.7040605545043945,
95
+ "learning_rate": 1.9581774027733947e-05,
96
+ "loss": 0.6221,
97
+ "mean_token_accuracy": 0.8259347748756408,
98
+ "num_input_tokens_seen": 2913982,
99
+ "num_tokens": 2913982.0,
100
+ "step": 450
101
+ },
102
+ {
103
+ "epoch": 0.9328358208955224,
104
+ "grad_norm": 5.201114654541016,
105
+ "learning_rate": 1.944928025796521e-05,
106
+ "loss": 0.6178,
107
+ "mean_token_accuracy": 0.8253031682968139,
108
+ "num_input_tokens_seen": 3233948,
109
+ "num_tokens": 3233948.0,
110
+ "step": 500
111
+ },
112
+ {
113
+ "epoch": 1.0261194029850746,
114
+ "grad_norm": 5.712339878082275,
115
+ "learning_rate": 1.929916330431312e-05,
116
+ "loss": 0.6321,
117
+ "mean_token_accuracy": 0.8240388989448547,
118
+ "num_input_tokens_seen": 3568746,
119
+ "num_tokens": 3568746.0,
120
+ "step": 550
121
+ },
122
+ {
123
+ "epoch": 1.1194029850746268,
124
+ "grad_norm": 3.397759199142456,
125
+ "learning_rate": 1.9131703139284143e-05,
126
+ "loss": 0.4505,
127
+ "mean_token_accuracy": 0.8675171172618866,
128
+ "num_input_tokens_seen": 3886940,
129
+ "num_tokens": 3886940.0,
130
+ "step": 600
131
+ },
132
+ {
133
+ "epoch": 1.212686567164179,
134
+ "grad_norm": 3.1189541816711426,
135
+ "learning_rate": 1.894721208098092e-05,
136
+ "loss": 0.3317,
137
+ "mean_token_accuracy": 0.9006148743629455,
138
+ "num_input_tokens_seen": 4206696,
139
+ "num_tokens": 4206696.0,
140
+ "step": 650
141
+ },
142
+ {
143
+ "epoch": 1.3059701492537314,
144
+ "grad_norm": 2.917012929916382,
145
+ "learning_rate": 1.874603421061986e-05,
146
+ "loss": 0.3288,
147
+ "mean_token_accuracy": 0.900334278345108,
148
+ "num_input_tokens_seen": 4538022,
149
+ "num_tokens": 4538022.0,
150
+ "step": 700
151
+ },
152
+ {
153
+ "epoch": 1.3992537313432836,
154
+ "grad_norm": 3.822690963745117,
155
+ "learning_rate": 1.852854473080961e-05,
156
+ "loss": 0.4135,
157
+ "mean_token_accuracy": 0.8779956555366516,
158
+ "num_input_tokens_seen": 4868192,
159
+ "num_tokens": 4868192.0,
160
+ "step": 750
161
+ },
162
+ {
163
+ "epoch": 1.4925373134328357,
164
+ "grad_norm": 3.944916009902954,
165
+ "learning_rate": 1.8295149265787224e-05,
166
+ "loss": 0.4413,
167
+ "mean_token_accuracy": 0.8670336186885834,
168
+ "num_input_tokens_seen": 5201704,
169
+ "num_tokens": 5201704.0,
170
+ "step": 800
171
+ },
172
+ {
173
+ "epoch": 1.585820895522388,
174
+ "grad_norm": 4.3602423667907715,
175
+ "learning_rate": 1.8046283104917116e-05,
176
+ "loss": 0.4167,
177
+ "mean_token_accuracy": 0.87647913813591,
178
+ "num_input_tokens_seen": 5521682,
179
+ "num_tokens": 5521682.0,
180
+ "step": 850
181
+ },
182
+ {
183
+ "epoch": 1.6791044776119404,
184
+ "grad_norm": 4.468040943145752,
185
+ "learning_rate": 1.7782410390863664e-05,
186
+ "loss": 0.4282,
187
+ "mean_token_accuracy": 0.8733036065101624,
188
+ "num_input_tokens_seen": 5840636,
189
+ "num_tokens": 5840636.0,
190
+ "step": 900
191
+ },
192
+ {
193
+ "epoch": 1.7723880597014925,
194
+ "grad_norm": 5.0916666984558105,
195
+ "learning_rate": 1.750402325395156e-05,
196
+ "loss": 0.422,
197
+ "mean_token_accuracy": 0.8751583611965179,
198
+ "num_input_tokens_seen": 6152288,
199
+ "num_tokens": 6152288.0,
200
+ "step": 950
201
+ },
202
+ {
203
+ "epoch": 1.8656716417910446,
204
+ "grad_norm": 3.583630084991455,
205
+ "learning_rate": 1.7211640894328413e-05,
206
+ "loss": 0.4014,
207
+ "mean_token_accuracy": 0.8789325177669525,
208
+ "num_input_tokens_seen": 6475382,
209
+ "num_tokens": 6475382.0,
210
+ "step": 1000
211
+ },
212
+ {
213
+ "epoch": 1.9589552238805972,
214
+ "grad_norm": 4.270807266235352,
215
+ "learning_rate": 1.6905808613641233e-05,
216
+ "loss": 0.4074,
217
+ "mean_token_accuracy": 0.8787497889995575,
218
+ "num_input_tokens_seen": 6799610,
219
+ "num_tokens": 6799610.0,
220
+ "step": 1050
221
+ },
222
+ {
223
+ "epoch": 2.0522388059701493,
224
+ "grad_norm": 3.991750717163086,
225
+ "learning_rate": 1.6587096798032984e-05,
226
+ "loss": 0.3727,
227
+ "mean_token_accuracy": 0.8879767000675202,
228
+ "num_input_tokens_seen": 7129112,
229
+ "num_tokens": 7129112.0,
230
+ "step": 1100
231
+ },
232
+ {
233
+ "epoch": 2.1455223880597014,
234
+ "grad_norm": 2.423408269882202,
235
+ "learning_rate": 1.625609985435571e-05,
236
+ "loss": 0.2241,
237
+ "mean_token_accuracy": 0.9320108902454376,
238
+ "num_input_tokens_seen": 7449332,
239
+ "num_tokens": 7449332.0,
240
+ "step": 1150
241
+ },
242
+ {
243
+ "epoch": 2.2388059701492535,
244
+ "grad_norm": 1.7369468212127686,
245
+ "learning_rate": 1.59134351015844e-05,
246
+ "loss": 0.1621,
247
+ "mean_token_accuracy": 0.9507821369171142,
248
+ "num_input_tokens_seen": 7773514,
249
+ "num_tokens": 7773514.0,
250
+ "step": 1200
251
+ },
252
+ {
253
+ "epoch": 2.332089552238806,
254
+ "grad_norm": 1.9339622259140015,
255
+ "learning_rate": 1.555974161949906e-05,
256
+ "loss": 0.174,
257
+ "mean_token_accuracy": 0.9477302300930023,
258
+ "num_input_tokens_seen": 8102664,
259
+ "num_tokens": 8102664.0,
260
+ "step": 1250
261
+ },
262
+ {
263
+ "epoch": 2.425373134328358,
264
+ "grad_norm": 2.86017107963562,
265
+ "learning_rate": 1.519567905678223e-05,
266
+ "loss": 0.2275,
267
+ "mean_token_accuracy": 0.9304351592063904,
268
+ "num_input_tokens_seen": 8432668,
269
+ "num_tokens": 8432668.0,
270
+ "step": 1300
271
+ },
272
+ {
273
+ "epoch": 2.5186567164179103,
274
+ "grad_norm": 2.300647735595703,
275
+ "learning_rate": 1.4821926400754915e-05,
276
+ "loss": 0.2328,
277
+ "mean_token_accuracy": 0.9280073237419129,
278
+ "num_input_tokens_seen": 8759670,
279
+ "num_tokens": 8759670.0,
280
+ "step": 1350
281
+ },
282
+ {
283
+ "epoch": 2.611940298507463,
284
+ "grad_norm": 2.8485522270202637,
285
+ "learning_rate": 1.4439180711045395e-05,
286
+ "loss": 0.2274,
287
+ "mean_token_accuracy": 0.9305808675289154,
288
+ "num_input_tokens_seen": 9083690,
289
+ "num_tokens": 9083690.0,
290
+ "step": 1400
291
+ },
292
+ {
293
+ "epoch": 2.705223880597015,
294
+ "grad_norm": 2.8772568702697754,
295
+ "learning_rate": 1.4048155819552617e-05,
296
+ "loss": 0.2385,
297
+ "mean_token_accuracy": 0.9277240431308746,
298
+ "num_input_tokens_seen": 9400762,
299
+ "num_tokens": 9400762.0,
300
+ "step": 1450
301
+ },
302
+ {
303
+ "epoch": 2.798507462686567,
304
+ "grad_norm": 3.3837602138519287,
305
+ "learning_rate": 1.3649580999128871e-05,
306
+ "loss": 0.2225,
307
+ "mean_token_accuracy": 0.9319508814811707,
308
+ "num_input_tokens_seen": 9719416,
309
+ "num_tokens": 9719416.0,
310
+ "step": 1500
311
+ },
312
+ {
313
+ "epoch": 2.8917910447761193,
314
+ "grad_norm": 3.256094217300415,
315
+ "learning_rate": 1.3244199603464581e-05,
316
+ "loss": 0.2307,
317
+ "mean_token_accuracy": 0.9294045794010163,
318
+ "num_input_tokens_seen": 10038208,
319
+ "num_tokens": 10038208.0,
320
+ "step": 1550
321
+ },
322
+ {
323
+ "epoch": 2.9850746268656714,
324
+ "grad_norm": 2.5858240127563477,
325
+ "learning_rate": 1.2832767680711941e-05,
326
+ "loss": 0.2196,
327
+ "mean_token_accuracy": 0.9315053272247314,
328
+ "num_input_tokens_seen": 10366444,
329
+ "num_tokens": 10366444.0,
330
+ "step": 1600
331
+ },
332
+ {
333
+ "epoch": 3.078358208955224,
334
+ "grad_norm": 1.8756778240203857,
335
+ "learning_rate": 1.2416052563433043e-05,
336
+ "loss": 0.19,
337
+ "mean_token_accuracy": 0.9421073424816132,
338
+ "num_input_tokens_seen": 10687314,
339
+ "num_tokens": 10687314.0,
340
+ "step": 1650
341
+ },
342
+ {
343
+ "epoch": 3.171641791044776,
344
+ "grad_norm": 0.8857208490371704,
345
+ "learning_rate": 1.1994831437502172e-05,
346
+ "loss": 0.1102,
347
+ "mean_token_accuracy": 0.9673730087280273,
348
+ "num_input_tokens_seen": 11009694,
349
+ "num_tokens": 11009694.0,
350
+ "step": 1700
351
+ },
352
+ {
353
+ "epoch": 3.264925373134328,
354
+ "grad_norm": 1.2341127395629883,
355
+ "learning_rate": 1.1569889892631488e-05,
356
+ "loss": 0.0797,
357
+ "mean_token_accuracy": 0.9754443645477295,
358
+ "num_input_tokens_seen": 11337744,
359
+ "num_tokens": 11337744.0,
360
+ "step": 1750
361
+ },
362
+ {
363
+ "epoch": 3.3582089552238807,
364
+ "grad_norm": 1.431805968284607,
365
+ "learning_rate": 1.1142020457223195e-05,
366
+ "loss": 0.0912,
367
+ "mean_token_accuracy": 0.9724087870121002,
368
+ "num_input_tokens_seen": 11665956,
369
+ "num_tokens": 11665956.0,
370
+ "step": 1800
371
+ },
372
+ {
373
+ "epoch": 3.451492537313433,
374
+ "grad_norm": 1.9885361194610596,
375
+ "learning_rate": 1.0712021120280951e-05,
376
+ "loss": 0.1156,
377
+ "mean_token_accuracy": 0.9630080580711364,
378
+ "num_input_tokens_seen": 12000080,
379
+ "num_tokens": 12000080.0,
380
+ "step": 1850
381
+ },
382
+ {
383
+ "epoch": 3.544776119402985,
384
+ "grad_norm": 1.1434038877487183,
385
+ "learning_rate": 1.028069384313702e-05,
386
+ "loss": 0.1109,
387
+ "mean_token_accuracy": 0.9657398784160613,
388
+ "num_input_tokens_seen": 12322044,
389
+ "num_tokens": 12322044.0,
390
+ "step": 1900
391
+ },
392
+ {
393
+ "epoch": 3.638059701492537,
394
+ "grad_norm": 1.1466773748397827,
395
+ "learning_rate": 9.848843063770963e-06,
396
+ "loss": 0.1098,
397
+ "mean_token_accuracy": 0.966309015750885,
398
+ "num_input_tokens_seen": 12642090,
399
+ "num_tokens": 12642090.0,
400
+ "step": 1950
401
+ },
402
+ {
403
+ "epoch": 3.7313432835820897,
404
+ "grad_norm": 1.5840574502944946,
405
+ "learning_rate": 9.41727419650929e-06,
406
+ "loss": 0.121,
407
+ "mean_token_accuracy": 0.961807359457016,
408
+ "num_input_tokens_seen": 12961610,
409
+ "num_tokens": 12961610.0,
410
+ "step": 2000
411
+ },
412
+ {
413
+ "epoch": 3.824626865671642,
414
+ "grad_norm": 1.5495262145996094,
415
+ "learning_rate": 8.986792129904186e-06,
416
+ "loss": 0.1102,
417
+ "mean_token_accuracy": 0.9657205975055695,
418
+ "num_input_tokens_seen": 13279862,
419
+ "num_tokens": 13279862.0,
420
+ "step": 2050
421
+ },
422
+ {
423
+ "epoch": 3.917910447761194,
424
+ "grad_norm": 1.7342835664749146,
425
+ "learning_rate": 8.558199725592856e-06,
426
+ "loss": 0.1156,
427
+ "mean_token_accuracy": 0.9632673525810241,
428
+ "num_input_tokens_seen": 13597816,
429
+ "num_tokens": 13597816.0,
430
+ "step": 2100
431
+ },
432
+ {
433
+ "epoch": 4.0111940298507465,
434
+ "grad_norm": 2.0673277378082275,
435
+ "learning_rate": 8.132296320937085e-06,
436
+ "loss": 0.118,
437
+ "mean_token_accuracy": 0.9625415456295013,
438
+ "num_input_tokens_seen": 13928436,
439
+ "num_tokens": 13928436.0,
440
+ "step": 2150
441
+ },
442
+ {
443
+ "epoch": 4.104477611940299,
444
+ "grad_norm": 1.0974746942520142,
445
+ "learning_rate": 7.709876238235702e-06,
446
+ "loss": 0.0839,
447
+ "mean_token_accuracy": 0.9738617813587189,
448
+ "num_input_tokens_seen": 14249072,
449
+ "num_tokens": 14249072.0,
450
+ "step": 2200
451
+ },
452
+ {
453
+ "epoch": 4.197761194029851,
454
+ "grad_norm": 1.0228750705718994,
455
+ "learning_rate": 7.29172730329028e-06,
456
+ "loss": 0.0498,
457
+ "mean_token_accuracy": 0.9851219677925109,
458
+ "num_input_tokens_seen": 14571152,
459
+ "num_tokens": 14571152.0,
460
+ "step": 2250
461
+ },
462
+ {
463
+ "epoch": 4.291044776119403,
464
+ "grad_norm": 0.6385033130645752,
465
+ "learning_rate": 6.8786293760869695e-06,
466
+ "loss": 0.0388,
467
+ "mean_token_accuracy": 0.9884346830844879,
468
+ "num_input_tokens_seen": 14903168,
469
+ "num_tokens": 14903168.0,
470
+ "step": 2300
471
+ },
472
+ {
473
+ "epoch": 4.384328358208955,
474
+ "grad_norm": 0.8694852590560913,
475
+ "learning_rate": 6.4713528963348506e-06,
476
+ "loss": 0.0411,
477
+ "mean_token_accuracy": 0.9874970281124115,
478
+ "num_input_tokens_seen": 15233224,
479
+ "num_tokens": 15233224.0,
480
+ "step": 2350
481
+ },
482
+ {
483
+ "epoch": 4.477611940298507,
484
+ "grad_norm": 0.929122805595398,
485
+ "learning_rate": 6.070657446573347e-06,
486
+ "loss": 0.0476,
487
+ "mean_token_accuracy": 0.985052285194397,
488
+ "num_input_tokens_seen": 15565618,
489
+ "num_tokens": 15565618.0,
490
+ "step": 2400
491
+ },
492
+ {
493
+ "epoch": 4.57089552238806,
494
+ "grad_norm": 0.6402145624160767,
495
+ "learning_rate": 5.677290335528576e-06,
496
+ "loss": 0.0694,
497
+ "mean_token_accuracy": 0.9864287662506104,
498
+ "num_input_tokens_seen": 15886728,
499
+ "num_tokens": 15886728.0,
500
+ "step": 2450
501
+ },
502
+ {
503
+ "epoch": 4.664179104477612,
504
+ "grad_norm": 0.9010692238807678,
505
+ "learning_rate": 5.291985204360754e-06,
506
+ "loss": 0.0424,
507
+ "mean_token_accuracy": 0.9870220470428467,
508
+ "num_input_tokens_seen": 16200844,
509
+ "num_tokens": 16200844.0,
510
+ "step": 2500
511
+ },
512
+ {
513
+ "epoch": 4.757462686567164,
514
+ "grad_norm": 0.8838453888893127,
515
+ "learning_rate": 4.9154606584019646e-06,
516
+ "loss": 0.0433,
517
+ "mean_token_accuracy": 0.9861760056018829,
518
+ "num_input_tokens_seen": 16518546,
519
+ "num_tokens": 16518546.0,
520
+ "step": 2550
521
+ },
522
+ {
523
+ "epoch": 4.850746268656716,
524
+ "grad_norm": 1.0508785247802734,
525
+ "learning_rate": 4.548418926936235e-06,
526
+ "loss": 0.0413,
527
+ "mean_token_accuracy": 0.9870886874198913,
528
+ "num_input_tokens_seen": 16837904,
529
+ "num_tokens": 16837904.0,
530
+ "step": 2600
531
+ },
532
+ {
533
+ "epoch": 4.9440298507462686,
534
+ "grad_norm": 1.0464740991592407,
535
+ "learning_rate": 4.191544553521355e-06,
536
+ "loss": 0.0428,
537
+ "mean_token_accuracy": 0.9860042917728424,
538
+ "num_input_tokens_seen": 17162930,
539
+ "num_tokens": 17162930.0,
540
+ "step": 2650
541
+ },
542
+ {
543
+ "epoch": 5.037313432835821,
544
+ "grad_norm": 1.167823076248169,
545
+ "learning_rate": 3.845503119295182e-06,
546
+ "loss": 0.0407,
547
+ "mean_token_accuracy": 0.9872146189212799,
548
+ "num_input_tokens_seen": 17492580,
549
+ "num_tokens": 17492580.0,
550
+ "step": 2700
551
+ },
552
+ {
553
+ "epoch": 5.130597014925373,
554
+ "grad_norm": 0.6037238836288452,
555
+ "learning_rate": 3.5109400016473338e-06,
556
+ "loss": 0.0259,
557
+ "mean_token_accuracy": 0.9919403278827668,
558
+ "num_input_tokens_seen": 17814314,
559
+ "num_tokens": 17814314.0,
560
+ "step": 2750
561
+ },
562
+ {
563
+ "epoch": 5.223880597014926,
564
+ "grad_norm": 0.4187396168708801,
565
+ "learning_rate": 3.1884791705714936e-06,
566
+ "loss": 0.0157,
567
+ "mean_token_accuracy": 0.9955093479156494,
568
+ "num_input_tokens_seen": 18139326,
569
+ "num_tokens": 18139326.0,
570
+ "step": 2800
571
+ },
572
+ {
573
+ "epoch": 5.317164179104478,
574
+ "grad_norm": 0.46902546286582947,
575
+ "learning_rate": 2.878722024943139e-06,
576
+ "loss": 0.0139,
577
+ "mean_token_accuracy": 0.995677514076233,
578
+ "num_input_tokens_seen": 18466486,
579
+ "num_tokens": 18466486.0,
580
+ "step": 2850
581
+ },
582
+ {
583
+ "epoch": 5.41044776119403,
584
+ "grad_norm": 0.3824739456176758,
585
+ "learning_rate": 2.5822462708930607e-06,
586
+ "loss": 0.0146,
587
+ "mean_token_accuracy": 0.9957832169532775,
588
+ "num_input_tokens_seen": 18796326,
589
+ "num_tokens": 18796326.0,
590
+ "step": 2900
591
+ },
592
+ {
593
+ "epoch": 5.503731343283582,
594
+ "grad_norm": 0.5814462304115295,
595
+ "learning_rate": 2.299604844368547e-06,
596
+ "loss": 0.0156,
597
+ "mean_token_accuracy": 0.9954328262805938,
598
+ "num_input_tokens_seen": 19129954,
599
+ "num_tokens": 19129954.0,
600
+ "step": 2950
601
+ },
602
+ {
603
+ "epoch": 5.597014925373134,
604
+ "grad_norm": 0.45167264342308044,
605
+ "learning_rate": 2.031324879891664e-06,
606
+ "loss": 0.013,
607
+ "mean_token_accuracy": 0.9963365316390991,
608
+ "num_input_tokens_seen": 19448322,
609
+ "num_tokens": 19448322.0,
610
+ "step": 3000
611
+ },
612
+ {
613
+ "epoch": 5.690298507462686,
614
+ "grad_norm": 0.5698373913764954,
615
+ "learning_rate": 1.777906727437979e-06,
616
+ "loss": 0.0127,
617
+ "mean_token_accuracy": 0.9959760665893554,
618
+ "num_input_tokens_seen": 19765536,
619
+ "num_tokens": 19765536.0,
620
+ "step": 3050
621
+ },
622
+ {
623
+ "epoch": 5.7835820895522385,
624
+ "grad_norm": 0.5558890104293823,
625
+ "learning_rate": 1.5398230192692275e-06,
626
+ "loss": 0.0117,
627
+ "mean_token_accuracy": 0.9963926291465759,
628
+ "num_input_tokens_seen": 20082350,
629
+ "num_tokens": 20082350.0,
630
+ "step": 3100
631
+ },
632
+ {
633
+ "epoch": 5.8768656716417915,
634
+ "grad_norm": 0.46126776933670044,
635
+ "learning_rate": 1.3175177884603252e-06,
636
+ "loss": 0.0112,
637
+ "mean_token_accuracy": 0.9968423485755921,
638
+ "num_input_tokens_seen": 20401204,
639
+ "num_tokens": 20401204.0,
640
+ "step": 3150
641
+ },
642
+ {
643
+ "epoch": 5.970149253731344,
644
+ "grad_norm": 0.4649079144001007,
645
+ "learning_rate": 1.1114056407647045e-06,
646
+ "loss": 0.0111,
647
+ "mean_token_accuracy": 0.9965043890476227,
648
+ "num_input_tokens_seen": 20730370,
649
+ "num_tokens": 20730370.0,
650
+ "step": 3200
651
+ },
652
+ {
653
+ "epoch": 6.063432835820896,
654
+ "grad_norm": 0.49266737699508667,
655
+ "learning_rate": 9.218709813624749e-07,
656
+ "loss": 0.012,
657
+ "mean_token_accuracy": 0.9967284095287323,
658
+ "num_input_tokens_seen": 21052760,
659
+ "num_tokens": 21052760.0,
660
+ "step": 3250
661
+ },
662
+ {
663
+ "epoch": 6.156716417910448,
664
+ "grad_norm": 0.38067445158958435,
665
+ "learning_rate": 7.492672979335147e-07,
666
+ "loss": 0.0069,
667
+ "mean_token_accuracy": 0.998069132566452,
668
+ "num_input_tokens_seen": 21373624,
669
+ "num_tokens": 21373624.0,
670
+ "step": 3300
671
+ },
672
+ {
673
+ "epoch": 6.25,
674
+ "grad_norm": 0.43708378076553345,
675
+ "learning_rate": 5.939165013926195e-07,
676
+ "loss": 0.0054,
677
+ "mean_token_accuracy": 0.9985744786262513,
678
+ "num_input_tokens_seen": 21701456,
679
+ "num_tokens": 21701456.0,
680
+ "step": 3350
681
+ },
682
+ {
683
+ "epoch": 6.343283582089552,
684
+ "grad_norm": 0.5964256525039673,
685
+ "learning_rate": 4.56108325516238e-07,
686
+ "loss": 0.0066,
687
+ "mean_token_accuracy": 0.9981687092781066,
688
+ "num_input_tokens_seen": 22028186,
689
+ "num_tokens": 22028186.0,
690
+ "step": 3400
691
+ },
692
+ {
693
+ "epoch": 6.436567164179104,
694
+ "grad_norm": 0.5558670163154602,
695
+ "learning_rate": 3.3609978658051045e-07,
696
+ "loss": 0.0065,
697
+ "mean_token_accuracy": 0.9981926000118255,
698
+ "num_input_tokens_seen": 22357840,
699
+ "num_tokens": 22357840.0,
700
+ "step": 3450
701
+ },
702
+ {
703
+ "epoch": 6.529850746268656,
704
+ "grad_norm": 0.5026367902755737,
705
+ "learning_rate": 2.341147040184011e-07,
706
+ "loss": 0.0052,
707
+ "mean_token_accuracy": 0.9985788524150848,
708
+ "num_input_tokens_seen": 22688738,
709
+ "num_tokens": 22688738.0,
710
+ "step": 3500
711
+ },
712
+ {
713
+ "epoch": 6.6231343283582085,
714
+ "grad_norm": 0.2897884249687195,
715
+ "learning_rate": 1.5034328298990652e-07,
716
+ "loss": 0.0046,
717
+ "mean_token_accuracy": 0.9987975347042084,
718
+ "num_input_tokens_seen": 23010742,
719
+ "num_tokens": 23010742.0,
720
+ "step": 3550
721
+ },
722
+ {
723
+ "epoch": 6.7164179104477615,
724
+ "grad_norm": 0.43537065386772156,
725
+ "learning_rate": 8.494175964388285e-08,
726
+ "loss": 0.0039,
727
+ "mean_token_accuracy": 0.9988950288295746,
728
+ "num_input_tokens_seen": 23329282,
729
+ "num_tokens": 23329282.0,
730
+ "step": 3600
731
+ },
732
+ {
733
+ "epoch": 6.809701492537314,
734
+ "grad_norm": 0.36725738644599915,
735
+ "learning_rate": 3.803210973305715e-08,
736
+ "loss": 0.003,
737
+ "mean_token_accuracy": 0.9991644871234894,
738
+ "num_input_tokens_seen": 23647522,
739
+ "num_tokens": 23647522.0,
740
+ "step": 3650
741
+ },
742
+ {
743
+ "epoch": 6.902985074626866,
744
+ "grad_norm": 0.531343400478363,
745
+ "learning_rate": 9.7018211256783e-09,
746
+ "loss": 0.0046,
747
+ "mean_token_accuracy": 0.9987628531455993,
748
+ "num_input_tokens_seen": 23963558,
749
+ "num_tokens": 23963558.0,
750
+ "step": 3700
751
+ },
752
+ {
753
+ "epoch": 6.996268656716418,
754
+ "grad_norm": 0.6237491369247437,
755
+ "learning_rate": 3.7306380940016486e-12,
756
+ "loss": 0.0051,
757
+ "mean_token_accuracy": 0.9986482226848602,
758
+ "num_input_tokens_seen": 24290780,
759
+ "num_tokens": 24290780.0,
760
+ "step": 3750
761
+ },
762
+ {
763
+ "epoch": 6.996268656716418,
764
+ "num_input_tokens_seen": 24290780,
765
+ "step": 3750,
766
+ "total_flos": 1.4113858770069094e+17,
767
+ "train_loss": 0.21003443336486816,
768
+ "train_runtime": 66532.5206,
769
+ "train_samples_per_second": 0.451,
770
+ "train_steps_per_second": 0.056,
771
+ "train_tokens_per_second": 70.398
772
+ }
773
+ ],
774
+ "logging_steps": 50,
775
+ "max_steps": 3750,
776
+ "num_input_tokens_seen": 24290780,
777
+ "num_train_epochs": 7,
778
+ "save_steps": 0,
779
+ "stateful_callbacks": {
780
+ "TrainerControl": {
781
+ "args": {
782
+ "should_epoch_stop": false,
783
+ "should_evaluate": false,
784
+ "should_log": false,
785
+ "should_save": false,
786
+ "should_training_stop": false
787
+ },
788
+ "attributes": {}
789
+ }
790
+ },
791
+ "total_flos": 1.4113858770069094e+17,
792
+ "train_batch_size": 1,
793
+ "trial_name": null,
794
+ "trial_params": null
795
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ceac200286f0cf175e77ab1763672204503403d9d5009bfa55db268c97ee492
3
+ size 6161
vocab.json ADDED
The diff for this file is too large to render. See raw diff