onlyoneplease commited on
Commit
d9593ff
·
verified ·
1 Parent(s): 522c6a0

Upload folder using huggingface_hub

Browse files
Files changed (43) hide show
  1. output/training/v1-20260117-010840-10e/args.json +353 -0
  2. output/training/v1-20260117-010840-10e/checkpoint-400/README.md +207 -0
  3. output/training/v1-20260117-010840-10e/checkpoint-400/adapter_config.json +38 -0
  4. output/training/v1-20260117-010840-10e/checkpoint-400/adapter_model.safetensors +3 -0
  5. output/training/v1-20260117-010840-10e/checkpoint-400/additional_config.json +1 -0
  6. output/training/v1-20260117-010840-10e/checkpoint-400/args.json +353 -0
  7. output/training/v1-20260117-010840-10e/checkpoint-400/optimizer.pt +3 -0
  8. output/training/v1-20260117-010840-10e/checkpoint-400/rng_state.pth +3 -0
  9. output/training/v1-20260117-010840-10e/checkpoint-400/scheduler.pt +3 -0
  10. output/training/v1-20260117-010840-10e/checkpoint-400/trainer_state.json +362 -0
  11. output/training/v1-20260117-010840-10e/checkpoint-400/training_args.bin +3 -0
  12. output/training/v1-20260117-010840-10e/checkpoint-500/README.md +207 -0
  13. output/training/v1-20260117-010840-10e/checkpoint-500/adapter_config.json +38 -0
  14. output/training/v1-20260117-010840-10e/checkpoint-500/adapter_model.safetensors +3 -0
  15. output/training/v1-20260117-010840-10e/checkpoint-500/additional_config.json +1 -0
  16. output/training/v1-20260117-010840-10e/checkpoint-500/args.json +353 -0
  17. output/training/v1-20260117-010840-10e/checkpoint-500/optimizer.pt +3 -0
  18. output/training/v1-20260117-010840-10e/checkpoint-500/rng_state.pth +3 -0
  19. output/training/v1-20260117-010840-10e/checkpoint-500/scheduler.pt +3 -0
  20. output/training/v1-20260117-010840-10e/checkpoint-500/trainer_state.json +442 -0
  21. output/training/v1-20260117-010840-10e/checkpoint-500/training_args.bin +3 -0
  22. output/training/v1-20260117-010840-10e/checkpoint-580/README.md +207 -0
  23. output/training/v1-20260117-010840-10e/checkpoint-580/adapter_config.json +38 -0
  24. output/training/v1-20260117-010840-10e/checkpoint-580/adapter_model.safetensors +3 -0
  25. output/training/v1-20260117-010840-10e/checkpoint-580/additional_config.json +1 -0
  26. output/training/v1-20260117-010840-10e/checkpoint-580/args.json +353 -0
  27. output/training/v1-20260117-010840-10e/checkpoint-580/optimizer.pt +3 -0
  28. output/training/v1-20260117-010840-10e/checkpoint-580/rng_state.pth +3 -0
  29. output/training/v1-20260117-010840-10e/checkpoint-580/scheduler.pt +3 -0
  30. output/training/v1-20260117-010840-10e/checkpoint-580/trainer_state.json +506 -0
  31. output/training/v1-20260117-010840-10e/checkpoint-580/training_args.bin +3 -0
  32. output/training/v1-20260117-010840-10e/images/train_epoch.png +0 -0
  33. output/training/v1-20260117-010840-10e/images/train_grad_norm.png +0 -0
  34. output/training/v1-20260117-010840-10e/images/train_learning_rate.png +0 -0
  35. output/training/v1-20260117-010840-10e/images/train_loss.png +0 -0
  36. output/training/v1-20260117-010840-10e/images/train_token_acc.png +0 -0
  37. output/training/v1-20260117-010840-10e/images/train_total_flos.png +0 -0
  38. output/training/v1-20260117-010840-10e/images/train_train_loss.png +0 -0
  39. output/training/v1-20260117-010840-10e/images/train_train_runtime.png +0 -0
  40. output/training/v1-20260117-010840-10e/images/train_train_samples_per_second.png +0 -0
  41. output/training/v1-20260117-010840-10e/images/train_train_steps_per_second.png +0 -0
  42. output/training/v1-20260117-010840-10e/logging.jsonl +61 -0
  43. output/training/v1-20260117-010840-10e/runs/events.out.tfevents.1768612131.5090.2113421.0 +3 -0
output/training/v1-20260117-010840-10e/args.json ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "no",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 1,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 8,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 0.0001,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 10.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.05,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 10,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "steps",
38
+ "save_steps": 100.0,
39
+ "save_total_limit": 3,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "bf16": true,
51
+ "fp16": false,
52
+ "fp16_opt_level": "O1",
53
+ "half_precision_backend": "auto",
54
+ "bf16_full_eval": false,
55
+ "fp16_full_eval": false,
56
+ "tf32": null,
57
+ "local_rank": -1,
58
+ "ddp_backend": null,
59
+ "tpu_num_cores": null,
60
+ "tpu_metrics_debug": false,
61
+ "debug": null,
62
+ "dataloader_drop_last": false,
63
+ "eval_steps": 100.0,
64
+ "dataloader_num_workers": 4,
65
+ "dataloader_prefetch_factor": null,
66
+ "past_index": -1,
67
+ "run_name": "/home/ab/document-parsing/output/training/v1-20260117-010840",
68
+ "disable_tqdm": null,
69
+ "remove_unused_columns": true,
70
+ "label_names": null,
71
+ "load_best_model_at_end": false,
72
+ "metric_for_best_model": "loss",
73
+ "greater_is_better": false,
74
+ "ignore_data_skip": false,
75
+ "fsdp": [],
76
+ "fsdp_min_num_params": 0,
77
+ "fsdp_config": null,
78
+ "fsdp_transformer_layer_cls_to_wrap": null,
79
+ "accelerator_config": {
80
+ "dispatch_batches": false
81
+ },
82
+ "parallelism_config": null,
83
+ "deepspeed": null,
84
+ "label_smoothing_factor": 0.0,
85
+ "optim": "adamw_torch_fused",
86
+ "optim_args": null,
87
+ "adafactor": false,
88
+ "group_by_length": false,
89
+ "length_column_name": "length",
90
+ "report_to": [
91
+ "tensorboard"
92
+ ],
93
+ "project": "huggingface",
94
+ "trackio_space_id": "trackio",
95
+ "ddp_find_unused_parameters": null,
96
+ "ddp_bucket_cap_mb": null,
97
+ "ddp_broadcast_buffers": null,
98
+ "dataloader_pin_memory": true,
99
+ "dataloader_persistent_workers": false,
100
+ "skip_memory_metrics": true,
101
+ "use_legacy_prediction_loop": false,
102
+ "push_to_hub": false,
103
+ "resume_from_checkpoint": null,
104
+ "hub_model_id": null,
105
+ "hub_strategy": "every_save",
106
+ "hub_token": null,
107
+ "hub_private_repo": null,
108
+ "hub_always_push": false,
109
+ "hub_revision": null,
110
+ "gradient_checkpointing": true,
111
+ "gradient_checkpointing_kwargs": null,
112
+ "include_inputs_for_metrics": false,
113
+ "include_for_metrics": [],
114
+ "eval_do_concat_batches": true,
115
+ "fp16_backend": "auto",
116
+ "push_to_hub_model_id": null,
117
+ "push_to_hub_organization": null,
118
+ "push_to_hub_token": null,
119
+ "mp_parameters": "",
120
+ "auto_find_batch_size": false,
121
+ "full_determinism": false,
122
+ "torchdynamo": null,
123
+ "ray_scope": "last",
124
+ "ddp_timeout": 18000000,
125
+ "torch_compile": false,
126
+ "torch_compile_backend": null,
127
+ "torch_compile_mode": null,
128
+ "include_tokens_per_second": false,
129
+ "include_num_input_tokens_seen": false,
130
+ "neftune_noise_alpha": null,
131
+ "optim_target_modules": null,
132
+ "batch_eval_metrics": false,
133
+ "eval_on_start": false,
134
+ "use_liger_kernel": false,
135
+ "liger_kernel_config": null,
136
+ "eval_use_gather_object": false,
137
+ "average_tokens_across_devices": true,
138
+ "sortish_sampler": false,
139
+ "predict_with_generate": false,
140
+ "generation_max_length": null,
141
+ "generation_num_beams": null,
142
+ "generation_config": null,
143
+ "tuner_backend": "peft",
144
+ "vit_gradient_checkpointing": null,
145
+ "router_aux_loss_coef": 0.0,
146
+ "enable_dft_loss": false,
147
+ "enable_channel_loss": false,
148
+ "check_model": true,
149
+ "acc_strategy": "token",
150
+ "train_dataloader_shuffle": true,
151
+ "max_epochs": null,
152
+ "aligner_lr": null,
153
+ "vit_lr": null,
154
+ "use_logits_to_keep": null,
155
+ "ds3_gather_for_generation": true,
156
+ "resume_only_model": false,
157
+ "optimizer": null,
158
+ "loss_type": null,
159
+ "metric": null,
160
+ "eval_use_evalscope": false,
161
+ "eval_dataset": [],
162
+ "eval_dataset_args": null,
163
+ "eval_limit": null,
164
+ "eval_generation_config": null,
165
+ "extra_eval_args": null,
166
+ "use_flash_ckpt": false,
167
+ "use_ray": false,
168
+ "ray_exp_name": null,
169
+ "device_groups": null,
170
+ "model": "nanonets/Nanonets-OCR2-3B",
171
+ "model_type": "qwen2_5_vl",
172
+ "model_revision": null,
173
+ "task_type": "causal_lm",
174
+ "torch_dtype": "bfloat16",
175
+ "attn_impl": null,
176
+ "new_special_tokens": [],
177
+ "num_labels": null,
178
+ "problem_type": null,
179
+ "rope_scaling": null,
180
+ "device_map": null,
181
+ "max_memory": {},
182
+ "max_model_len": null,
183
+ "local_repo_path": null,
184
+ "init_strategy": null,
185
+ "template": "qwen2_5_vl",
186
+ "system": null,
187
+ "max_length": 8192,
188
+ "truncation_strategy": "delete",
189
+ "max_pixels": null,
190
+ "agent_template": null,
191
+ "norm_bbox": null,
192
+ "use_chat_template": true,
193
+ "padding_side": "right",
194
+ "padding_free": false,
195
+ "loss_scale": "default",
196
+ "sequence_parallel_size": 1,
197
+ "template_backend": "swift",
198
+ "response_prefix": null,
199
+ "enable_thinking": null,
200
+ "add_non_thinking_prefix": true,
201
+ "dataset": [
202
+ "/home/ab/document-parsing/output/datasets/train.jsonl"
203
+ ],
204
+ "val_dataset": [],
205
+ "cached_dataset": [],
206
+ "cached_val_dataset": [],
207
+ "split_dataset_ratio": 0.0,
208
+ "dataset_num_proc": 1,
209
+ "load_from_cache_file": false,
210
+ "dataset_shuffle": true,
211
+ "val_dataset_shuffle": false,
212
+ "streaming": false,
213
+ "interleave_prob": null,
214
+ "stopping_strategy": "first_exhausted",
215
+ "shuffle_buffer_size": 1000,
216
+ "download_mode": "reuse_dataset_if_exists",
217
+ "columns": {},
218
+ "strict": false,
219
+ "model_name": null,
220
+ "model_author": null,
221
+ "custom_dataset_info": [],
222
+ "quant_method": null,
223
+ "quant_bits": null,
224
+ "hqq_axis": null,
225
+ "bnb_4bit_compute_dtype": "bfloat16",
226
+ "bnb_4bit_quant_type": "nf4",
227
+ "bnb_4bit_use_double_quant": true,
228
+ "bnb_4bit_quant_storage": null,
229
+ "max_new_tokens": 64,
230
+ "temperature": 0.0,
231
+ "top_k": null,
232
+ "top_p": null,
233
+ "repetition_penalty": null,
234
+ "num_beams": 1,
235
+ "stream": false,
236
+ "stop_words": [],
237
+ "logprobs": false,
238
+ "top_logprobs": null,
239
+ "structured_outputs_regex": null,
240
+ "ckpt_dir": null,
241
+ "lora_modules": [],
242
+ "train_type": "lora",
243
+ "adapters": [],
244
+ "external_plugins": [],
245
+ "model_kwargs": {},
246
+ "load_args": false,
247
+ "load_data_args": false,
248
+ "packing": false,
249
+ "packing_length": null,
250
+ "packing_num_proc": 1,
251
+ "lazy_tokenize": true,
252
+ "custom_register_path": [],
253
+ "use_hf": false,
254
+ "ignore_args_error": false,
255
+ "use_swift_lora": false,
256
+ "freeze_parameters": [],
257
+ "freeze_parameters_regex": null,
258
+ "freeze_parameters_ratio": 0.0,
259
+ "trainable_parameters": [],
260
+ "trainable_parameters_regex": null,
261
+ "freeze_llm": false,
262
+ "freeze_vit": false,
263
+ "freeze_aligner": true,
264
+ "target_modules": [
265
+ "all-linear"
266
+ ],
267
+ "target_regex": null,
268
+ "target_parameters": null,
269
+ "modules_to_save": [],
270
+ "lora_rank": 64,
271
+ "lora_alpha": 16,
272
+ "lora_dropout": 0.05,
273
+ "lora_bias": "none",
274
+ "lora_dtype": null,
275
+ "lorap_lr_ratio": null,
276
+ "use_rslora": false,
277
+ "use_dora": false,
278
+ "lora_ga_batch_size": 2,
279
+ "lora_ga_iters": 2,
280
+ "lora_ga_max_length": 1024,
281
+ "lora_ga_direction": "ArB2r",
282
+ "lora_ga_scale": "stable",
283
+ "lora_ga_stable_gamma": 16,
284
+ "init_weights": true,
285
+ "fourier_n_frequency": 2000,
286
+ "fourier_scaling": 300.0,
287
+ "boft_block_size": 4,
288
+ "boft_block_num": 0,
289
+ "boft_n_butterfly_factor": 1,
290
+ "boft_dropout": 0.0,
291
+ "vera_rank": 256,
292
+ "vera_projection_prng_key": 0,
293
+ "vera_dropout": 0.0,
294
+ "vera_d_initial": 0.1,
295
+ "adapter_act": "gelu",
296
+ "adapter_length": 128,
297
+ "use_galore": false,
298
+ "galore_target_modules": null,
299
+ "galore_rank": 128,
300
+ "galore_update_proj_gap": 50,
301
+ "galore_scale": 1.0,
302
+ "galore_proj_type": "std",
303
+ "galore_optim_per_parameter": false,
304
+ "galore_with_embedding": false,
305
+ "galore_quantization": false,
306
+ "galore_proj_quant": false,
307
+ "galore_proj_bits": 4,
308
+ "galore_proj_group_size": 256,
309
+ "galore_cos_threshold": 0.4,
310
+ "galore_gamma_proj": 2,
311
+ "galore_queue_size": 5,
312
+ "adalora_target_r": 8,
313
+ "adalora_init_r": 12,
314
+ "adalora_tinit": 0,
315
+ "adalora_tfinal": 0,
316
+ "adalora_deltaT": 1,
317
+ "adalora_beta1": 0.85,
318
+ "adalora_beta2": 0.85,
319
+ "adalora_orth_reg_weight": 0.5,
320
+ "llamapro_num_new_blocks": 4,
321
+ "llamapro_num_groups": null,
322
+ "lisa_activated_layers": 0,
323
+ "lisa_step_interval": 20,
324
+ "reft_layer_key": null,
325
+ "reft_layers": null,
326
+ "reft_rank": 4,
327
+ "reft_intervention_type": "LoreftIntervention",
328
+ "reft_args": null,
329
+ "swanlab_token": null,
330
+ "swanlab_project": "ms-swift",
331
+ "swanlab_workspace": null,
332
+ "swanlab_exp_name": null,
333
+ "swanlab_notification_method": null,
334
+ "swanlab_webhook_url": null,
335
+ "swanlab_secret": null,
336
+ "swanlab_mode": "cloud",
337
+ "add_version": true,
338
+ "create_checkpoint_symlink": false,
339
+ "zero_hpz_partition_size": null,
340
+ "deepspeed_autotp_size": null,
341
+ "early_stop_interval": null,
342
+ "rank": -1,
343
+ "global_world_size": 1,
344
+ "local_world_size": 1,
345
+ "model_suffix": "Nanonets-OCR2-3B",
346
+ "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'mrope_section': [16, 24, 24], 'rope_type': 'default', 'type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
347
+ "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7c76215fac00>, model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=['vision', 'video'])",
348
+ "model_dir": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
349
+ "_val_dataset_exists": [],
350
+ "hub": "<class 'swift.hub.hub.MSHub'>",
351
+ "evaluation_strategy": "steps",
352
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/home/ab/document-parsing/output/training/v1-20260117-010840', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ab/document-parsing/output/training/v1-20260117-010840/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=100.0, dataloader_num_workers=4, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/ab/document-parsing/output/training/v1-20260117-010840', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='lora', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
353
+ }
output/training/v1-20260117-010840-10e/checkpoint-400/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: ''
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
output/training/v1-20260117-010840-10e/checkpoint-400/adapter_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [],
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": "^(model.language_model.*\\.(down_proj|up_proj|gate_proj|v_proj|k_proj|q_proj|o_proj)|(?!(model.visual.merger))model.visual.*\\.(mlp.0|down_proj|up_proj|gate_proj|mlp.2|qkv|attn.proj))$",
32
+ "target_parameters": null,
33
+ "task_type": "CAUSAL_LM",
34
+ "trainable_token_indices": null,
35
+ "use_dora": false,
36
+ "use_qalora": false,
37
+ "use_rslora": false
38
+ }
output/training/v1-20260117-010840-10e/checkpoint-400/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2b5ba0bce8b712e8f48caae7682b785de24c5632eb5b9ada4c276878e3e846c
3
+ size 657478696
output/training/v1-20260117-010840-10e/checkpoint-400/additional_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
output/training/v1-20260117-010840-10e/checkpoint-400/args.json ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "no",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 1,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 8,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 0.0001,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 10.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.05,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 10,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "steps",
38
+ "save_steps": 100.0,
39
+ "save_total_limit": 3,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "bf16": true,
51
+ "fp16": false,
52
+ "fp16_opt_level": "O1",
53
+ "half_precision_backend": "auto",
54
+ "bf16_full_eval": false,
55
+ "fp16_full_eval": false,
56
+ "tf32": null,
57
+ "local_rank": -1,
58
+ "ddp_backend": null,
59
+ "tpu_num_cores": null,
60
+ "tpu_metrics_debug": false,
61
+ "debug": null,
62
+ "dataloader_drop_last": false,
63
+ "eval_steps": 100.0,
64
+ "dataloader_num_workers": 4,
65
+ "dataloader_prefetch_factor": null,
66
+ "past_index": -1,
67
+ "run_name": "/home/ab/document-parsing/output/training/v1-20260117-010840",
68
+ "disable_tqdm": null,
69
+ "remove_unused_columns": true,
70
+ "label_names": null,
71
+ "load_best_model_at_end": false,
72
+ "metric_for_best_model": "loss",
73
+ "greater_is_better": false,
74
+ "ignore_data_skip": false,
75
+ "fsdp": [],
76
+ "fsdp_min_num_params": 0,
77
+ "fsdp_config": null,
78
+ "fsdp_transformer_layer_cls_to_wrap": null,
79
+ "accelerator_config": {
80
+ "dispatch_batches": false
81
+ },
82
+ "parallelism_config": null,
83
+ "deepspeed": null,
84
+ "label_smoothing_factor": 0.0,
85
+ "optim": "adamw_torch_fused",
86
+ "optim_args": null,
87
+ "adafactor": false,
88
+ "group_by_length": false,
89
+ "length_column_name": "length",
90
+ "report_to": [
91
+ "tensorboard"
92
+ ],
93
+ "project": "huggingface",
94
+ "trackio_space_id": "trackio",
95
+ "ddp_find_unused_parameters": null,
96
+ "ddp_bucket_cap_mb": null,
97
+ "ddp_broadcast_buffers": null,
98
+ "dataloader_pin_memory": true,
99
+ "dataloader_persistent_workers": false,
100
+ "skip_memory_metrics": true,
101
+ "use_legacy_prediction_loop": false,
102
+ "push_to_hub": false,
103
+ "resume_from_checkpoint": null,
104
+ "hub_model_id": null,
105
+ "hub_strategy": "every_save",
106
+ "hub_token": null,
107
+ "hub_private_repo": null,
108
+ "hub_always_push": false,
109
+ "hub_revision": null,
110
+ "gradient_checkpointing": true,
111
+ "gradient_checkpointing_kwargs": null,
112
+ "include_inputs_for_metrics": false,
113
+ "include_for_metrics": [],
114
+ "eval_do_concat_batches": true,
115
+ "fp16_backend": "auto",
116
+ "push_to_hub_model_id": null,
117
+ "push_to_hub_organization": null,
118
+ "push_to_hub_token": null,
119
+ "mp_parameters": "",
120
+ "auto_find_batch_size": false,
121
+ "full_determinism": false,
122
+ "torchdynamo": null,
123
+ "ray_scope": "last",
124
+ "ddp_timeout": 18000000,
125
+ "torch_compile": false,
126
+ "torch_compile_backend": null,
127
+ "torch_compile_mode": null,
128
+ "include_tokens_per_second": false,
129
+ "include_num_input_tokens_seen": false,
130
+ "neftune_noise_alpha": null,
131
+ "optim_target_modules": null,
132
+ "batch_eval_metrics": false,
133
+ "eval_on_start": false,
134
+ "use_liger_kernel": false,
135
+ "liger_kernel_config": null,
136
+ "eval_use_gather_object": false,
137
+ "average_tokens_across_devices": true,
138
+ "sortish_sampler": false,
139
+ "predict_with_generate": false,
140
+ "generation_max_length": null,
141
+ "generation_num_beams": null,
142
+ "generation_config": null,
143
+ "tuner_backend": "peft",
144
+ "vit_gradient_checkpointing": null,
145
+ "router_aux_loss_coef": 0.0,
146
+ "enable_dft_loss": false,
147
+ "enable_channel_loss": false,
148
+ "check_model": true,
149
+ "acc_strategy": "token",
150
+ "train_dataloader_shuffle": true,
151
+ "max_epochs": null,
152
+ "aligner_lr": null,
153
+ "vit_lr": null,
154
+ "use_logits_to_keep": null,
155
+ "ds3_gather_for_generation": true,
156
+ "resume_only_model": false,
157
+ "optimizer": null,
158
+ "loss_type": null,
159
+ "metric": null,
160
+ "eval_use_evalscope": false,
161
+ "eval_dataset": [],
162
+ "eval_dataset_args": null,
163
+ "eval_limit": null,
164
+ "eval_generation_config": null,
165
+ "extra_eval_args": null,
166
+ "use_flash_ckpt": false,
167
+ "use_ray": false,
168
+ "ray_exp_name": null,
169
+ "device_groups": null,
170
+ "model": "nanonets/Nanonets-OCR2-3B",
171
+ "model_type": "qwen2_5_vl",
172
+ "model_revision": null,
173
+ "task_type": "causal_lm",
174
+ "torch_dtype": "bfloat16",
175
+ "attn_impl": null,
176
+ "new_special_tokens": [],
177
+ "num_labels": null,
178
+ "problem_type": null,
179
+ "rope_scaling": null,
180
+ "device_map": null,
181
+ "max_memory": {},
182
+ "max_model_len": null,
183
+ "local_repo_path": null,
184
+ "init_strategy": null,
185
+ "template": "qwen2_5_vl",
186
+ "system": null,
187
+ "max_length": 8192,
188
+ "truncation_strategy": "delete",
189
+ "max_pixels": null,
190
+ "agent_template": null,
191
+ "norm_bbox": null,
192
+ "use_chat_template": true,
193
+ "padding_side": "right",
194
+ "padding_free": false,
195
+ "loss_scale": "default",
196
+ "sequence_parallel_size": 1,
197
+ "template_backend": "swift",
198
+ "response_prefix": null,
199
+ "enable_thinking": null,
200
+ "add_non_thinking_prefix": true,
201
+ "dataset": [
202
+ "/home/ab/document-parsing/output/datasets/train.jsonl"
203
+ ],
204
+ "val_dataset": [],
205
+ "cached_dataset": [],
206
+ "cached_val_dataset": [],
207
+ "split_dataset_ratio": 0.0,
208
+ "dataset_num_proc": 1,
209
+ "load_from_cache_file": false,
210
+ "dataset_shuffle": true,
211
+ "val_dataset_shuffle": false,
212
+ "streaming": false,
213
+ "interleave_prob": null,
214
+ "stopping_strategy": "first_exhausted",
215
+ "shuffle_buffer_size": 1000,
216
+ "download_mode": "reuse_dataset_if_exists",
217
+ "columns": {},
218
+ "strict": false,
219
+ "model_name": null,
220
+ "model_author": null,
221
+ "custom_dataset_info": [],
222
+ "quant_method": null,
223
+ "quant_bits": null,
224
+ "hqq_axis": null,
225
+ "bnb_4bit_compute_dtype": "bfloat16",
226
+ "bnb_4bit_quant_type": "nf4",
227
+ "bnb_4bit_use_double_quant": true,
228
+ "bnb_4bit_quant_storage": null,
229
+ "max_new_tokens": 64,
230
+ "temperature": 0.0,
231
+ "top_k": null,
232
+ "top_p": null,
233
+ "repetition_penalty": null,
234
+ "num_beams": 1,
235
+ "stream": false,
236
+ "stop_words": [],
237
+ "logprobs": false,
238
+ "top_logprobs": null,
239
+ "structured_outputs_regex": null,
240
+ "ckpt_dir": null,
241
+ "lora_modules": [],
242
+ "train_type": "lora",
243
+ "adapters": [],
244
+ "external_plugins": [],
245
+ "model_kwargs": {},
246
+ "load_args": false,
247
+ "load_data_args": false,
248
+ "packing": false,
249
+ "packing_length": null,
250
+ "packing_num_proc": 1,
251
+ "lazy_tokenize": true,
252
+ "custom_register_path": [],
253
+ "use_hf": false,
254
+ "ignore_args_error": false,
255
+ "use_swift_lora": false,
256
+ "freeze_parameters": [],
257
+ "freeze_parameters_regex": null,
258
+ "freeze_parameters_ratio": 0.0,
259
+ "trainable_parameters": [],
260
+ "trainable_parameters_regex": null,
261
+ "freeze_llm": false,
262
+ "freeze_vit": false,
263
+ "freeze_aligner": true,
264
+ "target_modules": [
265
+ "all-linear"
266
+ ],
267
+ "target_regex": null,
268
+ "target_parameters": null,
269
+ "modules_to_save": [],
270
+ "lora_rank": 64,
271
+ "lora_alpha": 16,
272
+ "lora_dropout": 0.05,
273
+ "lora_bias": "none",
274
+ "lora_dtype": null,
275
+ "lorap_lr_ratio": null,
276
+ "use_rslora": false,
277
+ "use_dora": false,
278
+ "lora_ga_batch_size": 2,
279
+ "lora_ga_iters": 2,
280
+ "lora_ga_max_length": 1024,
281
+ "lora_ga_direction": "ArB2r",
282
+ "lora_ga_scale": "stable",
283
+ "lora_ga_stable_gamma": 16,
284
+ "init_weights": true,
285
+ "fourier_n_frequency": 2000,
286
+ "fourier_scaling": 300.0,
287
+ "boft_block_size": 4,
288
+ "boft_block_num": 0,
289
+ "boft_n_butterfly_factor": 1,
290
+ "boft_dropout": 0.0,
291
+ "vera_rank": 256,
292
+ "vera_projection_prng_key": 0,
293
+ "vera_dropout": 0.0,
294
+ "vera_d_initial": 0.1,
295
+ "adapter_act": "gelu",
296
+ "adapter_length": 128,
297
+ "use_galore": false,
298
+ "galore_target_modules": null,
299
+ "galore_rank": 128,
300
+ "galore_update_proj_gap": 50,
301
+ "galore_scale": 1.0,
302
+ "galore_proj_type": "std",
303
+ "galore_optim_per_parameter": false,
304
+ "galore_with_embedding": false,
305
+ "galore_quantization": false,
306
+ "galore_proj_quant": false,
307
+ "galore_proj_bits": 4,
308
+ "galore_proj_group_size": 256,
309
+ "galore_cos_threshold": 0.4,
310
+ "galore_gamma_proj": 2,
311
+ "galore_queue_size": 5,
312
+ "adalora_target_r": 8,
313
+ "adalora_init_r": 12,
314
+ "adalora_tinit": 0,
315
+ "adalora_tfinal": 0,
316
+ "adalora_deltaT": 1,
317
+ "adalora_beta1": 0.85,
318
+ "adalora_beta2": 0.85,
319
+ "adalora_orth_reg_weight": 0.5,
320
+ "llamapro_num_new_blocks": 4,
321
+ "llamapro_num_groups": null,
322
+ "lisa_activated_layers": 0,
323
+ "lisa_step_interval": 20,
324
+ "reft_layer_key": null,
325
+ "reft_layers": null,
326
+ "reft_rank": 4,
327
+ "reft_intervention_type": "LoreftIntervention",
328
+ "reft_args": null,
329
+ "swanlab_token": null,
330
+ "swanlab_project": "ms-swift",
331
+ "swanlab_workspace": null,
332
+ "swanlab_exp_name": null,
333
+ "swanlab_notification_method": null,
334
+ "swanlab_webhook_url": null,
335
+ "swanlab_secret": null,
336
+ "swanlab_mode": "cloud",
337
+ "add_version": true,
338
+ "create_checkpoint_symlink": false,
339
+ "zero_hpz_partition_size": null,
340
+ "deepspeed_autotp_size": null,
341
+ "early_stop_interval": null,
342
+ "rank": -1,
343
+ "global_world_size": 1,
344
+ "local_world_size": 1,
345
+ "model_suffix": "Nanonets-OCR2-3B",
346
+ "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'mrope_section': [16, 24, 24], 'rope_type': 'default', 'type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
347
+ "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7c76215fac00>, model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=['vision', 'video'])",
348
+ "model_dir": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
349
+ "_val_dataset_exists": [],
350
+ "hub": "<class 'swift.hub.hub.MSHub'>",
351
+ "evaluation_strategy": "steps",
352
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/home/ab/document-parsing/output/training/v1-20260117-010840', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ab/document-parsing/output/training/v1-20260117-010840/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=100.0, dataloader_num_workers=4, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/ab/document-parsing/output/training/v1-20260117-010840', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='lora', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
353
+ }
output/training/v1-20260117-010840-10e/checkpoint-400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0afd7d8505ae4933e4e78ce4c55d839caaabc686b92aa786281b243459ae37b4
3
+ size 1315426955
output/training/v1-20260117-010840-10e/checkpoint-400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc4a4100c327fe3f0fcd1d4d8851acffbbca0e1e3e5eb0db757b527d667f5693
3
+ size 14645
output/training/v1-20260117-010840-10e/checkpoint-400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d691cf1f75a0b30db18024d2926eda7b28204001f31010c4675f4b4a4df90aaa
3
+ size 1465
output/training/v1-20260117-010840-10e/checkpoint-400/trainer_state.json ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 6.9004329004329,
6
+ "eval_steps": 100.0,
7
+ "global_step": 400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.017316017316017316,
14
+ "grad_norm": 0.4092565178871155,
15
+ "learning_rate": 3.448275862068966e-06,
16
+ "loss": 1.4861114025115967,
17
+ "step": 1,
18
+ "token_acc": 0.6811960725974412
19
+ },
20
+ {
21
+ "epoch": 0.17316017316017315,
22
+ "grad_norm": 0.3977337181568146,
23
+ "learning_rate": 3.4482758620689657e-05,
24
+ "loss": 1.4343115488688152,
25
+ "step": 10,
26
+ "token_acc": 0.6920024476626676
27
+ },
28
+ {
29
+ "epoch": 0.3463203463203463,
30
+ "grad_norm": 0.2495131641626358,
31
+ "learning_rate": 6.896551724137931e-05,
32
+ "loss": 1.3693717956542968,
33
+ "step": 20,
34
+ "token_acc": 0.7011260365349897
35
+ },
36
+ {
37
+ "epoch": 0.5194805194805194,
38
+ "grad_norm": 0.24984458088874817,
39
+ "learning_rate": 9.999918729041868e-05,
40
+ "loss": 1.1922229766845702,
41
+ "step": 30,
42
+ "token_acc": 0.726987948088823
43
+ },
44
+ {
45
+ "epoch": 0.6926406926406926,
46
+ "grad_norm": 0.3221384584903717,
47
+ "learning_rate": 9.990169410465536e-05,
48
+ "loss": 1.0192347526550294,
49
+ "step": 40,
50
+ "token_acc": 0.7609010955099522
51
+ },
52
+ {
53
+ "epoch": 0.8658008658008658,
54
+ "grad_norm": 0.40206295251846313,
55
+ "learning_rate": 9.964202208175834e-05,
56
+ "loss": 0.9150349617004394,
57
+ "step": 50,
58
+ "token_acc": 0.7773335965518376
59
+ },
60
+ {
61
+ "epoch": 1.0346320346320346,
62
+ "grad_norm": 0.20406530797481537,
63
+ "learning_rate": 9.922101514711866e-05,
64
+ "loss": 0.7742667198181152,
65
+ "step": 60,
66
+ "token_acc": 0.8123942631570925
67
+ },
68
+ {
69
+ "epoch": 1.2077922077922079,
70
+ "grad_norm": 1.4768069982528687,
71
+ "learning_rate": 9.864004155919543e-05,
72
+ "loss": 0.6983946800231934,
73
+ "step": 70,
74
+ "token_acc": 0.8248333138378757
75
+ },
76
+ {
77
+ "epoch": 1.380952380952381,
78
+ "grad_norm": 0.611409604549408,
79
+ "learning_rate": 9.790098946272177e-05,
80
+ "loss": 0.6138243198394775,
81
+ "step": 80,
82
+ "token_acc": 0.8442561143531572
83
+ },
84
+ {
85
+ "epoch": 1.554112554112554,
86
+ "grad_norm": 0.3051394820213318,
87
+ "learning_rate": 9.700626075229738e-05,
88
+ "loss": 0.5975491523742675,
89
+ "step": 90,
90
+ "token_acc": 0.8483123092893768
91
+ },
92
+ {
93
+ "epoch": 1.7272727272727273,
94
+ "grad_norm": 0.3783220648765564,
95
+ "learning_rate": 9.595876326631154e-05,
96
+ "loss": 0.5410520553588867,
97
+ "step": 100,
98
+ "token_acc": 0.8605094145609629
99
+ },
100
+ {
101
+ "epoch": 1.9004329004329006,
102
+ "grad_norm": 0.6039865612983704,
103
+ "learning_rate": 9.476190133656548e-05,
104
+ "loss": 0.5531170845031739,
105
+ "step": 110,
106
+ "token_acc": 0.8547892544963617
107
+ },
108
+ {
109
+ "epoch": 2.069264069264069,
110
+ "grad_norm": 0.5374985337257385,
111
+ "learning_rate": 9.341956472430801e-05,
112
+ "loss": 0.5079349040985107,
113
+ "step": 120,
114
+ "token_acc": 0.864488826645558
115
+ },
116
+ {
117
+ "epoch": 2.242424242424242,
118
+ "grad_norm": 0.364619642496109,
119
+ "learning_rate": 9.193611597864139e-05,
120
+ "loss": 0.44995865821838377,
121
+ "step": 130,
122
+ "token_acc": 0.8797397710240138
123
+ },
124
+ {
125
+ "epoch": 2.4155844155844157,
126
+ "grad_norm": 1.59947669506073,
127
+ "learning_rate": 9.031637625838265e-05,
128
+ "loss": 0.429323148727417,
129
+ "step": 140,
130
+ "token_acc": 0.8858490566037736
131
+ },
132
+ {
133
+ "epoch": 2.588744588744589,
134
+ "grad_norm": 0.46518200635910034,
135
+ "learning_rate": 8.856560966345877e-05,
136
+ "loss": 0.4315037727355957,
137
+ "step": 150,
138
+ "token_acc": 0.8819307344821817
139
+ },
140
+ {
141
+ "epoch": 2.761904761904762,
142
+ "grad_norm": 0.691148579120636,
143
+ "learning_rate": 8.668950612675785e-05,
144
+ "loss": 0.40119166374206544,
145
+ "step": 160,
146
+ "token_acc": 0.8896224924972358
147
+ },
148
+ {
149
+ "epoch": 2.935064935064935,
150
+ "grad_norm": 0.3540444076061249,
151
+ "learning_rate": 8.469416292203747e-05,
152
+ "loss": 0.40500435829162595,
153
+ "step": 170,
154
+ "token_acc": 0.8917646715924161
155
+ },
156
+ {
157
+ "epoch": 3.103896103896104,
158
+ "grad_norm": 0.3412817418575287,
159
+ "learning_rate": 8.258606484798897e-05,
160
+ "loss": 0.37092483043670654,
161
+ "step": 180,
162
+ "token_acc": 0.8977291233149371
163
+ },
164
+ {
165
+ "epoch": 3.277056277056277,
166
+ "grad_norm": 0.34155094623565674,
167
+ "learning_rate": 8.037206315285843e-05,
168
+ "loss": 0.344103741645813,
169
+ "step": 190,
170
+ "token_acc": 0.9065206570433051
171
+ },
172
+ {
173
+ "epoch": 3.45021645021645,
174
+ "grad_norm": 0.3627335727214813,
175
+ "learning_rate": 7.805935326811912e-05,
176
+ "loss": 0.3504387140274048,
177
+ "step": 200,
178
+ "token_acc": 0.9002762340096682
179
+ },
180
+ {
181
+ "epoch": 3.6233766233766236,
182
+ "grad_norm": 0.8141089677810669,
183
+ "learning_rate": 7.565545142355971e-05,
184
+ "loss": 0.3558197498321533,
185
+ "step": 210,
186
+ "token_acc": 0.8999160043936163
187
+ },
188
+ {
189
+ "epoch": 3.7965367965367967,
190
+ "grad_norm": 0.6176502108573914,
191
+ "learning_rate": 7.316817021978884e-05,
192
+ "loss": 0.33676347732543943,
193
+ "step": 220,
194
+ "token_acc": 0.904816147992892
195
+ },
196
+ {
197
+ "epoch": 3.9696969696969697,
198
+ "grad_norm": 0.49287620186805725,
199
+ "learning_rate": 7.060559323754435e-05,
200
+ "loss": 0.35226542949676515,
201
+ "step": 230,
202
+ "token_acc": 0.9020813028578615
203
+ },
204
+ {
205
+ "epoch": 4.138528138528138,
206
+ "grad_norm": 0.6057422161102295,
207
+ "learning_rate": 6.797604876632633e-05,
208
+ "loss": 0.3057840585708618,
209
+ "step": 240,
210
+ "token_acc": 0.9123896645803242
211
+ },
212
+ {
213
+ "epoch": 4.311688311688312,
214
+ "grad_norm": 12.585014343261719,
215
+ "learning_rate": 6.528808273773461e-05,
216
+ "loss": 0.301344108581543,
217
+ "step": 250,
218
+ "token_acc": 0.9142363149996737
219
+ },
220
+ {
221
+ "epoch": 4.484848484848484,
222
+ "grad_norm": 0.32902830839157104,
223
+ "learning_rate": 6.255043095147679e-05,
224
+ "loss": 0.2898148775100708,
225
+ "step": 260,
226
+ "token_acc": 0.9177889157552563
227
+ },
228
+ {
229
+ "epoch": 4.658008658008658,
230
+ "grad_norm": 0.39732787013053894,
231
+ "learning_rate": 5.9771990684311544e-05,
232
+ "loss": 0.29072208404541017,
233
+ "step": 270,
234
+ "token_acc": 0.917258875717698
235
+ },
236
+ {
237
+ "epoch": 4.8311688311688314,
238
+ "grad_norm": 0.44461533427238464,
239
+ "learning_rate": 5.6961791774196424e-05,
240
+ "loss": 0.2852530241012573,
241
+ "step": 280,
242
+ "token_acc": 0.9166775180675826
243
+ },
244
+ {
245
+ "epoch": 5.0,
246
+ "grad_norm": 0.35245048999786377,
247
+ "learning_rate": 5.4128967273616625e-05,
248
+ "loss": 0.3020582675933838,
249
+ "step": 290,
250
+ "token_acc": 0.9138208862720794
251
+ },
252
+ {
253
+ "epoch": 5.1731601731601735,
254
+ "grad_norm": 0.36154425144195557,
255
+ "learning_rate": 5.128272376746972e-05,
256
+ "loss": 0.23758175373077392,
257
+ "step": 300,
258
+ "token_acc": 0.9282945419454031
259
+ },
260
+ {
261
+ "epoch": 5.346320346320346,
262
+ "grad_norm": 0.40296199917793274,
263
+ "learning_rate": 4.8432311451972665e-05,
264
+ "loss": 0.27498042583465576,
265
+ "step": 310,
266
+ "token_acc": 0.9217681765679143
267
+ },
268
+ {
269
+ "epoch": 5.51948051948052,
270
+ "grad_norm": 0.9700812697410583,
271
+ "learning_rate": 4.558699407183338e-05,
272
+ "loss": 0.2576076745986938,
273
+ "step": 320,
274
+ "token_acc": 0.9252093233763294
275
+ },
276
+ {
277
+ "epoch": 5.692640692640692,
278
+ "grad_norm": 0.4304976761341095,
279
+ "learning_rate": 4.2756018813390274e-05,
280
+ "loss": 0.2424612522125244,
281
+ "step": 330,
282
+ "token_acc": 0.9276378041152792
283
+ },
284
+ {
285
+ "epoch": 5.865800865800866,
286
+ "grad_norm": 0.4652138650417328,
287
+ "learning_rate": 3.9948586251565825e-05,
288
+ "loss": 0.259202766418457,
289
+ "step": 340,
290
+ "token_acc": 0.9240967292621122
291
+ },
292
+ {
293
+ "epoch": 6.034632034632034,
294
+ "grad_norm": 0.37480419874191284,
295
+ "learning_rate": 3.7173820448305755e-05,
296
+ "loss": 0.2334808111190796,
297
+ "step": 350,
298
+ "token_acc": 0.9299400823867182
299
+ },
300
+ {
301
+ "epoch": 6.207792207792208,
302
+ "grad_norm": 0.5389286279678345,
303
+ "learning_rate": 3.444073929968284e-05,
304
+ "loss": 0.23487865924835205,
305
+ "step": 360,
306
+ "token_acc": 0.9300512852684243
307
+ },
308
+ {
309
+ "epoch": 6.380952380952381,
310
+ "grad_norm": 0.4614177942276001,
311
+ "learning_rate": 3.175822522803623e-05,
312
+ "loss": 0.21724979877471923,
313
+ "step": 370,
314
+ "token_acc": 0.9360088365243004
315
+ },
316
+ {
317
+ "epoch": 6.554112554112554,
318
+ "grad_norm": 0.3773002326488495,
319
+ "learning_rate": 2.9134996314395818e-05,
320
+ "loss": 0.20992758274078369,
321
+ "step": 380,
322
+ "token_acc": 0.9362415581566618
323
+ },
324
+ {
325
+ "epoch": 6.7272727272727275,
326
+ "grad_norm": 1.1898497343063354,
327
+ "learning_rate": 2.65795779650105e-05,
328
+ "loss": 0.2153007745742798,
329
+ "step": 390,
330
+ "token_acc": 0.9367496189220204
331
+ },
332
+ {
333
+ "epoch": 6.9004329004329,
334
+ "grad_norm": 0.8586929440498352,
335
+ "learning_rate": 2.41002752040629e-05,
336
+ "loss": 0.22280852794647216,
337
+ "step": 400,
338
+ "token_acc": 0.9341588229918669
339
+ }
340
+ ],
341
+ "logging_steps": 10,
342
+ "max_steps": 580,
343
+ "num_input_tokens_seen": 0,
344
+ "num_train_epochs": 10,
345
+ "save_steps": 100,
346
+ "stateful_callbacks": {
347
+ "TrainerControl": {
348
+ "args": {
349
+ "should_epoch_stop": false,
350
+ "should_evaluate": false,
351
+ "should_log": false,
352
+ "should_save": true,
353
+ "should_training_stop": false
354
+ },
355
+ "attributes": {}
356
+ }
357
+ },
358
+ "total_flos": 1.5383232587218944e+17,
359
+ "train_batch_size": 1,
360
+ "trial_name": null,
361
+ "trial_params": null
362
+ }
output/training/v1-20260117-010840-10e/checkpoint-400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e5274be8af993948bcfc3f1251ec27de22bce224d71e604e5b270f182b3aac2
3
+ size 6993
output/training/v1-20260117-010840-10e/checkpoint-500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: ''
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
output/training/v1-20260117-010840-10e/checkpoint-500/adapter_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [],
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": "^(model.language_model.*\\.(down_proj|up_proj|gate_proj|v_proj|k_proj|q_proj|o_proj)|(?!(model.visual.merger))model.visual.*\\.(mlp.0|down_proj|up_proj|gate_proj|mlp.2|qkv|attn.proj))$",
32
+ "target_parameters": null,
33
+ "task_type": "CAUSAL_LM",
34
+ "trainable_token_indices": null,
35
+ "use_dora": false,
36
+ "use_qalora": false,
37
+ "use_rslora": false
38
+ }
output/training/v1-20260117-010840-10e/checkpoint-500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:144e3e85649e1f95c3bb79452fc4d9c71cadc539fa8e343f61d82c3f80d5b711
3
+ size 657478696
output/training/v1-20260117-010840-10e/checkpoint-500/additional_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
output/training/v1-20260117-010840-10e/checkpoint-500/args.json ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "no",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 1,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 8,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 0.0001,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 10.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.05,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 10,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "steps",
38
+ "save_steps": 100.0,
39
+ "save_total_limit": 3,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "bf16": true,
51
+ "fp16": false,
52
+ "fp16_opt_level": "O1",
53
+ "half_precision_backend": "auto",
54
+ "bf16_full_eval": false,
55
+ "fp16_full_eval": false,
56
+ "tf32": null,
57
+ "local_rank": -1,
58
+ "ddp_backend": null,
59
+ "tpu_num_cores": null,
60
+ "tpu_metrics_debug": false,
61
+ "debug": null,
62
+ "dataloader_drop_last": false,
63
+ "eval_steps": 100.0,
64
+ "dataloader_num_workers": 4,
65
+ "dataloader_prefetch_factor": null,
66
+ "past_index": -1,
67
+ "run_name": "/home/ab/document-parsing/output/training/v1-20260117-010840",
68
+ "disable_tqdm": null,
69
+ "remove_unused_columns": true,
70
+ "label_names": null,
71
+ "load_best_model_at_end": false,
72
+ "metric_for_best_model": "loss",
73
+ "greater_is_better": false,
74
+ "ignore_data_skip": false,
75
+ "fsdp": [],
76
+ "fsdp_min_num_params": 0,
77
+ "fsdp_config": null,
78
+ "fsdp_transformer_layer_cls_to_wrap": null,
79
+ "accelerator_config": {
80
+ "dispatch_batches": false
81
+ },
82
+ "parallelism_config": null,
83
+ "deepspeed": null,
84
+ "label_smoothing_factor": 0.0,
85
+ "optim": "adamw_torch_fused",
86
+ "optim_args": null,
87
+ "adafactor": false,
88
+ "group_by_length": false,
89
+ "length_column_name": "length",
90
+ "report_to": [
91
+ "tensorboard"
92
+ ],
93
+ "project": "huggingface",
94
+ "trackio_space_id": "trackio",
95
+ "ddp_find_unused_parameters": null,
96
+ "ddp_bucket_cap_mb": null,
97
+ "ddp_broadcast_buffers": null,
98
+ "dataloader_pin_memory": true,
99
+ "dataloader_persistent_workers": false,
100
+ "skip_memory_metrics": true,
101
+ "use_legacy_prediction_loop": false,
102
+ "push_to_hub": false,
103
+ "resume_from_checkpoint": null,
104
+ "hub_model_id": null,
105
+ "hub_strategy": "every_save",
106
+ "hub_token": null,
107
+ "hub_private_repo": null,
108
+ "hub_always_push": false,
109
+ "hub_revision": null,
110
+ "gradient_checkpointing": true,
111
+ "gradient_checkpointing_kwargs": null,
112
+ "include_inputs_for_metrics": false,
113
+ "include_for_metrics": [],
114
+ "eval_do_concat_batches": true,
115
+ "fp16_backend": "auto",
116
+ "push_to_hub_model_id": null,
117
+ "push_to_hub_organization": null,
118
+ "push_to_hub_token": null,
119
+ "mp_parameters": "",
120
+ "auto_find_batch_size": false,
121
+ "full_determinism": false,
122
+ "torchdynamo": null,
123
+ "ray_scope": "last",
124
+ "ddp_timeout": 18000000,
125
+ "torch_compile": false,
126
+ "torch_compile_backend": null,
127
+ "torch_compile_mode": null,
128
+ "include_tokens_per_second": false,
129
+ "include_num_input_tokens_seen": false,
130
+ "neftune_noise_alpha": null,
131
+ "optim_target_modules": null,
132
+ "batch_eval_metrics": false,
133
+ "eval_on_start": false,
134
+ "use_liger_kernel": false,
135
+ "liger_kernel_config": null,
136
+ "eval_use_gather_object": false,
137
+ "average_tokens_across_devices": true,
138
+ "sortish_sampler": false,
139
+ "predict_with_generate": false,
140
+ "generation_max_length": null,
141
+ "generation_num_beams": null,
142
+ "generation_config": null,
143
+ "tuner_backend": "peft",
144
+ "vit_gradient_checkpointing": null,
145
+ "router_aux_loss_coef": 0.0,
146
+ "enable_dft_loss": false,
147
+ "enable_channel_loss": false,
148
+ "check_model": true,
149
+ "acc_strategy": "token",
150
+ "train_dataloader_shuffle": true,
151
+ "max_epochs": null,
152
+ "aligner_lr": null,
153
+ "vit_lr": null,
154
+ "use_logits_to_keep": null,
155
+ "ds3_gather_for_generation": true,
156
+ "resume_only_model": false,
157
+ "optimizer": null,
158
+ "loss_type": null,
159
+ "metric": null,
160
+ "eval_use_evalscope": false,
161
+ "eval_dataset": [],
162
+ "eval_dataset_args": null,
163
+ "eval_limit": null,
164
+ "eval_generation_config": null,
165
+ "extra_eval_args": null,
166
+ "use_flash_ckpt": false,
167
+ "use_ray": false,
168
+ "ray_exp_name": null,
169
+ "device_groups": null,
170
+ "model": "nanonets/Nanonets-OCR2-3B",
171
+ "model_type": "qwen2_5_vl",
172
+ "model_revision": null,
173
+ "task_type": "causal_lm",
174
+ "torch_dtype": "bfloat16",
175
+ "attn_impl": null,
176
+ "new_special_tokens": [],
177
+ "num_labels": null,
178
+ "problem_type": null,
179
+ "rope_scaling": null,
180
+ "device_map": null,
181
+ "max_memory": {},
182
+ "max_model_len": null,
183
+ "local_repo_path": null,
184
+ "init_strategy": null,
185
+ "template": "qwen2_5_vl",
186
+ "system": null,
187
+ "max_length": 8192,
188
+ "truncation_strategy": "delete",
189
+ "max_pixels": null,
190
+ "agent_template": null,
191
+ "norm_bbox": null,
192
+ "use_chat_template": true,
193
+ "padding_side": "right",
194
+ "padding_free": false,
195
+ "loss_scale": "default",
196
+ "sequence_parallel_size": 1,
197
+ "template_backend": "swift",
198
+ "response_prefix": null,
199
+ "enable_thinking": null,
200
+ "add_non_thinking_prefix": true,
201
+ "dataset": [
202
+ "/home/ab/document-parsing/output/datasets/train.jsonl"
203
+ ],
204
+ "val_dataset": [],
205
+ "cached_dataset": [],
206
+ "cached_val_dataset": [],
207
+ "split_dataset_ratio": 0.0,
208
+ "dataset_num_proc": 1,
209
+ "load_from_cache_file": false,
210
+ "dataset_shuffle": true,
211
+ "val_dataset_shuffle": false,
212
+ "streaming": false,
213
+ "interleave_prob": null,
214
+ "stopping_strategy": "first_exhausted",
215
+ "shuffle_buffer_size": 1000,
216
+ "download_mode": "reuse_dataset_if_exists",
217
+ "columns": {},
218
+ "strict": false,
219
+ "model_name": null,
220
+ "model_author": null,
221
+ "custom_dataset_info": [],
222
+ "quant_method": null,
223
+ "quant_bits": null,
224
+ "hqq_axis": null,
225
+ "bnb_4bit_compute_dtype": "bfloat16",
226
+ "bnb_4bit_quant_type": "nf4",
227
+ "bnb_4bit_use_double_quant": true,
228
+ "bnb_4bit_quant_storage": null,
229
+ "max_new_tokens": 64,
230
+ "temperature": 0.0,
231
+ "top_k": null,
232
+ "top_p": null,
233
+ "repetition_penalty": null,
234
+ "num_beams": 1,
235
+ "stream": false,
236
+ "stop_words": [],
237
+ "logprobs": false,
238
+ "top_logprobs": null,
239
+ "structured_outputs_regex": null,
240
+ "ckpt_dir": null,
241
+ "lora_modules": [],
242
+ "train_type": "lora",
243
+ "adapters": [],
244
+ "external_plugins": [],
245
+ "model_kwargs": {},
246
+ "load_args": false,
247
+ "load_data_args": false,
248
+ "packing": false,
249
+ "packing_length": null,
250
+ "packing_num_proc": 1,
251
+ "lazy_tokenize": true,
252
+ "custom_register_path": [],
253
+ "use_hf": false,
254
+ "ignore_args_error": false,
255
+ "use_swift_lora": false,
256
+ "freeze_parameters": [],
257
+ "freeze_parameters_regex": null,
258
+ "freeze_parameters_ratio": 0.0,
259
+ "trainable_parameters": [],
260
+ "trainable_parameters_regex": null,
261
+ "freeze_llm": false,
262
+ "freeze_vit": false,
263
+ "freeze_aligner": true,
264
+ "target_modules": [
265
+ "all-linear"
266
+ ],
267
+ "target_regex": null,
268
+ "target_parameters": null,
269
+ "modules_to_save": [],
270
+ "lora_rank": 64,
271
+ "lora_alpha": 16,
272
+ "lora_dropout": 0.05,
273
+ "lora_bias": "none",
274
+ "lora_dtype": null,
275
+ "lorap_lr_ratio": null,
276
+ "use_rslora": false,
277
+ "use_dora": false,
278
+ "lora_ga_batch_size": 2,
279
+ "lora_ga_iters": 2,
280
+ "lora_ga_max_length": 1024,
281
+ "lora_ga_direction": "ArB2r",
282
+ "lora_ga_scale": "stable",
283
+ "lora_ga_stable_gamma": 16,
284
+ "init_weights": true,
285
+ "fourier_n_frequency": 2000,
286
+ "fourier_scaling": 300.0,
287
+ "boft_block_size": 4,
288
+ "boft_block_num": 0,
289
+ "boft_n_butterfly_factor": 1,
290
+ "boft_dropout": 0.0,
291
+ "vera_rank": 256,
292
+ "vera_projection_prng_key": 0,
293
+ "vera_dropout": 0.0,
294
+ "vera_d_initial": 0.1,
295
+ "adapter_act": "gelu",
296
+ "adapter_length": 128,
297
+ "use_galore": false,
298
+ "galore_target_modules": null,
299
+ "galore_rank": 128,
300
+ "galore_update_proj_gap": 50,
301
+ "galore_scale": 1.0,
302
+ "galore_proj_type": "std",
303
+ "galore_optim_per_parameter": false,
304
+ "galore_with_embedding": false,
305
+ "galore_quantization": false,
306
+ "galore_proj_quant": false,
307
+ "galore_proj_bits": 4,
308
+ "galore_proj_group_size": 256,
309
+ "galore_cos_threshold": 0.4,
310
+ "galore_gamma_proj": 2,
311
+ "galore_queue_size": 5,
312
+ "adalora_target_r": 8,
313
+ "adalora_init_r": 12,
314
+ "adalora_tinit": 0,
315
+ "adalora_tfinal": 0,
316
+ "adalora_deltaT": 1,
317
+ "adalora_beta1": 0.85,
318
+ "adalora_beta2": 0.85,
319
+ "adalora_orth_reg_weight": 0.5,
320
+ "llamapro_num_new_blocks": 4,
321
+ "llamapro_num_groups": null,
322
+ "lisa_activated_layers": 0,
323
+ "lisa_step_interval": 20,
324
+ "reft_layer_key": null,
325
+ "reft_layers": null,
326
+ "reft_rank": 4,
327
+ "reft_intervention_type": "LoreftIntervention",
328
+ "reft_args": null,
329
+ "swanlab_token": null,
330
+ "swanlab_project": "ms-swift",
331
+ "swanlab_workspace": null,
332
+ "swanlab_exp_name": null,
333
+ "swanlab_notification_method": null,
334
+ "swanlab_webhook_url": null,
335
+ "swanlab_secret": null,
336
+ "swanlab_mode": "cloud",
337
+ "add_version": true,
338
+ "create_checkpoint_symlink": false,
339
+ "zero_hpz_partition_size": null,
340
+ "deepspeed_autotp_size": null,
341
+ "early_stop_interval": null,
342
+ "rank": -1,
343
+ "global_world_size": 1,
344
+ "local_world_size": 1,
345
+ "model_suffix": "Nanonets-OCR2-3B",
346
+ "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'mrope_section': [16, 24, 24], 'rope_type': 'default', 'type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
347
+ "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7c76215fac00>, model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=['vision', 'video'])",
348
+ "model_dir": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
349
+ "_val_dataset_exists": [],
350
+ "hub": "<class 'swift.hub.hub.MSHub'>",
351
+ "evaluation_strategy": "steps",
352
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/home/ab/document-parsing/output/training/v1-20260117-010840', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ab/document-parsing/output/training/v1-20260117-010840/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=100.0, dataloader_num_workers=4, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/ab/document-parsing/output/training/v1-20260117-010840', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='lora', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
353
+ }
output/training/v1-20260117-010840-10e/checkpoint-500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35902d5d1198fb62622da98a4840274c8a8331dc3e77a6e5e7b95a3d8231fac5
3
+ size 1315426955
output/training/v1-20260117-010840-10e/checkpoint-500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dea05ecba7432f5405ea0b1af074f578def0664083423526d4ab725022c5bdc
3
+ size 14645
output/training/v1-20260117-010840-10e/checkpoint-500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94c11ced83f5ac31b306f251ad9a334516c5d69155e85aa8d0a2db0dc5539a56
3
+ size 1465
output/training/v1-20260117-010840-10e/checkpoint-500/trainer_state.json ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 8.623376623376624,
6
+ "eval_steps": 100.0,
7
+ "global_step": 500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.017316017316017316,
14
+ "grad_norm": 0.4092565178871155,
15
+ "learning_rate": 3.448275862068966e-06,
16
+ "loss": 1.4861114025115967,
17
+ "step": 1,
18
+ "token_acc": 0.6811960725974412
19
+ },
20
+ {
21
+ "epoch": 0.17316017316017315,
22
+ "grad_norm": 0.3977337181568146,
23
+ "learning_rate": 3.4482758620689657e-05,
24
+ "loss": 1.4343115488688152,
25
+ "step": 10,
26
+ "token_acc": 0.6920024476626676
27
+ },
28
+ {
29
+ "epoch": 0.3463203463203463,
30
+ "grad_norm": 0.2495131641626358,
31
+ "learning_rate": 6.896551724137931e-05,
32
+ "loss": 1.3693717956542968,
33
+ "step": 20,
34
+ "token_acc": 0.7011260365349897
35
+ },
36
+ {
37
+ "epoch": 0.5194805194805194,
38
+ "grad_norm": 0.24984458088874817,
39
+ "learning_rate": 9.999918729041868e-05,
40
+ "loss": 1.1922229766845702,
41
+ "step": 30,
42
+ "token_acc": 0.726987948088823
43
+ },
44
+ {
45
+ "epoch": 0.6926406926406926,
46
+ "grad_norm": 0.3221384584903717,
47
+ "learning_rate": 9.990169410465536e-05,
48
+ "loss": 1.0192347526550294,
49
+ "step": 40,
50
+ "token_acc": 0.7609010955099522
51
+ },
52
+ {
53
+ "epoch": 0.8658008658008658,
54
+ "grad_norm": 0.40206295251846313,
55
+ "learning_rate": 9.964202208175834e-05,
56
+ "loss": 0.9150349617004394,
57
+ "step": 50,
58
+ "token_acc": 0.7773335965518376
59
+ },
60
+ {
61
+ "epoch": 1.0346320346320346,
62
+ "grad_norm": 0.20406530797481537,
63
+ "learning_rate": 9.922101514711866e-05,
64
+ "loss": 0.7742667198181152,
65
+ "step": 60,
66
+ "token_acc": 0.8123942631570925
67
+ },
68
+ {
69
+ "epoch": 1.2077922077922079,
70
+ "grad_norm": 1.4768069982528687,
71
+ "learning_rate": 9.864004155919543e-05,
72
+ "loss": 0.6983946800231934,
73
+ "step": 70,
74
+ "token_acc": 0.8248333138378757
75
+ },
76
+ {
77
+ "epoch": 1.380952380952381,
78
+ "grad_norm": 0.611409604549408,
79
+ "learning_rate": 9.790098946272177e-05,
80
+ "loss": 0.6138243198394775,
81
+ "step": 80,
82
+ "token_acc": 0.8442561143531572
83
+ },
84
+ {
85
+ "epoch": 1.554112554112554,
86
+ "grad_norm": 0.3051394820213318,
87
+ "learning_rate": 9.700626075229738e-05,
88
+ "loss": 0.5975491523742675,
89
+ "step": 90,
90
+ "token_acc": 0.8483123092893768
91
+ },
92
+ {
93
+ "epoch": 1.7272727272727273,
94
+ "grad_norm": 0.3783220648765564,
95
+ "learning_rate": 9.595876326631154e-05,
96
+ "loss": 0.5410520553588867,
97
+ "step": 100,
98
+ "token_acc": 0.8605094145609629
99
+ },
100
+ {
101
+ "epoch": 1.9004329004329006,
102
+ "grad_norm": 0.6039865612983704,
103
+ "learning_rate": 9.476190133656548e-05,
104
+ "loss": 0.5531170845031739,
105
+ "step": 110,
106
+ "token_acc": 0.8547892544963617
107
+ },
108
+ {
109
+ "epoch": 2.069264069264069,
110
+ "grad_norm": 0.5374985337257385,
111
+ "learning_rate": 9.341956472430801e-05,
112
+ "loss": 0.5079349040985107,
113
+ "step": 120,
114
+ "token_acc": 0.864488826645558
115
+ },
116
+ {
117
+ "epoch": 2.242424242424242,
118
+ "grad_norm": 0.364619642496109,
119
+ "learning_rate": 9.193611597864139e-05,
120
+ "loss": 0.44995865821838377,
121
+ "step": 130,
122
+ "token_acc": 0.8797397710240138
123
+ },
124
+ {
125
+ "epoch": 2.4155844155844157,
126
+ "grad_norm": 1.59947669506073,
127
+ "learning_rate": 9.031637625838265e-05,
128
+ "loss": 0.429323148727417,
129
+ "step": 140,
130
+ "token_acc": 0.8858490566037736
131
+ },
132
+ {
133
+ "epoch": 2.588744588744589,
134
+ "grad_norm": 0.46518200635910034,
135
+ "learning_rate": 8.856560966345877e-05,
136
+ "loss": 0.4315037727355957,
137
+ "step": 150,
138
+ "token_acc": 0.8819307344821817
139
+ },
140
+ {
141
+ "epoch": 2.761904761904762,
142
+ "grad_norm": 0.691148579120636,
143
+ "learning_rate": 8.668950612675785e-05,
144
+ "loss": 0.40119166374206544,
145
+ "step": 160,
146
+ "token_acc": 0.8896224924972358
147
+ },
148
+ {
149
+ "epoch": 2.935064935064935,
150
+ "grad_norm": 0.3540444076061249,
151
+ "learning_rate": 8.469416292203747e-05,
152
+ "loss": 0.40500435829162595,
153
+ "step": 170,
154
+ "token_acc": 0.8917646715924161
155
+ },
156
+ {
157
+ "epoch": 3.103896103896104,
158
+ "grad_norm": 0.3412817418575287,
159
+ "learning_rate": 8.258606484798897e-05,
160
+ "loss": 0.37092483043670654,
161
+ "step": 180,
162
+ "token_acc": 0.8977291233149371
163
+ },
164
+ {
165
+ "epoch": 3.277056277056277,
166
+ "grad_norm": 0.34155094623565674,
167
+ "learning_rate": 8.037206315285843e-05,
168
+ "loss": 0.344103741645813,
169
+ "step": 190,
170
+ "token_acc": 0.9065206570433051
171
+ },
172
+ {
173
+ "epoch": 3.45021645021645,
174
+ "grad_norm": 0.3627335727214813,
175
+ "learning_rate": 7.805935326811912e-05,
176
+ "loss": 0.3504387140274048,
177
+ "step": 200,
178
+ "token_acc": 0.9002762340096682
179
+ },
180
+ {
181
+ "epoch": 3.6233766233766236,
182
+ "grad_norm": 0.8141089677810669,
183
+ "learning_rate": 7.565545142355971e-05,
184
+ "loss": 0.3558197498321533,
185
+ "step": 210,
186
+ "token_acc": 0.8999160043936163
187
+ },
188
+ {
189
+ "epoch": 3.7965367965367967,
190
+ "grad_norm": 0.6176502108573914,
191
+ "learning_rate": 7.316817021978884e-05,
192
+ "loss": 0.33676347732543943,
193
+ "step": 220,
194
+ "token_acc": 0.904816147992892
195
+ },
196
+ {
197
+ "epoch": 3.9696969696969697,
198
+ "grad_norm": 0.49287620186805725,
199
+ "learning_rate": 7.060559323754435e-05,
200
+ "loss": 0.35226542949676515,
201
+ "step": 230,
202
+ "token_acc": 0.9020813028578615
203
+ },
204
+ {
205
+ "epoch": 4.138528138528138,
206
+ "grad_norm": 0.6057422161102295,
207
+ "learning_rate": 6.797604876632633e-05,
208
+ "loss": 0.3057840585708618,
209
+ "step": 240,
210
+ "token_acc": 0.9123896645803242
211
+ },
212
+ {
213
+ "epoch": 4.311688311688312,
214
+ "grad_norm": 12.585014343261719,
215
+ "learning_rate": 6.528808273773461e-05,
216
+ "loss": 0.301344108581543,
217
+ "step": 250,
218
+ "token_acc": 0.9142363149996737
219
+ },
220
+ {
221
+ "epoch": 4.484848484848484,
222
+ "grad_norm": 0.32902830839157104,
223
+ "learning_rate": 6.255043095147679e-05,
224
+ "loss": 0.2898148775100708,
225
+ "step": 260,
226
+ "token_acc": 0.9177889157552563
227
+ },
228
+ {
229
+ "epoch": 4.658008658008658,
230
+ "grad_norm": 0.39732787013053894,
231
+ "learning_rate": 5.9771990684311544e-05,
232
+ "loss": 0.29072208404541017,
233
+ "step": 270,
234
+ "token_acc": 0.917258875717698
235
+ },
236
+ {
237
+ "epoch": 4.8311688311688314,
238
+ "grad_norm": 0.44461533427238464,
239
+ "learning_rate": 5.6961791774196424e-05,
240
+ "loss": 0.2852530241012573,
241
+ "step": 280,
242
+ "token_acc": 0.9166775180675826
243
+ },
244
+ {
245
+ "epoch": 5.0,
246
+ "grad_norm": 0.35245048999786377,
247
+ "learning_rate": 5.4128967273616625e-05,
248
+ "loss": 0.3020582675933838,
249
+ "step": 290,
250
+ "token_acc": 0.9138208862720794
251
+ },
252
+ {
253
+ "epoch": 5.1731601731601735,
254
+ "grad_norm": 0.36154425144195557,
255
+ "learning_rate": 5.128272376746972e-05,
256
+ "loss": 0.23758175373077392,
257
+ "step": 300,
258
+ "token_acc": 0.9282945419454031
259
+ },
260
+ {
261
+ "epoch": 5.346320346320346,
262
+ "grad_norm": 0.40296199917793274,
263
+ "learning_rate": 4.8432311451972665e-05,
264
+ "loss": 0.27498042583465576,
265
+ "step": 310,
266
+ "token_acc": 0.9217681765679143
267
+ },
268
+ {
269
+ "epoch": 5.51948051948052,
270
+ "grad_norm": 0.9700812697410583,
271
+ "learning_rate": 4.558699407183338e-05,
272
+ "loss": 0.2576076745986938,
273
+ "step": 320,
274
+ "token_acc": 0.9252093233763294
275
+ },
276
+ {
277
+ "epoch": 5.692640692640692,
278
+ "grad_norm": 0.4304976761341095,
279
+ "learning_rate": 4.2756018813390274e-05,
280
+ "loss": 0.2424612522125244,
281
+ "step": 330,
282
+ "token_acc": 0.9276378041152792
283
+ },
284
+ {
285
+ "epoch": 5.865800865800866,
286
+ "grad_norm": 0.4652138650417328,
287
+ "learning_rate": 3.9948586251565825e-05,
288
+ "loss": 0.259202766418457,
289
+ "step": 340,
290
+ "token_acc": 0.9240967292621122
291
+ },
292
+ {
293
+ "epoch": 6.034632034632034,
294
+ "grad_norm": 0.37480419874191284,
295
+ "learning_rate": 3.7173820448305755e-05,
296
+ "loss": 0.2334808111190796,
297
+ "step": 350,
298
+ "token_acc": 0.9299400823867182
299
+ },
300
+ {
301
+ "epoch": 6.207792207792208,
302
+ "grad_norm": 0.5389286279678345,
303
+ "learning_rate": 3.444073929968284e-05,
304
+ "loss": 0.23487865924835205,
305
+ "step": 360,
306
+ "token_acc": 0.9300512852684243
307
+ },
308
+ {
309
+ "epoch": 6.380952380952381,
310
+ "grad_norm": 0.4614177942276001,
311
+ "learning_rate": 3.175822522803623e-05,
312
+ "loss": 0.21724979877471923,
313
+ "step": 370,
314
+ "token_acc": 0.9360088365243004
315
+ },
316
+ {
317
+ "epoch": 6.554112554112554,
318
+ "grad_norm": 0.3773002326488495,
319
+ "learning_rate": 2.9134996314395818e-05,
320
+ "loss": 0.20992758274078369,
321
+ "step": 380,
322
+ "token_acc": 0.9362415581566618
323
+ },
324
+ {
325
+ "epoch": 6.7272727272727275,
326
+ "grad_norm": 1.1898497343063354,
327
+ "learning_rate": 2.65795779650105e-05,
328
+ "loss": 0.2153007745742798,
329
+ "step": 390,
330
+ "token_acc": 0.9367496189220204
331
+ },
332
+ {
333
+ "epoch": 6.9004329004329,
334
+ "grad_norm": 0.8586929440498352,
335
+ "learning_rate": 2.41002752040629e-05,
336
+ "loss": 0.22280852794647216,
337
+ "step": 400,
338
+ "token_acc": 0.9341588229918669
339
+ },
340
+ {
341
+ "epoch": 7.06926406926407,
342
+ "grad_norm": 0.5149306058883667,
343
+ "learning_rate": 2.1705145682618505e-05,
344
+ "loss": 0.21320977210998535,
345
+ "step": 410,
346
+ "token_acc": 0.9383294431477159
347
+ },
348
+ {
349
+ "epoch": 7.242424242424242,
350
+ "grad_norm": 0.4976541996002197,
351
+ "learning_rate": 1.940197349152923e-05,
352
+ "loss": 0.1985553979873657,
353
+ "step": 420,
354
+ "token_acc": 0.9401391309809833
355
+ },
356
+ {
357
+ "epoch": 7.415584415584416,
358
+ "grad_norm": 0.4779481589794159,
359
+ "learning_rate": 1.7198243863398273e-05,
360
+ "loss": 0.20875980854034423,
361
+ "step": 430,
362
+ "token_acc": 0.9373778262148182
363
+ },
364
+ {
365
+ "epoch": 7.588744588744589,
366
+ "grad_norm": 0.6022359132766724,
367
+ "learning_rate": 1.510111884582463e-05,
368
+ "loss": 0.19188997745513917,
369
+ "step": 440,
370
+ "token_acc": 0.942989444333798
371
+ },
372
+ {
373
+ "epoch": 7.761904761904762,
374
+ "grad_norm": 0.497090607881546,
375
+ "learning_rate": 1.3117414024987823e-05,
376
+ "loss": 0.1933382511138916,
377
+ "step": 450,
378
+ "token_acc": 0.9423271204556436
379
+ },
380
+ {
381
+ "epoch": 7.935064935064935,
382
+ "grad_norm": 0.488971084356308,
383
+ "learning_rate": 1.125357637522072e-05,
384
+ "loss": 0.1843361496925354,
385
+ "step": 460,
386
+ "token_acc": 0.9436703366987985
387
+ },
388
+ {
389
+ "epoch": 8.103896103896103,
390
+ "grad_norm": 0.767144501209259,
391
+ "learning_rate": 9.51566330655857e-06,
392
+ "loss": 0.19610201120376586,
393
+ "step": 470,
394
+ "token_acc": 0.9421800227876946
395
+ },
396
+ {
397
+ "epoch": 8.277056277056277,
398
+ "grad_norm": 0.4893112778663635,
399
+ "learning_rate": 7.909322978358913e-06,
400
+ "loss": 0.170158052444458,
401
+ "step": 480,
402
+ "token_acc": 0.9497098970386021
403
+ },
404
+ {
405
+ "epoch": 8.45021645021645,
406
+ "grad_norm": 0.5407018661499023,
407
+ "learning_rate": 6.439775942972609e-06,
408
+ "loss": 0.1650066614151001,
409
+ "step": 490,
410
+ "token_acc": 0.9508892299359032
411
+ },
412
+ {
413
+ "epoch": 8.623376623376624,
414
+ "grad_norm": 0.41522547602653503,
415
+ "learning_rate": 5.111798179123173e-06,
416
+ "loss": 0.1943192720413208,
417
+ "step": 500,
418
+ "token_acc": 0.9430037937960277
419
+ }
420
+ ],
421
+ "logging_steps": 10,
422
+ "max_steps": 580,
423
+ "num_input_tokens_seen": 0,
424
+ "num_train_epochs": 10,
425
+ "save_steps": 100,
426
+ "stateful_callbacks": {
427
+ "TrainerControl": {
428
+ "args": {
429
+ "should_epoch_stop": false,
430
+ "should_evaluate": false,
431
+ "should_log": false,
432
+ "should_save": true,
433
+ "should_training_stop": false
434
+ },
435
+ "attributes": {}
436
+ }
437
+ },
438
+ "total_flos": 1.9231358022524928e+17,
439
+ "train_batch_size": 1,
440
+ "trial_name": null,
441
+ "trial_params": null
442
+ }
output/training/v1-20260117-010840-10e/checkpoint-500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e5274be8af993948bcfc3f1251ec27de22bce224d71e604e5b270f182b3aac2
3
+ size 6993
output/training/v1-20260117-010840-10e/checkpoint-580/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: ''
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
output/training/v1-20260117-010840-10e/checkpoint-580/adapter_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [],
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": "^(model.language_model.*\\.(down_proj|up_proj|gate_proj|v_proj|k_proj|q_proj|o_proj)|(?!(model.visual.merger))model.visual.*\\.(mlp.0|down_proj|up_proj|gate_proj|mlp.2|qkv|attn.proj))$",
32
+ "target_parameters": null,
33
+ "task_type": "CAUSAL_LM",
34
+ "trainable_token_indices": null,
35
+ "use_dora": false,
36
+ "use_qalora": false,
37
+ "use_rslora": false
38
+ }
output/training/v1-20260117-010840-10e/checkpoint-580/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fbac9ce4144065f68ad19a5930a57921a1aad93aaa5d6ed500b386e5584010c
3
+ size 657478696
output/training/v1-20260117-010840-10e/checkpoint-580/additional_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
output/training/v1-20260117-010840-10e/checkpoint-580/args.json ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "no",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 1,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 8,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 0.0001,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 10.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.05,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 10,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "steps",
38
+ "save_steps": 100.0,
39
+ "save_total_limit": 3,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "bf16": true,
51
+ "fp16": false,
52
+ "fp16_opt_level": "O1",
53
+ "half_precision_backend": "auto",
54
+ "bf16_full_eval": false,
55
+ "fp16_full_eval": false,
56
+ "tf32": null,
57
+ "local_rank": -1,
58
+ "ddp_backend": null,
59
+ "tpu_num_cores": null,
60
+ "tpu_metrics_debug": false,
61
+ "debug": null,
62
+ "dataloader_drop_last": false,
63
+ "eval_steps": 100.0,
64
+ "dataloader_num_workers": 4,
65
+ "dataloader_prefetch_factor": null,
66
+ "past_index": -1,
67
+ "run_name": "/home/ab/document-parsing/output/training/v1-20260117-010840",
68
+ "disable_tqdm": null,
69
+ "remove_unused_columns": true,
70
+ "label_names": null,
71
+ "load_best_model_at_end": false,
72
+ "metric_for_best_model": "loss",
73
+ "greater_is_better": false,
74
+ "ignore_data_skip": false,
75
+ "fsdp": [],
76
+ "fsdp_min_num_params": 0,
77
+ "fsdp_config": null,
78
+ "fsdp_transformer_layer_cls_to_wrap": null,
79
+ "accelerator_config": {
80
+ "dispatch_batches": false
81
+ },
82
+ "parallelism_config": null,
83
+ "deepspeed": null,
84
+ "label_smoothing_factor": 0.0,
85
+ "optim": "adamw_torch_fused",
86
+ "optim_args": null,
87
+ "adafactor": false,
88
+ "group_by_length": false,
89
+ "length_column_name": "length",
90
+ "report_to": [
91
+ "tensorboard"
92
+ ],
93
+ "project": "huggingface",
94
+ "trackio_space_id": "trackio",
95
+ "ddp_find_unused_parameters": null,
96
+ "ddp_bucket_cap_mb": null,
97
+ "ddp_broadcast_buffers": null,
98
+ "dataloader_pin_memory": true,
99
+ "dataloader_persistent_workers": false,
100
+ "skip_memory_metrics": true,
101
+ "use_legacy_prediction_loop": false,
102
+ "push_to_hub": false,
103
+ "resume_from_checkpoint": null,
104
+ "hub_model_id": null,
105
+ "hub_strategy": "every_save",
106
+ "hub_token": null,
107
+ "hub_private_repo": null,
108
+ "hub_always_push": false,
109
+ "hub_revision": null,
110
+ "gradient_checkpointing": true,
111
+ "gradient_checkpointing_kwargs": null,
112
+ "include_inputs_for_metrics": false,
113
+ "include_for_metrics": [],
114
+ "eval_do_concat_batches": true,
115
+ "fp16_backend": "auto",
116
+ "push_to_hub_model_id": null,
117
+ "push_to_hub_organization": null,
118
+ "push_to_hub_token": null,
119
+ "mp_parameters": "",
120
+ "auto_find_batch_size": false,
121
+ "full_determinism": false,
122
+ "torchdynamo": null,
123
+ "ray_scope": "last",
124
+ "ddp_timeout": 18000000,
125
+ "torch_compile": false,
126
+ "torch_compile_backend": null,
127
+ "torch_compile_mode": null,
128
+ "include_tokens_per_second": false,
129
+ "include_num_input_tokens_seen": false,
130
+ "neftune_noise_alpha": null,
131
+ "optim_target_modules": null,
132
+ "batch_eval_metrics": false,
133
+ "eval_on_start": false,
134
+ "use_liger_kernel": false,
135
+ "liger_kernel_config": null,
136
+ "eval_use_gather_object": false,
137
+ "average_tokens_across_devices": true,
138
+ "sortish_sampler": false,
139
+ "predict_with_generate": false,
140
+ "generation_max_length": null,
141
+ "generation_num_beams": null,
142
+ "generation_config": null,
143
+ "tuner_backend": "peft",
144
+ "vit_gradient_checkpointing": null,
145
+ "router_aux_loss_coef": 0.0,
146
+ "enable_dft_loss": false,
147
+ "enable_channel_loss": false,
148
+ "check_model": true,
149
+ "acc_strategy": "token",
150
+ "train_dataloader_shuffle": true,
151
+ "max_epochs": null,
152
+ "aligner_lr": null,
153
+ "vit_lr": null,
154
+ "use_logits_to_keep": null,
155
+ "ds3_gather_for_generation": true,
156
+ "resume_only_model": false,
157
+ "optimizer": null,
158
+ "loss_type": null,
159
+ "metric": null,
160
+ "eval_use_evalscope": false,
161
+ "eval_dataset": [],
162
+ "eval_dataset_args": null,
163
+ "eval_limit": null,
164
+ "eval_generation_config": null,
165
+ "extra_eval_args": null,
166
+ "use_flash_ckpt": false,
167
+ "use_ray": false,
168
+ "ray_exp_name": null,
169
+ "device_groups": null,
170
+ "model": "nanonets/Nanonets-OCR2-3B",
171
+ "model_type": "qwen2_5_vl",
172
+ "model_revision": null,
173
+ "task_type": "causal_lm",
174
+ "torch_dtype": "bfloat16",
175
+ "attn_impl": null,
176
+ "new_special_tokens": [],
177
+ "num_labels": null,
178
+ "problem_type": null,
179
+ "rope_scaling": null,
180
+ "device_map": null,
181
+ "max_memory": {},
182
+ "max_model_len": null,
183
+ "local_repo_path": null,
184
+ "init_strategy": null,
185
+ "template": "qwen2_5_vl",
186
+ "system": null,
187
+ "max_length": 8192,
188
+ "truncation_strategy": "delete",
189
+ "max_pixels": null,
190
+ "agent_template": null,
191
+ "norm_bbox": null,
192
+ "use_chat_template": true,
193
+ "padding_side": "right",
194
+ "padding_free": false,
195
+ "loss_scale": "default",
196
+ "sequence_parallel_size": 1,
197
+ "template_backend": "swift",
198
+ "response_prefix": null,
199
+ "enable_thinking": null,
200
+ "add_non_thinking_prefix": true,
201
+ "dataset": [
202
+ "/home/ab/document-parsing/output/datasets/train.jsonl"
203
+ ],
204
+ "val_dataset": [],
205
+ "cached_dataset": [],
206
+ "cached_val_dataset": [],
207
+ "split_dataset_ratio": 0.0,
208
+ "dataset_num_proc": 1,
209
+ "load_from_cache_file": false,
210
+ "dataset_shuffle": true,
211
+ "val_dataset_shuffle": false,
212
+ "streaming": false,
213
+ "interleave_prob": null,
214
+ "stopping_strategy": "first_exhausted",
215
+ "shuffle_buffer_size": 1000,
216
+ "download_mode": "reuse_dataset_if_exists",
217
+ "columns": {},
218
+ "strict": false,
219
+ "model_name": null,
220
+ "model_author": null,
221
+ "custom_dataset_info": [],
222
+ "quant_method": null,
223
+ "quant_bits": null,
224
+ "hqq_axis": null,
225
+ "bnb_4bit_compute_dtype": "bfloat16",
226
+ "bnb_4bit_quant_type": "nf4",
227
+ "bnb_4bit_use_double_quant": true,
228
+ "bnb_4bit_quant_storage": null,
229
+ "max_new_tokens": 64,
230
+ "temperature": 0.0,
231
+ "top_k": null,
232
+ "top_p": null,
233
+ "repetition_penalty": null,
234
+ "num_beams": 1,
235
+ "stream": false,
236
+ "stop_words": [],
237
+ "logprobs": false,
238
+ "top_logprobs": null,
239
+ "structured_outputs_regex": null,
240
+ "ckpt_dir": null,
241
+ "lora_modules": [],
242
+ "train_type": "lora",
243
+ "adapters": [],
244
+ "external_plugins": [],
245
+ "model_kwargs": {},
246
+ "load_args": false,
247
+ "load_data_args": false,
248
+ "packing": false,
249
+ "packing_length": null,
250
+ "packing_num_proc": 1,
251
+ "lazy_tokenize": true,
252
+ "custom_register_path": [],
253
+ "use_hf": false,
254
+ "ignore_args_error": false,
255
+ "use_swift_lora": false,
256
+ "freeze_parameters": [],
257
+ "freeze_parameters_regex": null,
258
+ "freeze_parameters_ratio": 0.0,
259
+ "trainable_parameters": [],
260
+ "trainable_parameters_regex": null,
261
+ "freeze_llm": false,
262
+ "freeze_vit": false,
263
+ "freeze_aligner": true,
264
+ "target_modules": [
265
+ "all-linear"
266
+ ],
267
+ "target_regex": null,
268
+ "target_parameters": null,
269
+ "modules_to_save": [],
270
+ "lora_rank": 64,
271
+ "lora_alpha": 16,
272
+ "lora_dropout": 0.05,
273
+ "lora_bias": "none",
274
+ "lora_dtype": null,
275
+ "lorap_lr_ratio": null,
276
+ "use_rslora": false,
277
+ "use_dora": false,
278
+ "lora_ga_batch_size": 2,
279
+ "lora_ga_iters": 2,
280
+ "lora_ga_max_length": 1024,
281
+ "lora_ga_direction": "ArB2r",
282
+ "lora_ga_scale": "stable",
283
+ "lora_ga_stable_gamma": 16,
284
+ "init_weights": true,
285
+ "fourier_n_frequency": 2000,
286
+ "fourier_scaling": 300.0,
287
+ "boft_block_size": 4,
288
+ "boft_block_num": 0,
289
+ "boft_n_butterfly_factor": 1,
290
+ "boft_dropout": 0.0,
291
+ "vera_rank": 256,
292
+ "vera_projection_prng_key": 0,
293
+ "vera_dropout": 0.0,
294
+ "vera_d_initial": 0.1,
295
+ "adapter_act": "gelu",
296
+ "adapter_length": 128,
297
+ "use_galore": false,
298
+ "galore_target_modules": null,
299
+ "galore_rank": 128,
300
+ "galore_update_proj_gap": 50,
301
+ "galore_scale": 1.0,
302
+ "galore_proj_type": "std",
303
+ "galore_optim_per_parameter": false,
304
+ "galore_with_embedding": false,
305
+ "galore_quantization": false,
306
+ "galore_proj_quant": false,
307
+ "galore_proj_bits": 4,
308
+ "galore_proj_group_size": 256,
309
+ "galore_cos_threshold": 0.4,
310
+ "galore_gamma_proj": 2,
311
+ "galore_queue_size": 5,
312
+ "adalora_target_r": 8,
313
+ "adalora_init_r": 12,
314
+ "adalora_tinit": 0,
315
+ "adalora_tfinal": 0,
316
+ "adalora_deltaT": 1,
317
+ "adalora_beta1": 0.85,
318
+ "adalora_beta2": 0.85,
319
+ "adalora_orth_reg_weight": 0.5,
320
+ "llamapro_num_new_blocks": 4,
321
+ "llamapro_num_groups": null,
322
+ "lisa_activated_layers": 0,
323
+ "lisa_step_interval": 20,
324
+ "reft_layer_key": null,
325
+ "reft_layers": null,
326
+ "reft_rank": 4,
327
+ "reft_intervention_type": "LoreftIntervention",
328
+ "reft_args": null,
329
+ "swanlab_token": null,
330
+ "swanlab_project": "ms-swift",
331
+ "swanlab_workspace": null,
332
+ "swanlab_exp_name": null,
333
+ "swanlab_notification_method": null,
334
+ "swanlab_webhook_url": null,
335
+ "swanlab_secret": null,
336
+ "swanlab_mode": "cloud",
337
+ "add_version": true,
338
+ "create_checkpoint_symlink": false,
339
+ "zero_hpz_partition_size": null,
340
+ "deepspeed_autotp_size": null,
341
+ "early_stop_interval": null,
342
+ "rank": -1,
343
+ "global_world_size": 1,
344
+ "local_world_size": 1,
345
+ "model_suffix": "Nanonets-OCR2-3B",
346
+ "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'mrope_section': [16, 24, 24], 'rope_type': 'default', 'type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
347
+ "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7c76215fac00>, model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=['vision', 'video'])",
348
+ "model_dir": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
349
+ "_val_dataset_exists": [],
350
+ "hub": "<class 'swift.hub.hub.MSHub'>",
351
+ "evaluation_strategy": "steps",
352
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/home/ab/document-parsing/output/training/v1-20260117-010840', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ab/document-parsing/output/training/v1-20260117-010840/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=100.0, dataloader_num_workers=4, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/ab/document-parsing/output/training/v1-20260117-010840', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='lora', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
353
+ }
output/training/v1-20260117-010840-10e/checkpoint-580/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5739078c612d9264a84be014dcee923bd18a89769ff3d54d05e7bf6c600c656a
3
+ size 1315426955
output/training/v1-20260117-010840-10e/checkpoint-580/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b97d67bbbd39fd5a492faaf39d45ee3dddc989273d366f3048f720147cbb4b3
3
+ size 14645
output/training/v1-20260117-010840-10e/checkpoint-580/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a6112fa30c5dbad7af1b976693a28071346fb21ab769e7a2fde80a53c550ea0
3
+ size 1465
output/training/v1-20260117-010840-10e/checkpoint-580/trainer_state.json ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 100.0,
7
+ "global_step": 580,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.017316017316017316,
14
+ "grad_norm": 0.4092565178871155,
15
+ "learning_rate": 3.448275862068966e-06,
16
+ "loss": 1.4861114025115967,
17
+ "step": 1,
18
+ "token_acc": 0.6811960725974412
19
+ },
20
+ {
21
+ "epoch": 0.17316017316017315,
22
+ "grad_norm": 0.3977337181568146,
23
+ "learning_rate": 3.4482758620689657e-05,
24
+ "loss": 1.4343115488688152,
25
+ "step": 10,
26
+ "token_acc": 0.6920024476626676
27
+ },
28
+ {
29
+ "epoch": 0.3463203463203463,
30
+ "grad_norm": 0.2495131641626358,
31
+ "learning_rate": 6.896551724137931e-05,
32
+ "loss": 1.3693717956542968,
33
+ "step": 20,
34
+ "token_acc": 0.7011260365349897
35
+ },
36
+ {
37
+ "epoch": 0.5194805194805194,
38
+ "grad_norm": 0.24984458088874817,
39
+ "learning_rate": 9.999918729041868e-05,
40
+ "loss": 1.1922229766845702,
41
+ "step": 30,
42
+ "token_acc": 0.726987948088823
43
+ },
44
+ {
45
+ "epoch": 0.6926406926406926,
46
+ "grad_norm": 0.3221384584903717,
47
+ "learning_rate": 9.990169410465536e-05,
48
+ "loss": 1.0192347526550294,
49
+ "step": 40,
50
+ "token_acc": 0.7609010955099522
51
+ },
52
+ {
53
+ "epoch": 0.8658008658008658,
54
+ "grad_norm": 0.40206295251846313,
55
+ "learning_rate": 9.964202208175834e-05,
56
+ "loss": 0.9150349617004394,
57
+ "step": 50,
58
+ "token_acc": 0.7773335965518376
59
+ },
60
+ {
61
+ "epoch": 1.0346320346320346,
62
+ "grad_norm": 0.20406530797481537,
63
+ "learning_rate": 9.922101514711866e-05,
64
+ "loss": 0.7742667198181152,
65
+ "step": 60,
66
+ "token_acc": 0.8123942631570925
67
+ },
68
+ {
69
+ "epoch": 1.2077922077922079,
70
+ "grad_norm": 1.4768069982528687,
71
+ "learning_rate": 9.864004155919543e-05,
72
+ "loss": 0.6983946800231934,
73
+ "step": 70,
74
+ "token_acc": 0.8248333138378757
75
+ },
76
+ {
77
+ "epoch": 1.380952380952381,
78
+ "grad_norm": 0.611409604549408,
79
+ "learning_rate": 9.790098946272177e-05,
80
+ "loss": 0.6138243198394775,
81
+ "step": 80,
82
+ "token_acc": 0.8442561143531572
83
+ },
84
+ {
85
+ "epoch": 1.554112554112554,
86
+ "grad_norm": 0.3051394820213318,
87
+ "learning_rate": 9.700626075229738e-05,
88
+ "loss": 0.5975491523742675,
89
+ "step": 90,
90
+ "token_acc": 0.8483123092893768
91
+ },
92
+ {
93
+ "epoch": 1.7272727272727273,
94
+ "grad_norm": 0.3783220648765564,
95
+ "learning_rate": 9.595876326631154e-05,
96
+ "loss": 0.5410520553588867,
97
+ "step": 100,
98
+ "token_acc": 0.8605094145609629
99
+ },
100
+ {
101
+ "epoch": 1.9004329004329006,
102
+ "grad_norm": 0.6039865612983704,
103
+ "learning_rate": 9.476190133656548e-05,
104
+ "loss": 0.5531170845031739,
105
+ "step": 110,
106
+ "token_acc": 0.8547892544963617
107
+ },
108
+ {
109
+ "epoch": 2.069264069264069,
110
+ "grad_norm": 0.5374985337257385,
111
+ "learning_rate": 9.341956472430801e-05,
112
+ "loss": 0.5079349040985107,
113
+ "step": 120,
114
+ "token_acc": 0.864488826645558
115
+ },
116
+ {
117
+ "epoch": 2.242424242424242,
118
+ "grad_norm": 0.364619642496109,
119
+ "learning_rate": 9.193611597864139e-05,
120
+ "loss": 0.44995865821838377,
121
+ "step": 130,
122
+ "token_acc": 0.8797397710240138
123
+ },
124
+ {
125
+ "epoch": 2.4155844155844157,
126
+ "grad_norm": 1.59947669506073,
127
+ "learning_rate": 9.031637625838265e-05,
128
+ "loss": 0.429323148727417,
129
+ "step": 140,
130
+ "token_acc": 0.8858490566037736
131
+ },
132
+ {
133
+ "epoch": 2.588744588744589,
134
+ "grad_norm": 0.46518200635910034,
135
+ "learning_rate": 8.856560966345877e-05,
136
+ "loss": 0.4315037727355957,
137
+ "step": 150,
138
+ "token_acc": 0.8819307344821817
139
+ },
140
+ {
141
+ "epoch": 2.761904761904762,
142
+ "grad_norm": 0.691148579120636,
143
+ "learning_rate": 8.668950612675785e-05,
144
+ "loss": 0.40119166374206544,
145
+ "step": 160,
146
+ "token_acc": 0.8896224924972358
147
+ },
148
+ {
149
+ "epoch": 2.935064935064935,
150
+ "grad_norm": 0.3540444076061249,
151
+ "learning_rate": 8.469416292203747e-05,
152
+ "loss": 0.40500435829162595,
153
+ "step": 170,
154
+ "token_acc": 0.8917646715924161
155
+ },
156
+ {
157
+ "epoch": 3.103896103896104,
158
+ "grad_norm": 0.3412817418575287,
159
+ "learning_rate": 8.258606484798897e-05,
160
+ "loss": 0.37092483043670654,
161
+ "step": 180,
162
+ "token_acc": 0.8977291233149371
163
+ },
164
+ {
165
+ "epoch": 3.277056277056277,
166
+ "grad_norm": 0.34155094623565674,
167
+ "learning_rate": 8.037206315285843e-05,
168
+ "loss": 0.344103741645813,
169
+ "step": 190,
170
+ "token_acc": 0.9065206570433051
171
+ },
172
+ {
173
+ "epoch": 3.45021645021645,
174
+ "grad_norm": 0.3627335727214813,
175
+ "learning_rate": 7.805935326811912e-05,
176
+ "loss": 0.3504387140274048,
177
+ "step": 200,
178
+ "token_acc": 0.9002762340096682
179
+ },
180
+ {
181
+ "epoch": 3.6233766233766236,
182
+ "grad_norm": 0.8141089677810669,
183
+ "learning_rate": 7.565545142355971e-05,
184
+ "loss": 0.3558197498321533,
185
+ "step": 210,
186
+ "token_acc": 0.8999160043936163
187
+ },
188
+ {
189
+ "epoch": 3.7965367965367967,
190
+ "grad_norm": 0.6176502108573914,
191
+ "learning_rate": 7.316817021978884e-05,
192
+ "loss": 0.33676347732543943,
193
+ "step": 220,
194
+ "token_acc": 0.904816147992892
195
+ },
196
+ {
197
+ "epoch": 3.9696969696969697,
198
+ "grad_norm": 0.49287620186805725,
199
+ "learning_rate": 7.060559323754435e-05,
200
+ "loss": 0.35226542949676515,
201
+ "step": 230,
202
+ "token_acc": 0.9020813028578615
203
+ },
204
+ {
205
+ "epoch": 4.138528138528138,
206
+ "grad_norm": 0.6057422161102295,
207
+ "learning_rate": 6.797604876632633e-05,
208
+ "loss": 0.3057840585708618,
209
+ "step": 240,
210
+ "token_acc": 0.9123896645803242
211
+ },
212
+ {
213
+ "epoch": 4.311688311688312,
214
+ "grad_norm": 12.585014343261719,
215
+ "learning_rate": 6.528808273773461e-05,
216
+ "loss": 0.301344108581543,
217
+ "step": 250,
218
+ "token_acc": 0.9142363149996737
219
+ },
220
+ {
221
+ "epoch": 4.484848484848484,
222
+ "grad_norm": 0.32902830839157104,
223
+ "learning_rate": 6.255043095147679e-05,
224
+ "loss": 0.2898148775100708,
225
+ "step": 260,
226
+ "token_acc": 0.9177889157552563
227
+ },
228
+ {
229
+ "epoch": 4.658008658008658,
230
+ "grad_norm": 0.39732787013053894,
231
+ "learning_rate": 5.9771990684311544e-05,
232
+ "loss": 0.29072208404541017,
233
+ "step": 270,
234
+ "token_acc": 0.917258875717698
235
+ },
236
+ {
237
+ "epoch": 4.8311688311688314,
238
+ "grad_norm": 0.44461533427238464,
239
+ "learning_rate": 5.6961791774196424e-05,
240
+ "loss": 0.2852530241012573,
241
+ "step": 280,
242
+ "token_acc": 0.9166775180675826
243
+ },
244
+ {
245
+ "epoch": 5.0,
246
+ "grad_norm": 0.35245048999786377,
247
+ "learning_rate": 5.4128967273616625e-05,
248
+ "loss": 0.3020582675933838,
249
+ "step": 290,
250
+ "token_acc": 0.9138208862720794
251
+ },
252
+ {
253
+ "epoch": 5.1731601731601735,
254
+ "grad_norm": 0.36154425144195557,
255
+ "learning_rate": 5.128272376746972e-05,
256
+ "loss": 0.23758175373077392,
257
+ "step": 300,
258
+ "token_acc": 0.9282945419454031
259
+ },
260
+ {
261
+ "epoch": 5.346320346320346,
262
+ "grad_norm": 0.40296199917793274,
263
+ "learning_rate": 4.8432311451972665e-05,
264
+ "loss": 0.27498042583465576,
265
+ "step": 310,
266
+ "token_acc": 0.9217681765679143
267
+ },
268
+ {
269
+ "epoch": 5.51948051948052,
270
+ "grad_norm": 0.9700812697410583,
271
+ "learning_rate": 4.558699407183338e-05,
272
+ "loss": 0.2576076745986938,
273
+ "step": 320,
274
+ "token_acc": 0.9252093233763294
275
+ },
276
+ {
277
+ "epoch": 5.692640692640692,
278
+ "grad_norm": 0.4304976761341095,
279
+ "learning_rate": 4.2756018813390274e-05,
280
+ "loss": 0.2424612522125244,
281
+ "step": 330,
282
+ "token_acc": 0.9276378041152792
283
+ },
284
+ {
285
+ "epoch": 5.865800865800866,
286
+ "grad_norm": 0.4652138650417328,
287
+ "learning_rate": 3.9948586251565825e-05,
288
+ "loss": 0.259202766418457,
289
+ "step": 340,
290
+ "token_acc": 0.9240967292621122
291
+ },
292
+ {
293
+ "epoch": 6.034632034632034,
294
+ "grad_norm": 0.37480419874191284,
295
+ "learning_rate": 3.7173820448305755e-05,
296
+ "loss": 0.2334808111190796,
297
+ "step": 350,
298
+ "token_acc": 0.9299400823867182
299
+ },
300
+ {
301
+ "epoch": 6.207792207792208,
302
+ "grad_norm": 0.5389286279678345,
303
+ "learning_rate": 3.444073929968284e-05,
304
+ "loss": 0.23487865924835205,
305
+ "step": 360,
306
+ "token_acc": 0.9300512852684243
307
+ },
308
+ {
309
+ "epoch": 6.380952380952381,
310
+ "grad_norm": 0.4614177942276001,
311
+ "learning_rate": 3.175822522803623e-05,
312
+ "loss": 0.21724979877471923,
313
+ "step": 370,
314
+ "token_acc": 0.9360088365243004
315
+ },
316
+ {
317
+ "epoch": 6.554112554112554,
318
+ "grad_norm": 0.3773002326488495,
319
+ "learning_rate": 2.9134996314395818e-05,
320
+ "loss": 0.20992758274078369,
321
+ "step": 380,
322
+ "token_acc": 0.9362415581566618
323
+ },
324
+ {
325
+ "epoch": 6.7272727272727275,
326
+ "grad_norm": 1.1898497343063354,
327
+ "learning_rate": 2.65795779650105e-05,
328
+ "loss": 0.2153007745742798,
329
+ "step": 390,
330
+ "token_acc": 0.9367496189220204
331
+ },
332
+ {
333
+ "epoch": 6.9004329004329,
334
+ "grad_norm": 0.8586929440498352,
335
+ "learning_rate": 2.41002752040629e-05,
336
+ "loss": 0.22280852794647216,
337
+ "step": 400,
338
+ "token_acc": 0.9341588229918669
339
+ },
340
+ {
341
+ "epoch": 7.06926406926407,
342
+ "grad_norm": 0.5149306058883667,
343
+ "learning_rate": 2.1705145682618505e-05,
344
+ "loss": 0.21320977210998535,
345
+ "step": 410,
346
+ "token_acc": 0.9383294431477159
347
+ },
348
+ {
349
+ "epoch": 7.242424242424242,
350
+ "grad_norm": 0.4976541996002197,
351
+ "learning_rate": 1.940197349152923e-05,
352
+ "loss": 0.1985553979873657,
353
+ "step": 420,
354
+ "token_acc": 0.9401391309809833
355
+ },
356
+ {
357
+ "epoch": 7.415584415584416,
358
+ "grad_norm": 0.4779481589794159,
359
+ "learning_rate": 1.7198243863398273e-05,
360
+ "loss": 0.20875980854034423,
361
+ "step": 430,
362
+ "token_acc": 0.9373778262148182
363
+ },
364
+ {
365
+ "epoch": 7.588744588744589,
366
+ "grad_norm": 0.6022359132766724,
367
+ "learning_rate": 1.510111884582463e-05,
368
+ "loss": 0.19188997745513917,
369
+ "step": 440,
370
+ "token_acc": 0.942989444333798
371
+ },
372
+ {
373
+ "epoch": 7.761904761904762,
374
+ "grad_norm": 0.497090607881546,
375
+ "learning_rate": 1.3117414024987823e-05,
376
+ "loss": 0.1933382511138916,
377
+ "step": 450,
378
+ "token_acc": 0.9423271204556436
379
+ },
380
+ {
381
+ "epoch": 7.935064935064935,
382
+ "grad_norm": 0.488971084356308,
383
+ "learning_rate": 1.125357637522072e-05,
384
+ "loss": 0.1843361496925354,
385
+ "step": 460,
386
+ "token_acc": 0.9436703366987985
387
+ },
388
+ {
389
+ "epoch": 8.103896103896103,
390
+ "grad_norm": 0.767144501209259,
391
+ "learning_rate": 9.51566330655857e-06,
392
+ "loss": 0.19610201120376586,
393
+ "step": 470,
394
+ "token_acc": 0.9421800227876946
395
+ },
396
+ {
397
+ "epoch": 8.277056277056277,
398
+ "grad_norm": 0.4893112778663635,
399
+ "learning_rate": 7.909322978358913e-06,
400
+ "loss": 0.170158052444458,
401
+ "step": 480,
402
+ "token_acc": 0.9497098970386021
403
+ },
404
+ {
405
+ "epoch": 8.45021645021645,
406
+ "grad_norm": 0.5407018661499023,
407
+ "learning_rate": 6.439775942972609e-06,
408
+ "loss": 0.1650066614151001,
409
+ "step": 490,
410
+ "token_acc": 0.9508892299359032
411
+ },
412
+ {
413
+ "epoch": 8.623376623376624,
414
+ "grad_norm": 0.41522547602653503,
415
+ "learning_rate": 5.111798179123173e-06,
416
+ "loss": 0.1943192720413208,
417
+ "step": 500,
418
+ "token_acc": 0.9430037937960277
419
+ },
420
+ {
421
+ "epoch": 8.796536796536797,
422
+ "grad_norm": 0.5257052183151245,
423
+ "learning_rate": 3.929705570135711e-06,
424
+ "loss": 0.16702849864959718,
425
+ "step": 510,
426
+ "token_acc": 0.9501815248083905
427
+ },
428
+ {
429
+ "epoch": 8.969696969696969,
430
+ "grad_norm": 0.48933619260787964,
431
+ "learning_rate": 2.897339877460398e-06,
432
+ "loss": 0.19309405088424683,
433
+ "step": 520,
434
+ "token_acc": 0.9438778813778814
435
+ },
436
+ {
437
+ "epoch": 9.13852813852814,
438
+ "grad_norm": 0.6073329448699951,
439
+ "learning_rate": 2.018056255076256e-06,
440
+ "loss": 0.17578216791152954,
441
+ "step": 530,
442
+ "token_acc": 0.949875481814
443
+ },
444
+ {
445
+ "epoch": 9.311688311688311,
446
+ "grad_norm": 5.020083427429199,
447
+ "learning_rate": 1.2947123453528886e-06,
448
+ "loss": 0.18189191818237305,
449
+ "step": 540,
450
+ "token_acc": 0.9454742254092816
451
+ },
452
+ {
453
+ "epoch": 9.484848484848484,
454
+ "grad_norm": 0.6125385761260986,
455
+ "learning_rate": 7.296589918083685e-07,
456
+ "loss": 0.16662927865982055,
457
+ "step": 550,
458
+ "token_acc": 0.9502154609558632
459
+ },
460
+ {
461
+ "epoch": 9.658008658008658,
462
+ "grad_norm": 0.4245486259460449,
463
+ "learning_rate": 3.2473259894640894e-07,
464
+ "loss": 0.16942204236984254,
465
+ "step": 560,
466
+ "token_acc": 0.9508478741705578
467
+ },
468
+ {
469
+ "epoch": 9.831168831168831,
470
+ "grad_norm": 1.1829816102981567,
471
+ "learning_rate": 8.124916400311655e-08,
472
+ "loss": 0.17350658178329467,
473
+ "step": 570,
474
+ "token_acc": 0.9498697127620894
475
+ },
476
+ {
477
+ "epoch": 10.0,
478
+ "grad_norm": 0.5661698579788208,
479
+ "learning_rate": 0.0,
480
+ "loss": 0.1673359751701355,
481
+ "step": 580,
482
+ "token_acc": 0.9504393101204035
483
+ }
484
+ ],
485
+ "logging_steps": 10,
486
+ "max_steps": 580,
487
+ "num_input_tokens_seen": 0,
488
+ "num_train_epochs": 10,
489
+ "save_steps": 100,
490
+ "stateful_callbacks": {
491
+ "TrainerControl": {
492
+ "args": {
493
+ "should_epoch_stop": false,
494
+ "should_evaluate": false,
495
+ "should_log": false,
496
+ "should_save": true,
497
+ "should_training_stop": true
498
+ },
499
+ "attributes": {}
500
+ }
501
+ },
502
+ "total_flos": 2.229367912955904e+17,
503
+ "train_batch_size": 1,
504
+ "trial_name": null,
505
+ "trial_params": null
506
+ }
output/training/v1-20260117-010840-10e/checkpoint-580/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e5274be8af993948bcfc3f1251ec27de22bce224d71e604e5b270f182b3aac2
3
+ size 6993
output/training/v1-20260117-010840-10e/images/train_epoch.png ADDED
output/training/v1-20260117-010840-10e/images/train_grad_norm.png ADDED
output/training/v1-20260117-010840-10e/images/train_learning_rate.png ADDED
output/training/v1-20260117-010840-10e/images/train_loss.png ADDED
output/training/v1-20260117-010840-10e/images/train_token_acc.png ADDED
output/training/v1-20260117-010840-10e/images/train_total_flos.png ADDED
output/training/v1-20260117-010840-10e/images/train_train_loss.png ADDED
output/training/v1-20260117-010840-10e/images/train_train_runtime.png ADDED
output/training/v1-20260117-010840-10e/images/train_train_samples_per_second.png ADDED
output/training/v1-20260117-010840-10e/images/train_train_steps_per_second.png ADDED
output/training/v1-20260117-010840-10e/logging.jsonl ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"loss": 1.4861114, "grad_norm": 0.40925652, "learning_rate": 3.45e-06, "token_acc": 0.68119607, "epoch": 0.01731602, "global_step/max_steps": "1/580", "percentage": "0.17%", "elapsed_time": "11s", "remaining_time": "1h 52m 13s", "memory(GiB)": 20.84, "train_speed(iter/s)": 0.085986}
2
+ {"loss": 1.43431155, "grad_norm": 0.39773372, "learning_rate": 3.448e-05, "token_acc": 0.69200245, "epoch": 0.17316017, "global_step/max_steps": "10/580", "percentage": "1.72%", "elapsed_time": "1m 22s", "remaining_time": "1h 18m 37s", "memory(GiB)": 20.95, "train_speed(iter/s)": 0.120821}
3
+ {"loss": 1.3693718, "grad_norm": 0.24951316, "learning_rate": 6.897e-05, "token_acc": 0.70112604, "epoch": 0.34632035, "global_step/max_steps": "20/580", "percentage": "3.45%", "elapsed_time": "2m 44s", "remaining_time": "1h 16m 46s", "memory(GiB)": 28.17, "train_speed(iter/s)": 0.121558}
4
+ {"loss": 1.19222298, "grad_norm": 0.24984458, "learning_rate": 0.0001, "token_acc": 0.72698795, "epoch": 0.51948052, "global_step/max_steps": "30/580", "percentage": "5.17%", "elapsed_time": "4m 3s", "remaining_time": "1h 14m 23s", "memory(GiB)": 28.17, "train_speed(iter/s)": 0.123231}
5
+ {"loss": 1.01923475, "grad_norm": 0.32213846, "learning_rate": 9.99e-05, "token_acc": 0.7609011, "epoch": 0.69264069, "global_step/max_steps": "40/580", "percentage": "6.90%", "elapsed_time": "5m 23s", "remaining_time": "1h 12m 49s", "memory(GiB)": 28.66, "train_speed(iter/s)": 0.123595}
6
+ {"loss": 0.91503496, "grad_norm": 0.40206295, "learning_rate": 9.964e-05, "token_acc": 0.7773336, "epoch": 0.86580087, "global_step/max_steps": "50/580", "percentage": "8.62%", "elapsed_time": "6m 41s", "remaining_time": "1h 10m 55s", "memory(GiB)": 28.67, "train_speed(iter/s)": 0.124536}
7
+ {"loss": 0.77426672, "grad_norm": 0.20406531, "learning_rate": 9.922e-05, "token_acc": 0.81239426, "epoch": 1.03463203, "global_step/max_steps": "60/580", "percentage": "10.34%", "elapsed_time": "8m 2s", "remaining_time": "1h 9m 38s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124458}
8
+ {"loss": 0.69839468, "grad_norm": 1.476807, "learning_rate": 9.864e-05, "token_acc": 0.82483331, "epoch": 1.20779221, "global_step/max_steps": "70/580", "percentage": "12.07%", "elapsed_time": "9m 22s", "remaining_time": "1h 8m 14s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.12455}
9
+ {"loss": 0.61382432, "grad_norm": 0.6114096, "learning_rate": 9.79e-05, "token_acc": 0.84425611, "epoch": 1.38095238, "global_step/max_steps": "80/580", "percentage": "13.79%", "elapsed_time": "10m 42s", "remaining_time": "1h 6m 57s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124452}
10
+ {"loss": 0.59754915, "grad_norm": 0.30513948, "learning_rate": 9.701e-05, "token_acc": 0.84831231, "epoch": 1.55411255, "global_step/max_steps": "90/580", "percentage": "15.52%", "elapsed_time": "12m 4s", "remaining_time": "1h 5m 42s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124286}
11
+ {"loss": 0.54105206, "grad_norm": 0.37832206, "learning_rate": 9.596e-05, "token_acc": 0.86050941, "epoch": 1.72727273, "global_step/max_steps": "100/580", "percentage": "17.24%", "elapsed_time": "13m 22s", "remaining_time": "1h 4m 11s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124632}
12
+ {"loss": 0.55311708, "grad_norm": 0.60398656, "learning_rate": 9.476e-05, "token_acc": 0.85478925, "epoch": 1.9004329, "global_step/max_steps": "110/580", "percentage": "18.97%", "elapsed_time": "14m 43s", "remaining_time": "1h 2m 55s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124471}
13
+ {"loss": 0.5079349, "grad_norm": 0.53749853, "learning_rate": 9.342e-05, "token_acc": 0.86448883, "epoch": 2.06926407, "global_step/max_steps": "120/580", "percentage": "20.69%", "elapsed_time": "16m 2s", "remaining_time": "1h 1m 30s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124655}
14
+ {"loss": 0.44995866, "grad_norm": 0.36461964, "learning_rate": 9.194e-05, "token_acc": 0.87973977, "epoch": 2.24242424, "global_step/max_steps": "130/580", "percentage": "22.41%", "elapsed_time": "17m 21s", "remaining_time": "1h 0m 5s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124797}
15
+ {"loss": 0.42932315, "grad_norm": 1.5994767, "learning_rate": 9.032e-05, "token_acc": 0.88584906, "epoch": 2.41558442, "global_step/max_steps": "140/580", "percentage": "24.14%", "elapsed_time": "18m 44s", "remaining_time": "58m 53s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124527}
16
+ {"loss": 0.43150377, "grad_norm": 0.46518201, "learning_rate": 8.857e-05, "token_acc": 0.88193073, "epoch": 2.58874459, "global_step/max_steps": "150/580", "percentage": "25.86%", "elapsed_time": "20m 3s", "remaining_time": "57m 31s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124597}
17
+ {"loss": 0.40119166, "grad_norm": 0.69114858, "learning_rate": 8.669e-05, "token_acc": 0.88962249, "epoch": 2.76190476, "global_step/max_steps": "160/580", "percentage": "27.59%", "elapsed_time": "21m 21s", "remaining_time": "56m 5s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124813}
18
+ {"loss": 0.40500436, "grad_norm": 0.35404441, "learning_rate": 8.469e-05, "token_acc": 0.89176467, "epoch": 2.93506494, "global_step/max_steps": "170/580", "percentage": "29.31%", "elapsed_time": "22m 42s", "remaining_time": "54m 47s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124732}
19
+ {"loss": 0.37092483, "grad_norm": 0.34128174, "learning_rate": 8.259e-05, "token_acc": 0.89772912, "epoch": 3.1038961, "global_step/max_steps": "180/580", "percentage": "31.03%", "elapsed_time": "23m 59s", "remaining_time": "53m 19s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125023}
20
+ {"loss": 0.34410374, "grad_norm": 0.34155095, "learning_rate": 8.037e-05, "token_acc": 0.90652066, "epoch": 3.27705628, "global_step/max_steps": "190/580", "percentage": "32.76%", "elapsed_time": "25m 22s", "remaining_time": "52m 5s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124775}
21
+ {"loss": 0.35043871, "grad_norm": 0.36273357, "learning_rate": 7.806e-05, "token_acc": 0.90027623, "epoch": 3.45021645, "global_step/max_steps": "200/580", "percentage": "34.48%", "elapsed_time": "26m 40s", "remaining_time": "50m 41s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124947}
22
+ {"loss": 0.35581975, "grad_norm": 0.81410897, "learning_rate": 7.566e-05, "token_acc": 0.899916, "epoch": 3.62337662, "global_step/max_steps": "210/580", "percentage": "36.21%", "elapsed_time": "28m 0s", "remaining_time": "49m 20s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124974}
23
+ {"loss": 0.33676348, "grad_norm": 0.61765021, "learning_rate": 7.317e-05, "token_acc": 0.90481615, "epoch": 3.7965368, "global_step/max_steps": "220/580", "percentage": "37.93%", "elapsed_time": "29m 21s", "remaining_time": "48m 1s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124928}
24
+ {"loss": 0.35226543, "grad_norm": 0.4928762, "learning_rate": 7.061e-05, "token_acc": 0.9020813, "epoch": 3.96969697, "global_step/max_steps": "230/580", "percentage": "39.66%", "elapsed_time": "30m 40s", "remaining_time": "46m 40s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124975}
25
+ {"loss": 0.30578406, "grad_norm": 0.60574222, "learning_rate": 6.798e-05, "token_acc": 0.91238966, "epoch": 4.13852814, "global_step/max_steps": "240/580", "percentage": "41.38%", "elapsed_time": "32m 0s", "remaining_time": "45m 21s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124951}
26
+ {"loss": 0.30134411, "grad_norm": 12.58501434, "learning_rate": 6.529e-05, "token_acc": 0.91423631, "epoch": 4.31168831, "global_step/max_steps": "250/580", "percentage": "43.10%", "elapsed_time": "33m 18s", "remaining_time": "43m 58s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125065}
27
+ {"loss": 0.28981488, "grad_norm": 0.32902831, "learning_rate": 6.255e-05, "token_acc": 0.91778892, "epoch": 4.48484848, "global_step/max_steps": "260/580", "percentage": "44.83%", "elapsed_time": "34m 36s", "remaining_time": "42m 36s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125195}
28
+ {"loss": 0.29072208, "grad_norm": 0.39732787, "learning_rate": 5.977e-05, "token_acc": 0.91725888, "epoch": 4.65800866, "global_step/max_steps": "270/580", "percentage": "46.55%", "elapsed_time": "35m 57s", "remaining_time": "41m 17s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.12514}
29
+ {"loss": 0.28525302, "grad_norm": 0.44461533, "learning_rate": 5.696e-05, "token_acc": 0.91667752, "epoch": 4.83116883, "global_step/max_steps": "280/580", "percentage": "48.28%", "elapsed_time": "37m 17s", "remaining_time": "39m 56s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125165}
30
+ {"loss": 0.30205827, "grad_norm": 0.35245049, "learning_rate": 5.413e-05, "token_acc": 0.91382089, "epoch": 5.0, "global_step/max_steps": "290/580", "percentage": "50.00%", "elapsed_time": "38m 35s", "remaining_time": "38m 35s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125245}
31
+ {"loss": 0.23758175, "grad_norm": 0.36154425, "learning_rate": 5.128e-05, "token_acc": 0.92829454, "epoch": 5.17316017, "global_step/max_steps": "300/580", "percentage": "51.72%", "elapsed_time": "39m 53s", "remaining_time": "37m 14s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.12533}
32
+ {"loss": 0.27498043, "grad_norm": 0.402962, "learning_rate": 4.843e-05, "token_acc": 0.92176818, "epoch": 5.34632035, "global_step/max_steps": "310/580", "percentage": "53.45%", "elapsed_time": "41m 13s", "remaining_time": "35m 54s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125338}
33
+ {"loss": 0.25760767, "grad_norm": 0.97008127, "learning_rate": 4.559e-05, "token_acc": 0.92520932, "epoch": 5.51948052, "global_step/max_steps": "320/580", "percentage": "55.17%", "elapsed_time": "42m 36s", "remaining_time": "34m 36s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125186}
34
+ {"loss": 0.24246125, "grad_norm": 0.43049768, "learning_rate": 4.276e-05, "token_acc": 0.9276378, "epoch": 5.69264069, "global_step/max_steps": "330/580", "percentage": "56.90%", "elapsed_time": "43m 55s", "remaining_time": "33m 16s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125224}
35
+ {"loss": 0.25920277, "grad_norm": 0.46521387, "learning_rate": 3.995e-05, "token_acc": 0.92409673, "epoch": 5.86580087, "global_step/max_steps": "340/580", "percentage": "58.62%", "elapsed_time": "45m 13s", "remaining_time": "31m 55s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125279}
36
+ {"loss": 0.23348081, "grad_norm": 0.3748042, "learning_rate": 3.717e-05, "token_acc": 0.92994008, "epoch": 6.03463203, "global_step/max_steps": "350/580", "percentage": "60.34%", "elapsed_time": "46m 33s", "remaining_time": "30m 35s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125308}
37
+ {"loss": 0.23487866, "grad_norm": 0.53892863, "learning_rate": 3.444e-05, "token_acc": 0.93005129, "epoch": 6.20779221, "global_step/max_steps": "360/580", "percentage": "62.07%", "elapsed_time": "47m 52s", "remaining_time": "29m 15s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125347}
38
+ {"loss": 0.2172498, "grad_norm": 0.46141779, "learning_rate": 3.176e-05, "token_acc": 0.93600884, "epoch": 6.38095238, "global_step/max_steps": "370/580", "percentage": "63.79%", "elapsed_time": "49m 13s", "remaining_time": "27m 56s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.12527}
39
+ {"loss": 0.20992758, "grad_norm": 0.37730023, "learning_rate": 2.913e-05, "token_acc": 0.93624156, "epoch": 6.55411255, "global_step/max_steps": "380/580", "percentage": "65.52%", "elapsed_time": "50m 31s", "remaining_time": "26m 35s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125335}
40
+ {"loss": 0.21530077, "grad_norm": 1.18984973, "learning_rate": 2.658e-05, "token_acc": 0.93674962, "epoch": 6.72727273, "global_step/max_steps": "390/580", "percentage": "67.24%", "elapsed_time": "51m 50s", "remaining_time": "25m 15s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125393}
41
+ {"loss": 0.22280853, "grad_norm": 0.85869294, "learning_rate": 2.41e-05, "token_acc": 0.93415882, "epoch": 6.9004329, "global_step/max_steps": "400/580", "percentage": "68.97%", "elapsed_time": "53m 10s", "remaining_time": "23m 55s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125373}
42
+ {"loss": 0.21320977, "grad_norm": 0.51493061, "learning_rate": 2.171e-05, "token_acc": 0.93832944, "epoch": 7.06926407, "global_step/max_steps": "410/580", "percentage": "70.69%", "elapsed_time": "54m 30s", "remaining_time": "22m 36s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125348}
43
+ {"loss": 0.1985554, "grad_norm": 0.4976542, "learning_rate": 1.94e-05, "token_acc": 0.94013913, "epoch": 7.24242424, "global_step/max_steps": "420/580", "percentage": "72.41%", "elapsed_time": "55m 49s", "remaining_time": "21m 15s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125401}
44
+ {"loss": 0.20875981, "grad_norm": 0.47794816, "learning_rate": 1.72e-05, "token_acc": 0.93737783, "epoch": 7.41558442, "global_step/max_steps": "430/580", "percentage": "74.14%", "elapsed_time": "57m 8s", "remaining_time": "19m 55s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125431}
45
+ {"loss": 0.19188998, "grad_norm": 0.60223591, "learning_rate": 1.51e-05, "token_acc": 0.94298944, "epoch": 7.58874459, "global_step/max_steps": "440/580", "percentage": "75.86%", "elapsed_time": "58m 26s", "remaining_time": "18m 35s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125498}
46
+ {"loss": 0.19333825, "grad_norm": 0.49709061, "learning_rate": 1.312e-05, "token_acc": 0.94232712, "epoch": 7.76190476, "global_step/max_steps": "450/580", "percentage": "77.59%", "elapsed_time": "59m 48s", "remaining_time": "17m 16s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125417}
47
+ {"loss": 0.18433615, "grad_norm": 0.48897108, "learning_rate": 1.125e-05, "token_acc": 0.94367034, "epoch": 7.93506494, "global_step/max_steps": "460/580", "percentage": "79.31%", "elapsed_time": "1h 1m 6s", "remaining_time": "15m 56s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125455}
48
+ {"loss": 0.19610201, "grad_norm": 0.7671445, "learning_rate": 9.52e-06, "token_acc": 0.94218002, "epoch": 8.1038961, "global_step/max_steps": "470/580", "percentage": "81.03%", "elapsed_time": "1h 2m 26s", "remaining_time": "14m 36s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125464}
49
+ {"loss": 0.17015805, "grad_norm": 0.48931128, "learning_rate": 7.91e-06, "token_acc": 0.9497099, "epoch": 8.27705628, "global_step/max_steps": "480/580", "percentage": "82.76%", "elapsed_time": "1h 3m 44s", "remaining_time": "13m 16s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125511}
50
+ {"loss": 0.16500666, "grad_norm": 0.54070187, "learning_rate": 6.44e-06, "token_acc": 0.95088923, "epoch": 8.45021645, "global_step/max_steps": "490/580", "percentage": "84.48%", "elapsed_time": "1h 5m 4s", "remaining_time": "11m 57s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125482}
51
+ {"loss": 0.19431927, "grad_norm": 0.41522548, "learning_rate": 5.11e-06, "token_acc": 0.94300379, "epoch": 8.62337662, "global_step/max_steps": "500/580", "percentage": "86.21%", "elapsed_time": "1h 6m 26s", "remaining_time": "10m 37s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125411}
52
+ {"loss": 0.1670285, "grad_norm": 0.52570522, "learning_rate": 3.93e-06, "token_acc": 0.95018152, "epoch": 8.7965368, "global_step/max_steps": "510/580", "percentage": "87.93%", "elapsed_time": "1h 7m 47s", "remaining_time": "9m 18s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125396}
53
+ {"loss": 0.19309405, "grad_norm": 0.48933619, "learning_rate": 2.9e-06, "token_acc": 0.94387788, "epoch": 8.96969697, "global_step/max_steps": "520/580", "percentage": "89.66%", "elapsed_time": "1h 9m 6s", "remaining_time": "7m 58s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125407}
54
+ {"loss": 0.17578217, "grad_norm": 0.60733294, "learning_rate": 2.02e-06, "token_acc": 0.94987548, "epoch": 9.13852814, "global_step/max_steps": "530/580", "percentage": "91.38%", "elapsed_time": "1h 10m 29s", "remaining_time": "6m 39s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125309}
55
+ {"loss": 0.18189192, "grad_norm": 5.02008343, "learning_rate": 1.29e-06, "token_acc": 0.94547423, "epoch": 9.31168831, "global_step/max_steps": "540/580", "percentage": "93.10%", "elapsed_time": "1h 11m 48s", "remaining_time": "5m 19s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125319}
56
+ {"loss": 0.16662928, "grad_norm": 0.61253858, "learning_rate": 7.3e-07, "token_acc": 0.95021546, "epoch": 9.48484848, "global_step/max_steps": "550/580", "percentage": "94.83%", "elapsed_time": "1h 13m 7s", "remaining_time": "3m 59s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125367}
57
+ {"loss": 0.16942204, "grad_norm": 0.42454863, "learning_rate": 3.2e-07, "token_acc": 0.95084787, "epoch": 9.65800866, "global_step/max_steps": "560/580", "percentage": "96.55%", "elapsed_time": "1h 14m 25s", "remaining_time": "2m 39s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125401}
58
+ {"loss": 0.17350658, "grad_norm": 1.18298161, "learning_rate": 8e-08, "token_acc": 0.94986971, "epoch": 9.83116883, "global_step/max_steps": "570/580", "percentage": "98.28%", "elapsed_time": "1h 15m 44s", "remaining_time": "1m 19s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125429}
59
+ {"loss": 0.16733598, "grad_norm": 0.56616986, "learning_rate": 0.0, "token_acc": 0.95043931, "epoch": 10.0, "global_step/max_steps": "580/580", "percentage": "100.00%", "elapsed_time": "1h 17m 0s", "remaining_time": "0s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125516}
60
+ {"train_runtime": 4622.8006, "train_samples_per_second": 0.999, "train_steps_per_second": 0.125, "total_flos": 2.229367912955904e+17, "train_loss": 0.3817175, "epoch": 10.0, "global_step/max_steps": "580/580", "percentage": "100.00%", "elapsed_time": "1h 17m 2s", "remaining_time": "0s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125465}
61
+ {"model_parameter_info": "PeftModelForCausalLM: 3918.9627M Params (164.3397M Trainable [4.1934%]), 0.0024M Buffers.", "last_model_checkpoint": "/home/ab/document-parsing/output/training/v1-20260117-010840/checkpoint-580", "best_model_checkpoint": null, "best_metric": null, "global_step": 580, "log_history": [{"loss": 1.4861114025115967, "grad_norm": 0.4092565178871155, "learning_rate": 3.448275862068966e-06, "token_acc": 0.6811960725974412, "epoch": 0.017316017316017316, "step": 1}, {"loss": 1.4343115488688152, "grad_norm": 0.3977337181568146, "learning_rate": 3.4482758620689657e-05, "token_acc": 0.6920024476626676, "epoch": 0.17316017316017315, "step": 10}, {"loss": 1.3693717956542968, "grad_norm": 0.2495131641626358, "learning_rate": 6.896551724137931e-05, "token_acc": 0.7011260365349897, "epoch": 0.3463203463203463, "step": 20}, {"loss": 1.1922229766845702, "grad_norm": 0.24984458088874817, "learning_rate": 9.999918729041868e-05, "token_acc": 0.726987948088823, "epoch": 0.5194805194805194, "step": 30}, {"loss": 1.0192347526550294, "grad_norm": 0.3221384584903717, "learning_rate": 9.990169410465536e-05, "token_acc": 0.7609010955099522, "epoch": 0.6926406926406926, "step": 40}, {"loss": 0.9150349617004394, "grad_norm": 0.40206295251846313, "learning_rate": 9.964202208175834e-05, "token_acc": 0.7773335965518376, "epoch": 0.8658008658008658, "step": 50}, {"loss": 0.7742667198181152, "grad_norm": 0.20406530797481537, "learning_rate": 9.922101514711866e-05, "token_acc": 0.8123942631570925, "epoch": 1.0346320346320346, "step": 60}, {"loss": 0.6983946800231934, "grad_norm": 1.4768069982528687, "learning_rate": 9.864004155919543e-05, "token_acc": 0.8248333138378757, "epoch": 1.2077922077922079, "step": 70}, {"loss": 0.6138243198394775, "grad_norm": 0.611409604549408, "learning_rate": 9.790098946272177e-05, "token_acc": 0.8442561143531572, "epoch": 1.380952380952381, "step": 80}, {"loss": 0.5975491523742675, "grad_norm": 0.3051394820213318, "learning_rate": 9.700626075229738e-05, "token_acc": 0.8483123092893768, "epoch": 1.554112554112554, "step": 90}, {"loss": 0.5410520553588867, "grad_norm": 0.3783220648765564, "learning_rate": 9.595876326631154e-05, "token_acc": 0.8605094145609629, "epoch": 1.7272727272727273, "step": 100}, {"loss": 0.5531170845031739, "grad_norm": 0.6039865612983704, "learning_rate": 9.476190133656548e-05, "token_acc": 0.8547892544963617, "epoch": 1.9004329004329006, "step": 110}, {"loss": 0.5079349040985107, "grad_norm": 0.5374985337257385, "learning_rate": 9.341956472430801e-05, "token_acc": 0.864488826645558, "epoch": 2.069264069264069, "step": 120}, {"loss": 0.44995865821838377, "grad_norm": 0.364619642496109, "learning_rate": 9.193611597864139e-05, "token_acc": 0.8797397710240138, "epoch": 2.242424242424242, "step": 130}, {"loss": 0.429323148727417, "grad_norm": 1.59947669506073, "learning_rate": 9.031637625838265e-05, "token_acc": 0.8858490566037736, "epoch": 2.4155844155844157, "step": 140}, {"loss": 0.4315037727355957, "grad_norm": 0.46518200635910034, "learning_rate": 8.856560966345877e-05, "token_acc": 0.8819307344821817, "epoch": 2.588744588744589, "step": 150}, {"loss": 0.40119166374206544, "grad_norm": 0.691148579120636, "learning_rate": 8.668950612675785e-05, "token_acc": 0.8896224924972358, "epoch": 2.761904761904762, "step": 160}, {"loss": 0.40500435829162595, "grad_norm": 0.3540444076061249, "learning_rate": 8.469416292203747e-05, "token_acc": 0.8917646715924161, "epoch": 2.935064935064935, "step": 170}, {"loss": 0.37092483043670654, "grad_norm": 0.3412817418575287, "learning_rate": 8.258606484798897e-05, "token_acc": 0.8977291233149371, "epoch": 3.103896103896104, "step": 180}, {"loss": 0.344103741645813, "grad_norm": 0.34155094623565674, "learning_rate": 8.037206315285843e-05, "token_acc": 0.9065206570433051, "epoch": 3.277056277056277, "step": 190}, {"loss": 0.3504387140274048, "grad_norm": 0.3627335727214813, "learning_rate": 7.805935326811912e-05, "token_acc": 0.9002762340096682, "epoch": 3.45021645021645, "step": 200}, {"loss": 0.3558197498321533, "grad_norm": 0.8141089677810669, "learning_rate": 7.565545142355971e-05, "token_acc": 0.8999160043936163, "epoch": 3.6233766233766236, "step": 210}, {"loss": 0.33676347732543943, "grad_norm": 0.6176502108573914, "learning_rate": 7.316817021978884e-05, "token_acc": 0.904816147992892, "epoch": 3.7965367965367967, "step": 220}, {"loss": 0.35226542949676515, "grad_norm": 0.49287620186805725, "learning_rate": 7.060559323754435e-05, "token_acc": 0.9020813028578615, "epoch": 3.9696969696969697, "step": 230}, {"loss": 0.3057840585708618, "grad_norm": 0.6057422161102295, "learning_rate": 6.797604876632633e-05, "token_acc": 0.9123896645803242, "epoch": 4.138528138528138, "step": 240}, {"loss": 0.301344108581543, "grad_norm": 12.585014343261719, "learning_rate": 6.528808273773461e-05, "token_acc": 0.9142363149996737, "epoch": 4.311688311688312, "step": 250}, {"loss": 0.2898148775100708, "grad_norm": 0.32902830839157104, "learning_rate": 6.255043095147679e-05, "token_acc": 0.9177889157552563, "epoch": 4.484848484848484, "step": 260}, {"loss": 0.29072208404541017, "grad_norm": 0.39732787013053894, "learning_rate": 5.9771990684311544e-05, "token_acc": 0.917258875717698, "epoch": 4.658008658008658, "step": 270}, {"loss": 0.2852530241012573, "grad_norm": 0.44461533427238464, "learning_rate": 5.6961791774196424e-05, "token_acc": 0.9166775180675826, "epoch": 4.8311688311688314, "step": 280}, {"loss": 0.3020582675933838, "grad_norm": 0.35245048999786377, "learning_rate": 5.4128967273616625e-05, "token_acc": 0.9138208862720794, "epoch": 5.0, "step": 290}, {"loss": 0.23758175373077392, "grad_norm": 0.36154425144195557, "learning_rate": 5.128272376746972e-05, "token_acc": 0.9282945419454031, "epoch": 5.1731601731601735, "step": 300}, {"loss": 0.27498042583465576, "grad_norm": 0.40296199917793274, "learning_rate": 4.8432311451972665e-05, "token_acc": 0.9217681765679143, "epoch": 5.346320346320346, "step": 310}, {"loss": 0.2576076745986938, "grad_norm": 0.9700812697410583, "learning_rate": 4.558699407183338e-05, "token_acc": 0.9252093233763294, "epoch": 5.51948051948052, "step": 320}, {"loss": 0.2424612522125244, "grad_norm": 0.4304976761341095, "learning_rate": 4.2756018813390274e-05, "token_acc": 0.9276378041152792, "epoch": 5.692640692640692, "step": 330}, {"loss": 0.259202766418457, "grad_norm": 0.4652138650417328, "learning_rate": 3.9948586251565825e-05, "token_acc": 0.9240967292621122, "epoch": 5.865800865800866, "step": 340}, {"loss": 0.2334808111190796, "grad_norm": 0.37480419874191284, "learning_rate": 3.7173820448305755e-05, "token_acc": 0.9299400823867182, "epoch": 6.034632034632034, "step": 350}, {"loss": 0.23487865924835205, "grad_norm": 0.5389286279678345, "learning_rate": 3.444073929968284e-05, "token_acc": 0.9300512852684243, "epoch": 6.207792207792208, "step": 360}, {"loss": 0.21724979877471923, "grad_norm": 0.4614177942276001, "learning_rate": 3.175822522803623e-05, "token_acc": 0.9360088365243004, "epoch": 6.380952380952381, "step": 370}, {"loss": 0.20992758274078369, "grad_norm": 0.3773002326488495, "learning_rate": 2.9134996314395818e-05, "token_acc": 0.9362415581566618, "epoch": 6.554112554112554, "step": 380}, {"loss": 0.2153007745742798, "grad_norm": 1.1898497343063354, "learning_rate": 2.65795779650105e-05, "token_acc": 0.9367496189220204, "epoch": 6.7272727272727275, "step": 390}, {"loss": 0.22280852794647216, "grad_norm": 0.8586929440498352, "learning_rate": 2.41002752040629e-05, "token_acc": 0.9341588229918669, "epoch": 6.9004329004329, "step": 400}, {"loss": 0.21320977210998535, "grad_norm": 0.5149306058883667, "learning_rate": 2.1705145682618505e-05, "token_acc": 0.9383294431477159, "epoch": 7.06926406926407, "step": 410}, {"loss": 0.1985553979873657, "grad_norm": 0.4976541996002197, "learning_rate": 1.940197349152923e-05, "token_acc": 0.9401391309809833, "epoch": 7.242424242424242, "step": 420}, {"loss": 0.20875980854034423, "grad_norm": 0.4779481589794159, "learning_rate": 1.7198243863398273e-05, "token_acc": 0.9373778262148182, "epoch": 7.415584415584416, "step": 430}, {"loss": 0.19188997745513917, "grad_norm": 0.6022359132766724, "learning_rate": 1.510111884582463e-05, "token_acc": 0.942989444333798, "epoch": 7.588744588744589, "step": 440}, {"loss": 0.1933382511138916, "grad_norm": 0.497090607881546, "learning_rate": 1.3117414024987823e-05, "token_acc": 0.9423271204556436, "epoch": 7.761904761904762, "step": 450}, {"loss": 0.1843361496925354, "grad_norm": 0.488971084356308, "learning_rate": 1.125357637522072e-05, "token_acc": 0.9436703366987985, "epoch": 7.935064935064935, "step": 460}, {"loss": 0.19610201120376586, "grad_norm": 0.767144501209259, "learning_rate": 9.51566330655857e-06, "token_acc": 0.9421800227876946, "epoch": 8.103896103896103, "step": 470}, {"loss": 0.170158052444458, "grad_norm": 0.4893112778663635, "learning_rate": 7.909322978358913e-06, "token_acc": 0.9497098970386021, "epoch": 8.277056277056277, "step": 480}, {"loss": 0.1650066614151001, "grad_norm": 0.5407018661499023, "learning_rate": 6.439775942972609e-06, "token_acc": 0.9508892299359032, "epoch": 8.45021645021645, "step": 490}, {"loss": 0.1943192720413208, "grad_norm": 0.41522547602653503, "learning_rate": 5.111798179123173e-06, "token_acc": 0.9430037937960277, "epoch": 8.623376623376624, "step": 500}, {"loss": 0.16702849864959718, "grad_norm": 0.5257052183151245, "learning_rate": 3.929705570135711e-06, "token_acc": 0.9501815248083905, "epoch": 8.796536796536797, "step": 510}, {"loss": 0.19309405088424683, "grad_norm": 0.48933619260787964, "learning_rate": 2.897339877460398e-06, "token_acc": 0.9438778813778814, "epoch": 8.969696969696969, "step": 520}, {"loss": 0.17578216791152954, "grad_norm": 0.6073329448699951, "learning_rate": 2.018056255076256e-06, "token_acc": 0.949875481814, "epoch": 9.13852813852814, "step": 530}, {"loss": 0.18189191818237305, "grad_norm": 5.020083427429199, "learning_rate": 1.2947123453528886e-06, "token_acc": 0.9454742254092816, "epoch": 9.311688311688311, "step": 540}, {"loss": 0.16662927865982055, "grad_norm": 0.6125385761260986, "learning_rate": 7.296589918083685e-07, "token_acc": 0.9502154609558632, "epoch": 9.484848484848484, "step": 550}, {"loss": 0.16942204236984254, "grad_norm": 0.4245486259460449, "learning_rate": 3.2473259894640894e-07, "token_acc": 0.9508478741705578, "epoch": 9.658008658008658, "step": 560}, {"loss": 0.17350658178329467, "grad_norm": 1.1829816102981567, "learning_rate": 8.124916400311655e-08, "token_acc": 0.9498697127620894, "epoch": 9.831168831168831, "step": 570}, {"loss": 0.1673359751701355, "grad_norm": 0.5661698579788208, "learning_rate": 0.0, "token_acc": 0.9504393101204035, "epoch": 10.0, "step": 580}, {"train_runtime": 4622.8006, "train_samples_per_second": 0.999, "train_steps_per_second": 0.125, "total_flos": 2.229367912955904e+17, "train_loss": 0.3817174964937671, "epoch": 10.0, "step": 580}], "memory": 29.8046875}
output/training/v1-20260117-010840-10e/runs/events.out.tfevents.1768612131.5090.2113421.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b70de2b89c6fd0f0eae0667654df5c6c822d5e1f96e7052470d4c55216928190
3
+ size 25008