Upload folder using huggingface_hub
Browse files- output/training/v1-20260117-010840-10e/args.json +353 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/README.md +207 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/adapter_config.json +38 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/adapter_model.safetensors +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/additional_config.json +1 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/args.json +353 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/optimizer.pt +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/rng_state.pth +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/scheduler.pt +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/trainer_state.json +362 -0
- output/training/v1-20260117-010840-10e/checkpoint-400/training_args.bin +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/README.md +207 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/adapter_config.json +38 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/adapter_model.safetensors +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/additional_config.json +1 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/args.json +353 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/optimizer.pt +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/rng_state.pth +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/scheduler.pt +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/trainer_state.json +442 -0
- output/training/v1-20260117-010840-10e/checkpoint-500/training_args.bin +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/README.md +207 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/adapter_config.json +38 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/adapter_model.safetensors +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/additional_config.json +1 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/args.json +353 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/optimizer.pt +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/rng_state.pth +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/scheduler.pt +3 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/trainer_state.json +506 -0
- output/training/v1-20260117-010840-10e/checkpoint-580/training_args.bin +3 -0
- output/training/v1-20260117-010840-10e/images/train_epoch.png +0 -0
- output/training/v1-20260117-010840-10e/images/train_grad_norm.png +0 -0
- output/training/v1-20260117-010840-10e/images/train_learning_rate.png +0 -0
- output/training/v1-20260117-010840-10e/images/train_loss.png +0 -0
- output/training/v1-20260117-010840-10e/images/train_token_acc.png +0 -0
- output/training/v1-20260117-010840-10e/images/train_total_flos.png +0 -0
- output/training/v1-20260117-010840-10e/images/train_train_loss.png +0 -0
- output/training/v1-20260117-010840-10e/images/train_train_runtime.png +0 -0
- output/training/v1-20260117-010840-10e/images/train_train_samples_per_second.png +0 -0
- output/training/v1-20260117-010840-10e/images/train_train_steps_per_second.png +0 -0
- output/training/v1-20260117-010840-10e/logging.jsonl +61 -0
- output/training/v1-20260117-010840-10e/runs/events.out.tfevents.1768612131.5090.2113421.0 +3 -0
output/training/v1-20260117-010840-10e/args.json
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "no",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 1,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 8,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 0.0001,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 10.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 10,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "steps",
|
| 38 |
+
"save_steps": 100.0,
|
| 39 |
+
"save_total_limit": 3,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"bf16": true,
|
| 51 |
+
"fp16": false,
|
| 52 |
+
"fp16_opt_level": "O1",
|
| 53 |
+
"half_precision_backend": "auto",
|
| 54 |
+
"bf16_full_eval": false,
|
| 55 |
+
"fp16_full_eval": false,
|
| 56 |
+
"tf32": null,
|
| 57 |
+
"local_rank": -1,
|
| 58 |
+
"ddp_backend": null,
|
| 59 |
+
"tpu_num_cores": null,
|
| 60 |
+
"tpu_metrics_debug": false,
|
| 61 |
+
"debug": null,
|
| 62 |
+
"dataloader_drop_last": false,
|
| 63 |
+
"eval_steps": 100.0,
|
| 64 |
+
"dataloader_num_workers": 4,
|
| 65 |
+
"dataloader_prefetch_factor": null,
|
| 66 |
+
"past_index": -1,
|
| 67 |
+
"run_name": "/home/ab/document-parsing/output/training/v1-20260117-010840",
|
| 68 |
+
"disable_tqdm": null,
|
| 69 |
+
"remove_unused_columns": true,
|
| 70 |
+
"label_names": null,
|
| 71 |
+
"load_best_model_at_end": false,
|
| 72 |
+
"metric_for_best_model": "loss",
|
| 73 |
+
"greater_is_better": false,
|
| 74 |
+
"ignore_data_skip": false,
|
| 75 |
+
"fsdp": [],
|
| 76 |
+
"fsdp_min_num_params": 0,
|
| 77 |
+
"fsdp_config": null,
|
| 78 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 79 |
+
"accelerator_config": {
|
| 80 |
+
"dispatch_batches": false
|
| 81 |
+
},
|
| 82 |
+
"parallelism_config": null,
|
| 83 |
+
"deepspeed": null,
|
| 84 |
+
"label_smoothing_factor": 0.0,
|
| 85 |
+
"optim": "adamw_torch_fused",
|
| 86 |
+
"optim_args": null,
|
| 87 |
+
"adafactor": false,
|
| 88 |
+
"group_by_length": false,
|
| 89 |
+
"length_column_name": "length",
|
| 90 |
+
"report_to": [
|
| 91 |
+
"tensorboard"
|
| 92 |
+
],
|
| 93 |
+
"project": "huggingface",
|
| 94 |
+
"trackio_space_id": "trackio",
|
| 95 |
+
"ddp_find_unused_parameters": null,
|
| 96 |
+
"ddp_bucket_cap_mb": null,
|
| 97 |
+
"ddp_broadcast_buffers": null,
|
| 98 |
+
"dataloader_pin_memory": true,
|
| 99 |
+
"dataloader_persistent_workers": false,
|
| 100 |
+
"skip_memory_metrics": true,
|
| 101 |
+
"use_legacy_prediction_loop": false,
|
| 102 |
+
"push_to_hub": false,
|
| 103 |
+
"resume_from_checkpoint": null,
|
| 104 |
+
"hub_model_id": null,
|
| 105 |
+
"hub_strategy": "every_save",
|
| 106 |
+
"hub_token": null,
|
| 107 |
+
"hub_private_repo": null,
|
| 108 |
+
"hub_always_push": false,
|
| 109 |
+
"hub_revision": null,
|
| 110 |
+
"gradient_checkpointing": true,
|
| 111 |
+
"gradient_checkpointing_kwargs": null,
|
| 112 |
+
"include_inputs_for_metrics": false,
|
| 113 |
+
"include_for_metrics": [],
|
| 114 |
+
"eval_do_concat_batches": true,
|
| 115 |
+
"fp16_backend": "auto",
|
| 116 |
+
"push_to_hub_model_id": null,
|
| 117 |
+
"push_to_hub_organization": null,
|
| 118 |
+
"push_to_hub_token": null,
|
| 119 |
+
"mp_parameters": "",
|
| 120 |
+
"auto_find_batch_size": false,
|
| 121 |
+
"full_determinism": false,
|
| 122 |
+
"torchdynamo": null,
|
| 123 |
+
"ray_scope": "last",
|
| 124 |
+
"ddp_timeout": 18000000,
|
| 125 |
+
"torch_compile": false,
|
| 126 |
+
"torch_compile_backend": null,
|
| 127 |
+
"torch_compile_mode": null,
|
| 128 |
+
"include_tokens_per_second": false,
|
| 129 |
+
"include_num_input_tokens_seen": false,
|
| 130 |
+
"neftune_noise_alpha": null,
|
| 131 |
+
"optim_target_modules": null,
|
| 132 |
+
"batch_eval_metrics": false,
|
| 133 |
+
"eval_on_start": false,
|
| 134 |
+
"use_liger_kernel": false,
|
| 135 |
+
"liger_kernel_config": null,
|
| 136 |
+
"eval_use_gather_object": false,
|
| 137 |
+
"average_tokens_across_devices": true,
|
| 138 |
+
"sortish_sampler": false,
|
| 139 |
+
"predict_with_generate": false,
|
| 140 |
+
"generation_max_length": null,
|
| 141 |
+
"generation_num_beams": null,
|
| 142 |
+
"generation_config": null,
|
| 143 |
+
"tuner_backend": "peft",
|
| 144 |
+
"vit_gradient_checkpointing": null,
|
| 145 |
+
"router_aux_loss_coef": 0.0,
|
| 146 |
+
"enable_dft_loss": false,
|
| 147 |
+
"enable_channel_loss": false,
|
| 148 |
+
"check_model": true,
|
| 149 |
+
"acc_strategy": "token",
|
| 150 |
+
"train_dataloader_shuffle": true,
|
| 151 |
+
"max_epochs": null,
|
| 152 |
+
"aligner_lr": null,
|
| 153 |
+
"vit_lr": null,
|
| 154 |
+
"use_logits_to_keep": null,
|
| 155 |
+
"ds3_gather_for_generation": true,
|
| 156 |
+
"resume_only_model": false,
|
| 157 |
+
"optimizer": null,
|
| 158 |
+
"loss_type": null,
|
| 159 |
+
"metric": null,
|
| 160 |
+
"eval_use_evalscope": false,
|
| 161 |
+
"eval_dataset": [],
|
| 162 |
+
"eval_dataset_args": null,
|
| 163 |
+
"eval_limit": null,
|
| 164 |
+
"eval_generation_config": null,
|
| 165 |
+
"extra_eval_args": null,
|
| 166 |
+
"use_flash_ckpt": false,
|
| 167 |
+
"use_ray": false,
|
| 168 |
+
"ray_exp_name": null,
|
| 169 |
+
"device_groups": null,
|
| 170 |
+
"model": "nanonets/Nanonets-OCR2-3B",
|
| 171 |
+
"model_type": "qwen2_5_vl",
|
| 172 |
+
"model_revision": null,
|
| 173 |
+
"task_type": "causal_lm",
|
| 174 |
+
"torch_dtype": "bfloat16",
|
| 175 |
+
"attn_impl": null,
|
| 176 |
+
"new_special_tokens": [],
|
| 177 |
+
"num_labels": null,
|
| 178 |
+
"problem_type": null,
|
| 179 |
+
"rope_scaling": null,
|
| 180 |
+
"device_map": null,
|
| 181 |
+
"max_memory": {},
|
| 182 |
+
"max_model_len": null,
|
| 183 |
+
"local_repo_path": null,
|
| 184 |
+
"init_strategy": null,
|
| 185 |
+
"template": "qwen2_5_vl",
|
| 186 |
+
"system": null,
|
| 187 |
+
"max_length": 8192,
|
| 188 |
+
"truncation_strategy": "delete",
|
| 189 |
+
"max_pixels": null,
|
| 190 |
+
"agent_template": null,
|
| 191 |
+
"norm_bbox": null,
|
| 192 |
+
"use_chat_template": true,
|
| 193 |
+
"padding_side": "right",
|
| 194 |
+
"padding_free": false,
|
| 195 |
+
"loss_scale": "default",
|
| 196 |
+
"sequence_parallel_size": 1,
|
| 197 |
+
"template_backend": "swift",
|
| 198 |
+
"response_prefix": null,
|
| 199 |
+
"enable_thinking": null,
|
| 200 |
+
"add_non_thinking_prefix": true,
|
| 201 |
+
"dataset": [
|
| 202 |
+
"/home/ab/document-parsing/output/datasets/train.jsonl"
|
| 203 |
+
],
|
| 204 |
+
"val_dataset": [],
|
| 205 |
+
"cached_dataset": [],
|
| 206 |
+
"cached_val_dataset": [],
|
| 207 |
+
"split_dataset_ratio": 0.0,
|
| 208 |
+
"dataset_num_proc": 1,
|
| 209 |
+
"load_from_cache_file": false,
|
| 210 |
+
"dataset_shuffle": true,
|
| 211 |
+
"val_dataset_shuffle": false,
|
| 212 |
+
"streaming": false,
|
| 213 |
+
"interleave_prob": null,
|
| 214 |
+
"stopping_strategy": "first_exhausted",
|
| 215 |
+
"shuffle_buffer_size": 1000,
|
| 216 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 217 |
+
"columns": {},
|
| 218 |
+
"strict": false,
|
| 219 |
+
"model_name": null,
|
| 220 |
+
"model_author": null,
|
| 221 |
+
"custom_dataset_info": [],
|
| 222 |
+
"quant_method": null,
|
| 223 |
+
"quant_bits": null,
|
| 224 |
+
"hqq_axis": null,
|
| 225 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 226 |
+
"bnb_4bit_quant_type": "nf4",
|
| 227 |
+
"bnb_4bit_use_double_quant": true,
|
| 228 |
+
"bnb_4bit_quant_storage": null,
|
| 229 |
+
"max_new_tokens": 64,
|
| 230 |
+
"temperature": 0.0,
|
| 231 |
+
"top_k": null,
|
| 232 |
+
"top_p": null,
|
| 233 |
+
"repetition_penalty": null,
|
| 234 |
+
"num_beams": 1,
|
| 235 |
+
"stream": false,
|
| 236 |
+
"stop_words": [],
|
| 237 |
+
"logprobs": false,
|
| 238 |
+
"top_logprobs": null,
|
| 239 |
+
"structured_outputs_regex": null,
|
| 240 |
+
"ckpt_dir": null,
|
| 241 |
+
"lora_modules": [],
|
| 242 |
+
"train_type": "lora",
|
| 243 |
+
"adapters": [],
|
| 244 |
+
"external_plugins": [],
|
| 245 |
+
"model_kwargs": {},
|
| 246 |
+
"load_args": false,
|
| 247 |
+
"load_data_args": false,
|
| 248 |
+
"packing": false,
|
| 249 |
+
"packing_length": null,
|
| 250 |
+
"packing_num_proc": 1,
|
| 251 |
+
"lazy_tokenize": true,
|
| 252 |
+
"custom_register_path": [],
|
| 253 |
+
"use_hf": false,
|
| 254 |
+
"ignore_args_error": false,
|
| 255 |
+
"use_swift_lora": false,
|
| 256 |
+
"freeze_parameters": [],
|
| 257 |
+
"freeze_parameters_regex": null,
|
| 258 |
+
"freeze_parameters_ratio": 0.0,
|
| 259 |
+
"trainable_parameters": [],
|
| 260 |
+
"trainable_parameters_regex": null,
|
| 261 |
+
"freeze_llm": false,
|
| 262 |
+
"freeze_vit": false,
|
| 263 |
+
"freeze_aligner": true,
|
| 264 |
+
"target_modules": [
|
| 265 |
+
"all-linear"
|
| 266 |
+
],
|
| 267 |
+
"target_regex": null,
|
| 268 |
+
"target_parameters": null,
|
| 269 |
+
"modules_to_save": [],
|
| 270 |
+
"lora_rank": 64,
|
| 271 |
+
"lora_alpha": 16,
|
| 272 |
+
"lora_dropout": 0.05,
|
| 273 |
+
"lora_bias": "none",
|
| 274 |
+
"lora_dtype": null,
|
| 275 |
+
"lorap_lr_ratio": null,
|
| 276 |
+
"use_rslora": false,
|
| 277 |
+
"use_dora": false,
|
| 278 |
+
"lora_ga_batch_size": 2,
|
| 279 |
+
"lora_ga_iters": 2,
|
| 280 |
+
"lora_ga_max_length": 1024,
|
| 281 |
+
"lora_ga_direction": "ArB2r",
|
| 282 |
+
"lora_ga_scale": "stable",
|
| 283 |
+
"lora_ga_stable_gamma": 16,
|
| 284 |
+
"init_weights": true,
|
| 285 |
+
"fourier_n_frequency": 2000,
|
| 286 |
+
"fourier_scaling": 300.0,
|
| 287 |
+
"boft_block_size": 4,
|
| 288 |
+
"boft_block_num": 0,
|
| 289 |
+
"boft_n_butterfly_factor": 1,
|
| 290 |
+
"boft_dropout": 0.0,
|
| 291 |
+
"vera_rank": 256,
|
| 292 |
+
"vera_projection_prng_key": 0,
|
| 293 |
+
"vera_dropout": 0.0,
|
| 294 |
+
"vera_d_initial": 0.1,
|
| 295 |
+
"adapter_act": "gelu",
|
| 296 |
+
"adapter_length": 128,
|
| 297 |
+
"use_galore": false,
|
| 298 |
+
"galore_target_modules": null,
|
| 299 |
+
"galore_rank": 128,
|
| 300 |
+
"galore_update_proj_gap": 50,
|
| 301 |
+
"galore_scale": 1.0,
|
| 302 |
+
"galore_proj_type": "std",
|
| 303 |
+
"galore_optim_per_parameter": false,
|
| 304 |
+
"galore_with_embedding": false,
|
| 305 |
+
"galore_quantization": false,
|
| 306 |
+
"galore_proj_quant": false,
|
| 307 |
+
"galore_proj_bits": 4,
|
| 308 |
+
"galore_proj_group_size": 256,
|
| 309 |
+
"galore_cos_threshold": 0.4,
|
| 310 |
+
"galore_gamma_proj": 2,
|
| 311 |
+
"galore_queue_size": 5,
|
| 312 |
+
"adalora_target_r": 8,
|
| 313 |
+
"adalora_init_r": 12,
|
| 314 |
+
"adalora_tinit": 0,
|
| 315 |
+
"adalora_tfinal": 0,
|
| 316 |
+
"adalora_deltaT": 1,
|
| 317 |
+
"adalora_beta1": 0.85,
|
| 318 |
+
"adalora_beta2": 0.85,
|
| 319 |
+
"adalora_orth_reg_weight": 0.5,
|
| 320 |
+
"llamapro_num_new_blocks": 4,
|
| 321 |
+
"llamapro_num_groups": null,
|
| 322 |
+
"lisa_activated_layers": 0,
|
| 323 |
+
"lisa_step_interval": 20,
|
| 324 |
+
"reft_layer_key": null,
|
| 325 |
+
"reft_layers": null,
|
| 326 |
+
"reft_rank": 4,
|
| 327 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 328 |
+
"reft_args": null,
|
| 329 |
+
"swanlab_token": null,
|
| 330 |
+
"swanlab_project": "ms-swift",
|
| 331 |
+
"swanlab_workspace": null,
|
| 332 |
+
"swanlab_exp_name": null,
|
| 333 |
+
"swanlab_notification_method": null,
|
| 334 |
+
"swanlab_webhook_url": null,
|
| 335 |
+
"swanlab_secret": null,
|
| 336 |
+
"swanlab_mode": "cloud",
|
| 337 |
+
"add_version": true,
|
| 338 |
+
"create_checkpoint_symlink": false,
|
| 339 |
+
"zero_hpz_partition_size": null,
|
| 340 |
+
"deepspeed_autotp_size": null,
|
| 341 |
+
"early_stop_interval": null,
|
| 342 |
+
"rank": -1,
|
| 343 |
+
"global_world_size": 1,
|
| 344 |
+
"local_world_size": 1,
|
| 345 |
+
"model_suffix": "Nanonets-OCR2-3B",
|
| 346 |
+
"model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'mrope_section': [16, 24, 24], 'rope_type': 'default', 'type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
|
| 347 |
+
"model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7c76215fac00>, model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=['vision', 'video'])",
|
| 348 |
+
"model_dir": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
|
| 349 |
+
"_val_dataset_exists": [],
|
| 350 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 351 |
+
"evaluation_strategy": "steps",
|
| 352 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/home/ab/document-parsing/output/training/v1-20260117-010840', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ab/document-parsing/output/training/v1-20260117-010840/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=100.0, dataloader_num_workers=4, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/ab/document-parsing/output/training/v1-20260117-010840', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='lora', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
|
| 353 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-400/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: ''
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.18.1
|
output/training/v1-20260117-010840-10e/checkpoint-400/adapter_config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": [],
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 64,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": "^(model.language_model.*\\.(down_proj|up_proj|gate_proj|v_proj|k_proj|q_proj|o_proj)|(?!(model.visual.merger))model.visual.*\\.(mlp.0|down_proj|up_proj|gate_proj|mlp.2|qkv|attn.proj))$",
|
| 32 |
+
"target_parameters": null,
|
| 33 |
+
"task_type": "CAUSAL_LM",
|
| 34 |
+
"trainable_token_indices": null,
|
| 35 |
+
"use_dora": false,
|
| 36 |
+
"use_qalora": false,
|
| 37 |
+
"use_rslora": false
|
| 38 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-400/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d2b5ba0bce8b712e8f48caae7682b785de24c5632eb5b9ada4c276878e3e846c
|
| 3 |
+
size 657478696
|
output/training/v1-20260117-010840-10e/checkpoint-400/additional_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
|
output/training/v1-20260117-010840-10e/checkpoint-400/args.json
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "no",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 1,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 8,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 0.0001,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 10.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 10,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "steps",
|
| 38 |
+
"save_steps": 100.0,
|
| 39 |
+
"save_total_limit": 3,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"bf16": true,
|
| 51 |
+
"fp16": false,
|
| 52 |
+
"fp16_opt_level": "O1",
|
| 53 |
+
"half_precision_backend": "auto",
|
| 54 |
+
"bf16_full_eval": false,
|
| 55 |
+
"fp16_full_eval": false,
|
| 56 |
+
"tf32": null,
|
| 57 |
+
"local_rank": -1,
|
| 58 |
+
"ddp_backend": null,
|
| 59 |
+
"tpu_num_cores": null,
|
| 60 |
+
"tpu_metrics_debug": false,
|
| 61 |
+
"debug": null,
|
| 62 |
+
"dataloader_drop_last": false,
|
| 63 |
+
"eval_steps": 100.0,
|
| 64 |
+
"dataloader_num_workers": 4,
|
| 65 |
+
"dataloader_prefetch_factor": null,
|
| 66 |
+
"past_index": -1,
|
| 67 |
+
"run_name": "/home/ab/document-parsing/output/training/v1-20260117-010840",
|
| 68 |
+
"disable_tqdm": null,
|
| 69 |
+
"remove_unused_columns": true,
|
| 70 |
+
"label_names": null,
|
| 71 |
+
"load_best_model_at_end": false,
|
| 72 |
+
"metric_for_best_model": "loss",
|
| 73 |
+
"greater_is_better": false,
|
| 74 |
+
"ignore_data_skip": false,
|
| 75 |
+
"fsdp": [],
|
| 76 |
+
"fsdp_min_num_params": 0,
|
| 77 |
+
"fsdp_config": null,
|
| 78 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 79 |
+
"accelerator_config": {
|
| 80 |
+
"dispatch_batches": false
|
| 81 |
+
},
|
| 82 |
+
"parallelism_config": null,
|
| 83 |
+
"deepspeed": null,
|
| 84 |
+
"label_smoothing_factor": 0.0,
|
| 85 |
+
"optim": "adamw_torch_fused",
|
| 86 |
+
"optim_args": null,
|
| 87 |
+
"adafactor": false,
|
| 88 |
+
"group_by_length": false,
|
| 89 |
+
"length_column_name": "length",
|
| 90 |
+
"report_to": [
|
| 91 |
+
"tensorboard"
|
| 92 |
+
],
|
| 93 |
+
"project": "huggingface",
|
| 94 |
+
"trackio_space_id": "trackio",
|
| 95 |
+
"ddp_find_unused_parameters": null,
|
| 96 |
+
"ddp_bucket_cap_mb": null,
|
| 97 |
+
"ddp_broadcast_buffers": null,
|
| 98 |
+
"dataloader_pin_memory": true,
|
| 99 |
+
"dataloader_persistent_workers": false,
|
| 100 |
+
"skip_memory_metrics": true,
|
| 101 |
+
"use_legacy_prediction_loop": false,
|
| 102 |
+
"push_to_hub": false,
|
| 103 |
+
"resume_from_checkpoint": null,
|
| 104 |
+
"hub_model_id": null,
|
| 105 |
+
"hub_strategy": "every_save",
|
| 106 |
+
"hub_token": null,
|
| 107 |
+
"hub_private_repo": null,
|
| 108 |
+
"hub_always_push": false,
|
| 109 |
+
"hub_revision": null,
|
| 110 |
+
"gradient_checkpointing": true,
|
| 111 |
+
"gradient_checkpointing_kwargs": null,
|
| 112 |
+
"include_inputs_for_metrics": false,
|
| 113 |
+
"include_for_metrics": [],
|
| 114 |
+
"eval_do_concat_batches": true,
|
| 115 |
+
"fp16_backend": "auto",
|
| 116 |
+
"push_to_hub_model_id": null,
|
| 117 |
+
"push_to_hub_organization": null,
|
| 118 |
+
"push_to_hub_token": null,
|
| 119 |
+
"mp_parameters": "",
|
| 120 |
+
"auto_find_batch_size": false,
|
| 121 |
+
"full_determinism": false,
|
| 122 |
+
"torchdynamo": null,
|
| 123 |
+
"ray_scope": "last",
|
| 124 |
+
"ddp_timeout": 18000000,
|
| 125 |
+
"torch_compile": false,
|
| 126 |
+
"torch_compile_backend": null,
|
| 127 |
+
"torch_compile_mode": null,
|
| 128 |
+
"include_tokens_per_second": false,
|
| 129 |
+
"include_num_input_tokens_seen": false,
|
| 130 |
+
"neftune_noise_alpha": null,
|
| 131 |
+
"optim_target_modules": null,
|
| 132 |
+
"batch_eval_metrics": false,
|
| 133 |
+
"eval_on_start": false,
|
| 134 |
+
"use_liger_kernel": false,
|
| 135 |
+
"liger_kernel_config": null,
|
| 136 |
+
"eval_use_gather_object": false,
|
| 137 |
+
"average_tokens_across_devices": true,
|
| 138 |
+
"sortish_sampler": false,
|
| 139 |
+
"predict_with_generate": false,
|
| 140 |
+
"generation_max_length": null,
|
| 141 |
+
"generation_num_beams": null,
|
| 142 |
+
"generation_config": null,
|
| 143 |
+
"tuner_backend": "peft",
|
| 144 |
+
"vit_gradient_checkpointing": null,
|
| 145 |
+
"router_aux_loss_coef": 0.0,
|
| 146 |
+
"enable_dft_loss": false,
|
| 147 |
+
"enable_channel_loss": false,
|
| 148 |
+
"check_model": true,
|
| 149 |
+
"acc_strategy": "token",
|
| 150 |
+
"train_dataloader_shuffle": true,
|
| 151 |
+
"max_epochs": null,
|
| 152 |
+
"aligner_lr": null,
|
| 153 |
+
"vit_lr": null,
|
| 154 |
+
"use_logits_to_keep": null,
|
| 155 |
+
"ds3_gather_for_generation": true,
|
| 156 |
+
"resume_only_model": false,
|
| 157 |
+
"optimizer": null,
|
| 158 |
+
"loss_type": null,
|
| 159 |
+
"metric": null,
|
| 160 |
+
"eval_use_evalscope": false,
|
| 161 |
+
"eval_dataset": [],
|
| 162 |
+
"eval_dataset_args": null,
|
| 163 |
+
"eval_limit": null,
|
| 164 |
+
"eval_generation_config": null,
|
| 165 |
+
"extra_eval_args": null,
|
| 166 |
+
"use_flash_ckpt": false,
|
| 167 |
+
"use_ray": false,
|
| 168 |
+
"ray_exp_name": null,
|
| 169 |
+
"device_groups": null,
|
| 170 |
+
"model": "nanonets/Nanonets-OCR2-3B",
|
| 171 |
+
"model_type": "qwen2_5_vl",
|
| 172 |
+
"model_revision": null,
|
| 173 |
+
"task_type": "causal_lm",
|
| 174 |
+
"torch_dtype": "bfloat16",
|
| 175 |
+
"attn_impl": null,
|
| 176 |
+
"new_special_tokens": [],
|
| 177 |
+
"num_labels": null,
|
| 178 |
+
"problem_type": null,
|
| 179 |
+
"rope_scaling": null,
|
| 180 |
+
"device_map": null,
|
| 181 |
+
"max_memory": {},
|
| 182 |
+
"max_model_len": null,
|
| 183 |
+
"local_repo_path": null,
|
| 184 |
+
"init_strategy": null,
|
| 185 |
+
"template": "qwen2_5_vl",
|
| 186 |
+
"system": null,
|
| 187 |
+
"max_length": 8192,
|
| 188 |
+
"truncation_strategy": "delete",
|
| 189 |
+
"max_pixels": null,
|
| 190 |
+
"agent_template": null,
|
| 191 |
+
"norm_bbox": null,
|
| 192 |
+
"use_chat_template": true,
|
| 193 |
+
"padding_side": "right",
|
| 194 |
+
"padding_free": false,
|
| 195 |
+
"loss_scale": "default",
|
| 196 |
+
"sequence_parallel_size": 1,
|
| 197 |
+
"template_backend": "swift",
|
| 198 |
+
"response_prefix": null,
|
| 199 |
+
"enable_thinking": null,
|
| 200 |
+
"add_non_thinking_prefix": true,
|
| 201 |
+
"dataset": [
|
| 202 |
+
"/home/ab/document-parsing/output/datasets/train.jsonl"
|
| 203 |
+
],
|
| 204 |
+
"val_dataset": [],
|
| 205 |
+
"cached_dataset": [],
|
| 206 |
+
"cached_val_dataset": [],
|
| 207 |
+
"split_dataset_ratio": 0.0,
|
| 208 |
+
"dataset_num_proc": 1,
|
| 209 |
+
"load_from_cache_file": false,
|
| 210 |
+
"dataset_shuffle": true,
|
| 211 |
+
"val_dataset_shuffle": false,
|
| 212 |
+
"streaming": false,
|
| 213 |
+
"interleave_prob": null,
|
| 214 |
+
"stopping_strategy": "first_exhausted",
|
| 215 |
+
"shuffle_buffer_size": 1000,
|
| 216 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 217 |
+
"columns": {},
|
| 218 |
+
"strict": false,
|
| 219 |
+
"model_name": null,
|
| 220 |
+
"model_author": null,
|
| 221 |
+
"custom_dataset_info": [],
|
| 222 |
+
"quant_method": null,
|
| 223 |
+
"quant_bits": null,
|
| 224 |
+
"hqq_axis": null,
|
| 225 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 226 |
+
"bnb_4bit_quant_type": "nf4",
|
| 227 |
+
"bnb_4bit_use_double_quant": true,
|
| 228 |
+
"bnb_4bit_quant_storage": null,
|
| 229 |
+
"max_new_tokens": 64,
|
| 230 |
+
"temperature": 0.0,
|
| 231 |
+
"top_k": null,
|
| 232 |
+
"top_p": null,
|
| 233 |
+
"repetition_penalty": null,
|
| 234 |
+
"num_beams": 1,
|
| 235 |
+
"stream": false,
|
| 236 |
+
"stop_words": [],
|
| 237 |
+
"logprobs": false,
|
| 238 |
+
"top_logprobs": null,
|
| 239 |
+
"structured_outputs_regex": null,
|
| 240 |
+
"ckpt_dir": null,
|
| 241 |
+
"lora_modules": [],
|
| 242 |
+
"train_type": "lora",
|
| 243 |
+
"adapters": [],
|
| 244 |
+
"external_plugins": [],
|
| 245 |
+
"model_kwargs": {},
|
| 246 |
+
"load_args": false,
|
| 247 |
+
"load_data_args": false,
|
| 248 |
+
"packing": false,
|
| 249 |
+
"packing_length": null,
|
| 250 |
+
"packing_num_proc": 1,
|
| 251 |
+
"lazy_tokenize": true,
|
| 252 |
+
"custom_register_path": [],
|
| 253 |
+
"use_hf": false,
|
| 254 |
+
"ignore_args_error": false,
|
| 255 |
+
"use_swift_lora": false,
|
| 256 |
+
"freeze_parameters": [],
|
| 257 |
+
"freeze_parameters_regex": null,
|
| 258 |
+
"freeze_parameters_ratio": 0.0,
|
| 259 |
+
"trainable_parameters": [],
|
| 260 |
+
"trainable_parameters_regex": null,
|
| 261 |
+
"freeze_llm": false,
|
| 262 |
+
"freeze_vit": false,
|
| 263 |
+
"freeze_aligner": true,
|
| 264 |
+
"target_modules": [
|
| 265 |
+
"all-linear"
|
| 266 |
+
],
|
| 267 |
+
"target_regex": null,
|
| 268 |
+
"target_parameters": null,
|
| 269 |
+
"modules_to_save": [],
|
| 270 |
+
"lora_rank": 64,
|
| 271 |
+
"lora_alpha": 16,
|
| 272 |
+
"lora_dropout": 0.05,
|
| 273 |
+
"lora_bias": "none",
|
| 274 |
+
"lora_dtype": null,
|
| 275 |
+
"lorap_lr_ratio": null,
|
| 276 |
+
"use_rslora": false,
|
| 277 |
+
"use_dora": false,
|
| 278 |
+
"lora_ga_batch_size": 2,
|
| 279 |
+
"lora_ga_iters": 2,
|
| 280 |
+
"lora_ga_max_length": 1024,
|
| 281 |
+
"lora_ga_direction": "ArB2r",
|
| 282 |
+
"lora_ga_scale": "stable",
|
| 283 |
+
"lora_ga_stable_gamma": 16,
|
| 284 |
+
"init_weights": true,
|
| 285 |
+
"fourier_n_frequency": 2000,
|
| 286 |
+
"fourier_scaling": 300.0,
|
| 287 |
+
"boft_block_size": 4,
|
| 288 |
+
"boft_block_num": 0,
|
| 289 |
+
"boft_n_butterfly_factor": 1,
|
| 290 |
+
"boft_dropout": 0.0,
|
| 291 |
+
"vera_rank": 256,
|
| 292 |
+
"vera_projection_prng_key": 0,
|
| 293 |
+
"vera_dropout": 0.0,
|
| 294 |
+
"vera_d_initial": 0.1,
|
| 295 |
+
"adapter_act": "gelu",
|
| 296 |
+
"adapter_length": 128,
|
| 297 |
+
"use_galore": false,
|
| 298 |
+
"galore_target_modules": null,
|
| 299 |
+
"galore_rank": 128,
|
| 300 |
+
"galore_update_proj_gap": 50,
|
| 301 |
+
"galore_scale": 1.0,
|
| 302 |
+
"galore_proj_type": "std",
|
| 303 |
+
"galore_optim_per_parameter": false,
|
| 304 |
+
"galore_with_embedding": false,
|
| 305 |
+
"galore_quantization": false,
|
| 306 |
+
"galore_proj_quant": false,
|
| 307 |
+
"galore_proj_bits": 4,
|
| 308 |
+
"galore_proj_group_size": 256,
|
| 309 |
+
"galore_cos_threshold": 0.4,
|
| 310 |
+
"galore_gamma_proj": 2,
|
| 311 |
+
"galore_queue_size": 5,
|
| 312 |
+
"adalora_target_r": 8,
|
| 313 |
+
"adalora_init_r": 12,
|
| 314 |
+
"adalora_tinit": 0,
|
| 315 |
+
"adalora_tfinal": 0,
|
| 316 |
+
"adalora_deltaT": 1,
|
| 317 |
+
"adalora_beta1": 0.85,
|
| 318 |
+
"adalora_beta2": 0.85,
|
| 319 |
+
"adalora_orth_reg_weight": 0.5,
|
| 320 |
+
"llamapro_num_new_blocks": 4,
|
| 321 |
+
"llamapro_num_groups": null,
|
| 322 |
+
"lisa_activated_layers": 0,
|
| 323 |
+
"lisa_step_interval": 20,
|
| 324 |
+
"reft_layer_key": null,
|
| 325 |
+
"reft_layers": null,
|
| 326 |
+
"reft_rank": 4,
|
| 327 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 328 |
+
"reft_args": null,
|
| 329 |
+
"swanlab_token": null,
|
| 330 |
+
"swanlab_project": "ms-swift",
|
| 331 |
+
"swanlab_workspace": null,
|
| 332 |
+
"swanlab_exp_name": null,
|
| 333 |
+
"swanlab_notification_method": null,
|
| 334 |
+
"swanlab_webhook_url": null,
|
| 335 |
+
"swanlab_secret": null,
|
| 336 |
+
"swanlab_mode": "cloud",
|
| 337 |
+
"add_version": true,
|
| 338 |
+
"create_checkpoint_symlink": false,
|
| 339 |
+
"zero_hpz_partition_size": null,
|
| 340 |
+
"deepspeed_autotp_size": null,
|
| 341 |
+
"early_stop_interval": null,
|
| 342 |
+
"rank": -1,
|
| 343 |
+
"global_world_size": 1,
|
| 344 |
+
"local_world_size": 1,
|
| 345 |
+
"model_suffix": "Nanonets-OCR2-3B",
|
| 346 |
+
"model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'mrope_section': [16, 24, 24], 'rope_type': 'default', 'type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
|
| 347 |
+
"model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7c76215fac00>, model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=['vision', 'video'])",
|
| 348 |
+
"model_dir": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
|
| 349 |
+
"_val_dataset_exists": [],
|
| 350 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 351 |
+
"evaluation_strategy": "steps",
|
| 352 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/home/ab/document-parsing/output/training/v1-20260117-010840', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ab/document-parsing/output/training/v1-20260117-010840/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=100.0, dataloader_num_workers=4, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/ab/document-parsing/output/training/v1-20260117-010840', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='lora', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
|
| 353 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-400/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0afd7d8505ae4933e4e78ce4c55d839caaabc686b92aa786281b243459ae37b4
|
| 3 |
+
size 1315426955
|
output/training/v1-20260117-010840-10e/checkpoint-400/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc4a4100c327fe3f0fcd1d4d8851acffbbca0e1e3e5eb0db757b527d667f5693
|
| 3 |
+
size 14645
|
output/training/v1-20260117-010840-10e/checkpoint-400/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d691cf1f75a0b30db18024d2926eda7b28204001f31010c4675f4b4a4df90aaa
|
| 3 |
+
size 1465
|
output/training/v1-20260117-010840-10e/checkpoint-400/trainer_state.json
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 6.9004329004329,
|
| 6 |
+
"eval_steps": 100.0,
|
| 7 |
+
"global_step": 400,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.017316017316017316,
|
| 14 |
+
"grad_norm": 0.4092565178871155,
|
| 15 |
+
"learning_rate": 3.448275862068966e-06,
|
| 16 |
+
"loss": 1.4861114025115967,
|
| 17 |
+
"step": 1,
|
| 18 |
+
"token_acc": 0.6811960725974412
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"epoch": 0.17316017316017315,
|
| 22 |
+
"grad_norm": 0.3977337181568146,
|
| 23 |
+
"learning_rate": 3.4482758620689657e-05,
|
| 24 |
+
"loss": 1.4343115488688152,
|
| 25 |
+
"step": 10,
|
| 26 |
+
"token_acc": 0.6920024476626676
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"epoch": 0.3463203463203463,
|
| 30 |
+
"grad_norm": 0.2495131641626358,
|
| 31 |
+
"learning_rate": 6.896551724137931e-05,
|
| 32 |
+
"loss": 1.3693717956542968,
|
| 33 |
+
"step": 20,
|
| 34 |
+
"token_acc": 0.7011260365349897
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"epoch": 0.5194805194805194,
|
| 38 |
+
"grad_norm": 0.24984458088874817,
|
| 39 |
+
"learning_rate": 9.999918729041868e-05,
|
| 40 |
+
"loss": 1.1922229766845702,
|
| 41 |
+
"step": 30,
|
| 42 |
+
"token_acc": 0.726987948088823
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"epoch": 0.6926406926406926,
|
| 46 |
+
"grad_norm": 0.3221384584903717,
|
| 47 |
+
"learning_rate": 9.990169410465536e-05,
|
| 48 |
+
"loss": 1.0192347526550294,
|
| 49 |
+
"step": 40,
|
| 50 |
+
"token_acc": 0.7609010955099522
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"epoch": 0.8658008658008658,
|
| 54 |
+
"grad_norm": 0.40206295251846313,
|
| 55 |
+
"learning_rate": 9.964202208175834e-05,
|
| 56 |
+
"loss": 0.9150349617004394,
|
| 57 |
+
"step": 50,
|
| 58 |
+
"token_acc": 0.7773335965518376
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 1.0346320346320346,
|
| 62 |
+
"grad_norm": 0.20406530797481537,
|
| 63 |
+
"learning_rate": 9.922101514711866e-05,
|
| 64 |
+
"loss": 0.7742667198181152,
|
| 65 |
+
"step": 60,
|
| 66 |
+
"token_acc": 0.8123942631570925
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 1.2077922077922079,
|
| 70 |
+
"grad_norm": 1.4768069982528687,
|
| 71 |
+
"learning_rate": 9.864004155919543e-05,
|
| 72 |
+
"loss": 0.6983946800231934,
|
| 73 |
+
"step": 70,
|
| 74 |
+
"token_acc": 0.8248333138378757
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"epoch": 1.380952380952381,
|
| 78 |
+
"grad_norm": 0.611409604549408,
|
| 79 |
+
"learning_rate": 9.790098946272177e-05,
|
| 80 |
+
"loss": 0.6138243198394775,
|
| 81 |
+
"step": 80,
|
| 82 |
+
"token_acc": 0.8442561143531572
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"epoch": 1.554112554112554,
|
| 86 |
+
"grad_norm": 0.3051394820213318,
|
| 87 |
+
"learning_rate": 9.700626075229738e-05,
|
| 88 |
+
"loss": 0.5975491523742675,
|
| 89 |
+
"step": 90,
|
| 90 |
+
"token_acc": 0.8483123092893768
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"epoch": 1.7272727272727273,
|
| 94 |
+
"grad_norm": 0.3783220648765564,
|
| 95 |
+
"learning_rate": 9.595876326631154e-05,
|
| 96 |
+
"loss": 0.5410520553588867,
|
| 97 |
+
"step": 100,
|
| 98 |
+
"token_acc": 0.8605094145609629
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"epoch": 1.9004329004329006,
|
| 102 |
+
"grad_norm": 0.6039865612983704,
|
| 103 |
+
"learning_rate": 9.476190133656548e-05,
|
| 104 |
+
"loss": 0.5531170845031739,
|
| 105 |
+
"step": 110,
|
| 106 |
+
"token_acc": 0.8547892544963617
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"epoch": 2.069264069264069,
|
| 110 |
+
"grad_norm": 0.5374985337257385,
|
| 111 |
+
"learning_rate": 9.341956472430801e-05,
|
| 112 |
+
"loss": 0.5079349040985107,
|
| 113 |
+
"step": 120,
|
| 114 |
+
"token_acc": 0.864488826645558
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 2.242424242424242,
|
| 118 |
+
"grad_norm": 0.364619642496109,
|
| 119 |
+
"learning_rate": 9.193611597864139e-05,
|
| 120 |
+
"loss": 0.44995865821838377,
|
| 121 |
+
"step": 130,
|
| 122 |
+
"token_acc": 0.8797397710240138
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 2.4155844155844157,
|
| 126 |
+
"grad_norm": 1.59947669506073,
|
| 127 |
+
"learning_rate": 9.031637625838265e-05,
|
| 128 |
+
"loss": 0.429323148727417,
|
| 129 |
+
"step": 140,
|
| 130 |
+
"token_acc": 0.8858490566037736
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"epoch": 2.588744588744589,
|
| 134 |
+
"grad_norm": 0.46518200635910034,
|
| 135 |
+
"learning_rate": 8.856560966345877e-05,
|
| 136 |
+
"loss": 0.4315037727355957,
|
| 137 |
+
"step": 150,
|
| 138 |
+
"token_acc": 0.8819307344821817
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 2.761904761904762,
|
| 142 |
+
"grad_norm": 0.691148579120636,
|
| 143 |
+
"learning_rate": 8.668950612675785e-05,
|
| 144 |
+
"loss": 0.40119166374206544,
|
| 145 |
+
"step": 160,
|
| 146 |
+
"token_acc": 0.8896224924972358
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"epoch": 2.935064935064935,
|
| 150 |
+
"grad_norm": 0.3540444076061249,
|
| 151 |
+
"learning_rate": 8.469416292203747e-05,
|
| 152 |
+
"loss": 0.40500435829162595,
|
| 153 |
+
"step": 170,
|
| 154 |
+
"token_acc": 0.8917646715924161
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"epoch": 3.103896103896104,
|
| 158 |
+
"grad_norm": 0.3412817418575287,
|
| 159 |
+
"learning_rate": 8.258606484798897e-05,
|
| 160 |
+
"loss": 0.37092483043670654,
|
| 161 |
+
"step": 180,
|
| 162 |
+
"token_acc": 0.8977291233149371
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"epoch": 3.277056277056277,
|
| 166 |
+
"grad_norm": 0.34155094623565674,
|
| 167 |
+
"learning_rate": 8.037206315285843e-05,
|
| 168 |
+
"loss": 0.344103741645813,
|
| 169 |
+
"step": 190,
|
| 170 |
+
"token_acc": 0.9065206570433051
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 3.45021645021645,
|
| 174 |
+
"grad_norm": 0.3627335727214813,
|
| 175 |
+
"learning_rate": 7.805935326811912e-05,
|
| 176 |
+
"loss": 0.3504387140274048,
|
| 177 |
+
"step": 200,
|
| 178 |
+
"token_acc": 0.9002762340096682
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 3.6233766233766236,
|
| 182 |
+
"grad_norm": 0.8141089677810669,
|
| 183 |
+
"learning_rate": 7.565545142355971e-05,
|
| 184 |
+
"loss": 0.3558197498321533,
|
| 185 |
+
"step": 210,
|
| 186 |
+
"token_acc": 0.8999160043936163
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"epoch": 3.7965367965367967,
|
| 190 |
+
"grad_norm": 0.6176502108573914,
|
| 191 |
+
"learning_rate": 7.316817021978884e-05,
|
| 192 |
+
"loss": 0.33676347732543943,
|
| 193 |
+
"step": 220,
|
| 194 |
+
"token_acc": 0.904816147992892
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"epoch": 3.9696969696969697,
|
| 198 |
+
"grad_norm": 0.49287620186805725,
|
| 199 |
+
"learning_rate": 7.060559323754435e-05,
|
| 200 |
+
"loss": 0.35226542949676515,
|
| 201 |
+
"step": 230,
|
| 202 |
+
"token_acc": 0.9020813028578615
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 4.138528138528138,
|
| 206 |
+
"grad_norm": 0.6057422161102295,
|
| 207 |
+
"learning_rate": 6.797604876632633e-05,
|
| 208 |
+
"loss": 0.3057840585708618,
|
| 209 |
+
"step": 240,
|
| 210 |
+
"token_acc": 0.9123896645803242
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"epoch": 4.311688311688312,
|
| 214 |
+
"grad_norm": 12.585014343261719,
|
| 215 |
+
"learning_rate": 6.528808273773461e-05,
|
| 216 |
+
"loss": 0.301344108581543,
|
| 217 |
+
"step": 250,
|
| 218 |
+
"token_acc": 0.9142363149996737
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"epoch": 4.484848484848484,
|
| 222 |
+
"grad_norm": 0.32902830839157104,
|
| 223 |
+
"learning_rate": 6.255043095147679e-05,
|
| 224 |
+
"loss": 0.2898148775100708,
|
| 225 |
+
"step": 260,
|
| 226 |
+
"token_acc": 0.9177889157552563
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 4.658008658008658,
|
| 230 |
+
"grad_norm": 0.39732787013053894,
|
| 231 |
+
"learning_rate": 5.9771990684311544e-05,
|
| 232 |
+
"loss": 0.29072208404541017,
|
| 233 |
+
"step": 270,
|
| 234 |
+
"token_acc": 0.917258875717698
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 4.8311688311688314,
|
| 238 |
+
"grad_norm": 0.44461533427238464,
|
| 239 |
+
"learning_rate": 5.6961791774196424e-05,
|
| 240 |
+
"loss": 0.2852530241012573,
|
| 241 |
+
"step": 280,
|
| 242 |
+
"token_acc": 0.9166775180675826
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"epoch": 5.0,
|
| 246 |
+
"grad_norm": 0.35245048999786377,
|
| 247 |
+
"learning_rate": 5.4128967273616625e-05,
|
| 248 |
+
"loss": 0.3020582675933838,
|
| 249 |
+
"step": 290,
|
| 250 |
+
"token_acc": 0.9138208862720794
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"epoch": 5.1731601731601735,
|
| 254 |
+
"grad_norm": 0.36154425144195557,
|
| 255 |
+
"learning_rate": 5.128272376746972e-05,
|
| 256 |
+
"loss": 0.23758175373077392,
|
| 257 |
+
"step": 300,
|
| 258 |
+
"token_acc": 0.9282945419454031
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 5.346320346320346,
|
| 262 |
+
"grad_norm": 0.40296199917793274,
|
| 263 |
+
"learning_rate": 4.8432311451972665e-05,
|
| 264 |
+
"loss": 0.27498042583465576,
|
| 265 |
+
"step": 310,
|
| 266 |
+
"token_acc": 0.9217681765679143
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"epoch": 5.51948051948052,
|
| 270 |
+
"grad_norm": 0.9700812697410583,
|
| 271 |
+
"learning_rate": 4.558699407183338e-05,
|
| 272 |
+
"loss": 0.2576076745986938,
|
| 273 |
+
"step": 320,
|
| 274 |
+
"token_acc": 0.9252093233763294
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"epoch": 5.692640692640692,
|
| 278 |
+
"grad_norm": 0.4304976761341095,
|
| 279 |
+
"learning_rate": 4.2756018813390274e-05,
|
| 280 |
+
"loss": 0.2424612522125244,
|
| 281 |
+
"step": 330,
|
| 282 |
+
"token_acc": 0.9276378041152792
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 5.865800865800866,
|
| 286 |
+
"grad_norm": 0.4652138650417328,
|
| 287 |
+
"learning_rate": 3.9948586251565825e-05,
|
| 288 |
+
"loss": 0.259202766418457,
|
| 289 |
+
"step": 340,
|
| 290 |
+
"token_acc": 0.9240967292621122
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 6.034632034632034,
|
| 294 |
+
"grad_norm": 0.37480419874191284,
|
| 295 |
+
"learning_rate": 3.7173820448305755e-05,
|
| 296 |
+
"loss": 0.2334808111190796,
|
| 297 |
+
"step": 350,
|
| 298 |
+
"token_acc": 0.9299400823867182
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"epoch": 6.207792207792208,
|
| 302 |
+
"grad_norm": 0.5389286279678345,
|
| 303 |
+
"learning_rate": 3.444073929968284e-05,
|
| 304 |
+
"loss": 0.23487865924835205,
|
| 305 |
+
"step": 360,
|
| 306 |
+
"token_acc": 0.9300512852684243
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"epoch": 6.380952380952381,
|
| 310 |
+
"grad_norm": 0.4614177942276001,
|
| 311 |
+
"learning_rate": 3.175822522803623e-05,
|
| 312 |
+
"loss": 0.21724979877471923,
|
| 313 |
+
"step": 370,
|
| 314 |
+
"token_acc": 0.9360088365243004
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 6.554112554112554,
|
| 318 |
+
"grad_norm": 0.3773002326488495,
|
| 319 |
+
"learning_rate": 2.9134996314395818e-05,
|
| 320 |
+
"loss": 0.20992758274078369,
|
| 321 |
+
"step": 380,
|
| 322 |
+
"token_acc": 0.9362415581566618
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"epoch": 6.7272727272727275,
|
| 326 |
+
"grad_norm": 1.1898497343063354,
|
| 327 |
+
"learning_rate": 2.65795779650105e-05,
|
| 328 |
+
"loss": 0.2153007745742798,
|
| 329 |
+
"step": 390,
|
| 330 |
+
"token_acc": 0.9367496189220204
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"epoch": 6.9004329004329,
|
| 334 |
+
"grad_norm": 0.8586929440498352,
|
| 335 |
+
"learning_rate": 2.41002752040629e-05,
|
| 336 |
+
"loss": 0.22280852794647216,
|
| 337 |
+
"step": 400,
|
| 338 |
+
"token_acc": 0.9341588229918669
|
| 339 |
+
}
|
| 340 |
+
],
|
| 341 |
+
"logging_steps": 10,
|
| 342 |
+
"max_steps": 580,
|
| 343 |
+
"num_input_tokens_seen": 0,
|
| 344 |
+
"num_train_epochs": 10,
|
| 345 |
+
"save_steps": 100,
|
| 346 |
+
"stateful_callbacks": {
|
| 347 |
+
"TrainerControl": {
|
| 348 |
+
"args": {
|
| 349 |
+
"should_epoch_stop": false,
|
| 350 |
+
"should_evaluate": false,
|
| 351 |
+
"should_log": false,
|
| 352 |
+
"should_save": true,
|
| 353 |
+
"should_training_stop": false
|
| 354 |
+
},
|
| 355 |
+
"attributes": {}
|
| 356 |
+
}
|
| 357 |
+
},
|
| 358 |
+
"total_flos": 1.5383232587218944e+17,
|
| 359 |
+
"train_batch_size": 1,
|
| 360 |
+
"trial_name": null,
|
| 361 |
+
"trial_params": null
|
| 362 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-400/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e5274be8af993948bcfc3f1251ec27de22bce224d71e604e5b270f182b3aac2
|
| 3 |
+
size 6993
|
output/training/v1-20260117-010840-10e/checkpoint-500/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: ''
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.18.1
|
output/training/v1-20260117-010840-10e/checkpoint-500/adapter_config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": [],
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 64,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": "^(model.language_model.*\\.(down_proj|up_proj|gate_proj|v_proj|k_proj|q_proj|o_proj)|(?!(model.visual.merger))model.visual.*\\.(mlp.0|down_proj|up_proj|gate_proj|mlp.2|qkv|attn.proj))$",
|
| 32 |
+
"target_parameters": null,
|
| 33 |
+
"task_type": "CAUSAL_LM",
|
| 34 |
+
"trainable_token_indices": null,
|
| 35 |
+
"use_dora": false,
|
| 36 |
+
"use_qalora": false,
|
| 37 |
+
"use_rslora": false
|
| 38 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-500/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:144e3e85649e1f95c3bb79452fc4d9c71cadc539fa8e343f61d82c3f80d5b711
|
| 3 |
+
size 657478696
|
output/training/v1-20260117-010840-10e/checkpoint-500/additional_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
|
output/training/v1-20260117-010840-10e/checkpoint-500/args.json
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "no",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 1,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 8,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 0.0001,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 10.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 10,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "steps",
|
| 38 |
+
"save_steps": 100.0,
|
| 39 |
+
"save_total_limit": 3,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"bf16": true,
|
| 51 |
+
"fp16": false,
|
| 52 |
+
"fp16_opt_level": "O1",
|
| 53 |
+
"half_precision_backend": "auto",
|
| 54 |
+
"bf16_full_eval": false,
|
| 55 |
+
"fp16_full_eval": false,
|
| 56 |
+
"tf32": null,
|
| 57 |
+
"local_rank": -1,
|
| 58 |
+
"ddp_backend": null,
|
| 59 |
+
"tpu_num_cores": null,
|
| 60 |
+
"tpu_metrics_debug": false,
|
| 61 |
+
"debug": null,
|
| 62 |
+
"dataloader_drop_last": false,
|
| 63 |
+
"eval_steps": 100.0,
|
| 64 |
+
"dataloader_num_workers": 4,
|
| 65 |
+
"dataloader_prefetch_factor": null,
|
| 66 |
+
"past_index": -1,
|
| 67 |
+
"run_name": "/home/ab/document-parsing/output/training/v1-20260117-010840",
|
| 68 |
+
"disable_tqdm": null,
|
| 69 |
+
"remove_unused_columns": true,
|
| 70 |
+
"label_names": null,
|
| 71 |
+
"load_best_model_at_end": false,
|
| 72 |
+
"metric_for_best_model": "loss",
|
| 73 |
+
"greater_is_better": false,
|
| 74 |
+
"ignore_data_skip": false,
|
| 75 |
+
"fsdp": [],
|
| 76 |
+
"fsdp_min_num_params": 0,
|
| 77 |
+
"fsdp_config": null,
|
| 78 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 79 |
+
"accelerator_config": {
|
| 80 |
+
"dispatch_batches": false
|
| 81 |
+
},
|
| 82 |
+
"parallelism_config": null,
|
| 83 |
+
"deepspeed": null,
|
| 84 |
+
"label_smoothing_factor": 0.0,
|
| 85 |
+
"optim": "adamw_torch_fused",
|
| 86 |
+
"optim_args": null,
|
| 87 |
+
"adafactor": false,
|
| 88 |
+
"group_by_length": false,
|
| 89 |
+
"length_column_name": "length",
|
| 90 |
+
"report_to": [
|
| 91 |
+
"tensorboard"
|
| 92 |
+
],
|
| 93 |
+
"project": "huggingface",
|
| 94 |
+
"trackio_space_id": "trackio",
|
| 95 |
+
"ddp_find_unused_parameters": null,
|
| 96 |
+
"ddp_bucket_cap_mb": null,
|
| 97 |
+
"ddp_broadcast_buffers": null,
|
| 98 |
+
"dataloader_pin_memory": true,
|
| 99 |
+
"dataloader_persistent_workers": false,
|
| 100 |
+
"skip_memory_metrics": true,
|
| 101 |
+
"use_legacy_prediction_loop": false,
|
| 102 |
+
"push_to_hub": false,
|
| 103 |
+
"resume_from_checkpoint": null,
|
| 104 |
+
"hub_model_id": null,
|
| 105 |
+
"hub_strategy": "every_save",
|
| 106 |
+
"hub_token": null,
|
| 107 |
+
"hub_private_repo": null,
|
| 108 |
+
"hub_always_push": false,
|
| 109 |
+
"hub_revision": null,
|
| 110 |
+
"gradient_checkpointing": true,
|
| 111 |
+
"gradient_checkpointing_kwargs": null,
|
| 112 |
+
"include_inputs_for_metrics": false,
|
| 113 |
+
"include_for_metrics": [],
|
| 114 |
+
"eval_do_concat_batches": true,
|
| 115 |
+
"fp16_backend": "auto",
|
| 116 |
+
"push_to_hub_model_id": null,
|
| 117 |
+
"push_to_hub_organization": null,
|
| 118 |
+
"push_to_hub_token": null,
|
| 119 |
+
"mp_parameters": "",
|
| 120 |
+
"auto_find_batch_size": false,
|
| 121 |
+
"full_determinism": false,
|
| 122 |
+
"torchdynamo": null,
|
| 123 |
+
"ray_scope": "last",
|
| 124 |
+
"ddp_timeout": 18000000,
|
| 125 |
+
"torch_compile": false,
|
| 126 |
+
"torch_compile_backend": null,
|
| 127 |
+
"torch_compile_mode": null,
|
| 128 |
+
"include_tokens_per_second": false,
|
| 129 |
+
"include_num_input_tokens_seen": false,
|
| 130 |
+
"neftune_noise_alpha": null,
|
| 131 |
+
"optim_target_modules": null,
|
| 132 |
+
"batch_eval_metrics": false,
|
| 133 |
+
"eval_on_start": false,
|
| 134 |
+
"use_liger_kernel": false,
|
| 135 |
+
"liger_kernel_config": null,
|
| 136 |
+
"eval_use_gather_object": false,
|
| 137 |
+
"average_tokens_across_devices": true,
|
| 138 |
+
"sortish_sampler": false,
|
| 139 |
+
"predict_with_generate": false,
|
| 140 |
+
"generation_max_length": null,
|
| 141 |
+
"generation_num_beams": null,
|
| 142 |
+
"generation_config": null,
|
| 143 |
+
"tuner_backend": "peft",
|
| 144 |
+
"vit_gradient_checkpointing": null,
|
| 145 |
+
"router_aux_loss_coef": 0.0,
|
| 146 |
+
"enable_dft_loss": false,
|
| 147 |
+
"enable_channel_loss": false,
|
| 148 |
+
"check_model": true,
|
| 149 |
+
"acc_strategy": "token",
|
| 150 |
+
"train_dataloader_shuffle": true,
|
| 151 |
+
"max_epochs": null,
|
| 152 |
+
"aligner_lr": null,
|
| 153 |
+
"vit_lr": null,
|
| 154 |
+
"use_logits_to_keep": null,
|
| 155 |
+
"ds3_gather_for_generation": true,
|
| 156 |
+
"resume_only_model": false,
|
| 157 |
+
"optimizer": null,
|
| 158 |
+
"loss_type": null,
|
| 159 |
+
"metric": null,
|
| 160 |
+
"eval_use_evalscope": false,
|
| 161 |
+
"eval_dataset": [],
|
| 162 |
+
"eval_dataset_args": null,
|
| 163 |
+
"eval_limit": null,
|
| 164 |
+
"eval_generation_config": null,
|
| 165 |
+
"extra_eval_args": null,
|
| 166 |
+
"use_flash_ckpt": false,
|
| 167 |
+
"use_ray": false,
|
| 168 |
+
"ray_exp_name": null,
|
| 169 |
+
"device_groups": null,
|
| 170 |
+
"model": "nanonets/Nanonets-OCR2-3B",
|
| 171 |
+
"model_type": "qwen2_5_vl",
|
| 172 |
+
"model_revision": null,
|
| 173 |
+
"task_type": "causal_lm",
|
| 174 |
+
"torch_dtype": "bfloat16",
|
| 175 |
+
"attn_impl": null,
|
| 176 |
+
"new_special_tokens": [],
|
| 177 |
+
"num_labels": null,
|
| 178 |
+
"problem_type": null,
|
| 179 |
+
"rope_scaling": null,
|
| 180 |
+
"device_map": null,
|
| 181 |
+
"max_memory": {},
|
| 182 |
+
"max_model_len": null,
|
| 183 |
+
"local_repo_path": null,
|
| 184 |
+
"init_strategy": null,
|
| 185 |
+
"template": "qwen2_5_vl",
|
| 186 |
+
"system": null,
|
| 187 |
+
"max_length": 8192,
|
| 188 |
+
"truncation_strategy": "delete",
|
| 189 |
+
"max_pixels": null,
|
| 190 |
+
"agent_template": null,
|
| 191 |
+
"norm_bbox": null,
|
| 192 |
+
"use_chat_template": true,
|
| 193 |
+
"padding_side": "right",
|
| 194 |
+
"padding_free": false,
|
| 195 |
+
"loss_scale": "default",
|
| 196 |
+
"sequence_parallel_size": 1,
|
| 197 |
+
"template_backend": "swift",
|
| 198 |
+
"response_prefix": null,
|
| 199 |
+
"enable_thinking": null,
|
| 200 |
+
"add_non_thinking_prefix": true,
|
| 201 |
+
"dataset": [
|
| 202 |
+
"/home/ab/document-parsing/output/datasets/train.jsonl"
|
| 203 |
+
],
|
| 204 |
+
"val_dataset": [],
|
| 205 |
+
"cached_dataset": [],
|
| 206 |
+
"cached_val_dataset": [],
|
| 207 |
+
"split_dataset_ratio": 0.0,
|
| 208 |
+
"dataset_num_proc": 1,
|
| 209 |
+
"load_from_cache_file": false,
|
| 210 |
+
"dataset_shuffle": true,
|
| 211 |
+
"val_dataset_shuffle": false,
|
| 212 |
+
"streaming": false,
|
| 213 |
+
"interleave_prob": null,
|
| 214 |
+
"stopping_strategy": "first_exhausted",
|
| 215 |
+
"shuffle_buffer_size": 1000,
|
| 216 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 217 |
+
"columns": {},
|
| 218 |
+
"strict": false,
|
| 219 |
+
"model_name": null,
|
| 220 |
+
"model_author": null,
|
| 221 |
+
"custom_dataset_info": [],
|
| 222 |
+
"quant_method": null,
|
| 223 |
+
"quant_bits": null,
|
| 224 |
+
"hqq_axis": null,
|
| 225 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 226 |
+
"bnb_4bit_quant_type": "nf4",
|
| 227 |
+
"bnb_4bit_use_double_quant": true,
|
| 228 |
+
"bnb_4bit_quant_storage": null,
|
| 229 |
+
"max_new_tokens": 64,
|
| 230 |
+
"temperature": 0.0,
|
| 231 |
+
"top_k": null,
|
| 232 |
+
"top_p": null,
|
| 233 |
+
"repetition_penalty": null,
|
| 234 |
+
"num_beams": 1,
|
| 235 |
+
"stream": false,
|
| 236 |
+
"stop_words": [],
|
| 237 |
+
"logprobs": false,
|
| 238 |
+
"top_logprobs": null,
|
| 239 |
+
"structured_outputs_regex": null,
|
| 240 |
+
"ckpt_dir": null,
|
| 241 |
+
"lora_modules": [],
|
| 242 |
+
"train_type": "lora",
|
| 243 |
+
"adapters": [],
|
| 244 |
+
"external_plugins": [],
|
| 245 |
+
"model_kwargs": {},
|
| 246 |
+
"load_args": false,
|
| 247 |
+
"load_data_args": false,
|
| 248 |
+
"packing": false,
|
| 249 |
+
"packing_length": null,
|
| 250 |
+
"packing_num_proc": 1,
|
| 251 |
+
"lazy_tokenize": true,
|
| 252 |
+
"custom_register_path": [],
|
| 253 |
+
"use_hf": false,
|
| 254 |
+
"ignore_args_error": false,
|
| 255 |
+
"use_swift_lora": false,
|
| 256 |
+
"freeze_parameters": [],
|
| 257 |
+
"freeze_parameters_regex": null,
|
| 258 |
+
"freeze_parameters_ratio": 0.0,
|
| 259 |
+
"trainable_parameters": [],
|
| 260 |
+
"trainable_parameters_regex": null,
|
| 261 |
+
"freeze_llm": false,
|
| 262 |
+
"freeze_vit": false,
|
| 263 |
+
"freeze_aligner": true,
|
| 264 |
+
"target_modules": [
|
| 265 |
+
"all-linear"
|
| 266 |
+
],
|
| 267 |
+
"target_regex": null,
|
| 268 |
+
"target_parameters": null,
|
| 269 |
+
"modules_to_save": [],
|
| 270 |
+
"lora_rank": 64,
|
| 271 |
+
"lora_alpha": 16,
|
| 272 |
+
"lora_dropout": 0.05,
|
| 273 |
+
"lora_bias": "none",
|
| 274 |
+
"lora_dtype": null,
|
| 275 |
+
"lorap_lr_ratio": null,
|
| 276 |
+
"use_rslora": false,
|
| 277 |
+
"use_dora": false,
|
| 278 |
+
"lora_ga_batch_size": 2,
|
| 279 |
+
"lora_ga_iters": 2,
|
| 280 |
+
"lora_ga_max_length": 1024,
|
| 281 |
+
"lora_ga_direction": "ArB2r",
|
| 282 |
+
"lora_ga_scale": "stable",
|
| 283 |
+
"lora_ga_stable_gamma": 16,
|
| 284 |
+
"init_weights": true,
|
| 285 |
+
"fourier_n_frequency": 2000,
|
| 286 |
+
"fourier_scaling": 300.0,
|
| 287 |
+
"boft_block_size": 4,
|
| 288 |
+
"boft_block_num": 0,
|
| 289 |
+
"boft_n_butterfly_factor": 1,
|
| 290 |
+
"boft_dropout": 0.0,
|
| 291 |
+
"vera_rank": 256,
|
| 292 |
+
"vera_projection_prng_key": 0,
|
| 293 |
+
"vera_dropout": 0.0,
|
| 294 |
+
"vera_d_initial": 0.1,
|
| 295 |
+
"adapter_act": "gelu",
|
| 296 |
+
"adapter_length": 128,
|
| 297 |
+
"use_galore": false,
|
| 298 |
+
"galore_target_modules": null,
|
| 299 |
+
"galore_rank": 128,
|
| 300 |
+
"galore_update_proj_gap": 50,
|
| 301 |
+
"galore_scale": 1.0,
|
| 302 |
+
"galore_proj_type": "std",
|
| 303 |
+
"galore_optim_per_parameter": false,
|
| 304 |
+
"galore_with_embedding": false,
|
| 305 |
+
"galore_quantization": false,
|
| 306 |
+
"galore_proj_quant": false,
|
| 307 |
+
"galore_proj_bits": 4,
|
| 308 |
+
"galore_proj_group_size": 256,
|
| 309 |
+
"galore_cos_threshold": 0.4,
|
| 310 |
+
"galore_gamma_proj": 2,
|
| 311 |
+
"galore_queue_size": 5,
|
| 312 |
+
"adalora_target_r": 8,
|
| 313 |
+
"adalora_init_r": 12,
|
| 314 |
+
"adalora_tinit": 0,
|
| 315 |
+
"adalora_tfinal": 0,
|
| 316 |
+
"adalora_deltaT": 1,
|
| 317 |
+
"adalora_beta1": 0.85,
|
| 318 |
+
"adalora_beta2": 0.85,
|
| 319 |
+
"adalora_orth_reg_weight": 0.5,
|
| 320 |
+
"llamapro_num_new_blocks": 4,
|
| 321 |
+
"llamapro_num_groups": null,
|
| 322 |
+
"lisa_activated_layers": 0,
|
| 323 |
+
"lisa_step_interval": 20,
|
| 324 |
+
"reft_layer_key": null,
|
| 325 |
+
"reft_layers": null,
|
| 326 |
+
"reft_rank": 4,
|
| 327 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 328 |
+
"reft_args": null,
|
| 329 |
+
"swanlab_token": null,
|
| 330 |
+
"swanlab_project": "ms-swift",
|
| 331 |
+
"swanlab_workspace": null,
|
| 332 |
+
"swanlab_exp_name": null,
|
| 333 |
+
"swanlab_notification_method": null,
|
| 334 |
+
"swanlab_webhook_url": null,
|
| 335 |
+
"swanlab_secret": null,
|
| 336 |
+
"swanlab_mode": "cloud",
|
| 337 |
+
"add_version": true,
|
| 338 |
+
"create_checkpoint_symlink": false,
|
| 339 |
+
"zero_hpz_partition_size": null,
|
| 340 |
+
"deepspeed_autotp_size": null,
|
| 341 |
+
"early_stop_interval": null,
|
| 342 |
+
"rank": -1,
|
| 343 |
+
"global_world_size": 1,
|
| 344 |
+
"local_world_size": 1,
|
| 345 |
+
"model_suffix": "Nanonets-OCR2-3B",
|
| 346 |
+
"model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'mrope_section': [16, 24, 24], 'rope_type': 'default', 'type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
|
| 347 |
+
"model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7c76215fac00>, model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=['vision', 'video'])",
|
| 348 |
+
"model_dir": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
|
| 349 |
+
"_val_dataset_exists": [],
|
| 350 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 351 |
+
"evaluation_strategy": "steps",
|
| 352 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/home/ab/document-parsing/output/training/v1-20260117-010840', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ab/document-parsing/output/training/v1-20260117-010840/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=100.0, dataloader_num_workers=4, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/ab/document-parsing/output/training/v1-20260117-010840', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='lora', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
|
| 353 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-500/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35902d5d1198fb62622da98a4840274c8a8331dc3e77a6e5e7b95a3d8231fac5
|
| 3 |
+
size 1315426955
|
output/training/v1-20260117-010840-10e/checkpoint-500/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9dea05ecba7432f5405ea0b1af074f578def0664083423526d4ab725022c5bdc
|
| 3 |
+
size 14645
|
output/training/v1-20260117-010840-10e/checkpoint-500/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94c11ced83f5ac31b306f251ad9a334516c5d69155e85aa8d0a2db0dc5539a56
|
| 3 |
+
size 1465
|
output/training/v1-20260117-010840-10e/checkpoint-500/trainer_state.json
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 8.623376623376624,
|
| 6 |
+
"eval_steps": 100.0,
|
| 7 |
+
"global_step": 500,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.017316017316017316,
|
| 14 |
+
"grad_norm": 0.4092565178871155,
|
| 15 |
+
"learning_rate": 3.448275862068966e-06,
|
| 16 |
+
"loss": 1.4861114025115967,
|
| 17 |
+
"step": 1,
|
| 18 |
+
"token_acc": 0.6811960725974412
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"epoch": 0.17316017316017315,
|
| 22 |
+
"grad_norm": 0.3977337181568146,
|
| 23 |
+
"learning_rate": 3.4482758620689657e-05,
|
| 24 |
+
"loss": 1.4343115488688152,
|
| 25 |
+
"step": 10,
|
| 26 |
+
"token_acc": 0.6920024476626676
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"epoch": 0.3463203463203463,
|
| 30 |
+
"grad_norm": 0.2495131641626358,
|
| 31 |
+
"learning_rate": 6.896551724137931e-05,
|
| 32 |
+
"loss": 1.3693717956542968,
|
| 33 |
+
"step": 20,
|
| 34 |
+
"token_acc": 0.7011260365349897
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"epoch": 0.5194805194805194,
|
| 38 |
+
"grad_norm": 0.24984458088874817,
|
| 39 |
+
"learning_rate": 9.999918729041868e-05,
|
| 40 |
+
"loss": 1.1922229766845702,
|
| 41 |
+
"step": 30,
|
| 42 |
+
"token_acc": 0.726987948088823
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"epoch": 0.6926406926406926,
|
| 46 |
+
"grad_norm": 0.3221384584903717,
|
| 47 |
+
"learning_rate": 9.990169410465536e-05,
|
| 48 |
+
"loss": 1.0192347526550294,
|
| 49 |
+
"step": 40,
|
| 50 |
+
"token_acc": 0.7609010955099522
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"epoch": 0.8658008658008658,
|
| 54 |
+
"grad_norm": 0.40206295251846313,
|
| 55 |
+
"learning_rate": 9.964202208175834e-05,
|
| 56 |
+
"loss": 0.9150349617004394,
|
| 57 |
+
"step": 50,
|
| 58 |
+
"token_acc": 0.7773335965518376
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 1.0346320346320346,
|
| 62 |
+
"grad_norm": 0.20406530797481537,
|
| 63 |
+
"learning_rate": 9.922101514711866e-05,
|
| 64 |
+
"loss": 0.7742667198181152,
|
| 65 |
+
"step": 60,
|
| 66 |
+
"token_acc": 0.8123942631570925
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 1.2077922077922079,
|
| 70 |
+
"grad_norm": 1.4768069982528687,
|
| 71 |
+
"learning_rate": 9.864004155919543e-05,
|
| 72 |
+
"loss": 0.6983946800231934,
|
| 73 |
+
"step": 70,
|
| 74 |
+
"token_acc": 0.8248333138378757
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"epoch": 1.380952380952381,
|
| 78 |
+
"grad_norm": 0.611409604549408,
|
| 79 |
+
"learning_rate": 9.790098946272177e-05,
|
| 80 |
+
"loss": 0.6138243198394775,
|
| 81 |
+
"step": 80,
|
| 82 |
+
"token_acc": 0.8442561143531572
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"epoch": 1.554112554112554,
|
| 86 |
+
"grad_norm": 0.3051394820213318,
|
| 87 |
+
"learning_rate": 9.700626075229738e-05,
|
| 88 |
+
"loss": 0.5975491523742675,
|
| 89 |
+
"step": 90,
|
| 90 |
+
"token_acc": 0.8483123092893768
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"epoch": 1.7272727272727273,
|
| 94 |
+
"grad_norm": 0.3783220648765564,
|
| 95 |
+
"learning_rate": 9.595876326631154e-05,
|
| 96 |
+
"loss": 0.5410520553588867,
|
| 97 |
+
"step": 100,
|
| 98 |
+
"token_acc": 0.8605094145609629
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"epoch": 1.9004329004329006,
|
| 102 |
+
"grad_norm": 0.6039865612983704,
|
| 103 |
+
"learning_rate": 9.476190133656548e-05,
|
| 104 |
+
"loss": 0.5531170845031739,
|
| 105 |
+
"step": 110,
|
| 106 |
+
"token_acc": 0.8547892544963617
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"epoch": 2.069264069264069,
|
| 110 |
+
"grad_norm": 0.5374985337257385,
|
| 111 |
+
"learning_rate": 9.341956472430801e-05,
|
| 112 |
+
"loss": 0.5079349040985107,
|
| 113 |
+
"step": 120,
|
| 114 |
+
"token_acc": 0.864488826645558
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 2.242424242424242,
|
| 118 |
+
"grad_norm": 0.364619642496109,
|
| 119 |
+
"learning_rate": 9.193611597864139e-05,
|
| 120 |
+
"loss": 0.44995865821838377,
|
| 121 |
+
"step": 130,
|
| 122 |
+
"token_acc": 0.8797397710240138
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 2.4155844155844157,
|
| 126 |
+
"grad_norm": 1.59947669506073,
|
| 127 |
+
"learning_rate": 9.031637625838265e-05,
|
| 128 |
+
"loss": 0.429323148727417,
|
| 129 |
+
"step": 140,
|
| 130 |
+
"token_acc": 0.8858490566037736
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"epoch": 2.588744588744589,
|
| 134 |
+
"grad_norm": 0.46518200635910034,
|
| 135 |
+
"learning_rate": 8.856560966345877e-05,
|
| 136 |
+
"loss": 0.4315037727355957,
|
| 137 |
+
"step": 150,
|
| 138 |
+
"token_acc": 0.8819307344821817
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 2.761904761904762,
|
| 142 |
+
"grad_norm": 0.691148579120636,
|
| 143 |
+
"learning_rate": 8.668950612675785e-05,
|
| 144 |
+
"loss": 0.40119166374206544,
|
| 145 |
+
"step": 160,
|
| 146 |
+
"token_acc": 0.8896224924972358
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"epoch": 2.935064935064935,
|
| 150 |
+
"grad_norm": 0.3540444076061249,
|
| 151 |
+
"learning_rate": 8.469416292203747e-05,
|
| 152 |
+
"loss": 0.40500435829162595,
|
| 153 |
+
"step": 170,
|
| 154 |
+
"token_acc": 0.8917646715924161
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"epoch": 3.103896103896104,
|
| 158 |
+
"grad_norm": 0.3412817418575287,
|
| 159 |
+
"learning_rate": 8.258606484798897e-05,
|
| 160 |
+
"loss": 0.37092483043670654,
|
| 161 |
+
"step": 180,
|
| 162 |
+
"token_acc": 0.8977291233149371
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"epoch": 3.277056277056277,
|
| 166 |
+
"grad_norm": 0.34155094623565674,
|
| 167 |
+
"learning_rate": 8.037206315285843e-05,
|
| 168 |
+
"loss": 0.344103741645813,
|
| 169 |
+
"step": 190,
|
| 170 |
+
"token_acc": 0.9065206570433051
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 3.45021645021645,
|
| 174 |
+
"grad_norm": 0.3627335727214813,
|
| 175 |
+
"learning_rate": 7.805935326811912e-05,
|
| 176 |
+
"loss": 0.3504387140274048,
|
| 177 |
+
"step": 200,
|
| 178 |
+
"token_acc": 0.9002762340096682
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 3.6233766233766236,
|
| 182 |
+
"grad_norm": 0.8141089677810669,
|
| 183 |
+
"learning_rate": 7.565545142355971e-05,
|
| 184 |
+
"loss": 0.3558197498321533,
|
| 185 |
+
"step": 210,
|
| 186 |
+
"token_acc": 0.8999160043936163
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"epoch": 3.7965367965367967,
|
| 190 |
+
"grad_norm": 0.6176502108573914,
|
| 191 |
+
"learning_rate": 7.316817021978884e-05,
|
| 192 |
+
"loss": 0.33676347732543943,
|
| 193 |
+
"step": 220,
|
| 194 |
+
"token_acc": 0.904816147992892
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"epoch": 3.9696969696969697,
|
| 198 |
+
"grad_norm": 0.49287620186805725,
|
| 199 |
+
"learning_rate": 7.060559323754435e-05,
|
| 200 |
+
"loss": 0.35226542949676515,
|
| 201 |
+
"step": 230,
|
| 202 |
+
"token_acc": 0.9020813028578615
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 4.138528138528138,
|
| 206 |
+
"grad_norm": 0.6057422161102295,
|
| 207 |
+
"learning_rate": 6.797604876632633e-05,
|
| 208 |
+
"loss": 0.3057840585708618,
|
| 209 |
+
"step": 240,
|
| 210 |
+
"token_acc": 0.9123896645803242
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"epoch": 4.311688311688312,
|
| 214 |
+
"grad_norm": 12.585014343261719,
|
| 215 |
+
"learning_rate": 6.528808273773461e-05,
|
| 216 |
+
"loss": 0.301344108581543,
|
| 217 |
+
"step": 250,
|
| 218 |
+
"token_acc": 0.9142363149996737
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"epoch": 4.484848484848484,
|
| 222 |
+
"grad_norm": 0.32902830839157104,
|
| 223 |
+
"learning_rate": 6.255043095147679e-05,
|
| 224 |
+
"loss": 0.2898148775100708,
|
| 225 |
+
"step": 260,
|
| 226 |
+
"token_acc": 0.9177889157552563
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 4.658008658008658,
|
| 230 |
+
"grad_norm": 0.39732787013053894,
|
| 231 |
+
"learning_rate": 5.9771990684311544e-05,
|
| 232 |
+
"loss": 0.29072208404541017,
|
| 233 |
+
"step": 270,
|
| 234 |
+
"token_acc": 0.917258875717698
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 4.8311688311688314,
|
| 238 |
+
"grad_norm": 0.44461533427238464,
|
| 239 |
+
"learning_rate": 5.6961791774196424e-05,
|
| 240 |
+
"loss": 0.2852530241012573,
|
| 241 |
+
"step": 280,
|
| 242 |
+
"token_acc": 0.9166775180675826
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"epoch": 5.0,
|
| 246 |
+
"grad_norm": 0.35245048999786377,
|
| 247 |
+
"learning_rate": 5.4128967273616625e-05,
|
| 248 |
+
"loss": 0.3020582675933838,
|
| 249 |
+
"step": 290,
|
| 250 |
+
"token_acc": 0.9138208862720794
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"epoch": 5.1731601731601735,
|
| 254 |
+
"grad_norm": 0.36154425144195557,
|
| 255 |
+
"learning_rate": 5.128272376746972e-05,
|
| 256 |
+
"loss": 0.23758175373077392,
|
| 257 |
+
"step": 300,
|
| 258 |
+
"token_acc": 0.9282945419454031
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 5.346320346320346,
|
| 262 |
+
"grad_norm": 0.40296199917793274,
|
| 263 |
+
"learning_rate": 4.8432311451972665e-05,
|
| 264 |
+
"loss": 0.27498042583465576,
|
| 265 |
+
"step": 310,
|
| 266 |
+
"token_acc": 0.9217681765679143
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"epoch": 5.51948051948052,
|
| 270 |
+
"grad_norm": 0.9700812697410583,
|
| 271 |
+
"learning_rate": 4.558699407183338e-05,
|
| 272 |
+
"loss": 0.2576076745986938,
|
| 273 |
+
"step": 320,
|
| 274 |
+
"token_acc": 0.9252093233763294
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"epoch": 5.692640692640692,
|
| 278 |
+
"grad_norm": 0.4304976761341095,
|
| 279 |
+
"learning_rate": 4.2756018813390274e-05,
|
| 280 |
+
"loss": 0.2424612522125244,
|
| 281 |
+
"step": 330,
|
| 282 |
+
"token_acc": 0.9276378041152792
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 5.865800865800866,
|
| 286 |
+
"grad_norm": 0.4652138650417328,
|
| 287 |
+
"learning_rate": 3.9948586251565825e-05,
|
| 288 |
+
"loss": 0.259202766418457,
|
| 289 |
+
"step": 340,
|
| 290 |
+
"token_acc": 0.9240967292621122
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 6.034632034632034,
|
| 294 |
+
"grad_norm": 0.37480419874191284,
|
| 295 |
+
"learning_rate": 3.7173820448305755e-05,
|
| 296 |
+
"loss": 0.2334808111190796,
|
| 297 |
+
"step": 350,
|
| 298 |
+
"token_acc": 0.9299400823867182
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"epoch": 6.207792207792208,
|
| 302 |
+
"grad_norm": 0.5389286279678345,
|
| 303 |
+
"learning_rate": 3.444073929968284e-05,
|
| 304 |
+
"loss": 0.23487865924835205,
|
| 305 |
+
"step": 360,
|
| 306 |
+
"token_acc": 0.9300512852684243
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"epoch": 6.380952380952381,
|
| 310 |
+
"grad_norm": 0.4614177942276001,
|
| 311 |
+
"learning_rate": 3.175822522803623e-05,
|
| 312 |
+
"loss": 0.21724979877471923,
|
| 313 |
+
"step": 370,
|
| 314 |
+
"token_acc": 0.9360088365243004
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 6.554112554112554,
|
| 318 |
+
"grad_norm": 0.3773002326488495,
|
| 319 |
+
"learning_rate": 2.9134996314395818e-05,
|
| 320 |
+
"loss": 0.20992758274078369,
|
| 321 |
+
"step": 380,
|
| 322 |
+
"token_acc": 0.9362415581566618
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"epoch": 6.7272727272727275,
|
| 326 |
+
"grad_norm": 1.1898497343063354,
|
| 327 |
+
"learning_rate": 2.65795779650105e-05,
|
| 328 |
+
"loss": 0.2153007745742798,
|
| 329 |
+
"step": 390,
|
| 330 |
+
"token_acc": 0.9367496189220204
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"epoch": 6.9004329004329,
|
| 334 |
+
"grad_norm": 0.8586929440498352,
|
| 335 |
+
"learning_rate": 2.41002752040629e-05,
|
| 336 |
+
"loss": 0.22280852794647216,
|
| 337 |
+
"step": 400,
|
| 338 |
+
"token_acc": 0.9341588229918669
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 7.06926406926407,
|
| 342 |
+
"grad_norm": 0.5149306058883667,
|
| 343 |
+
"learning_rate": 2.1705145682618505e-05,
|
| 344 |
+
"loss": 0.21320977210998535,
|
| 345 |
+
"step": 410,
|
| 346 |
+
"token_acc": 0.9383294431477159
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 7.242424242424242,
|
| 350 |
+
"grad_norm": 0.4976541996002197,
|
| 351 |
+
"learning_rate": 1.940197349152923e-05,
|
| 352 |
+
"loss": 0.1985553979873657,
|
| 353 |
+
"step": 420,
|
| 354 |
+
"token_acc": 0.9401391309809833
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"epoch": 7.415584415584416,
|
| 358 |
+
"grad_norm": 0.4779481589794159,
|
| 359 |
+
"learning_rate": 1.7198243863398273e-05,
|
| 360 |
+
"loss": 0.20875980854034423,
|
| 361 |
+
"step": 430,
|
| 362 |
+
"token_acc": 0.9373778262148182
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"epoch": 7.588744588744589,
|
| 366 |
+
"grad_norm": 0.6022359132766724,
|
| 367 |
+
"learning_rate": 1.510111884582463e-05,
|
| 368 |
+
"loss": 0.19188997745513917,
|
| 369 |
+
"step": 440,
|
| 370 |
+
"token_acc": 0.942989444333798
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"epoch": 7.761904761904762,
|
| 374 |
+
"grad_norm": 0.497090607881546,
|
| 375 |
+
"learning_rate": 1.3117414024987823e-05,
|
| 376 |
+
"loss": 0.1933382511138916,
|
| 377 |
+
"step": 450,
|
| 378 |
+
"token_acc": 0.9423271204556436
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"epoch": 7.935064935064935,
|
| 382 |
+
"grad_norm": 0.488971084356308,
|
| 383 |
+
"learning_rate": 1.125357637522072e-05,
|
| 384 |
+
"loss": 0.1843361496925354,
|
| 385 |
+
"step": 460,
|
| 386 |
+
"token_acc": 0.9436703366987985
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"epoch": 8.103896103896103,
|
| 390 |
+
"grad_norm": 0.767144501209259,
|
| 391 |
+
"learning_rate": 9.51566330655857e-06,
|
| 392 |
+
"loss": 0.19610201120376586,
|
| 393 |
+
"step": 470,
|
| 394 |
+
"token_acc": 0.9421800227876946
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 8.277056277056277,
|
| 398 |
+
"grad_norm": 0.4893112778663635,
|
| 399 |
+
"learning_rate": 7.909322978358913e-06,
|
| 400 |
+
"loss": 0.170158052444458,
|
| 401 |
+
"step": 480,
|
| 402 |
+
"token_acc": 0.9497098970386021
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 8.45021645021645,
|
| 406 |
+
"grad_norm": 0.5407018661499023,
|
| 407 |
+
"learning_rate": 6.439775942972609e-06,
|
| 408 |
+
"loss": 0.1650066614151001,
|
| 409 |
+
"step": 490,
|
| 410 |
+
"token_acc": 0.9508892299359032
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"epoch": 8.623376623376624,
|
| 414 |
+
"grad_norm": 0.41522547602653503,
|
| 415 |
+
"learning_rate": 5.111798179123173e-06,
|
| 416 |
+
"loss": 0.1943192720413208,
|
| 417 |
+
"step": 500,
|
| 418 |
+
"token_acc": 0.9430037937960277
|
| 419 |
+
}
|
| 420 |
+
],
|
| 421 |
+
"logging_steps": 10,
|
| 422 |
+
"max_steps": 580,
|
| 423 |
+
"num_input_tokens_seen": 0,
|
| 424 |
+
"num_train_epochs": 10,
|
| 425 |
+
"save_steps": 100,
|
| 426 |
+
"stateful_callbacks": {
|
| 427 |
+
"TrainerControl": {
|
| 428 |
+
"args": {
|
| 429 |
+
"should_epoch_stop": false,
|
| 430 |
+
"should_evaluate": false,
|
| 431 |
+
"should_log": false,
|
| 432 |
+
"should_save": true,
|
| 433 |
+
"should_training_stop": false
|
| 434 |
+
},
|
| 435 |
+
"attributes": {}
|
| 436 |
+
}
|
| 437 |
+
},
|
| 438 |
+
"total_flos": 1.9231358022524928e+17,
|
| 439 |
+
"train_batch_size": 1,
|
| 440 |
+
"trial_name": null,
|
| 441 |
+
"trial_params": null
|
| 442 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-500/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e5274be8af993948bcfc3f1251ec27de22bce224d71e604e5b270f182b3aac2
|
| 3 |
+
size 6993
|
output/training/v1-20260117-010840-10e/checkpoint-580/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: ''
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.18.1
|
output/training/v1-20260117-010840-10e/checkpoint-580/adapter_config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": [],
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 64,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": "^(model.language_model.*\\.(down_proj|up_proj|gate_proj|v_proj|k_proj|q_proj|o_proj)|(?!(model.visual.merger))model.visual.*\\.(mlp.0|down_proj|up_proj|gate_proj|mlp.2|qkv|attn.proj))$",
|
| 32 |
+
"target_parameters": null,
|
| 33 |
+
"task_type": "CAUSAL_LM",
|
| 34 |
+
"trainable_token_indices": null,
|
| 35 |
+
"use_dora": false,
|
| 36 |
+
"use_qalora": false,
|
| 37 |
+
"use_rslora": false
|
| 38 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-580/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7fbac9ce4144065f68ad19a5930a57921a1aad93aaa5d6ed500b386e5584010c
|
| 3 |
+
size 657478696
|
output/training/v1-20260117-010840-10e/checkpoint-580/additional_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
|
output/training/v1-20260117-010840-10e/checkpoint-580/args.json
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840",
|
| 3 |
+
"overwrite_output_dir": false,
|
| 4 |
+
"do_train": false,
|
| 5 |
+
"do_eval": false,
|
| 6 |
+
"do_predict": false,
|
| 7 |
+
"eval_strategy": "no",
|
| 8 |
+
"prediction_loss_only": false,
|
| 9 |
+
"per_device_train_batch_size": 1,
|
| 10 |
+
"per_device_eval_batch_size": 1,
|
| 11 |
+
"per_gpu_train_batch_size": null,
|
| 12 |
+
"per_gpu_eval_batch_size": null,
|
| 13 |
+
"gradient_accumulation_steps": 8,
|
| 14 |
+
"eval_accumulation_steps": null,
|
| 15 |
+
"eval_delay": 0,
|
| 16 |
+
"torch_empty_cache_steps": null,
|
| 17 |
+
"learning_rate": 0.0001,
|
| 18 |
+
"weight_decay": 0.1,
|
| 19 |
+
"adam_beta1": 0.9,
|
| 20 |
+
"adam_beta2": 0.95,
|
| 21 |
+
"adam_epsilon": 1e-08,
|
| 22 |
+
"max_grad_norm": 1.0,
|
| 23 |
+
"num_train_epochs": 10.0,
|
| 24 |
+
"max_steps": -1,
|
| 25 |
+
"lr_scheduler_type": "cosine",
|
| 26 |
+
"lr_scheduler_kwargs": null,
|
| 27 |
+
"warmup_ratio": 0.05,
|
| 28 |
+
"warmup_steps": 0,
|
| 29 |
+
"log_level": "passive",
|
| 30 |
+
"log_level_replica": "warning",
|
| 31 |
+
"log_on_each_node": true,
|
| 32 |
+
"logging_dir": "/home/ab/document-parsing/output/training/v1-20260117-010840/runs",
|
| 33 |
+
"logging_strategy": "steps",
|
| 34 |
+
"logging_first_step": true,
|
| 35 |
+
"logging_steps": 10,
|
| 36 |
+
"logging_nan_inf_filter": true,
|
| 37 |
+
"save_strategy": "steps",
|
| 38 |
+
"save_steps": 100.0,
|
| 39 |
+
"save_total_limit": 3,
|
| 40 |
+
"save_safetensors": true,
|
| 41 |
+
"save_on_each_node": false,
|
| 42 |
+
"save_only_model": false,
|
| 43 |
+
"restore_callback_states_from_checkpoint": false,
|
| 44 |
+
"no_cuda": false,
|
| 45 |
+
"use_cpu": false,
|
| 46 |
+
"use_mps_device": false,
|
| 47 |
+
"seed": 42,
|
| 48 |
+
"data_seed": 42,
|
| 49 |
+
"jit_mode_eval": false,
|
| 50 |
+
"bf16": true,
|
| 51 |
+
"fp16": false,
|
| 52 |
+
"fp16_opt_level": "O1",
|
| 53 |
+
"half_precision_backend": "auto",
|
| 54 |
+
"bf16_full_eval": false,
|
| 55 |
+
"fp16_full_eval": false,
|
| 56 |
+
"tf32": null,
|
| 57 |
+
"local_rank": -1,
|
| 58 |
+
"ddp_backend": null,
|
| 59 |
+
"tpu_num_cores": null,
|
| 60 |
+
"tpu_metrics_debug": false,
|
| 61 |
+
"debug": null,
|
| 62 |
+
"dataloader_drop_last": false,
|
| 63 |
+
"eval_steps": 100.0,
|
| 64 |
+
"dataloader_num_workers": 4,
|
| 65 |
+
"dataloader_prefetch_factor": null,
|
| 66 |
+
"past_index": -1,
|
| 67 |
+
"run_name": "/home/ab/document-parsing/output/training/v1-20260117-010840",
|
| 68 |
+
"disable_tqdm": null,
|
| 69 |
+
"remove_unused_columns": true,
|
| 70 |
+
"label_names": null,
|
| 71 |
+
"load_best_model_at_end": false,
|
| 72 |
+
"metric_for_best_model": "loss",
|
| 73 |
+
"greater_is_better": false,
|
| 74 |
+
"ignore_data_skip": false,
|
| 75 |
+
"fsdp": [],
|
| 76 |
+
"fsdp_min_num_params": 0,
|
| 77 |
+
"fsdp_config": null,
|
| 78 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 79 |
+
"accelerator_config": {
|
| 80 |
+
"dispatch_batches": false
|
| 81 |
+
},
|
| 82 |
+
"parallelism_config": null,
|
| 83 |
+
"deepspeed": null,
|
| 84 |
+
"label_smoothing_factor": 0.0,
|
| 85 |
+
"optim": "adamw_torch_fused",
|
| 86 |
+
"optim_args": null,
|
| 87 |
+
"adafactor": false,
|
| 88 |
+
"group_by_length": false,
|
| 89 |
+
"length_column_name": "length",
|
| 90 |
+
"report_to": [
|
| 91 |
+
"tensorboard"
|
| 92 |
+
],
|
| 93 |
+
"project": "huggingface",
|
| 94 |
+
"trackio_space_id": "trackio",
|
| 95 |
+
"ddp_find_unused_parameters": null,
|
| 96 |
+
"ddp_bucket_cap_mb": null,
|
| 97 |
+
"ddp_broadcast_buffers": null,
|
| 98 |
+
"dataloader_pin_memory": true,
|
| 99 |
+
"dataloader_persistent_workers": false,
|
| 100 |
+
"skip_memory_metrics": true,
|
| 101 |
+
"use_legacy_prediction_loop": false,
|
| 102 |
+
"push_to_hub": false,
|
| 103 |
+
"resume_from_checkpoint": null,
|
| 104 |
+
"hub_model_id": null,
|
| 105 |
+
"hub_strategy": "every_save",
|
| 106 |
+
"hub_token": null,
|
| 107 |
+
"hub_private_repo": null,
|
| 108 |
+
"hub_always_push": false,
|
| 109 |
+
"hub_revision": null,
|
| 110 |
+
"gradient_checkpointing": true,
|
| 111 |
+
"gradient_checkpointing_kwargs": null,
|
| 112 |
+
"include_inputs_for_metrics": false,
|
| 113 |
+
"include_for_metrics": [],
|
| 114 |
+
"eval_do_concat_batches": true,
|
| 115 |
+
"fp16_backend": "auto",
|
| 116 |
+
"push_to_hub_model_id": null,
|
| 117 |
+
"push_to_hub_organization": null,
|
| 118 |
+
"push_to_hub_token": null,
|
| 119 |
+
"mp_parameters": "",
|
| 120 |
+
"auto_find_batch_size": false,
|
| 121 |
+
"full_determinism": false,
|
| 122 |
+
"torchdynamo": null,
|
| 123 |
+
"ray_scope": "last",
|
| 124 |
+
"ddp_timeout": 18000000,
|
| 125 |
+
"torch_compile": false,
|
| 126 |
+
"torch_compile_backend": null,
|
| 127 |
+
"torch_compile_mode": null,
|
| 128 |
+
"include_tokens_per_second": false,
|
| 129 |
+
"include_num_input_tokens_seen": false,
|
| 130 |
+
"neftune_noise_alpha": null,
|
| 131 |
+
"optim_target_modules": null,
|
| 132 |
+
"batch_eval_metrics": false,
|
| 133 |
+
"eval_on_start": false,
|
| 134 |
+
"use_liger_kernel": false,
|
| 135 |
+
"liger_kernel_config": null,
|
| 136 |
+
"eval_use_gather_object": false,
|
| 137 |
+
"average_tokens_across_devices": true,
|
| 138 |
+
"sortish_sampler": false,
|
| 139 |
+
"predict_with_generate": false,
|
| 140 |
+
"generation_max_length": null,
|
| 141 |
+
"generation_num_beams": null,
|
| 142 |
+
"generation_config": null,
|
| 143 |
+
"tuner_backend": "peft",
|
| 144 |
+
"vit_gradient_checkpointing": null,
|
| 145 |
+
"router_aux_loss_coef": 0.0,
|
| 146 |
+
"enable_dft_loss": false,
|
| 147 |
+
"enable_channel_loss": false,
|
| 148 |
+
"check_model": true,
|
| 149 |
+
"acc_strategy": "token",
|
| 150 |
+
"train_dataloader_shuffle": true,
|
| 151 |
+
"max_epochs": null,
|
| 152 |
+
"aligner_lr": null,
|
| 153 |
+
"vit_lr": null,
|
| 154 |
+
"use_logits_to_keep": null,
|
| 155 |
+
"ds3_gather_for_generation": true,
|
| 156 |
+
"resume_only_model": false,
|
| 157 |
+
"optimizer": null,
|
| 158 |
+
"loss_type": null,
|
| 159 |
+
"metric": null,
|
| 160 |
+
"eval_use_evalscope": false,
|
| 161 |
+
"eval_dataset": [],
|
| 162 |
+
"eval_dataset_args": null,
|
| 163 |
+
"eval_limit": null,
|
| 164 |
+
"eval_generation_config": null,
|
| 165 |
+
"extra_eval_args": null,
|
| 166 |
+
"use_flash_ckpt": false,
|
| 167 |
+
"use_ray": false,
|
| 168 |
+
"ray_exp_name": null,
|
| 169 |
+
"device_groups": null,
|
| 170 |
+
"model": "nanonets/Nanonets-OCR2-3B",
|
| 171 |
+
"model_type": "qwen2_5_vl",
|
| 172 |
+
"model_revision": null,
|
| 173 |
+
"task_type": "causal_lm",
|
| 174 |
+
"torch_dtype": "bfloat16",
|
| 175 |
+
"attn_impl": null,
|
| 176 |
+
"new_special_tokens": [],
|
| 177 |
+
"num_labels": null,
|
| 178 |
+
"problem_type": null,
|
| 179 |
+
"rope_scaling": null,
|
| 180 |
+
"device_map": null,
|
| 181 |
+
"max_memory": {},
|
| 182 |
+
"max_model_len": null,
|
| 183 |
+
"local_repo_path": null,
|
| 184 |
+
"init_strategy": null,
|
| 185 |
+
"template": "qwen2_5_vl",
|
| 186 |
+
"system": null,
|
| 187 |
+
"max_length": 8192,
|
| 188 |
+
"truncation_strategy": "delete",
|
| 189 |
+
"max_pixels": null,
|
| 190 |
+
"agent_template": null,
|
| 191 |
+
"norm_bbox": null,
|
| 192 |
+
"use_chat_template": true,
|
| 193 |
+
"padding_side": "right",
|
| 194 |
+
"padding_free": false,
|
| 195 |
+
"loss_scale": "default",
|
| 196 |
+
"sequence_parallel_size": 1,
|
| 197 |
+
"template_backend": "swift",
|
| 198 |
+
"response_prefix": null,
|
| 199 |
+
"enable_thinking": null,
|
| 200 |
+
"add_non_thinking_prefix": true,
|
| 201 |
+
"dataset": [
|
| 202 |
+
"/home/ab/document-parsing/output/datasets/train.jsonl"
|
| 203 |
+
],
|
| 204 |
+
"val_dataset": [],
|
| 205 |
+
"cached_dataset": [],
|
| 206 |
+
"cached_val_dataset": [],
|
| 207 |
+
"split_dataset_ratio": 0.0,
|
| 208 |
+
"dataset_num_proc": 1,
|
| 209 |
+
"load_from_cache_file": false,
|
| 210 |
+
"dataset_shuffle": true,
|
| 211 |
+
"val_dataset_shuffle": false,
|
| 212 |
+
"streaming": false,
|
| 213 |
+
"interleave_prob": null,
|
| 214 |
+
"stopping_strategy": "first_exhausted",
|
| 215 |
+
"shuffle_buffer_size": 1000,
|
| 216 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 217 |
+
"columns": {},
|
| 218 |
+
"strict": false,
|
| 219 |
+
"model_name": null,
|
| 220 |
+
"model_author": null,
|
| 221 |
+
"custom_dataset_info": [],
|
| 222 |
+
"quant_method": null,
|
| 223 |
+
"quant_bits": null,
|
| 224 |
+
"hqq_axis": null,
|
| 225 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 226 |
+
"bnb_4bit_quant_type": "nf4",
|
| 227 |
+
"bnb_4bit_use_double_quant": true,
|
| 228 |
+
"bnb_4bit_quant_storage": null,
|
| 229 |
+
"max_new_tokens": 64,
|
| 230 |
+
"temperature": 0.0,
|
| 231 |
+
"top_k": null,
|
| 232 |
+
"top_p": null,
|
| 233 |
+
"repetition_penalty": null,
|
| 234 |
+
"num_beams": 1,
|
| 235 |
+
"stream": false,
|
| 236 |
+
"stop_words": [],
|
| 237 |
+
"logprobs": false,
|
| 238 |
+
"top_logprobs": null,
|
| 239 |
+
"structured_outputs_regex": null,
|
| 240 |
+
"ckpt_dir": null,
|
| 241 |
+
"lora_modules": [],
|
| 242 |
+
"train_type": "lora",
|
| 243 |
+
"adapters": [],
|
| 244 |
+
"external_plugins": [],
|
| 245 |
+
"model_kwargs": {},
|
| 246 |
+
"load_args": false,
|
| 247 |
+
"load_data_args": false,
|
| 248 |
+
"packing": false,
|
| 249 |
+
"packing_length": null,
|
| 250 |
+
"packing_num_proc": 1,
|
| 251 |
+
"lazy_tokenize": true,
|
| 252 |
+
"custom_register_path": [],
|
| 253 |
+
"use_hf": false,
|
| 254 |
+
"ignore_args_error": false,
|
| 255 |
+
"use_swift_lora": false,
|
| 256 |
+
"freeze_parameters": [],
|
| 257 |
+
"freeze_parameters_regex": null,
|
| 258 |
+
"freeze_parameters_ratio": 0.0,
|
| 259 |
+
"trainable_parameters": [],
|
| 260 |
+
"trainable_parameters_regex": null,
|
| 261 |
+
"freeze_llm": false,
|
| 262 |
+
"freeze_vit": false,
|
| 263 |
+
"freeze_aligner": true,
|
| 264 |
+
"target_modules": [
|
| 265 |
+
"all-linear"
|
| 266 |
+
],
|
| 267 |
+
"target_regex": null,
|
| 268 |
+
"target_parameters": null,
|
| 269 |
+
"modules_to_save": [],
|
| 270 |
+
"lora_rank": 64,
|
| 271 |
+
"lora_alpha": 16,
|
| 272 |
+
"lora_dropout": 0.05,
|
| 273 |
+
"lora_bias": "none",
|
| 274 |
+
"lora_dtype": null,
|
| 275 |
+
"lorap_lr_ratio": null,
|
| 276 |
+
"use_rslora": false,
|
| 277 |
+
"use_dora": false,
|
| 278 |
+
"lora_ga_batch_size": 2,
|
| 279 |
+
"lora_ga_iters": 2,
|
| 280 |
+
"lora_ga_max_length": 1024,
|
| 281 |
+
"lora_ga_direction": "ArB2r",
|
| 282 |
+
"lora_ga_scale": "stable",
|
| 283 |
+
"lora_ga_stable_gamma": 16,
|
| 284 |
+
"init_weights": true,
|
| 285 |
+
"fourier_n_frequency": 2000,
|
| 286 |
+
"fourier_scaling": 300.0,
|
| 287 |
+
"boft_block_size": 4,
|
| 288 |
+
"boft_block_num": 0,
|
| 289 |
+
"boft_n_butterfly_factor": 1,
|
| 290 |
+
"boft_dropout": 0.0,
|
| 291 |
+
"vera_rank": 256,
|
| 292 |
+
"vera_projection_prng_key": 0,
|
| 293 |
+
"vera_dropout": 0.0,
|
| 294 |
+
"vera_d_initial": 0.1,
|
| 295 |
+
"adapter_act": "gelu",
|
| 296 |
+
"adapter_length": 128,
|
| 297 |
+
"use_galore": false,
|
| 298 |
+
"galore_target_modules": null,
|
| 299 |
+
"galore_rank": 128,
|
| 300 |
+
"galore_update_proj_gap": 50,
|
| 301 |
+
"galore_scale": 1.0,
|
| 302 |
+
"galore_proj_type": "std",
|
| 303 |
+
"galore_optim_per_parameter": false,
|
| 304 |
+
"galore_with_embedding": false,
|
| 305 |
+
"galore_quantization": false,
|
| 306 |
+
"galore_proj_quant": false,
|
| 307 |
+
"galore_proj_bits": 4,
|
| 308 |
+
"galore_proj_group_size": 256,
|
| 309 |
+
"galore_cos_threshold": 0.4,
|
| 310 |
+
"galore_gamma_proj": 2,
|
| 311 |
+
"galore_queue_size": 5,
|
| 312 |
+
"adalora_target_r": 8,
|
| 313 |
+
"adalora_init_r": 12,
|
| 314 |
+
"adalora_tinit": 0,
|
| 315 |
+
"adalora_tfinal": 0,
|
| 316 |
+
"adalora_deltaT": 1,
|
| 317 |
+
"adalora_beta1": 0.85,
|
| 318 |
+
"adalora_beta2": 0.85,
|
| 319 |
+
"adalora_orth_reg_weight": 0.5,
|
| 320 |
+
"llamapro_num_new_blocks": 4,
|
| 321 |
+
"llamapro_num_groups": null,
|
| 322 |
+
"lisa_activated_layers": 0,
|
| 323 |
+
"lisa_step_interval": 20,
|
| 324 |
+
"reft_layer_key": null,
|
| 325 |
+
"reft_layers": null,
|
| 326 |
+
"reft_rank": 4,
|
| 327 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 328 |
+
"reft_args": null,
|
| 329 |
+
"swanlab_token": null,
|
| 330 |
+
"swanlab_project": "ms-swift",
|
| 331 |
+
"swanlab_workspace": null,
|
| 332 |
+
"swanlab_exp_name": null,
|
| 333 |
+
"swanlab_notification_method": null,
|
| 334 |
+
"swanlab_webhook_url": null,
|
| 335 |
+
"swanlab_secret": null,
|
| 336 |
+
"swanlab_mode": "cloud",
|
| 337 |
+
"add_version": true,
|
| 338 |
+
"create_checkpoint_symlink": false,
|
| 339 |
+
"zero_hpz_partition_size": null,
|
| 340 |
+
"deepspeed_autotp_size": null,
|
| 341 |
+
"early_stop_interval": null,
|
| 342 |
+
"rank": -1,
|
| 343 |
+
"global_world_size": 1,
|
| 344 |
+
"local_world_size": 1,
|
| 345 |
+
"model_suffix": "Nanonets-OCR2-3B",
|
| 346 |
+
"model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'mrope_section': [16, 24, 24], 'rope_type': 'default', 'type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
|
| 347 |
+
"model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7c76215fac00>, model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=['vision', 'video'])",
|
| 348 |
+
"model_dir": "/home/ab/.cache/modelscope/hub/models/nanonets/Nanonets-OCR2-3B",
|
| 349 |
+
"_val_dataset_exists": [],
|
| 350 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 351 |
+
"evaluation_strategy": "steps",
|
| 352 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/home/ab/document-parsing/output/training/v1-20260117-010840', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=0.0001, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ab/document-parsing/output/training/v1-20260117-010840/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=100, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=100.0, dataloader_num_workers=4, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/ab/document-parsing/output/training/v1-20260117-010840', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='lora', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
|
| 353 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-580/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5739078c612d9264a84be014dcee923bd18a89769ff3d54d05e7bf6c600c656a
|
| 3 |
+
size 1315426955
|
output/training/v1-20260117-010840-10e/checkpoint-580/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b97d67bbbd39fd5a492faaf39d45ee3dddc989273d366f3048f720147cbb4b3
|
| 3 |
+
size 14645
|
output/training/v1-20260117-010840-10e/checkpoint-580/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a6112fa30c5dbad7af1b976693a28071346fb21ab769e7a2fde80a53c550ea0
|
| 3 |
+
size 1465
|
output/training/v1-20260117-010840-10e/checkpoint-580/trainer_state.json
ADDED
|
@@ -0,0 +1,506 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 10.0,
|
| 6 |
+
"eval_steps": 100.0,
|
| 7 |
+
"global_step": 580,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.017316017316017316,
|
| 14 |
+
"grad_norm": 0.4092565178871155,
|
| 15 |
+
"learning_rate": 3.448275862068966e-06,
|
| 16 |
+
"loss": 1.4861114025115967,
|
| 17 |
+
"step": 1,
|
| 18 |
+
"token_acc": 0.6811960725974412
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"epoch": 0.17316017316017315,
|
| 22 |
+
"grad_norm": 0.3977337181568146,
|
| 23 |
+
"learning_rate": 3.4482758620689657e-05,
|
| 24 |
+
"loss": 1.4343115488688152,
|
| 25 |
+
"step": 10,
|
| 26 |
+
"token_acc": 0.6920024476626676
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"epoch": 0.3463203463203463,
|
| 30 |
+
"grad_norm": 0.2495131641626358,
|
| 31 |
+
"learning_rate": 6.896551724137931e-05,
|
| 32 |
+
"loss": 1.3693717956542968,
|
| 33 |
+
"step": 20,
|
| 34 |
+
"token_acc": 0.7011260365349897
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"epoch": 0.5194805194805194,
|
| 38 |
+
"grad_norm": 0.24984458088874817,
|
| 39 |
+
"learning_rate": 9.999918729041868e-05,
|
| 40 |
+
"loss": 1.1922229766845702,
|
| 41 |
+
"step": 30,
|
| 42 |
+
"token_acc": 0.726987948088823
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"epoch": 0.6926406926406926,
|
| 46 |
+
"grad_norm": 0.3221384584903717,
|
| 47 |
+
"learning_rate": 9.990169410465536e-05,
|
| 48 |
+
"loss": 1.0192347526550294,
|
| 49 |
+
"step": 40,
|
| 50 |
+
"token_acc": 0.7609010955099522
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"epoch": 0.8658008658008658,
|
| 54 |
+
"grad_norm": 0.40206295251846313,
|
| 55 |
+
"learning_rate": 9.964202208175834e-05,
|
| 56 |
+
"loss": 0.9150349617004394,
|
| 57 |
+
"step": 50,
|
| 58 |
+
"token_acc": 0.7773335965518376
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 1.0346320346320346,
|
| 62 |
+
"grad_norm": 0.20406530797481537,
|
| 63 |
+
"learning_rate": 9.922101514711866e-05,
|
| 64 |
+
"loss": 0.7742667198181152,
|
| 65 |
+
"step": 60,
|
| 66 |
+
"token_acc": 0.8123942631570925
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 1.2077922077922079,
|
| 70 |
+
"grad_norm": 1.4768069982528687,
|
| 71 |
+
"learning_rate": 9.864004155919543e-05,
|
| 72 |
+
"loss": 0.6983946800231934,
|
| 73 |
+
"step": 70,
|
| 74 |
+
"token_acc": 0.8248333138378757
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"epoch": 1.380952380952381,
|
| 78 |
+
"grad_norm": 0.611409604549408,
|
| 79 |
+
"learning_rate": 9.790098946272177e-05,
|
| 80 |
+
"loss": 0.6138243198394775,
|
| 81 |
+
"step": 80,
|
| 82 |
+
"token_acc": 0.8442561143531572
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"epoch": 1.554112554112554,
|
| 86 |
+
"grad_norm": 0.3051394820213318,
|
| 87 |
+
"learning_rate": 9.700626075229738e-05,
|
| 88 |
+
"loss": 0.5975491523742675,
|
| 89 |
+
"step": 90,
|
| 90 |
+
"token_acc": 0.8483123092893768
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"epoch": 1.7272727272727273,
|
| 94 |
+
"grad_norm": 0.3783220648765564,
|
| 95 |
+
"learning_rate": 9.595876326631154e-05,
|
| 96 |
+
"loss": 0.5410520553588867,
|
| 97 |
+
"step": 100,
|
| 98 |
+
"token_acc": 0.8605094145609629
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"epoch": 1.9004329004329006,
|
| 102 |
+
"grad_norm": 0.6039865612983704,
|
| 103 |
+
"learning_rate": 9.476190133656548e-05,
|
| 104 |
+
"loss": 0.5531170845031739,
|
| 105 |
+
"step": 110,
|
| 106 |
+
"token_acc": 0.8547892544963617
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"epoch": 2.069264069264069,
|
| 110 |
+
"grad_norm": 0.5374985337257385,
|
| 111 |
+
"learning_rate": 9.341956472430801e-05,
|
| 112 |
+
"loss": 0.5079349040985107,
|
| 113 |
+
"step": 120,
|
| 114 |
+
"token_acc": 0.864488826645558
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 2.242424242424242,
|
| 118 |
+
"grad_norm": 0.364619642496109,
|
| 119 |
+
"learning_rate": 9.193611597864139e-05,
|
| 120 |
+
"loss": 0.44995865821838377,
|
| 121 |
+
"step": 130,
|
| 122 |
+
"token_acc": 0.8797397710240138
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 2.4155844155844157,
|
| 126 |
+
"grad_norm": 1.59947669506073,
|
| 127 |
+
"learning_rate": 9.031637625838265e-05,
|
| 128 |
+
"loss": 0.429323148727417,
|
| 129 |
+
"step": 140,
|
| 130 |
+
"token_acc": 0.8858490566037736
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"epoch": 2.588744588744589,
|
| 134 |
+
"grad_norm": 0.46518200635910034,
|
| 135 |
+
"learning_rate": 8.856560966345877e-05,
|
| 136 |
+
"loss": 0.4315037727355957,
|
| 137 |
+
"step": 150,
|
| 138 |
+
"token_acc": 0.8819307344821817
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 2.761904761904762,
|
| 142 |
+
"grad_norm": 0.691148579120636,
|
| 143 |
+
"learning_rate": 8.668950612675785e-05,
|
| 144 |
+
"loss": 0.40119166374206544,
|
| 145 |
+
"step": 160,
|
| 146 |
+
"token_acc": 0.8896224924972358
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"epoch": 2.935064935064935,
|
| 150 |
+
"grad_norm": 0.3540444076061249,
|
| 151 |
+
"learning_rate": 8.469416292203747e-05,
|
| 152 |
+
"loss": 0.40500435829162595,
|
| 153 |
+
"step": 170,
|
| 154 |
+
"token_acc": 0.8917646715924161
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"epoch": 3.103896103896104,
|
| 158 |
+
"grad_norm": 0.3412817418575287,
|
| 159 |
+
"learning_rate": 8.258606484798897e-05,
|
| 160 |
+
"loss": 0.37092483043670654,
|
| 161 |
+
"step": 180,
|
| 162 |
+
"token_acc": 0.8977291233149371
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"epoch": 3.277056277056277,
|
| 166 |
+
"grad_norm": 0.34155094623565674,
|
| 167 |
+
"learning_rate": 8.037206315285843e-05,
|
| 168 |
+
"loss": 0.344103741645813,
|
| 169 |
+
"step": 190,
|
| 170 |
+
"token_acc": 0.9065206570433051
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 3.45021645021645,
|
| 174 |
+
"grad_norm": 0.3627335727214813,
|
| 175 |
+
"learning_rate": 7.805935326811912e-05,
|
| 176 |
+
"loss": 0.3504387140274048,
|
| 177 |
+
"step": 200,
|
| 178 |
+
"token_acc": 0.9002762340096682
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 3.6233766233766236,
|
| 182 |
+
"grad_norm": 0.8141089677810669,
|
| 183 |
+
"learning_rate": 7.565545142355971e-05,
|
| 184 |
+
"loss": 0.3558197498321533,
|
| 185 |
+
"step": 210,
|
| 186 |
+
"token_acc": 0.8999160043936163
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"epoch": 3.7965367965367967,
|
| 190 |
+
"grad_norm": 0.6176502108573914,
|
| 191 |
+
"learning_rate": 7.316817021978884e-05,
|
| 192 |
+
"loss": 0.33676347732543943,
|
| 193 |
+
"step": 220,
|
| 194 |
+
"token_acc": 0.904816147992892
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"epoch": 3.9696969696969697,
|
| 198 |
+
"grad_norm": 0.49287620186805725,
|
| 199 |
+
"learning_rate": 7.060559323754435e-05,
|
| 200 |
+
"loss": 0.35226542949676515,
|
| 201 |
+
"step": 230,
|
| 202 |
+
"token_acc": 0.9020813028578615
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 4.138528138528138,
|
| 206 |
+
"grad_norm": 0.6057422161102295,
|
| 207 |
+
"learning_rate": 6.797604876632633e-05,
|
| 208 |
+
"loss": 0.3057840585708618,
|
| 209 |
+
"step": 240,
|
| 210 |
+
"token_acc": 0.9123896645803242
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"epoch": 4.311688311688312,
|
| 214 |
+
"grad_norm": 12.585014343261719,
|
| 215 |
+
"learning_rate": 6.528808273773461e-05,
|
| 216 |
+
"loss": 0.301344108581543,
|
| 217 |
+
"step": 250,
|
| 218 |
+
"token_acc": 0.9142363149996737
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"epoch": 4.484848484848484,
|
| 222 |
+
"grad_norm": 0.32902830839157104,
|
| 223 |
+
"learning_rate": 6.255043095147679e-05,
|
| 224 |
+
"loss": 0.2898148775100708,
|
| 225 |
+
"step": 260,
|
| 226 |
+
"token_acc": 0.9177889157552563
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 4.658008658008658,
|
| 230 |
+
"grad_norm": 0.39732787013053894,
|
| 231 |
+
"learning_rate": 5.9771990684311544e-05,
|
| 232 |
+
"loss": 0.29072208404541017,
|
| 233 |
+
"step": 270,
|
| 234 |
+
"token_acc": 0.917258875717698
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 4.8311688311688314,
|
| 238 |
+
"grad_norm": 0.44461533427238464,
|
| 239 |
+
"learning_rate": 5.6961791774196424e-05,
|
| 240 |
+
"loss": 0.2852530241012573,
|
| 241 |
+
"step": 280,
|
| 242 |
+
"token_acc": 0.9166775180675826
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"epoch": 5.0,
|
| 246 |
+
"grad_norm": 0.35245048999786377,
|
| 247 |
+
"learning_rate": 5.4128967273616625e-05,
|
| 248 |
+
"loss": 0.3020582675933838,
|
| 249 |
+
"step": 290,
|
| 250 |
+
"token_acc": 0.9138208862720794
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"epoch": 5.1731601731601735,
|
| 254 |
+
"grad_norm": 0.36154425144195557,
|
| 255 |
+
"learning_rate": 5.128272376746972e-05,
|
| 256 |
+
"loss": 0.23758175373077392,
|
| 257 |
+
"step": 300,
|
| 258 |
+
"token_acc": 0.9282945419454031
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 5.346320346320346,
|
| 262 |
+
"grad_norm": 0.40296199917793274,
|
| 263 |
+
"learning_rate": 4.8432311451972665e-05,
|
| 264 |
+
"loss": 0.27498042583465576,
|
| 265 |
+
"step": 310,
|
| 266 |
+
"token_acc": 0.9217681765679143
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"epoch": 5.51948051948052,
|
| 270 |
+
"grad_norm": 0.9700812697410583,
|
| 271 |
+
"learning_rate": 4.558699407183338e-05,
|
| 272 |
+
"loss": 0.2576076745986938,
|
| 273 |
+
"step": 320,
|
| 274 |
+
"token_acc": 0.9252093233763294
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"epoch": 5.692640692640692,
|
| 278 |
+
"grad_norm": 0.4304976761341095,
|
| 279 |
+
"learning_rate": 4.2756018813390274e-05,
|
| 280 |
+
"loss": 0.2424612522125244,
|
| 281 |
+
"step": 330,
|
| 282 |
+
"token_acc": 0.9276378041152792
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 5.865800865800866,
|
| 286 |
+
"grad_norm": 0.4652138650417328,
|
| 287 |
+
"learning_rate": 3.9948586251565825e-05,
|
| 288 |
+
"loss": 0.259202766418457,
|
| 289 |
+
"step": 340,
|
| 290 |
+
"token_acc": 0.9240967292621122
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 6.034632034632034,
|
| 294 |
+
"grad_norm": 0.37480419874191284,
|
| 295 |
+
"learning_rate": 3.7173820448305755e-05,
|
| 296 |
+
"loss": 0.2334808111190796,
|
| 297 |
+
"step": 350,
|
| 298 |
+
"token_acc": 0.9299400823867182
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"epoch": 6.207792207792208,
|
| 302 |
+
"grad_norm": 0.5389286279678345,
|
| 303 |
+
"learning_rate": 3.444073929968284e-05,
|
| 304 |
+
"loss": 0.23487865924835205,
|
| 305 |
+
"step": 360,
|
| 306 |
+
"token_acc": 0.9300512852684243
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"epoch": 6.380952380952381,
|
| 310 |
+
"grad_norm": 0.4614177942276001,
|
| 311 |
+
"learning_rate": 3.175822522803623e-05,
|
| 312 |
+
"loss": 0.21724979877471923,
|
| 313 |
+
"step": 370,
|
| 314 |
+
"token_acc": 0.9360088365243004
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 6.554112554112554,
|
| 318 |
+
"grad_norm": 0.3773002326488495,
|
| 319 |
+
"learning_rate": 2.9134996314395818e-05,
|
| 320 |
+
"loss": 0.20992758274078369,
|
| 321 |
+
"step": 380,
|
| 322 |
+
"token_acc": 0.9362415581566618
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"epoch": 6.7272727272727275,
|
| 326 |
+
"grad_norm": 1.1898497343063354,
|
| 327 |
+
"learning_rate": 2.65795779650105e-05,
|
| 328 |
+
"loss": 0.2153007745742798,
|
| 329 |
+
"step": 390,
|
| 330 |
+
"token_acc": 0.9367496189220204
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"epoch": 6.9004329004329,
|
| 334 |
+
"grad_norm": 0.8586929440498352,
|
| 335 |
+
"learning_rate": 2.41002752040629e-05,
|
| 336 |
+
"loss": 0.22280852794647216,
|
| 337 |
+
"step": 400,
|
| 338 |
+
"token_acc": 0.9341588229918669
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 7.06926406926407,
|
| 342 |
+
"grad_norm": 0.5149306058883667,
|
| 343 |
+
"learning_rate": 2.1705145682618505e-05,
|
| 344 |
+
"loss": 0.21320977210998535,
|
| 345 |
+
"step": 410,
|
| 346 |
+
"token_acc": 0.9383294431477159
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 7.242424242424242,
|
| 350 |
+
"grad_norm": 0.4976541996002197,
|
| 351 |
+
"learning_rate": 1.940197349152923e-05,
|
| 352 |
+
"loss": 0.1985553979873657,
|
| 353 |
+
"step": 420,
|
| 354 |
+
"token_acc": 0.9401391309809833
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"epoch": 7.415584415584416,
|
| 358 |
+
"grad_norm": 0.4779481589794159,
|
| 359 |
+
"learning_rate": 1.7198243863398273e-05,
|
| 360 |
+
"loss": 0.20875980854034423,
|
| 361 |
+
"step": 430,
|
| 362 |
+
"token_acc": 0.9373778262148182
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"epoch": 7.588744588744589,
|
| 366 |
+
"grad_norm": 0.6022359132766724,
|
| 367 |
+
"learning_rate": 1.510111884582463e-05,
|
| 368 |
+
"loss": 0.19188997745513917,
|
| 369 |
+
"step": 440,
|
| 370 |
+
"token_acc": 0.942989444333798
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"epoch": 7.761904761904762,
|
| 374 |
+
"grad_norm": 0.497090607881546,
|
| 375 |
+
"learning_rate": 1.3117414024987823e-05,
|
| 376 |
+
"loss": 0.1933382511138916,
|
| 377 |
+
"step": 450,
|
| 378 |
+
"token_acc": 0.9423271204556436
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"epoch": 7.935064935064935,
|
| 382 |
+
"grad_norm": 0.488971084356308,
|
| 383 |
+
"learning_rate": 1.125357637522072e-05,
|
| 384 |
+
"loss": 0.1843361496925354,
|
| 385 |
+
"step": 460,
|
| 386 |
+
"token_acc": 0.9436703366987985
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"epoch": 8.103896103896103,
|
| 390 |
+
"grad_norm": 0.767144501209259,
|
| 391 |
+
"learning_rate": 9.51566330655857e-06,
|
| 392 |
+
"loss": 0.19610201120376586,
|
| 393 |
+
"step": 470,
|
| 394 |
+
"token_acc": 0.9421800227876946
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 8.277056277056277,
|
| 398 |
+
"grad_norm": 0.4893112778663635,
|
| 399 |
+
"learning_rate": 7.909322978358913e-06,
|
| 400 |
+
"loss": 0.170158052444458,
|
| 401 |
+
"step": 480,
|
| 402 |
+
"token_acc": 0.9497098970386021
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 8.45021645021645,
|
| 406 |
+
"grad_norm": 0.5407018661499023,
|
| 407 |
+
"learning_rate": 6.439775942972609e-06,
|
| 408 |
+
"loss": 0.1650066614151001,
|
| 409 |
+
"step": 490,
|
| 410 |
+
"token_acc": 0.9508892299359032
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"epoch": 8.623376623376624,
|
| 414 |
+
"grad_norm": 0.41522547602653503,
|
| 415 |
+
"learning_rate": 5.111798179123173e-06,
|
| 416 |
+
"loss": 0.1943192720413208,
|
| 417 |
+
"step": 500,
|
| 418 |
+
"token_acc": 0.9430037937960277
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"epoch": 8.796536796536797,
|
| 422 |
+
"grad_norm": 0.5257052183151245,
|
| 423 |
+
"learning_rate": 3.929705570135711e-06,
|
| 424 |
+
"loss": 0.16702849864959718,
|
| 425 |
+
"step": 510,
|
| 426 |
+
"token_acc": 0.9501815248083905
|
| 427 |
+
},
|
| 428 |
+
{
|
| 429 |
+
"epoch": 8.969696969696969,
|
| 430 |
+
"grad_norm": 0.48933619260787964,
|
| 431 |
+
"learning_rate": 2.897339877460398e-06,
|
| 432 |
+
"loss": 0.19309405088424683,
|
| 433 |
+
"step": 520,
|
| 434 |
+
"token_acc": 0.9438778813778814
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 9.13852813852814,
|
| 438 |
+
"grad_norm": 0.6073329448699951,
|
| 439 |
+
"learning_rate": 2.018056255076256e-06,
|
| 440 |
+
"loss": 0.17578216791152954,
|
| 441 |
+
"step": 530,
|
| 442 |
+
"token_acc": 0.949875481814
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"epoch": 9.311688311688311,
|
| 446 |
+
"grad_norm": 5.020083427429199,
|
| 447 |
+
"learning_rate": 1.2947123453528886e-06,
|
| 448 |
+
"loss": 0.18189191818237305,
|
| 449 |
+
"step": 540,
|
| 450 |
+
"token_acc": 0.9454742254092816
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 9.484848484848484,
|
| 454 |
+
"grad_norm": 0.6125385761260986,
|
| 455 |
+
"learning_rate": 7.296589918083685e-07,
|
| 456 |
+
"loss": 0.16662927865982055,
|
| 457 |
+
"step": 550,
|
| 458 |
+
"token_acc": 0.9502154609558632
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 9.658008658008658,
|
| 462 |
+
"grad_norm": 0.4245486259460449,
|
| 463 |
+
"learning_rate": 3.2473259894640894e-07,
|
| 464 |
+
"loss": 0.16942204236984254,
|
| 465 |
+
"step": 560,
|
| 466 |
+
"token_acc": 0.9508478741705578
|
| 467 |
+
},
|
| 468 |
+
{
|
| 469 |
+
"epoch": 9.831168831168831,
|
| 470 |
+
"grad_norm": 1.1829816102981567,
|
| 471 |
+
"learning_rate": 8.124916400311655e-08,
|
| 472 |
+
"loss": 0.17350658178329467,
|
| 473 |
+
"step": 570,
|
| 474 |
+
"token_acc": 0.9498697127620894
|
| 475 |
+
},
|
| 476 |
+
{
|
| 477 |
+
"epoch": 10.0,
|
| 478 |
+
"grad_norm": 0.5661698579788208,
|
| 479 |
+
"learning_rate": 0.0,
|
| 480 |
+
"loss": 0.1673359751701355,
|
| 481 |
+
"step": 580,
|
| 482 |
+
"token_acc": 0.9504393101204035
|
| 483 |
+
}
|
| 484 |
+
],
|
| 485 |
+
"logging_steps": 10,
|
| 486 |
+
"max_steps": 580,
|
| 487 |
+
"num_input_tokens_seen": 0,
|
| 488 |
+
"num_train_epochs": 10,
|
| 489 |
+
"save_steps": 100,
|
| 490 |
+
"stateful_callbacks": {
|
| 491 |
+
"TrainerControl": {
|
| 492 |
+
"args": {
|
| 493 |
+
"should_epoch_stop": false,
|
| 494 |
+
"should_evaluate": false,
|
| 495 |
+
"should_log": false,
|
| 496 |
+
"should_save": true,
|
| 497 |
+
"should_training_stop": true
|
| 498 |
+
},
|
| 499 |
+
"attributes": {}
|
| 500 |
+
}
|
| 501 |
+
},
|
| 502 |
+
"total_flos": 2.229367912955904e+17,
|
| 503 |
+
"train_batch_size": 1,
|
| 504 |
+
"trial_name": null,
|
| 505 |
+
"trial_params": null
|
| 506 |
+
}
|
output/training/v1-20260117-010840-10e/checkpoint-580/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e5274be8af993948bcfc3f1251ec27de22bce224d71e604e5b270f182b3aac2
|
| 3 |
+
size 6993
|
output/training/v1-20260117-010840-10e/images/train_epoch.png
ADDED
|
output/training/v1-20260117-010840-10e/images/train_grad_norm.png
ADDED
|
output/training/v1-20260117-010840-10e/images/train_learning_rate.png
ADDED
|
output/training/v1-20260117-010840-10e/images/train_loss.png
ADDED
|
output/training/v1-20260117-010840-10e/images/train_token_acc.png
ADDED
|
output/training/v1-20260117-010840-10e/images/train_total_flos.png
ADDED
|
output/training/v1-20260117-010840-10e/images/train_train_loss.png
ADDED
|
output/training/v1-20260117-010840-10e/images/train_train_runtime.png
ADDED
|
output/training/v1-20260117-010840-10e/images/train_train_samples_per_second.png
ADDED
|
output/training/v1-20260117-010840-10e/images/train_train_steps_per_second.png
ADDED
|
output/training/v1-20260117-010840-10e/logging.jsonl
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"loss": 1.4861114, "grad_norm": 0.40925652, "learning_rate": 3.45e-06, "token_acc": 0.68119607, "epoch": 0.01731602, "global_step/max_steps": "1/580", "percentage": "0.17%", "elapsed_time": "11s", "remaining_time": "1h 52m 13s", "memory(GiB)": 20.84, "train_speed(iter/s)": 0.085986}
|
| 2 |
+
{"loss": 1.43431155, "grad_norm": 0.39773372, "learning_rate": 3.448e-05, "token_acc": 0.69200245, "epoch": 0.17316017, "global_step/max_steps": "10/580", "percentage": "1.72%", "elapsed_time": "1m 22s", "remaining_time": "1h 18m 37s", "memory(GiB)": 20.95, "train_speed(iter/s)": 0.120821}
|
| 3 |
+
{"loss": 1.3693718, "grad_norm": 0.24951316, "learning_rate": 6.897e-05, "token_acc": 0.70112604, "epoch": 0.34632035, "global_step/max_steps": "20/580", "percentage": "3.45%", "elapsed_time": "2m 44s", "remaining_time": "1h 16m 46s", "memory(GiB)": 28.17, "train_speed(iter/s)": 0.121558}
|
| 4 |
+
{"loss": 1.19222298, "grad_norm": 0.24984458, "learning_rate": 0.0001, "token_acc": 0.72698795, "epoch": 0.51948052, "global_step/max_steps": "30/580", "percentage": "5.17%", "elapsed_time": "4m 3s", "remaining_time": "1h 14m 23s", "memory(GiB)": 28.17, "train_speed(iter/s)": 0.123231}
|
| 5 |
+
{"loss": 1.01923475, "grad_norm": 0.32213846, "learning_rate": 9.99e-05, "token_acc": 0.7609011, "epoch": 0.69264069, "global_step/max_steps": "40/580", "percentage": "6.90%", "elapsed_time": "5m 23s", "remaining_time": "1h 12m 49s", "memory(GiB)": 28.66, "train_speed(iter/s)": 0.123595}
|
| 6 |
+
{"loss": 0.91503496, "grad_norm": 0.40206295, "learning_rate": 9.964e-05, "token_acc": 0.7773336, "epoch": 0.86580087, "global_step/max_steps": "50/580", "percentage": "8.62%", "elapsed_time": "6m 41s", "remaining_time": "1h 10m 55s", "memory(GiB)": 28.67, "train_speed(iter/s)": 0.124536}
|
| 7 |
+
{"loss": 0.77426672, "grad_norm": 0.20406531, "learning_rate": 9.922e-05, "token_acc": 0.81239426, "epoch": 1.03463203, "global_step/max_steps": "60/580", "percentage": "10.34%", "elapsed_time": "8m 2s", "remaining_time": "1h 9m 38s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124458}
|
| 8 |
+
{"loss": 0.69839468, "grad_norm": 1.476807, "learning_rate": 9.864e-05, "token_acc": 0.82483331, "epoch": 1.20779221, "global_step/max_steps": "70/580", "percentage": "12.07%", "elapsed_time": "9m 22s", "remaining_time": "1h 8m 14s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.12455}
|
| 9 |
+
{"loss": 0.61382432, "grad_norm": 0.6114096, "learning_rate": 9.79e-05, "token_acc": 0.84425611, "epoch": 1.38095238, "global_step/max_steps": "80/580", "percentage": "13.79%", "elapsed_time": "10m 42s", "remaining_time": "1h 6m 57s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124452}
|
| 10 |
+
{"loss": 0.59754915, "grad_norm": 0.30513948, "learning_rate": 9.701e-05, "token_acc": 0.84831231, "epoch": 1.55411255, "global_step/max_steps": "90/580", "percentage": "15.52%", "elapsed_time": "12m 4s", "remaining_time": "1h 5m 42s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124286}
|
| 11 |
+
{"loss": 0.54105206, "grad_norm": 0.37832206, "learning_rate": 9.596e-05, "token_acc": 0.86050941, "epoch": 1.72727273, "global_step/max_steps": "100/580", "percentage": "17.24%", "elapsed_time": "13m 22s", "remaining_time": "1h 4m 11s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124632}
|
| 12 |
+
{"loss": 0.55311708, "grad_norm": 0.60398656, "learning_rate": 9.476e-05, "token_acc": 0.85478925, "epoch": 1.9004329, "global_step/max_steps": "110/580", "percentage": "18.97%", "elapsed_time": "14m 43s", "remaining_time": "1h 2m 55s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124471}
|
| 13 |
+
{"loss": 0.5079349, "grad_norm": 0.53749853, "learning_rate": 9.342e-05, "token_acc": 0.86448883, "epoch": 2.06926407, "global_step/max_steps": "120/580", "percentage": "20.69%", "elapsed_time": "16m 2s", "remaining_time": "1h 1m 30s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124655}
|
| 14 |
+
{"loss": 0.44995866, "grad_norm": 0.36461964, "learning_rate": 9.194e-05, "token_acc": 0.87973977, "epoch": 2.24242424, "global_step/max_steps": "130/580", "percentage": "22.41%", "elapsed_time": "17m 21s", "remaining_time": "1h 0m 5s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124797}
|
| 15 |
+
{"loss": 0.42932315, "grad_norm": 1.5994767, "learning_rate": 9.032e-05, "token_acc": 0.88584906, "epoch": 2.41558442, "global_step/max_steps": "140/580", "percentage": "24.14%", "elapsed_time": "18m 44s", "remaining_time": "58m 53s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124527}
|
| 16 |
+
{"loss": 0.43150377, "grad_norm": 0.46518201, "learning_rate": 8.857e-05, "token_acc": 0.88193073, "epoch": 2.58874459, "global_step/max_steps": "150/580", "percentage": "25.86%", "elapsed_time": "20m 3s", "remaining_time": "57m 31s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124597}
|
| 17 |
+
{"loss": 0.40119166, "grad_norm": 0.69114858, "learning_rate": 8.669e-05, "token_acc": 0.88962249, "epoch": 2.76190476, "global_step/max_steps": "160/580", "percentage": "27.59%", "elapsed_time": "21m 21s", "remaining_time": "56m 5s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124813}
|
| 18 |
+
{"loss": 0.40500436, "grad_norm": 0.35404441, "learning_rate": 8.469e-05, "token_acc": 0.89176467, "epoch": 2.93506494, "global_step/max_steps": "170/580", "percentage": "29.31%", "elapsed_time": "22m 42s", "remaining_time": "54m 47s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124732}
|
| 19 |
+
{"loss": 0.37092483, "grad_norm": 0.34128174, "learning_rate": 8.259e-05, "token_acc": 0.89772912, "epoch": 3.1038961, "global_step/max_steps": "180/580", "percentage": "31.03%", "elapsed_time": "23m 59s", "remaining_time": "53m 19s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125023}
|
| 20 |
+
{"loss": 0.34410374, "grad_norm": 0.34155095, "learning_rate": 8.037e-05, "token_acc": 0.90652066, "epoch": 3.27705628, "global_step/max_steps": "190/580", "percentage": "32.76%", "elapsed_time": "25m 22s", "remaining_time": "52m 5s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124775}
|
| 21 |
+
{"loss": 0.35043871, "grad_norm": 0.36273357, "learning_rate": 7.806e-05, "token_acc": 0.90027623, "epoch": 3.45021645, "global_step/max_steps": "200/580", "percentage": "34.48%", "elapsed_time": "26m 40s", "remaining_time": "50m 41s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124947}
|
| 22 |
+
{"loss": 0.35581975, "grad_norm": 0.81410897, "learning_rate": 7.566e-05, "token_acc": 0.899916, "epoch": 3.62337662, "global_step/max_steps": "210/580", "percentage": "36.21%", "elapsed_time": "28m 0s", "remaining_time": "49m 20s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124974}
|
| 23 |
+
{"loss": 0.33676348, "grad_norm": 0.61765021, "learning_rate": 7.317e-05, "token_acc": 0.90481615, "epoch": 3.7965368, "global_step/max_steps": "220/580", "percentage": "37.93%", "elapsed_time": "29m 21s", "remaining_time": "48m 1s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124928}
|
| 24 |
+
{"loss": 0.35226543, "grad_norm": 0.4928762, "learning_rate": 7.061e-05, "token_acc": 0.9020813, "epoch": 3.96969697, "global_step/max_steps": "230/580", "percentage": "39.66%", "elapsed_time": "30m 40s", "remaining_time": "46m 40s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124975}
|
| 25 |
+
{"loss": 0.30578406, "grad_norm": 0.60574222, "learning_rate": 6.798e-05, "token_acc": 0.91238966, "epoch": 4.13852814, "global_step/max_steps": "240/580", "percentage": "41.38%", "elapsed_time": "32m 0s", "remaining_time": "45m 21s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.124951}
|
| 26 |
+
{"loss": 0.30134411, "grad_norm": 12.58501434, "learning_rate": 6.529e-05, "token_acc": 0.91423631, "epoch": 4.31168831, "global_step/max_steps": "250/580", "percentage": "43.10%", "elapsed_time": "33m 18s", "remaining_time": "43m 58s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125065}
|
| 27 |
+
{"loss": 0.28981488, "grad_norm": 0.32902831, "learning_rate": 6.255e-05, "token_acc": 0.91778892, "epoch": 4.48484848, "global_step/max_steps": "260/580", "percentage": "44.83%", "elapsed_time": "34m 36s", "remaining_time": "42m 36s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125195}
|
| 28 |
+
{"loss": 0.29072208, "grad_norm": 0.39732787, "learning_rate": 5.977e-05, "token_acc": 0.91725888, "epoch": 4.65800866, "global_step/max_steps": "270/580", "percentage": "46.55%", "elapsed_time": "35m 57s", "remaining_time": "41m 17s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.12514}
|
| 29 |
+
{"loss": 0.28525302, "grad_norm": 0.44461533, "learning_rate": 5.696e-05, "token_acc": 0.91667752, "epoch": 4.83116883, "global_step/max_steps": "280/580", "percentage": "48.28%", "elapsed_time": "37m 17s", "remaining_time": "39m 56s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125165}
|
| 30 |
+
{"loss": 0.30205827, "grad_norm": 0.35245049, "learning_rate": 5.413e-05, "token_acc": 0.91382089, "epoch": 5.0, "global_step/max_steps": "290/580", "percentage": "50.00%", "elapsed_time": "38m 35s", "remaining_time": "38m 35s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125245}
|
| 31 |
+
{"loss": 0.23758175, "grad_norm": 0.36154425, "learning_rate": 5.128e-05, "token_acc": 0.92829454, "epoch": 5.17316017, "global_step/max_steps": "300/580", "percentage": "51.72%", "elapsed_time": "39m 53s", "remaining_time": "37m 14s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.12533}
|
| 32 |
+
{"loss": 0.27498043, "grad_norm": 0.402962, "learning_rate": 4.843e-05, "token_acc": 0.92176818, "epoch": 5.34632035, "global_step/max_steps": "310/580", "percentage": "53.45%", "elapsed_time": "41m 13s", "remaining_time": "35m 54s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125338}
|
| 33 |
+
{"loss": 0.25760767, "grad_norm": 0.97008127, "learning_rate": 4.559e-05, "token_acc": 0.92520932, "epoch": 5.51948052, "global_step/max_steps": "320/580", "percentage": "55.17%", "elapsed_time": "42m 36s", "remaining_time": "34m 36s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125186}
|
| 34 |
+
{"loss": 0.24246125, "grad_norm": 0.43049768, "learning_rate": 4.276e-05, "token_acc": 0.9276378, "epoch": 5.69264069, "global_step/max_steps": "330/580", "percentage": "56.90%", "elapsed_time": "43m 55s", "remaining_time": "33m 16s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125224}
|
| 35 |
+
{"loss": 0.25920277, "grad_norm": 0.46521387, "learning_rate": 3.995e-05, "token_acc": 0.92409673, "epoch": 5.86580087, "global_step/max_steps": "340/580", "percentage": "58.62%", "elapsed_time": "45m 13s", "remaining_time": "31m 55s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125279}
|
| 36 |
+
{"loss": 0.23348081, "grad_norm": 0.3748042, "learning_rate": 3.717e-05, "token_acc": 0.92994008, "epoch": 6.03463203, "global_step/max_steps": "350/580", "percentage": "60.34%", "elapsed_time": "46m 33s", "remaining_time": "30m 35s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125308}
|
| 37 |
+
{"loss": 0.23487866, "grad_norm": 0.53892863, "learning_rate": 3.444e-05, "token_acc": 0.93005129, "epoch": 6.20779221, "global_step/max_steps": "360/580", "percentage": "62.07%", "elapsed_time": "47m 52s", "remaining_time": "29m 15s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125347}
|
| 38 |
+
{"loss": 0.2172498, "grad_norm": 0.46141779, "learning_rate": 3.176e-05, "token_acc": 0.93600884, "epoch": 6.38095238, "global_step/max_steps": "370/580", "percentage": "63.79%", "elapsed_time": "49m 13s", "remaining_time": "27m 56s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.12527}
|
| 39 |
+
{"loss": 0.20992758, "grad_norm": 0.37730023, "learning_rate": 2.913e-05, "token_acc": 0.93624156, "epoch": 6.55411255, "global_step/max_steps": "380/580", "percentage": "65.52%", "elapsed_time": "50m 31s", "remaining_time": "26m 35s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125335}
|
| 40 |
+
{"loss": 0.21530077, "grad_norm": 1.18984973, "learning_rate": 2.658e-05, "token_acc": 0.93674962, "epoch": 6.72727273, "global_step/max_steps": "390/580", "percentage": "67.24%", "elapsed_time": "51m 50s", "remaining_time": "25m 15s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125393}
|
| 41 |
+
{"loss": 0.22280853, "grad_norm": 0.85869294, "learning_rate": 2.41e-05, "token_acc": 0.93415882, "epoch": 6.9004329, "global_step/max_steps": "400/580", "percentage": "68.97%", "elapsed_time": "53m 10s", "remaining_time": "23m 55s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125373}
|
| 42 |
+
{"loss": 0.21320977, "grad_norm": 0.51493061, "learning_rate": 2.171e-05, "token_acc": 0.93832944, "epoch": 7.06926407, "global_step/max_steps": "410/580", "percentage": "70.69%", "elapsed_time": "54m 30s", "remaining_time": "22m 36s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125348}
|
| 43 |
+
{"loss": 0.1985554, "grad_norm": 0.4976542, "learning_rate": 1.94e-05, "token_acc": 0.94013913, "epoch": 7.24242424, "global_step/max_steps": "420/580", "percentage": "72.41%", "elapsed_time": "55m 49s", "remaining_time": "21m 15s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125401}
|
| 44 |
+
{"loss": 0.20875981, "grad_norm": 0.47794816, "learning_rate": 1.72e-05, "token_acc": 0.93737783, "epoch": 7.41558442, "global_step/max_steps": "430/580", "percentage": "74.14%", "elapsed_time": "57m 8s", "remaining_time": "19m 55s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125431}
|
| 45 |
+
{"loss": 0.19188998, "grad_norm": 0.60223591, "learning_rate": 1.51e-05, "token_acc": 0.94298944, "epoch": 7.58874459, "global_step/max_steps": "440/580", "percentage": "75.86%", "elapsed_time": "58m 26s", "remaining_time": "18m 35s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125498}
|
| 46 |
+
{"loss": 0.19333825, "grad_norm": 0.49709061, "learning_rate": 1.312e-05, "token_acc": 0.94232712, "epoch": 7.76190476, "global_step/max_steps": "450/580", "percentage": "77.59%", "elapsed_time": "59m 48s", "remaining_time": "17m 16s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125417}
|
| 47 |
+
{"loss": 0.18433615, "grad_norm": 0.48897108, "learning_rate": 1.125e-05, "token_acc": 0.94367034, "epoch": 7.93506494, "global_step/max_steps": "460/580", "percentage": "79.31%", "elapsed_time": "1h 1m 6s", "remaining_time": "15m 56s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125455}
|
| 48 |
+
{"loss": 0.19610201, "grad_norm": 0.7671445, "learning_rate": 9.52e-06, "token_acc": 0.94218002, "epoch": 8.1038961, "global_step/max_steps": "470/580", "percentage": "81.03%", "elapsed_time": "1h 2m 26s", "remaining_time": "14m 36s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125464}
|
| 49 |
+
{"loss": 0.17015805, "grad_norm": 0.48931128, "learning_rate": 7.91e-06, "token_acc": 0.9497099, "epoch": 8.27705628, "global_step/max_steps": "480/580", "percentage": "82.76%", "elapsed_time": "1h 3m 44s", "remaining_time": "13m 16s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125511}
|
| 50 |
+
{"loss": 0.16500666, "grad_norm": 0.54070187, "learning_rate": 6.44e-06, "token_acc": 0.95088923, "epoch": 8.45021645, "global_step/max_steps": "490/580", "percentage": "84.48%", "elapsed_time": "1h 5m 4s", "remaining_time": "11m 57s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125482}
|
| 51 |
+
{"loss": 0.19431927, "grad_norm": 0.41522548, "learning_rate": 5.11e-06, "token_acc": 0.94300379, "epoch": 8.62337662, "global_step/max_steps": "500/580", "percentage": "86.21%", "elapsed_time": "1h 6m 26s", "remaining_time": "10m 37s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125411}
|
| 52 |
+
{"loss": 0.1670285, "grad_norm": 0.52570522, "learning_rate": 3.93e-06, "token_acc": 0.95018152, "epoch": 8.7965368, "global_step/max_steps": "510/580", "percentage": "87.93%", "elapsed_time": "1h 7m 47s", "remaining_time": "9m 18s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125396}
|
| 53 |
+
{"loss": 0.19309405, "grad_norm": 0.48933619, "learning_rate": 2.9e-06, "token_acc": 0.94387788, "epoch": 8.96969697, "global_step/max_steps": "520/580", "percentage": "89.66%", "elapsed_time": "1h 9m 6s", "remaining_time": "7m 58s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125407}
|
| 54 |
+
{"loss": 0.17578217, "grad_norm": 0.60733294, "learning_rate": 2.02e-06, "token_acc": 0.94987548, "epoch": 9.13852814, "global_step/max_steps": "530/580", "percentage": "91.38%", "elapsed_time": "1h 10m 29s", "remaining_time": "6m 39s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125309}
|
| 55 |
+
{"loss": 0.18189192, "grad_norm": 5.02008343, "learning_rate": 1.29e-06, "token_acc": 0.94547423, "epoch": 9.31168831, "global_step/max_steps": "540/580", "percentage": "93.10%", "elapsed_time": "1h 11m 48s", "remaining_time": "5m 19s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125319}
|
| 56 |
+
{"loss": 0.16662928, "grad_norm": 0.61253858, "learning_rate": 7.3e-07, "token_acc": 0.95021546, "epoch": 9.48484848, "global_step/max_steps": "550/580", "percentage": "94.83%", "elapsed_time": "1h 13m 7s", "remaining_time": "3m 59s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125367}
|
| 57 |
+
{"loss": 0.16942204, "grad_norm": 0.42454863, "learning_rate": 3.2e-07, "token_acc": 0.95084787, "epoch": 9.65800866, "global_step/max_steps": "560/580", "percentage": "96.55%", "elapsed_time": "1h 14m 25s", "remaining_time": "2m 39s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125401}
|
| 58 |
+
{"loss": 0.17350658, "grad_norm": 1.18298161, "learning_rate": 8e-08, "token_acc": 0.94986971, "epoch": 9.83116883, "global_step/max_steps": "570/580", "percentage": "98.28%", "elapsed_time": "1h 15m 44s", "remaining_time": "1m 19s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125429}
|
| 59 |
+
{"loss": 0.16733598, "grad_norm": 0.56616986, "learning_rate": 0.0, "token_acc": 0.95043931, "epoch": 10.0, "global_step/max_steps": "580/580", "percentage": "100.00%", "elapsed_time": "1h 17m 0s", "remaining_time": "0s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125516}
|
| 60 |
+
{"train_runtime": 4622.8006, "train_samples_per_second": 0.999, "train_steps_per_second": 0.125, "total_flos": 2.229367912955904e+17, "train_loss": 0.3817175, "epoch": 10.0, "global_step/max_steps": "580/580", "percentage": "100.00%", "elapsed_time": "1h 17m 2s", "remaining_time": "0s", "memory(GiB)": 29.8, "train_speed(iter/s)": 0.125465}
|
| 61 |
+
{"model_parameter_info": "PeftModelForCausalLM: 3918.9627M Params (164.3397M Trainable [4.1934%]), 0.0024M Buffers.", "last_model_checkpoint": "/home/ab/document-parsing/output/training/v1-20260117-010840/checkpoint-580", "best_model_checkpoint": null, "best_metric": null, "global_step": 580, "log_history": [{"loss": 1.4861114025115967, "grad_norm": 0.4092565178871155, "learning_rate": 3.448275862068966e-06, "token_acc": 0.6811960725974412, "epoch": 0.017316017316017316, "step": 1}, {"loss": 1.4343115488688152, "grad_norm": 0.3977337181568146, "learning_rate": 3.4482758620689657e-05, "token_acc": 0.6920024476626676, "epoch": 0.17316017316017315, "step": 10}, {"loss": 1.3693717956542968, "grad_norm": 0.2495131641626358, "learning_rate": 6.896551724137931e-05, "token_acc": 0.7011260365349897, "epoch": 0.3463203463203463, "step": 20}, {"loss": 1.1922229766845702, "grad_norm": 0.24984458088874817, "learning_rate": 9.999918729041868e-05, "token_acc": 0.726987948088823, "epoch": 0.5194805194805194, "step": 30}, {"loss": 1.0192347526550294, "grad_norm": 0.3221384584903717, "learning_rate": 9.990169410465536e-05, "token_acc": 0.7609010955099522, "epoch": 0.6926406926406926, "step": 40}, {"loss": 0.9150349617004394, "grad_norm": 0.40206295251846313, "learning_rate": 9.964202208175834e-05, "token_acc": 0.7773335965518376, "epoch": 0.8658008658008658, "step": 50}, {"loss": 0.7742667198181152, "grad_norm": 0.20406530797481537, "learning_rate": 9.922101514711866e-05, "token_acc": 0.8123942631570925, "epoch": 1.0346320346320346, "step": 60}, {"loss": 0.6983946800231934, "grad_norm": 1.4768069982528687, "learning_rate": 9.864004155919543e-05, "token_acc": 0.8248333138378757, "epoch": 1.2077922077922079, "step": 70}, {"loss": 0.6138243198394775, "grad_norm": 0.611409604549408, "learning_rate": 9.790098946272177e-05, "token_acc": 0.8442561143531572, "epoch": 1.380952380952381, "step": 80}, {"loss": 0.5975491523742675, "grad_norm": 0.3051394820213318, "learning_rate": 9.700626075229738e-05, "token_acc": 0.8483123092893768, "epoch": 1.554112554112554, "step": 90}, {"loss": 0.5410520553588867, "grad_norm": 0.3783220648765564, "learning_rate": 9.595876326631154e-05, "token_acc": 0.8605094145609629, "epoch": 1.7272727272727273, "step": 100}, {"loss": 0.5531170845031739, "grad_norm": 0.6039865612983704, "learning_rate": 9.476190133656548e-05, "token_acc": 0.8547892544963617, "epoch": 1.9004329004329006, "step": 110}, {"loss": 0.5079349040985107, "grad_norm": 0.5374985337257385, "learning_rate": 9.341956472430801e-05, "token_acc": 0.864488826645558, "epoch": 2.069264069264069, "step": 120}, {"loss": 0.44995865821838377, "grad_norm": 0.364619642496109, "learning_rate": 9.193611597864139e-05, "token_acc": 0.8797397710240138, "epoch": 2.242424242424242, "step": 130}, {"loss": 0.429323148727417, "grad_norm": 1.59947669506073, "learning_rate": 9.031637625838265e-05, "token_acc": 0.8858490566037736, "epoch": 2.4155844155844157, "step": 140}, {"loss": 0.4315037727355957, "grad_norm": 0.46518200635910034, "learning_rate": 8.856560966345877e-05, "token_acc": 0.8819307344821817, "epoch": 2.588744588744589, "step": 150}, {"loss": 0.40119166374206544, "grad_norm": 0.691148579120636, "learning_rate": 8.668950612675785e-05, "token_acc": 0.8896224924972358, "epoch": 2.761904761904762, "step": 160}, {"loss": 0.40500435829162595, "grad_norm": 0.3540444076061249, "learning_rate": 8.469416292203747e-05, "token_acc": 0.8917646715924161, "epoch": 2.935064935064935, "step": 170}, {"loss": 0.37092483043670654, "grad_norm": 0.3412817418575287, "learning_rate": 8.258606484798897e-05, "token_acc": 0.8977291233149371, "epoch": 3.103896103896104, "step": 180}, {"loss": 0.344103741645813, "grad_norm": 0.34155094623565674, "learning_rate": 8.037206315285843e-05, "token_acc": 0.9065206570433051, "epoch": 3.277056277056277, "step": 190}, {"loss": 0.3504387140274048, "grad_norm": 0.3627335727214813, "learning_rate": 7.805935326811912e-05, "token_acc": 0.9002762340096682, "epoch": 3.45021645021645, "step": 200}, {"loss": 0.3558197498321533, "grad_norm": 0.8141089677810669, "learning_rate": 7.565545142355971e-05, "token_acc": 0.8999160043936163, "epoch": 3.6233766233766236, "step": 210}, {"loss": 0.33676347732543943, "grad_norm": 0.6176502108573914, "learning_rate": 7.316817021978884e-05, "token_acc": 0.904816147992892, "epoch": 3.7965367965367967, "step": 220}, {"loss": 0.35226542949676515, "grad_norm": 0.49287620186805725, "learning_rate": 7.060559323754435e-05, "token_acc": 0.9020813028578615, "epoch": 3.9696969696969697, "step": 230}, {"loss": 0.3057840585708618, "grad_norm": 0.6057422161102295, "learning_rate": 6.797604876632633e-05, "token_acc": 0.9123896645803242, "epoch": 4.138528138528138, "step": 240}, {"loss": 0.301344108581543, "grad_norm": 12.585014343261719, "learning_rate": 6.528808273773461e-05, "token_acc": 0.9142363149996737, "epoch": 4.311688311688312, "step": 250}, {"loss": 0.2898148775100708, "grad_norm": 0.32902830839157104, "learning_rate": 6.255043095147679e-05, "token_acc": 0.9177889157552563, "epoch": 4.484848484848484, "step": 260}, {"loss": 0.29072208404541017, "grad_norm": 0.39732787013053894, "learning_rate": 5.9771990684311544e-05, "token_acc": 0.917258875717698, "epoch": 4.658008658008658, "step": 270}, {"loss": 0.2852530241012573, "grad_norm": 0.44461533427238464, "learning_rate": 5.6961791774196424e-05, "token_acc": 0.9166775180675826, "epoch": 4.8311688311688314, "step": 280}, {"loss": 0.3020582675933838, "grad_norm": 0.35245048999786377, "learning_rate": 5.4128967273616625e-05, "token_acc": 0.9138208862720794, "epoch": 5.0, "step": 290}, {"loss": 0.23758175373077392, "grad_norm": 0.36154425144195557, "learning_rate": 5.128272376746972e-05, "token_acc": 0.9282945419454031, "epoch": 5.1731601731601735, "step": 300}, {"loss": 0.27498042583465576, "grad_norm": 0.40296199917793274, "learning_rate": 4.8432311451972665e-05, "token_acc": 0.9217681765679143, "epoch": 5.346320346320346, "step": 310}, {"loss": 0.2576076745986938, "grad_norm": 0.9700812697410583, "learning_rate": 4.558699407183338e-05, "token_acc": 0.9252093233763294, "epoch": 5.51948051948052, "step": 320}, {"loss": 0.2424612522125244, "grad_norm": 0.4304976761341095, "learning_rate": 4.2756018813390274e-05, "token_acc": 0.9276378041152792, "epoch": 5.692640692640692, "step": 330}, {"loss": 0.259202766418457, "grad_norm": 0.4652138650417328, "learning_rate": 3.9948586251565825e-05, "token_acc": 0.9240967292621122, "epoch": 5.865800865800866, "step": 340}, {"loss": 0.2334808111190796, "grad_norm": 0.37480419874191284, "learning_rate": 3.7173820448305755e-05, "token_acc": 0.9299400823867182, "epoch": 6.034632034632034, "step": 350}, {"loss": 0.23487865924835205, "grad_norm": 0.5389286279678345, "learning_rate": 3.444073929968284e-05, "token_acc": 0.9300512852684243, "epoch": 6.207792207792208, "step": 360}, {"loss": 0.21724979877471923, "grad_norm": 0.4614177942276001, "learning_rate": 3.175822522803623e-05, "token_acc": 0.9360088365243004, "epoch": 6.380952380952381, "step": 370}, {"loss": 0.20992758274078369, "grad_norm": 0.3773002326488495, "learning_rate": 2.9134996314395818e-05, "token_acc": 0.9362415581566618, "epoch": 6.554112554112554, "step": 380}, {"loss": 0.2153007745742798, "grad_norm": 1.1898497343063354, "learning_rate": 2.65795779650105e-05, "token_acc": 0.9367496189220204, "epoch": 6.7272727272727275, "step": 390}, {"loss": 0.22280852794647216, "grad_norm": 0.8586929440498352, "learning_rate": 2.41002752040629e-05, "token_acc": 0.9341588229918669, "epoch": 6.9004329004329, "step": 400}, {"loss": 0.21320977210998535, "grad_norm": 0.5149306058883667, "learning_rate": 2.1705145682618505e-05, "token_acc": 0.9383294431477159, "epoch": 7.06926406926407, "step": 410}, {"loss": 0.1985553979873657, "grad_norm": 0.4976541996002197, "learning_rate": 1.940197349152923e-05, "token_acc": 0.9401391309809833, "epoch": 7.242424242424242, "step": 420}, {"loss": 0.20875980854034423, "grad_norm": 0.4779481589794159, "learning_rate": 1.7198243863398273e-05, "token_acc": 0.9373778262148182, "epoch": 7.415584415584416, "step": 430}, {"loss": 0.19188997745513917, "grad_norm": 0.6022359132766724, "learning_rate": 1.510111884582463e-05, "token_acc": 0.942989444333798, "epoch": 7.588744588744589, "step": 440}, {"loss": 0.1933382511138916, "grad_norm": 0.497090607881546, "learning_rate": 1.3117414024987823e-05, "token_acc": 0.9423271204556436, "epoch": 7.761904761904762, "step": 450}, {"loss": 0.1843361496925354, "grad_norm": 0.488971084356308, "learning_rate": 1.125357637522072e-05, "token_acc": 0.9436703366987985, "epoch": 7.935064935064935, "step": 460}, {"loss": 0.19610201120376586, "grad_norm": 0.767144501209259, "learning_rate": 9.51566330655857e-06, "token_acc": 0.9421800227876946, "epoch": 8.103896103896103, "step": 470}, {"loss": 0.170158052444458, "grad_norm": 0.4893112778663635, "learning_rate": 7.909322978358913e-06, "token_acc": 0.9497098970386021, "epoch": 8.277056277056277, "step": 480}, {"loss": 0.1650066614151001, "grad_norm": 0.5407018661499023, "learning_rate": 6.439775942972609e-06, "token_acc": 0.9508892299359032, "epoch": 8.45021645021645, "step": 490}, {"loss": 0.1943192720413208, "grad_norm": 0.41522547602653503, "learning_rate": 5.111798179123173e-06, "token_acc": 0.9430037937960277, "epoch": 8.623376623376624, "step": 500}, {"loss": 0.16702849864959718, "grad_norm": 0.5257052183151245, "learning_rate": 3.929705570135711e-06, "token_acc": 0.9501815248083905, "epoch": 8.796536796536797, "step": 510}, {"loss": 0.19309405088424683, "grad_norm": 0.48933619260787964, "learning_rate": 2.897339877460398e-06, "token_acc": 0.9438778813778814, "epoch": 8.969696969696969, "step": 520}, {"loss": 0.17578216791152954, "grad_norm": 0.6073329448699951, "learning_rate": 2.018056255076256e-06, "token_acc": 0.949875481814, "epoch": 9.13852813852814, "step": 530}, {"loss": 0.18189191818237305, "grad_norm": 5.020083427429199, "learning_rate": 1.2947123453528886e-06, "token_acc": 0.9454742254092816, "epoch": 9.311688311688311, "step": 540}, {"loss": 0.16662927865982055, "grad_norm": 0.6125385761260986, "learning_rate": 7.296589918083685e-07, "token_acc": 0.9502154609558632, "epoch": 9.484848484848484, "step": 550}, {"loss": 0.16942204236984254, "grad_norm": 0.4245486259460449, "learning_rate": 3.2473259894640894e-07, "token_acc": 0.9508478741705578, "epoch": 9.658008658008658, "step": 560}, {"loss": 0.17350658178329467, "grad_norm": 1.1829816102981567, "learning_rate": 8.124916400311655e-08, "token_acc": 0.9498697127620894, "epoch": 9.831168831168831, "step": 570}, {"loss": 0.1673359751701355, "grad_norm": 0.5661698579788208, "learning_rate": 0.0, "token_acc": 0.9504393101204035, "epoch": 10.0, "step": 580}, {"train_runtime": 4622.8006, "train_samples_per_second": 0.999, "train_steps_per_second": 0.125, "total_flos": 2.229367912955904e+17, "train_loss": 0.3817174964937671, "epoch": 10.0, "step": 580}], "memory": 29.8046875}
|
output/training/v1-20260117-010840-10e/runs/events.out.tfevents.1768612131.5090.2113421.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b70de2b89c6fd0f0eae0667654df5c6c822d5e1f96e7052470d4c55216928190
|
| 3 |
+
size 25008
|