JacksonBrune commited on 12 days ago

Commit

fc34e18

verified ·

1 Parent(s): 58b5a67

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +1 -0
README.md +159 -0
added_tokens.json +28 -0
args.json +353 -0
chat_template.jinja +61 -0
config.json +68 -0
generation_config.json +13 -0
merges.txt +0 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +406 -0
optimizer.pt +3 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +239 -0
trainer_state.json +1394 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,159 @@

+---
+language:
+- en
+license: apache-2.0
+tags:
+- finance
+- earnings-call
+- evasion-detection
+- qwen3
+- text-classification
+base_model: Qwen/Qwen3-4B-Instruct-2507
+datasets:
+- earnings-call-qa
+metrics:
+- accuracy
+- f1
+model-index:
+- name: Qwen3-4B-Evasion
+  results:
+  - task:
+      type: text-classification
+      name: Evasion Classification
+    metrics:
+    - type: accuracy
+      value: 0.7508
+      name: Accuracy
+    - type: f1
+      value: 0.7475
+      name: Weighted F1
+library_name: transformers
+pipeline_tag: text-classification
+---
+# Qwen3-4B-Evasion
+A fine-tuned model for detecting evasion levels in earnings call Q&A responses.
+## Model Description
+**Qwen3-4B-Evasion** is a specialized model fine-tuned from [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507) for analyzing executive responses during earnings call Q&A sessions. The model classifies responses into three evasion categories based on the Rasiah taxonomy.
+## Intended Use
+### Primary Use Case
+- Analyze transparency and directness of executive responses in earnings calls
+- Financial discourse analysis
+- Corporate communication research
+### Classification Categories
+- **direct**: Clear, on-topic resolution to the question
+- **intermediate**: Partially responsive, incomplete, or softened answer
+- **fully_evasive**: Does not provide requested information
+## Training Details
+### Training Data
+- **Dataset**: 27,097 earnings call Q&A pairs
+- **Source**: Annotated by DeepSeek-V3.2 and Qwen3-Max models
+- **Label Distribution**:
+  - intermediate: 45.4%
+  - direct: 29.8%
+  - fully_evasive: 24.9%
+### Training Configuration
+- **Base Model**: Qwen/Qwen3-4B-Instruct-2507
+- **Training Type**: Full parameter fine-tuning
+- **Hardware**: 2x NVIDIA B200 GPUs
+- **Epochs**: 2
+- **Batch Size**: 32 (effective)
+- **Learning Rate**: 2e-5
+- **Framework**: MS-SWIFT
+## Performance
+Evaluated on 297 human-annotated benchmark samples:
+| Metric | Score |
+|--------|-------|
+| **Overall Accuracy** | 75.08% |
+| **Weighted F1** | 74.75% |
+| **Weighted Precision** | 77.56% |
+| **Weighted Recall** | 75.08% |
+### Per-Class Performance
+| Class | Precision | Recall | F1-Score | Support |
+|-------|-----------|--------|----------|---------|
+| direct | 86.67% | 54.74% | 67.10% | 95 |
+| intermediate | 63.12% | 80.91% | 70.92% | 110 |
+| fully_evasive | 85.42% | 89.13% | 87.23% | 92 |
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "FutureMa/Qwen3-4B-Evasion"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Prepare input
+question = "What are your revenue projections for next quarter?"
+answer = "We don't provide specific guidance on that."
+prompt = f"""You are a financial discourse analyst. Classify the evasion level of this executive response.
+Question: {question}
+Answer: {answer}
+Return JSON: {{"rasiah":"direct|intermediate|fully_evasive","confidence":0.00}}"""
+messages = [{"role": "user", "content": prompt}]
+text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+inputs = tokenizer([text], return_tensors="pt").to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=128, temperature=1)
+response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(response)
+```
+## Limitations
+- **Direct Class Recall**: Lower recall (54.74%) for direct responses - model tends to be conservative
+- **Domain Specific**: Optimized for earnings call context, may not generalize to other domains
+- **English Only**: Trained exclusively on English text
+- **Confidence Calibration**: Model confidence scores may require further calibration
+## Bias and Ethical Considerations
+- Training data derived from corporate earnings calls may reflect existing biases in financial communication
+- Model should not be used as sole determinant for investment decisions
+- Human oversight recommended for critical applications
+## Citation
+```bibtex
+@misc{qwen3-4b-evasion,
+  author = {Shijian Ma},
+  title = {Qwen3-4B-Evasion: Earnings Call Evasion Detection Model},
+  year = {2025},
+  publisher = {HuggingFace},
+  howpublished = {\url{https://huggingface.co/FutureMa/Qwen3-4B-Evasion}}
+}
+```
+## License
+Apache 2.0
+## Acknowledgments
+- Base model: [Qwen Team](https://huggingface.co/Qwen)
+- Training framework: [MS-SWIFT](https://github.com/modelscope/ms-swift)
+- Evasion taxonomy: Rasiah et al.
+## Contact
+For questions or issues, please open an issue on the model repository.

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

args.json ADDED Viewed

	@@ -0,0 +1,353 @@

+{
+  "output_dir": "/home/ubuntu/ms-swift/output/qwen3-4b-evasion-full/v1-20251231-073918",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "no",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 8,
+  "per_device_eval_batch_size": 1,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 2,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 2e-05,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 2.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.03,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/home/ubuntu/ms-swift/output/qwen3-4b-evasion-full/v1-20251231-073918/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 10,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 500.0,
+  "save_total_limit": 3,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 500.0,
+  "dataloader_num_workers": 8,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/home/ubuntu/ms-swift/output/qwen3-4b-evasion-full/v1-20251231-073918",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": [],
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "parallelism_config": null,
+  "deepspeed": null,
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch_fused",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "project": "huggingface",
+  "trackio_space_id": "trackio",
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "hub_revision": null,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": "{\"use_reentrant\": false}",
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "liger_kernel_config": null,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": true,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "enable_channel_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "use_ray": false,
+  "ray_exp_name": null,
+  "device_groups": null,
+  "model": "Qwen/Qwen3-4B-Instruct-2507",
+  "model_type": "qwen3_nothinking",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "qwen3_nothinking",
+  "system": null,
+  "max_length": 2048,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_side": "right",
+  "padding_free": false,
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "template_backend": "swift",
+  "response_prefix": null,
+  "enable_thinking": null,
+  "add_non_thinking_prefix": true,
+  "dataset": [
+    "/home/ubuntu/ms-swift/data/evasion_qa27k_msswift.jsonl"
+  ],
+  "val_dataset": [],
+  "cached_dataset": [],
+  "cached_val_dataset": [],
+  "split_dataset_ratio": 0.0,
+  "dataset_num_proc": 1,
+  "load_from_cache_file": false,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "structured_outputs_regex": null,
+  "ckpt_dir": null,
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "packing_num_proc": 1,
+  "lazy_tokenize": false,
+  "custom_register_path": [],
+  "use_hf": false,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "target_parameters": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": "ms-swift",
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_notification_method": null,
+  "swanlab_webhook_url": null,
+  "swanlab_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": null,
+  "rank": 0,
+  "global_world_size": 2,
+  "local_world_size": 2,
+  "model_suffix": "Qwen3-4B-Instruct-2507",
+  "model_info": "ModelInfo(model_type='qwen3_nothinking', model_dir='/home/ubuntu/.cache/modelscope/hub/models/Qwen/Qwen3-4B-Instruct-2507', torch_dtype=torch.bfloat16, max_model_len=262144, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen3_nothinking', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507', hf_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', hf_model_id='Qwen/Qwen3-30B-A3B-Instruct-2507-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507', hf_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', hf_model_id='Qwen/Qwen3-235B-A22B-Instruct-2507-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='swift/Qwen3-235B-A22B-Instruct-2507-AWQ', hf_model_id=None, model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-4B-Instruct-2507', hf_model_id='Qwen/Qwen3-4B-Instruct-2507', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-Instruct-2507-FP8', hf_model_id='Qwen/Qwen3-4B-Instruct-2507-FP8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen3_nothinking', get_function=<function get_model_tokenizer_with_flash_attn at 0x70c718bef490>, model_arch=None, architectures=['Qwen3MoeForCausalLM', 'Qwen3ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.51'], tags=[])",
+  "model_dir": "/home/ubuntu/.cache/modelscope/hub/models/Qwen/Qwen3-4B-Instruct-2507",
+  "_val_dataset_exists": [],
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "steps",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/home/ubuntu/ms-swift/output/qwen3-4b-evasion-full/v1-20251231-073918', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=2e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.03, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/home/ubuntu/ms-swift/output/qwen3-4b-evasion-full/v1-20251231-073918/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=10, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=3, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=500.0, dataloader_num_workers=8, dataloader_prefetch_factor=2, past_index=-1, run_name='/home/ubuntu/ms-swift/output/qwen3-4b-evasion-full/v1-20251231-073918', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], project='huggingface', trackio_space_id='trackio', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': False}, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, chord_sft_dataset=[], chord_sft_per_device_train_batch_size=None, chord_enable_phi_function=False, chord_mu_warmup_steps=None, chord_mu_decay_steps=None, chord_mu_peak=None, chord_mu_valley=None, train_type='full', local_repo_path=None, galore_config=None, task_type='causal_lm', problem_type=None)"
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,61 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.57.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85c6ac7361f9fa1628b95f7a5462511d9a711d38adef3ab0bfc5de92ad4d0c27
+size 4967215360

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e552b551f490e5c0fd54c7d0ef914e512c79e7e30481370c948678352673b03c
+size 3077766632

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,406 @@

+{
+  "metadata": {
+    "total_parameters": 4022468096,
+    "total_size": 8044936192
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39ac89e56fec4dacd012d8108324c37dc5238afe141f68f6c8dea7b7222bbe6e
+size 16090225449

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95e5fc2074c0df31522a514f862c86cb00d71c946a7f15cc9ec0e53a69fb28a7
+size 14917

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e7153eae67b6c9232a41bc996a2bf5b83229b8c7230d61911ac0fd40e64154e
+size 14917

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cebc4fd0a8a189aa9b371f3e058580335b13df782675613f6af7a02d0799151b
+size 1465

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1394 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500.0,
+  "global_step": 1690,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0011841326228537595,
+      "grad_norm": 219.0,
+      "learning_rate": 3.921568627450981e-07,
+      "loss": 1.2013294696807861,
+      "step": 1,
+      "token_acc": 0.8954758190327613
+    },
+    {
+      "epoch": 0.011841326228537596,
+      "grad_norm": 50.0,
+      "learning_rate": 3.92156862745098e-06,
+      "loss": 0.8140333493550619,
+      "step": 10,
+      "token_acc": 0.9158278375564041
+    },
+    {
+      "epoch": 0.023682652457075192,
+      "grad_norm": 15.0625,
+      "learning_rate": 7.84313725490196e-06,
+      "loss": 0.25884711742401123,
+      "step": 20,
+      "token_acc": 0.9479379018347185
+    },
+    {
+      "epoch": 0.035523978685612786,
+      "grad_norm": 4.78125,
+      "learning_rate": 1.1764705882352942e-05,
+      "loss": 0.13083882331848146,
+      "step": 30,
+      "token_acc": 0.9458783043954325
+    },
+    {
+      "epoch": 0.047365304914150384,
+      "grad_norm": 4.65625,
+      "learning_rate": 1.568627450980392e-05,
+      "loss": 0.11950666904449463,
+      "step": 40,
+      "token_acc": 0.953747256193164
+    },
+    {
+      "epoch": 0.05920663114268798,
+      "grad_norm": 3.921875,
+      "learning_rate": 1.9607843137254903e-05,
+      "loss": 0.10617152452468873,
+      "step": 50,
+      "token_acc": 0.9590588235294117
+    },
+    {
+      "epoch": 0.07104795737122557,
+      "grad_norm": 2.25,
+      "learning_rate": 1.9998512057697314e-05,
+      "loss": 0.10807085037231445,
+      "step": 60,
+      "token_acc": 0.9589895524715422
+    },
+    {
+      "epoch": 0.08288928359976318,
+      "grad_norm": 2.375,
+      "learning_rate": 1.9993369121919753e-05,
+      "loss": 0.10784111022949219,
+      "step": 70,
+      "token_acc": 0.9572008747266479
+    },
+    {
+      "epoch": 0.09473060982830077,
+      "grad_norm": 2.9375,
+      "learning_rate": 1.998455471202776e-05,
+      "loss": 0.10691288709640503,
+      "step": 80,
+      "token_acc": 0.9570600219401347
+    },
+    {
+      "epoch": 0.10657193605683836,
+      "grad_norm": 1.578125,
+      "learning_rate": 1.9972072066356417e-05,
+      "loss": 0.11526317596435547,
+      "step": 90,
+      "token_acc": 0.9496324104489285
+    },
+    {
+      "epoch": 0.11841326228537596,
+      "grad_norm": 3.09375,
+      "learning_rate": 1.995592577091769e-05,
+      "loss": 0.10205183029174805,
+      "step": 100,
+      "token_acc": 0.9572502348888193
+    },
+    {
+      "epoch": 0.13025458851391356,
+      "grad_norm": 3.1875,
+      "learning_rate": 1.9936121757715598e-05,
+      "loss": 0.10735645294189453,
+      "step": 110,
+      "token_acc": 0.9611528822055138
+    },
+    {
+      "epoch": 0.14209591474245115,
+      "grad_norm": 1.796875,
+      "learning_rate": 1.991266730256683e-05,
+      "loss": 0.10336060523986816,
+      "step": 120,
+      "token_acc": 0.9576736165543188
+    },
+    {
+      "epoch": 0.15393724097098876,
+      "grad_norm": 2.078125,
+      "learning_rate": 1.9885571022427676e-05,
+      "loss": 0.09967223405838013,
+      "step": 130,
+      "token_acc": 0.959868317918169
+    },
+    {
+      "epoch": 0.16577856719952636,
+      "grad_norm": 1.140625,
+      "learning_rate": 1.9854842872228247e-05,
+      "loss": 0.09939006567001343,
+      "step": 140,
+      "token_acc": 0.9603572547790661
+    },
+    {
+      "epoch": 0.17761989342806395,
+      "grad_norm": 1.2890625,
+      "learning_rate": 1.98204941412151e-05,
+      "loss": 0.0902411937713623,
+      "step": 150,
+      "token_acc": 0.9632122730118973
+    },
+    {
+      "epoch": 0.18946121965660154,
+      "grad_norm": 2.984375,
+      "learning_rate": 1.9782537448803707e-05,
+      "loss": 0.10655044317245484,
+      "step": 160,
+      "token_acc": 0.9559263340154258
+    },
+    {
+      "epoch": 0.20130254588513913,
+      "grad_norm": 2.65625,
+      "learning_rate": 1.9740986739942146e-05,
+      "loss": 0.10265426635742188,
+      "step": 170,
+      "token_acc": 0.9573600877880546
+    },
+    {
+      "epoch": 0.21314387211367672,
+      "grad_norm": 1.3046875,
+      "learning_rate": 1.9695857279987897e-05,
+      "loss": 0.09765652418136597,
+      "step": 180,
+      "token_acc": 0.9597620165962111
+    },
+    {
+      "epoch": 0.22498519834221434,
+      "grad_norm": 1.4140625,
+      "learning_rate": 1.9647165649099465e-05,
+      "loss": 0.09450024366378784,
+      "step": 190,
+      "token_acc": 0.963653454488485
+    },
+    {
+      "epoch": 0.23682652457075193,
+      "grad_norm": 1.3046875,
+      "learning_rate": 1.9594929736144978e-05,
+      "loss": 0.10988011360168456,
+      "step": 200,
+      "token_acc": 0.9540840231141652
+    },
+    {
+      "epoch": 0.24866785079928952,
+      "grad_norm": 1.828125,
+      "learning_rate": 1.9539168732129977e-05,
+      "loss": 0.09797856211662292,
+      "step": 210,
+      "token_acc": 0.9617614793919448
+    },
+    {
+      "epoch": 0.2605091770278271,
+      "grad_norm": 4.15625,
+      "learning_rate": 1.9479903123146835e-05,
+      "loss": 0.09065916538238525,
+      "step": 220,
+      "token_acc": 0.9650382394256282
+    },
+    {
+      "epoch": 0.27235050325636473,
+      "grad_norm": 1.9765625,
+      "learning_rate": 1.9417154682848314e-05,
+      "loss": 0.10036060810089112,
+      "step": 230,
+      "token_acc": 0.961611076148521
+    },
+    {
+      "epoch": 0.2841918294849023,
+      "grad_norm": 2.234375,
+      "learning_rate": 1.935094646444815e-05,
+      "loss": 0.09578206539154052,
+      "step": 240,
+      "token_acc": 0.9624119028974158
+    },
+    {
+      "epoch": 0.2960331557134399,
+      "grad_norm": 2.171875,
+      "learning_rate": 1.928130279225149e-05,
+      "loss": 0.09263083934783936,
+      "step": 250,
+      "token_acc": 0.963653454488485
+    },
+    {
+      "epoch": 0.30787448194197753,
+      "grad_norm": 1.6640625,
+      "learning_rate": 1.920824925271838e-05,
+      "loss": 0.09710139036178589,
+      "step": 260,
+      "token_acc": 0.9595754643358826
+    },
+    {
+      "epoch": 0.3197158081705151,
+      "grad_norm": 3.40625,
+      "learning_rate": 1.9131812685063512e-05,
+      "loss": 0.10172030925750733,
+      "step": 270,
+      "token_acc": 0.957680250783699
+    },
+    {
+      "epoch": 0.3315571343990527,
+      "grad_norm": 1.9609375,
+      "learning_rate": 1.9052021171395742e-05,
+      "loss": 0.10712752342224122,
+      "step": 280,
+      "token_acc": 0.9577840552416823
+    },
+    {
+      "epoch": 0.3433984606275903,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.896890402640098e-05,
+      "loss": 0.09744402766227722,
+      "step": 290,
+      "token_acc": 0.9596054485674025
+    },
+    {
+      "epoch": 0.3552397868561279,
+      "grad_norm": 2.359375,
+      "learning_rate": 1.8882491786572226e-05,
+      "loss": 0.09446089267730713,
+      "step": 300,
+      "token_acc": 0.9636648394675019
+    },
+    {
+      "epoch": 0.36708111308466546,
+      "grad_norm": 1.828125,
+      "learning_rate": 1.8792816198990768e-05,
+      "loss": 0.09970860481262207,
+      "step": 310,
+      "token_acc": 0.9583398590446358
+    },
+    {
+      "epoch": 0.3789224393132031,
+      "grad_norm": 1.5390625,
+      "learning_rate": 1.8699910209662536e-05,
+      "loss": 0.09670261144638062,
+      "step": 320,
+      "token_acc": 0.9606150949317432
+    },
+    {
+      "epoch": 0.3907637655417407,
+      "grad_norm": 2.828125,
+      "learning_rate": 1.8603807951414093e-05,
+      "loss": 0.09714120626449585,
+      "step": 330,
+      "token_acc": 0.9602938877598874
+    },
+    {
+      "epoch": 0.40260509177027826,
+      "grad_norm": 1.890625,
+      "learning_rate": 1.850454473135249e-05,
+      "loss": 0.09373531341552735,
+      "step": 340,
+      "token_acc": 0.9619166536600593
+    },
+    {
+      "epoch": 0.4144464179988159,
+      "grad_norm": 2.25,
+      "learning_rate": 1.8402157017893795e-05,
+      "loss": 0.09355499744415283,
+      "step": 350,
+      "token_acc": 0.9667919799498746
+    },
+    {
+      "epoch": 0.42628774422735344,
+      "grad_norm": 0.8828125,
+      "learning_rate": 1.829668242736489e-05,
+      "loss": 0.08944010734558105,
+      "step": 360,
+      "token_acc": 0.9638327853452325
+    },
+    {
+      "epoch": 0.43812907045589106,
+      "grad_norm": 1.2265625,
+      "learning_rate": 1.8188159710183595e-05,
+      "loss": 0.09383893013000488,
+      "step": 370,
+      "token_acc": 0.9663642052565707
+    },
+    {
+      "epoch": 0.4499703966844287,
+      "grad_norm": 2.953125,
+      "learning_rate": 1.807662873662209e-05,
+      "loss": 0.09152829647064209,
+      "step": 380,
+      "token_acc": 0.9641403069213905
+    },
+    {
+      "epoch": 0.46181172291296624,
+      "grad_norm": 1.6953125,
+      "learning_rate": 1.796213048215896e-05,
+      "loss": 0.10058202743530273,
+      "step": 390,
+      "token_acc": 0.961363279409455
+    },
+    {
+      "epoch": 0.47365304914150386,
+      "grad_norm": 1.7421875,
+      "learning_rate": 1.7844707012425155e-05,
+      "loss": 0.0878696620464325,
+      "step": 400,
+      "token_acc": 0.9662956576265872
+    },
+    {
+      "epoch": 0.4854943753700414,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.772440146774945e-05,
+      "loss": 0.09355847835540772,
+      "step": 410,
+      "token_acc": 0.9618928627205997
+    },
+    {
+      "epoch": 0.49733570159857904,
+      "grad_norm": 1.3828125,
+      "learning_rate": 1.7601258047309096e-05,
+      "loss": 0.09457954168319702,
+      "step": 420,
+      "token_acc": 0.9631430363864492
+    },
+    {
+      "epoch": 0.5091770278271166,
+      "grad_norm": 1.0234375,
+      "learning_rate": 1.7475321992891417e-05,
+      "loss": 0.09055821895599366,
+      "step": 430,
+      "token_acc": 0.9654251139399654
+    },
+    {
+      "epoch": 0.5210183540556542,
+      "grad_norm": 1.9921875,
+      "learning_rate": 1.73466395722724e-05,
+      "loss": 0.09674708843231201,
+      "step": 440,
+      "token_acc": 0.9611041405269761
+    },
+    {
+      "epoch": 0.5328596802841918,
+      "grad_norm": 1.671875,
+      "learning_rate": 1.7215258062218323e-05,
+      "loss": 0.10127317905426025,
+      "step": 450,
+      "token_acc": 0.9612791973663584
+    },
+    {
+      "epoch": 0.5447010065127295,
+      "grad_norm": 2.28125,
+      "learning_rate": 1.708122573111669e-05,
+      "loss": 0.08792918920516968,
+      "step": 460,
+      "token_acc": 0.9650962591954922
+    },
+    {
+      "epoch": 0.5565423327412671,
+      "grad_norm": 2.171875,
+      "learning_rate": 1.6944591821242867e-05,
+      "loss": 0.09947954416275025,
+      "step": 470,
+      "token_acc": 0.9605057758351545
+    },
+    {
+      "epoch": 0.5683836589698046,
+      "grad_norm": 1.46875,
+      "learning_rate": 1.680540653066891e-05,
+      "loss": 0.0963528037071228,
+      "step": 480,
+      "token_acc": 0.9614842649131048
+    },
+    {
+      "epoch": 0.5802249851983422,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.6663720994821246e-05,
+      "loss": 0.0961789608001709,
+      "step": 490,
+      "token_acc": 0.9619599248591109
+    },
+    {
+      "epoch": 0.5920663114268798,
+      "grad_norm": 2.109375,
+      "learning_rate": 1.651958726769396e-05,
+      "loss": 0.090640389919281,
+      "step": 500,
+      "token_acc": 0.963166144200627
+    },
+    {
+      "epoch": 0.6039076376554174,
+      "grad_norm": 0.9140625,
+      "learning_rate": 1.6373058302724655e-05,
+      "loss": 0.08862148523330689,
+      "step": 510,
+      "token_acc": 0.9642521166509878
+    },
+    {
+      "epoch": 0.6157489638839551,
+      "grad_norm": 1.7421875,
+      "learning_rate": 1.6224187933339808e-05,
+      "loss": 0.08748204708099365,
+      "step": 520,
+      "token_acc": 0.9620749098887321
+    },
+    {
+      "epoch": 0.6275902901124926,
+      "grad_norm": 1.5546875,
+      "learning_rate": 1.6073030853176862e-05,
+      "loss": 0.09252775907516479,
+      "step": 530,
+      "token_acc": 0.9616528408201597
+    },
+    {
+      "epoch": 0.6394316163410302,
+      "grad_norm": 1.296875,
+      "learning_rate": 1.5919642595990275e-05,
+      "loss": 0.08904544115066529,
+      "step": 540,
+      "token_acc": 0.9668594653743943
+    },
+    {
+      "epoch": 0.6512729425695678,
+      "grad_norm": 1.8203125,
+      "learning_rate": 1.5764079515248922e-05,
+      "loss": 0.08241082429885864,
+      "step": 550,
+      "token_acc": 0.9658628249295333
+    },
+    {
+      "epoch": 0.6631142687981054,
+      "grad_norm": 1.6015625,
+      "learning_rate": 1.5606398763432318e-05,
+      "loss": 0.0839945912361145,
+      "step": 560,
+      "token_acc": 0.9672131147540983
+    },
+    {
+      "epoch": 0.6749555950266429,
+      "grad_norm": 1.5,
+      "learning_rate": 1.5446658271033336e-05,
+      "loss": 0.09018040895462036,
+      "step": 570,
+      "token_acc": 0.9658574784651527
+    },
+    {
+      "epoch": 0.6867969212551805,
+      "grad_norm": 1.40625,
+      "learning_rate": 1.528491672527504e-05,
+      "loss": 0.08107317686080932,
+      "step": 580,
+      "token_acc": 0.9681967726774244
+    },
+    {
+      "epoch": 0.6986382474837182,
+      "grad_norm": 1.453125,
+      "learning_rate": 1.512123354854955e-05,
+      "loss": 0.08852046132087707,
+      "step": 590,
+      "token_acc": 0.9663957486714598
+    },
+    {
+      "epoch": 0.7104795737122558,
+      "grad_norm": 1.1875,
+      "learning_rate": 1.4955668876586763e-05,
+      "loss": 0.07870029807090759,
+      "step": 600,
+      "token_acc": 0.9683862849952816
+    },
+    {
+      "epoch": 0.7223208999407934,
+      "grad_norm": 1.4609375,
+      "learning_rate": 1.4788283536361036e-05,
+      "loss": 0.0841621994972229,
+      "step": 610,
+      "token_acc": 0.9685781618224666
+    },
+    {
+      "epoch": 0.7341622261693309,
+      "grad_norm": 1.6171875,
+      "learning_rate": 1.4619139023743916e-05,
+      "loss": 0.08564043045043945,
+      "step": 620,
+      "token_acc": 0.9654417513682565
+    },
+    {
+      "epoch": 0.7460035523978685,
+      "grad_norm": 1.203125,
+      "learning_rate": 1.4448297480911086e-05,
+      "loss": 0.09037463665008545,
+      "step": 630,
+      "token_acc": 0.963363081258807
+    },
+    {
+      "epoch": 0.7578448786264061,
+      "grad_norm": 0.9609375,
+      "learning_rate": 1.4275821673511903e-05,
+      "loss": 0.09671027660369873,
+      "step": 640,
+      "token_acc": 0.959305055564251
+    },
+    {
+      "epoch": 0.7696862048549438,
+      "grad_norm": 1.5078125,
+      "learning_rate": 1.4101774967609854e-05,
+      "loss": 0.09160791039466858,
+      "step": 650,
+      "token_acc": 0.9654741446648961
+    },
+    {
+      "epoch": 0.7815275310834814,
+      "grad_norm": 1.421875,
+      "learning_rate": 1.392622130640243e-05,
+      "loss": 0.095394766330719,
+      "step": 660,
+      "token_acc": 0.9619956208945887
+    },
+    {
+      "epoch": 0.7933688573120189,
+      "grad_norm": 1.78125,
+      "learning_rate": 1.3749225186728991e-05,
+      "loss": 0.08577767610549927,
+      "step": 670,
+      "token_acc": 0.966750313676286
+    },
+    {
+      "epoch": 0.8052101835405565,
+      "grad_norm": 2.0625,
+      "learning_rate": 1.357085163537517e-05,
+      "loss": 0.09209753274917602,
+      "step": 680,
+      "token_acc": 0.9620608899297424
+    },
+    {
+      "epoch": 0.8170515097690941,
+      "grad_norm": 2.390625,
+      "learning_rate": 1.3391166185182651e-05,
+      "loss": 0.0821334183216095,
+      "step": 690,
+      "token_acc": 0.9690383111806099
+    },
+    {
+      "epoch": 0.8288928359976317,
+      "grad_norm": 1.2109375,
+      "learning_rate": 1.3210234850972966e-05,
+      "loss": 0.09119898080825806,
+      "step": 700,
+      "token_acc": 0.9637817497648166
+    },
+    {
+      "epoch": 0.8407341622261694,
+      "grad_norm": 1.8671875,
+      "learning_rate": 1.3028124105294255e-05,
+      "loss": 0.0862145483493805,
+      "step": 710,
+      "token_acc": 0.9672259683236631
+    },
+    {
+      "epoch": 0.8525754884547069,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.2844900853999847e-05,
+      "loss": 0.08162487745285034,
+      "step": 720,
+      "token_acc": 0.9676405906377631
+    },
+    {
+      "epoch": 0.8644168146832445,
+      "grad_norm": 1.7890625,
+      "learning_rate": 1.2660632411667648e-05,
+      "loss": 0.08193669319152833,
+      "step": 730,
+      "token_acc": 0.9653278945716975
+    },
+    {
+      "epoch": 0.8762581409117821,
+      "grad_norm": 1.6015625,
+      "learning_rate": 1.2475386476869364e-05,
+      "loss": 0.09078997969627381,
+      "step": 740,
+      "token_acc": 0.9639045825486503
+    },
+    {
+      "epoch": 0.8880994671403197,
+      "grad_norm": 1.671875,
+      "learning_rate": 1.2289231107298672e-05,
+      "loss": 0.09944761395454407,
+      "step": 750,
+      "token_acc": 0.9596546310832025
+    },
+    {
+      "epoch": 0.8999407933688574,
+      "grad_norm": 1.1171875,
+      "learning_rate": 1.2102234694767401e-05,
+      "loss": 0.0917394757270813,
+      "step": 760,
+      "token_acc": 0.9615505335844319
+    },
+    {
+      "epoch": 0.9117821195973949,
+      "grad_norm": 1.609375,
+      "learning_rate": 1.1914465940079036e-05,
+      "loss": 0.08656581044197083,
+      "step": 770,
+      "token_acc": 0.9671951028096061
+    },
+    {
+      "epoch": 0.9236234458259325,
+      "grad_norm": 1.28125,
+      "learning_rate": 1.1725993827788625e-05,
+      "loss": 0.08798307180404663,
+      "step": 780,
+      "token_acc": 0.9632065132299984
+    },
+    {
+      "epoch": 0.9354647720544701,
+      "grad_norm": 1.4765625,
+      "learning_rate": 1.1536887600858487e-05,
+      "loss": 0.08726394176483154,
+      "step": 790,
+      "token_acc": 0.9665934755332497
+    },
+    {
+      "epoch": 0.9473060982830077,
+      "grad_norm": 0.89453125,
+      "learning_rate": 1.134721673521897e-05,
+      "loss": 0.0808544933795929,
+      "step": 800,
+      "token_acc": 0.9646211646837821
+    },
+    {
+      "epoch": 0.9591474245115453,
+      "grad_norm": 1.3671875,
+      "learning_rate": 1.1157050914243614e-05,
+      "loss": 0.08560880422592163,
+      "step": 810,
+      "token_acc": 0.9667189952904238
+    },
+    {
+      "epoch": 0.9709887507400828,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.0966460003148115e-05,
+      "loss": 0.0828078031539917,
+      "step": 820,
+      "token_acc": 0.9668499607227022
+    },
+    {
+      "epoch": 0.9828300769686205,
+      "grad_norm": 1.921875,
+      "learning_rate": 1.0775514023322444e-05,
+      "loss": 0.09345529675483703,
+      "step": 830,
+      "token_acc": 0.9608886107634543
+    },
+    {
+      "epoch": 0.9946714031971581,
+      "grad_norm": 1.2578125,
+      "learning_rate": 1.058428312660566e-05,
+      "loss": 0.08514059782028198,
+      "step": 840,
+      "token_acc": 0.9657169693174703
+    },
+    {
+      "epoch": 1.0059206631142688,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.0392837569512715e-05,
+      "loss": 0.08234425187110901,
+      "step": 850,
+      "token_acc": 0.9645318540931249
+    },
+    {
+      "epoch": 1.0177619893428065,
+      "grad_norm": 1.8359375,
+      "learning_rate": 1.020124768742286e-05,
+      "loss": 0.07545605897903443,
+      "step": 860,
+      "token_acc": 0.9709147771696638
+    },
+    {
+      "epoch": 1.029603315571344,
+      "grad_norm": 1.1640625,
+      "learning_rate": 1.0009583868739053e-05,
+      "loss": 0.07274842262268066,
+      "step": 870,
+      "token_acc": 0.9721873035826524
+    },
+    {
+      "epoch": 1.0414446417998815,
+      "grad_norm": 1.3359375,
+      "learning_rate": 9.817916529027898e-06,
+      "loss": 0.07491129636764526,
+      "step": 880,
+      "token_acc": 0.9713480507280413
+    },
+    {
+      "epoch": 1.0532859680284192,
+      "grad_norm": 1.15625,
+      "learning_rate": 9.626316085149588e-06,
+      "loss": 0.07744649052619934,
+      "step": 890,
+      "token_acc": 0.9709102283390679
+    },
+    {
+      "epoch": 1.0651272942569567,
+      "grad_norm": 1.1953125,
+      "learning_rate": 9.43485292938739e-06,
+      "loss": 0.07794994711875916,
+      "step": 900,
+      "token_acc": 0.970647931303669
+    },
+    {
+      "epoch": 1.0769686204854945,
+      "grad_norm": 1.0234375,
+      "learning_rate": 9.243597403586145e-06,
+      "loss": 0.0824435293674469,
+      "step": 910,
+      "token_acc": 0.9683633516053249
+    },
+    {
+      "epoch": 1.088809946714032,
+      "grad_norm": 0.91796875,
+      "learning_rate": 9.052619773309318e-06,
+      "loss": 0.07359167337417602,
+      "step": 920,
+      "token_acc": 0.9754111198120595
+    },
+    {
+      "epoch": 1.1006512729425695,
+      "grad_norm": 1.0625,
+      "learning_rate": 8.861990202024046e-06,
+      "loss": 0.07806094288825989,
+      "step": 930,
+      "token_acc": 0.9696922355881894
+    },
+    {
+      "epoch": 1.1124925991711072,
+      "grad_norm": 2.0,
+      "learning_rate": 8.67177872532372e-06,
+      "loss": 0.07662028670310975,
+      "step": 940,
+      "token_acc": 0.9707960433349034
+    },
+    {
+      "epoch": 1.1243339253996447,
+      "grad_norm": 1.5,
+      "learning_rate": 8.482055225197532e-06,
+      "loss": 0.07939339876174926,
+      "step": 950,
+      "token_acc": 0.9700156985871271
+    },
+    {
+      "epoch": 1.1361752516281824,
+      "grad_norm": 2.046875,
+      "learning_rate": 8.292889404356461e-06,
+      "loss": 0.07178534269332885,
+      "step": 960,
+      "token_acc": 0.9713704630788486
+    },
+    {
+      "epoch": 1.14801657785672,
+      "grad_norm": 1.453125,
+      "learning_rate": 8.104350760625122e-06,
+      "loss": 0.07578552961349487,
+      "step": 970,
+      "token_acc": 0.9700093720712277
+    },
+    {
+      "epoch": 1.1598579040852575,
+      "grad_norm": 1.3828125,
+      "learning_rate": 7.916508561408892e-06,
+      "loss": 0.07551709413528443,
+      "step": 980,
+      "token_acc": 0.9736513875896476
+    },
+    {
+      "epoch": 1.1716992303137952,
+      "grad_norm": 1.0625,
+      "learning_rate": 7.729431818245678e-06,
+      "loss": 0.06962672472000123,
+      "step": 990,
+      "token_acc": 0.9749726263100266
+    },
+    {
+      "epoch": 1.1835405565423327,
+      "grad_norm": 1.546875,
+      "learning_rate": 7.543189261451716e-06,
+      "loss": 0.07484488487243653,
+      "step": 1000,
+      "token_acc": 0.9705790297339593
+    },
+    {
+      "epoch": 1.1953818827708704,
+      "grad_norm": 1.328125,
+      "learning_rate": 7.35784931487064e-06,
+      "loss": 0.07622098922729492,
+      "step": 1010,
+      "token_acc": 0.970372680492749
+    },
+    {
+      "epoch": 1.207223208999408,
+      "grad_norm": 2.390625,
+      "learning_rate": 7.173480070735209e-06,
+      "loss": 0.07499848604202271,
+      "step": 1020,
+      "token_acc": 0.9686574146265399
+    },
+    {
+      "epoch": 1.2190645352279454,
+      "grad_norm": 1.2109375,
+      "learning_rate": 6.990149264650814e-06,
+      "loss": 0.07203071117401123,
+      "step": 1030,
+      "token_acc": 0.972574831531108
+    },
+    {
+      "epoch": 1.2309058614564832,
+      "grad_norm": 1.375,
+      "learning_rate": 6.807924250710019e-06,
+      "loss": 0.07002646923065185,
+      "step": 1040,
+      "token_acc": 0.9741379310344828
+    },
+    {
+      "epoch": 1.2427471876850207,
+      "grad_norm": 1.328125,
+      "learning_rate": 6.626871976747289e-06,
+      "loss": 0.07481561303138733,
+      "step": 1050,
+      "token_acc": 0.9709576138147566
+    },
+    {
+      "epoch": 1.2545885139135584,
+      "grad_norm": 1.2734375,
+      "learning_rate": 6.44705895974294e-06,
+      "loss": 0.06933027505874634,
+      "step": 1060,
+      "token_acc": 0.9734443746071653
+    },
+    {
+      "epoch": 1.266429840142096,
+      "grad_norm": 1.5625,
+      "learning_rate": 6.268551261385414e-06,
+      "loss": 0.0675657868385315,
+      "step": 1070,
+      "token_acc": 0.9746320075164422
+    },
+    {
+      "epoch": 1.2782711663706334,
+      "grad_norm": 1.5546875,
+      "learning_rate": 6.091414463800789e-06,
+      "loss": 0.07069060802459717,
+      "step": 1080,
+      "token_acc": 0.973655323819978
+    },
+    {
+      "epoch": 1.2901124925991712,
+      "grad_norm": 1.125,
+      "learning_rate": 5.915713645458514e-06,
+      "loss": 0.07225958108901978,
+      "step": 1090,
+      "token_acc": 0.9728201099764336
+    },
+    {
+      "epoch": 1.3019538188277087,
+      "grad_norm": 1.6171875,
+      "learning_rate": 5.741513357262147e-06,
+      "loss": 0.07490838170051575,
+      "step": 1100,
+      "token_acc": 0.970542149796302
+    },
+    {
+      "epoch": 1.3137951450562464,
+      "grad_norm": 1.3359375,
+      "learning_rate": 5.568877598833935e-06,
+      "loss": 0.07528679370880127,
+      "step": 1110,
+      "token_acc": 0.970496409615985
+    },
+    {
+      "epoch": 1.325636471284784,
+      "grad_norm": 1.453125,
+      "learning_rate": 5.3978697950019484e-06,
+      "loss": 0.07579593658447266,
+      "step": 1120,
+      "token_acc": 0.9716936625255543
+    },
+    {
+      "epoch": 1.3374777975133214,
+      "grad_norm": 1.6640625,
+      "learning_rate": 5.228552772498335e-06,
+      "loss": 0.06750929355621338,
+      "step": 1130,
+      "token_acc": 0.9741029641185648
+    },
+    {
+      "epoch": 1.3493191237418591,
+      "grad_norm": 1.6875,
+      "learning_rate": 5.060988736877366e-06,
+      "loss": 0.07841302156448364,
+      "step": 1140,
+      "token_acc": 0.9696400625978091
+    },
+    {
+      "epoch": 1.3611604499703966,
+      "grad_norm": 1.3671875,
+      "learning_rate": 4.895239249661662e-06,
+      "loss": 0.08451638221740723,
+      "step": 1150,
+      "token_acc": 0.967736883320282
+    },
+    {
+      "epoch": 1.3730017761989344,
+      "grad_norm": 1.0234375,
+      "learning_rate": 4.731365205725056e-06,
+      "loss": 0.074539315700531,
+      "step": 1160,
+      "token_acc": 0.9703715315880233
+    },
+    {
+      "epoch": 1.3848431024274719,
+      "grad_norm": 1.3359375,
+      "learning_rate": 4.569426810920347e-06,
+      "loss": 0.068775475025177,
+      "step": 1170,
+      "token_acc": 0.9716523101018011
+    },
+    {
+      "epoch": 1.3966844286560094,
+      "grad_norm": 1.2265625,
+      "learning_rate": 4.409483559960221e-06,
+      "loss": 0.07150940299034118,
+      "step": 1180,
+      "token_acc": 0.9737005913476502
+    },
+    {
+      "epoch": 1.4085257548845471,
+      "grad_norm": 1.890625,
+      "learning_rate": 4.251594214559416e-06,
+      "loss": 0.08267040252685547,
+      "step": 1190,
+      "token_acc": 0.9680350987151363
+    },
+    {
+      "epoch": 1.4203670811130846,
+      "grad_norm": 1.46875,
+      "learning_rate": 4.095816781846219e-06,
+      "loss": 0.0697063684463501,
+      "step": 1200,
+      "token_acc": 0.9751095804633688
+    },
+    {
+      "epoch": 1.4322084073416224,
+      "grad_norm": 1.203125,
+      "learning_rate": 3.942208493051137e-06,
+      "loss": 0.07361778020858764,
+      "step": 1210,
+      "token_acc": 0.9734901960784313
+    },
+    {
+      "epoch": 1.4440497335701599,
+      "grad_norm": 1.4609375,
+      "learning_rate": 3.7908257824806814e-06,
+      "loss": 0.07019197940826416,
+      "step": 1220,
+      "token_acc": 0.9710122218740207
+    },
+    {
+      "epoch": 1.4558910597986974,
+      "grad_norm": 1.5859375,
+      "learning_rate": 3.6417242667838917e-06,
+      "loss": 0.07444216012954712,
+      "step": 1230,
+      "token_acc": 0.9728040012503908
+    },
+    {
+      "epoch": 1.467732386027235,
+      "grad_norm": 1.4375,
+      "learning_rate": 3.4949587245192983e-06,
+      "loss": 0.06847925186157226,
+      "step": 1240,
+      "token_acc": 0.9746320075164422
+    },
+    {
+      "epoch": 1.4795737122557726,
+      "grad_norm": 1.625,
+      "learning_rate": 3.3505830760297543e-06,
+      "loss": 0.0696124255657196,
+      "step": 1250,
+      "token_acc": 0.9730534231552561
+    },
+    {
+      "epoch": 1.4914150384843103,
+      "grad_norm": 1.3203125,
+      "learning_rate": 3.2086503636325895e-06,
+      "loss": 0.07145707607269287,
+      "step": 1260,
+      "token_acc": 0.9749294891883422
+    },
+    {
+      "epoch": 1.5032563647128478,
+      "grad_norm": 1.8515625,
+      "learning_rate": 3.069212732132345e-06,
+      "loss": 0.07296675443649292,
+      "step": 1270,
+      "token_acc": 0.9725662329518734
+    },
+    {
+      "epoch": 1.5150976909413854,
+      "grad_norm": 2.625,
+      "learning_rate": 2.9323214096632335e-06,
+      "loss": 0.07637610435485839,
+      "step": 1280,
+      "token_acc": 0.9721566776781501
+    },
+    {
+      "epoch": 1.526939017169923,
+      "grad_norm": 1.4375,
+      "learning_rate": 2.798026688868386e-06,
+      "loss": 0.07028791308403015,
+      "step": 1290,
+      "token_acc": 0.9726801695713613
+    },
+    {
+      "epoch": 1.5387803433984606,
+      "grad_norm": 1.7578125,
+      "learning_rate": 2.6663779084227926e-06,
+      "loss": 0.0738570511341095,
+      "step": 1300,
+      "token_acc": 0.9717247879359096
+    },
+    {
+      "epoch": 1.5506216696269983,
+      "grad_norm": 2.046875,
+      "learning_rate": 2.5374234349066985e-06,
+      "loss": 0.07539566755294799,
+      "step": 1310,
+      "token_acc": 0.9680968096809681
+    },
+    {
+      "epoch": 1.5624629958555358,
+      "grad_norm": 1.09375,
+      "learning_rate": 2.411210645036173e-06,
+      "loss": 0.07291572093963623,
+      "step": 1320,
+      "token_acc": 0.972758405977584
+    },
+    {
+      "epoch": 1.5743043220840733,
+      "grad_norm": 1.6484375,
+      "learning_rate": 2.2877859082573194e-06,
+      "loss": 0.07078194618225098,
+      "step": 1330,
+      "token_acc": 0.9733229329173166
+    },
+    {
+      "epoch": 1.586145648312611,
+      "grad_norm": 1.53125,
+      "learning_rate": 2.16719456971057e-06,
+      "loss": 0.07727055549621582,
+      "step": 1340,
+      "token_acc": 0.9690154136520919
+    },
+    {
+      "epoch": 1.5979869745411486,
+      "grad_norm": 1.125,
+      "learning_rate": 2.0494809335712697e-06,
+      "loss": 0.06905415058135986,
+      "step": 1350,
+      "token_acc": 0.9750783699059561
+    },
+    {
+      "epoch": 1.6098283007696863,
+      "grad_norm": 1.8046875,
+      "learning_rate": 1.9346882467727323e-06,
+      "loss": 0.07434183359146118,
+      "step": 1360,
+      "token_acc": 0.9726091720143998
+    },
+    {
+      "epoch": 1.6216696269982238,
+      "grad_norm": 0.96875,
+      "learning_rate": 1.8228586831177032e-06,
+      "loss": 0.06618231534957886,
+      "step": 1370,
+      "token_acc": 0.9750900830330566
+    },
+    {
+      "epoch": 1.6335109532267613,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.7140333277840837e-06,
+      "loss": 0.07258784770965576,
+      "step": 1380,
+      "token_acc": 0.9727699530516432
+    },
+    {
+      "epoch": 1.6453522794552988,
+      "grad_norm": 1.1875,
+      "learning_rate": 1.6082521622306003e-06,
+      "loss": 0.0752481460571289,
+      "step": 1390,
+      "token_acc": 0.9715364050951407
+    },
+    {
+      "epoch": 1.6571936056838366,
+      "grad_norm": 1.3984375,
+      "learning_rate": 1.5055540495079802e-06,
+      "loss": 0.06541621685028076,
+      "step": 1400,
+      "token_acc": 0.9767806714778788
+    },
+    {
+      "epoch": 1.6690349319123743,
+      "grad_norm": 1.90625,
+      "learning_rate": 1.4059767199810125e-06,
+      "loss": 0.0707894206047058,
+      "step": 1410,
+      "token_acc": 0.9731301068510371
+    },
+    {
+      "epoch": 1.6808762581409118,
+      "grad_norm": 1.3203125,
+      "learning_rate": 1.3095567574667589e-06,
+      "loss": 0.07458854913711548,
+      "step": 1420,
+      "token_acc": 0.9726630007855459
+    },
+    {
+      "epoch": 1.6927175843694493,
+      "grad_norm": 1.65625,
+      "learning_rate": 1.216329585793975e-06,
+      "loss": 0.06724110841751099,
+      "step": 1430,
+      "token_acc": 0.9734000938820216
+    },
+    {
+      "epoch": 1.7045589105979868,
+      "grad_norm": 1.46875,
+      "learning_rate": 1.1263294557887216e-06,
+      "loss": 0.07588486671447754,
+      "step": 1440,
+      "token_acc": 0.9710873664362036
+    },
+    {
+      "epoch": 1.7164002368265245,
+      "grad_norm": 2.046875,
+      "learning_rate": 1.0395894326909163e-06,
+      "loss": 0.07099611163139344,
+      "step": 1450,
+      "token_acc": 0.9723091364205256
+    },
+    {
+      "epoch": 1.7282415630550623,
+      "grad_norm": 1.921875,
+      "learning_rate": 9.561413840064637e-07,
+      "loss": 0.06974682807922364,
+      "step": 1460,
+      "token_acc": 0.9720609009574636
+    },
+    {
+      "epoch": 1.7400828892835998,
+      "grad_norm": 1.2578125,
+      "learning_rate": 8.760159677994174e-07,
+      "loss": 0.06880149841308594,
+      "step": 1470,
+      "token_acc": 0.9749019607843137
+    },
+    {
+      "epoch": 1.7519242155121373,
+      "grad_norm": 1.90625,
+      "learning_rate": 7.992426214284787e-07,
+      "loss": 0.07654795646667481,
+      "step": 1480,
+      "token_acc": 0.969967151572032
+    },
+    {
+      "epoch": 1.7637655417406748,
+      "grad_norm": 1.3671875,
+      "learning_rate": 7.258495507319885e-07,
+      "loss": 0.06865710020065308,
+      "step": 1490,
+      "token_acc": 0.9735068192506663
+    },
+    {
+      "epoch": 1.7756068679692125,
+      "grad_norm": 1.34375,
+      "learning_rate": 6.558637196653372e-07,
+      "loss": 0.06818960905075074,
+      "step": 1500,
+      "token_acc": 0.9739225484072455
+    },
+    {
+      "epoch": 1.7874481941977503,
+      "grad_norm": 1.7578125,
+      "learning_rate": 5.893108403946634e-07,
+      "loss": 0.07731307148933411,
+      "step": 1510,
+      "token_acc": 0.9705836332342357
+    },
+    {
+      "epoch": 1.7992895204262878,
+      "grad_norm": 1.1484375,
+      "learning_rate": 5.262153638504286e-07,
+      "loss": 0.07072955965995789,
+      "step": 1520,
+      "token_acc": 0.9747514596812372
+    },
+    {
+      "epoch": 1.8111308466548253,
+      "grad_norm": 1.40625,
+      "learning_rate": 4.6660047074436945e-07,
+      "loss": 0.07091631889343261,
+      "step": 1530,
+      "token_acc": 0.9746914544602406
+    },
+    {
+      "epoch": 1.8229721728833628,
+      "grad_norm": 1.6328125,
+      "learning_rate": 4.10488063053105e-07,
+      "loss": 0.062443327903747556,
+      "step": 1540,
+      "token_acc": 0.976577139287945
+    },
+    {
+      "epoch": 1.8348134991119005,
+      "grad_norm": 1.6015625,
+      "learning_rate": 3.57898755971553e-07,
+      "loss": 0.07588485479354859,
+      "step": 1550,
+      "token_acc": 0.973754100921731
+    },
+    {
+      "epoch": 1.8466548253404382,
+      "grad_norm": 1.4375,
+      "learning_rate": 3.088518703390908e-07,
+      "loss": 0.07371261715888977,
+      "step": 1560,
+      "token_acc": 0.9696590553644041
+    },
+    {
+      "epoch": 1.8584961515689757,
+      "grad_norm": 1.1484375,
+      "learning_rate": 2.633654255412554e-07,
+      "loss": 0.06826964616775513,
+      "step": 1570,
+      "token_acc": 0.9750783699059561
+    },
+    {
+      "epoch": 1.8703374777975132,
+      "grad_norm": 1.5078125,
+      "learning_rate": 2.214561328895748e-07,
+      "loss": 0.06952533721923829,
+      "step": 1580,
+      "token_acc": 0.9716478696741855
+    },
+    {
+      "epoch": 1.8821788040260508,
+      "grad_norm": 1.7890625,
+      "learning_rate": 1.8313938948198884e-07,
+      "loss": 0.07293472290039063,
+      "step": 1590,
+      "token_acc": 0.9714820009350164
+    },
+    {
+      "epoch": 1.8940201302545885,
+      "grad_norm": 1.625,
+      "learning_rate": 1.484292725460934e-07,
+      "loss": 0.07688854336738586,
+      "step": 1600,
+      "token_acc": 0.9702054257487847
+    },
+    {
+      "epoch": 1.9058614564831262,
+      "grad_norm": 1.3359375,
+      "learning_rate": 1.173385342672917e-07,
+      "loss": 0.07143334150314332,
+      "step": 1610,
+      "token_acc": 0.970491288651703
+    },
+    {
+      "epoch": 1.9177027827116637,
+      "grad_norm": 1.921875,
+      "learning_rate": 8.987859710375524e-08,
+      "loss": 0.081912100315094,
+      "step": 1620,
+      "token_acc": 0.9685150375939849
+    },
+    {
+      "epoch": 1.9295441089402012,
+      "grad_norm": 1.8203125,
+      "learning_rate": 6.605954958991523e-08,
+      "loss": 0.07874792218208312,
+      "step": 1630,
+      "token_acc": 0.9696588586700204
+    },
+    {
+      "epoch": 1.9413854351687387,
+      "grad_norm": 1.5546875,
+      "learning_rate": 4.5890142630027336e-08,
+      "loss": 0.0735186517238617,
+      "step": 1640,
+      "token_acc": 0.9709894934922377
+    },
+    {
+      "epoch": 1.9532267613972765,
+      "grad_norm": 1.9765625,
+      "learning_rate": 2.9377786283167897e-08,
+      "loss": 0.0773587942123413,
+      "step": 1650,
+      "token_acc": 0.9692741809060982
+    },
+    {
+      "epoch": 1.9650680876258142,
+      "grad_norm": 1.796875,
+      "learning_rate": 1.6528547040842724e-08,
+      "loss": 0.06999446153640747,
+      "step": 1660,
+      "token_acc": 0.9743669896842764
+    },
+    {
+      "epoch": 1.9769094138543517,
+      "grad_norm": 1.5859375,
+      "learning_rate": 7.3471455982143665e-09,
+      "loss": 0.07299281358718872,
+      "step": 1670,
+      "token_acc": 0.9729179711959924
+    },
+    {
+      "epoch": 1.9887507400828892,
+      "grad_norm": 1.34375,
+      "learning_rate": 1.8369551197594538e-09,
+      "loss": 0.067216557264328,
+      "step": 1680,
+      "token_acc": 0.9730407523510972
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 2.65625,
+      "learning_rate": 0.0,
+      "loss": 0.07405292987823486,
+      "step": 1690,
+      "token_acc": 0.9716838024608124
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1690,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.7276889282044232e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e7fbd87625cc50123815e7aff873d4789ca3015538e01899b3d2b9db87882bd
+size 7057

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff