azeddinShr commited on Dec 16, 2025

Commit

03401b7

verified ·

1 Parent(s): 2a65632

Complete Spark-TTS with Arabic fine-tuned LLM

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +16 -0
BiCodec/config.yaml +60 -0
BiCodec/model.safetensors +3 -0
LLM/README.md +132 -0
LLM/added_tokens.json +0 -0
LLM/chat_template.jinja +54 -0
LLM/checkpoint-1000/added_tokens.json +0 -0
LLM/checkpoint-1000/chat_template.jinja +54 -0
LLM/checkpoint-1000/config.json +54 -0
LLM/checkpoint-1000/generation_config.json +10 -0
LLM/checkpoint-1000/merges.txt +0 -0
LLM/checkpoint-1000/model.safetensors +3 -0
LLM/checkpoint-1000/optimizer.pt +3 -0
LLM/checkpoint-1000/rng_state.pth +3 -0
LLM/checkpoint-1000/scheduler.pt +3 -0
LLM/checkpoint-1000/special_tokens_map.json +31 -0
LLM/checkpoint-1000/tokenizer.json +3 -0
LLM/checkpoint-1000/tokenizer_config.json +0 -0
LLM/checkpoint-1000/trainer_state.json +307 -0
LLM/checkpoint-1000/training_args.bin +3 -0
LLM/checkpoint-1000/vocab.json +0 -0
LLM/checkpoint-1016/added_tokens.json +0 -0
LLM/checkpoint-1016/chat_template.jinja +54 -0
LLM/checkpoint-1016/config.json +54 -0
LLM/checkpoint-1016/generation_config.json +10 -0
LLM/checkpoint-1016/merges.txt +0 -0
LLM/checkpoint-1016/model.safetensors +3 -0
LLM/checkpoint-1016/optimizer.pt +3 -0
LLM/checkpoint-1016/rng_state.pth +3 -0
LLM/checkpoint-1016/scheduler.pt +3 -0
LLM/checkpoint-1016/special_tokens_map.json +31 -0
LLM/checkpoint-1016/tokenizer.json +3 -0
LLM/checkpoint-1016/tokenizer_config.json +0 -0
LLM/checkpoint-1016/trainer_state.json +307 -0
LLM/checkpoint-1016/training_args.bin +3 -0
LLM/checkpoint-1016/vocab.json +0 -0
LLM/checkpoint-600/added_tokens.json +0 -0
LLM/checkpoint-600/chat_template.jinja +54 -0
LLM/checkpoint-600/config.json +54 -0
LLM/checkpoint-600/generation_config.json +10 -0
LLM/checkpoint-600/merges.txt +0 -0
LLM/checkpoint-600/model.safetensors +3 -0
LLM/checkpoint-600/optimizer.pt +3 -0
LLM/checkpoint-600/rng_state.pth +3 -0
LLM/checkpoint-600/scheduler.pt +3 -0
LLM/checkpoint-600/special_tokens_map.json +31 -0
LLM/checkpoint-600/tokenizer.json +3 -0
LLM/checkpoint-600/tokenizer_config.json +0 -0
LLM/checkpoint-600/trainer_state.json +200 -0
LLM/checkpoint-600/training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+LLM/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+wav2vec2-large-xlsr-53/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
+LLM/model.safetensors filter=lfs diff=lfs merge=lfs -text
+BiCodec/model.safetensors filter=lfs diff=lfs merge=lfs -text
+LLM/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+LLM/checkpoint-1016/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+LLM/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+LLM/checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+src/figures/infer_control.png filter=lfs diff=lfs merge=lfs -text
+src/figures/infer_voice_cloning.png filter=lfs diff=lfs merge=lfs -text
+src/logo/HKUST.jpg filter=lfs diff=lfs merge=lfs -text
+src/logo/NPU.jpg filter=lfs diff=lfs merge=lfs -text
+src/logo/SJU.jpg filter=lfs diff=lfs merge=lfs -text
+src/logo/SparkTTS.png filter=lfs diff=lfs merge=lfs -text
+src/logo/mobvoi.jpg filter=lfs diff=lfs merge=lfs -text
+src/logo/mobvoi.png filter=lfs diff=lfs merge=lfs -text

BiCodec/config.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+audio_tokenizer:
+  mel_params:
+    sample_rate: 16000
+    n_fft: 1024
+    win_length: 640
+    hop_length: 320
+    mel_fmin: 10
+    mel_fmax: null
+    num_mels: 128
+  encoder:
+    input_channels: 1024
+    vocos_dim: 384
+    vocos_intermediate_dim: 2048
+    vocos_num_layers: 12
+    out_channels: 1024
+    sample_ratios: [1,1]
+  decoder:
+    input_channel: 1024
+    channels: 1536
+    rates: [8, 5, 4, 2]
+    kernel_sizes: [16,11,8,4]
+  quantizer:
+    input_dim: 1024
+    codebook_size: 8192
+    codebook_dim: 8
+    commitment: 0.25
+    codebook_loss_weight: 2.0
+    use_l2_normlize: True
+    threshold_ema_dead_code: 0.2
+  speaker_encoder:
+    input_dim: 128
+    out_dim: 1024
+    latent_dim: 128
+    token_num: 32
+    fsq_levels: [4, 4, 4, 4, 4, 4]
+    fsq_num_quantizers: 1
+  prenet:
+    input_channels: 1024
+    vocos_dim: 384
+    vocos_intermediate_dim: 2048
+    vocos_num_layers: 12
+    out_channels: 1024
+    condition_dim: 1024
+    sample_ratios: [1,1]
+    use_tanh_at_final: False
+  postnet:
+    input_channels: 1024
+    vocos_dim: 384
+    vocos_intermediate_dim: 2048
+    vocos_num_layers: 6
+    out_channels: 1024
+    use_tanh_at_final: False

BiCodec/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9940cd48d4446e4340ced82d234bf5618350dd9f5db900ebe47a4fdb03867ec
+size 625518756

LLM/README.md ADDED Viewed

	@@ -0,0 +1,132 @@

+---
+library_name: transformers
+tags:
+- generated_from_trainer
+datasets:
+- /content/processed_output/clartts_data.jsonl
+model-index:
+- name: content/finetuned_model
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.13.0.dev0`
+```yaml
+base_model: /content/SparkTTS-Finetune/pretrained_models/Spark-TTS-0.5B/LLM
+load_in_4bit: false
+load_in_8bit: false
+trust_remote_code: true
+strict: false
+datasets:
+  - path: /content/processed_output/clartts_data.jsonl
+    type: completion
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: /content/finetuned_model
+sequence_len: 1024
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 50
+xformers_attention:
+flash_attention: false
+warmup_steps: 10
+evals_per_epoch: 1
+save_steps: 200
+debug:
+deepspeed:
+weight_decay: 0.0
+```
+</details><br>
+# content/finetuned_model
+This model was trained from scratch on the /content/processed_output/clartts_data.jsonl dataset.
+It achieves the following results on the evaluation set:
+- Loss: 4.4637
+- Memory/max Active (gib): 7.2
+- Memory/max Allocated (gib): 7.2
+- Memory/device Reserved (gib): 7.62
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 8
+- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- training_steps: 1016
+### Training results
+| Training Loss | Epoch | Step | Validation Loss | Active (gib) | Allocated (gib) | Reserved (gib) |
+|:-------------:|:-----:|:----:|:---------------:|:------------:|:---------------:|:--------------:|
+| No log        | 0     | 0    | 11.8503         | 3.1          | 3.1             | 3.2            |
+| 4.7248        | 1.0   | 339  | 4.6423          | 7.2          | 7.2             | 7.67           |
+| 4.3688        | 2.0   | 678  | 4.4637          | 7.2          | 7.2             | 7.62           |
+### Framework versions
+- Transformers 4.57.1
+- Pytorch 2.7.1+cu118
+- Datasets 4.4.1
+- Tokenizers 0.22.1

LLM/added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

LLM/checkpoint-1000/added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-1000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

LLM/checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 166000
+}

LLM/checkpoint-1000/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "do_sample": true,
+  "eos_token_id": [
+    151645
+  ],
+  "pad_token_id": 151643,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

LLM/checkpoint-1000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50d2144b8b38b8ad0d8ce257050b2284d2665d63f400eb53f9c9a142380fc9c1
+size 1310860488

LLM/checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0acaa72f9ba986f14d742ef24a41421ec717c825dbbc267497d61261816edd2
+size 2621903691

LLM/checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
+size 14645

LLM/checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16fef0806330d17ac0adc016c87f30d80c9a7ff3ea6e21a41ba6f8b2c605e2e4
+size 1465

LLM/checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

LLM/checkpoint-1000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8b057d6ca205a429cc3428b9fc815f0d6ee1d53106dd5e5b129ef9db2ff057
+size 14129172

LLM/checkpoint-1000/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,307 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9516069449575175,
+  "eval_steps": 339,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 11.850272178649902,
+      "eval_runtime": 7.1659,
+      "eval_samples_per_second": 19.956,
+      "eval_steps_per_second": 19.956,
+      "memory/device_reserved (GiB)": 3.2,
+      "memory/max_active (GiB)": 3.1,
+      "memory/max_allocated (GiB)": 3.1,
+      "step": 0
+    },
+    {
+      "epoch": 0.1477650535648319,
+      "grad_norm": 9.9116792678833,
+      "learning_rate": 0.00019925925947187668,
+      "loss": 7.2742,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 50,
+      "tokens_per_second_per_gpu": 2082.96,
+      "total_tokens": 164572
+    },
+    {
+      "epoch": 0.2955301071296638,
+      "grad_norm": 12.538771629333496,
+      "learning_rate": 0.0001961624298837552,
+      "loss": 5.6623,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 100,
+      "tokens_per_second_per_gpu": 1546.73,
+      "total_tokens": 287922
+    },
+    {
+      "epoch": 0.44329516069449576,
+      "grad_norm": 25.31467628479004,
+      "learning_rate": 0.00019072586525126637,
+      "loss": 5.3752,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 150,
+      "tokens_per_second_per_gpu": 1524.55,
+      "total_tokens": 410915
+    },
+    {
+      "epoch": 0.5910602142593276,
+      "grad_norm": 25.589689254760742,
+      "learning_rate": 0.00018308184302213046,
+      "loss": 5.0362,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 200,
+      "tokens_per_second_per_gpu": 1390.99,
+      "total_tokens": 533959
+    },
+    {
+      "epoch": 0.7388252678241596,
+      "grad_norm": 26.41189956665039,
+      "learning_rate": 0.00017341635045468791,
+      "loss": 4.8371,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 250,
+      "tokens_per_second_per_gpu": 1552.5,
+      "total_tokens": 656838
+    },
+    {
+      "epoch": 0.8865903213889915,
+      "grad_norm": 23.636552810668945,
+      "learning_rate": 0.00016196455934844978,
+      "loss": 4.7248,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 300,
+      "tokens_per_second_per_gpu": 1489.9,
+      "total_tokens": 779045
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 4.642312526702881,
+      "eval_runtime": 7.0402,
+      "eval_samples_per_second": 20.312,
+      "eval_steps_per_second": 20.312,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 339
+    },
+    {
+      "epoch": 1.032508311784263,
+      "grad_norm": 29.976333618164062,
+      "learning_rate": 0.00014900510406201564,
+      "loss": 4.6412,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 350,
+      "tokens_per_second_per_gpu": 346.62,
+      "total_tokens": 945548
+    },
+    {
+      "epoch": 1.1802733653490949,
+      "grad_norm": 28.489376068115234,
+      "learning_rate": 0.00013485330204031937,
+      "loss": 4.4916,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 400,
+      "tokens_per_second_per_gpu": 1511.66,
+      "total_tokens": 1067762
+    },
+    {
+      "epoch": 1.328038418913927,
+      "grad_norm": 25.01222038269043,
+      "learning_rate": 0.0001198534818030452,
+      "loss": 4.4404,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 450,
+      "tokens_per_second_per_gpu": 1375.01,
+      "total_tokens": 1188747
+    },
+    {
+      "epoch": 1.4758034724787588,
+      "grad_norm": 19.93057632446289,
+      "learning_rate": 0.00010437060506248341,
+      "loss": 4.4182,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 500,
+      "tokens_per_second_per_gpu": 1540.13,
+      "total_tokens": 1312791
+    },
+    {
+      "epoch": 1.6235685260435906,
+      "grad_norm": 20.955829620361328,
+      "learning_rate": 8.878138681368239e-05,
+      "loss": 4.3869,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 550,
+      "tokens_per_second_per_gpu": 1544.52,
+      "total_tokens": 1437766
+    },
+    {
+      "epoch": 1.7713335796084226,
+      "grad_norm": 27.51922035217285,
+      "learning_rate": 7.346512945462767e-05,
+      "loss": 4.359,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 600,
+      "tokens_per_second_per_gpu": 1525.07,
+      "total_tokens": 1560469
+    },
+    {
+      "epoch": 1.9190986331732547,
+      "grad_norm": 18.944068908691406,
+      "learning_rate": 5.879449395213175e-05,
+      "loss": 4.3688,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 650,
+      "tokens_per_second_per_gpu": 1526.23,
+      "total_tokens": 1681459
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 4.463651657104492,
+      "eval_runtime": 6.8792,
+      "eval_samples_per_second": 20.787,
+      "eval_steps_per_second": 20.787,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 678
+    },
+    {
+      "epoch": 2.065016623568526,
+      "grad_norm": 18.91043472290039,
+      "learning_rate": 4.512643260086751e-05,
+      "loss": 4.294,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 700,
+      "tokens_per_second_per_gpu": 492.11,
+      "total_tokens": 1847089
+    },
+    {
+      "epoch": 2.212781677133358,
+      "grad_norm": 22.407855987548828,
+      "learning_rate": 3.279350399124066e-05,
+      "loss": 4.2329,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 750,
+      "tokens_per_second_per_gpu": 1508.42,
+      "total_tokens": 1969141
+    },
+    {
+      "epoch": 2.3605467306981898,
+      "grad_norm": 22.668535232543945,
+      "learning_rate": 2.209578150224645e-05,
+      "loss": 4.2312,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 800,
+      "tokens_per_second_per_gpu": 1531.3,
+      "total_tokens": 2090972
+    },
+    {
+      "epoch": 2.5083117842630216,
+      "grad_norm": 18.202590942382812,
+      "learning_rate": 1.3293552194358238e-05,
+      "loss": 4.222,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 850,
+      "tokens_per_second_per_gpu": 1544.96,
+      "total_tokens": 2213097
+    },
+    {
+      "epoch": 2.656076837827854,
+      "grad_norm": 18.34627914428711,
+      "learning_rate": 6.600983746212319e-06,
+      "loss": 4.2271,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 900,
+      "tokens_per_second_per_gpu": 1515.29,
+      "total_tokens": 2335026
+    },
+    {
+      "epoch": 2.8038418913926857,
+      "grad_norm": 21.71125030517578,
+      "learning_rate": 2.1809135253115565e-06,
+      "loss": 4.2248,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 950,
+      "tokens_per_second_per_gpu": 1450.1,
+      "total_tokens": 2458575
+    },
+    {
+      "epoch": 2.9516069449575175,
+      "grad_norm": 16.12077522277832,
+      "learning_rate": 1.4088658024622448e-07,
+      "loss": 4.2186,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 1000,
+      "tokens_per_second_per_gpu": 1481.33,
+      "total_tokens": 2582586
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 1016,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.756941874102272e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

LLM/checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f63f7b81172feef0cd47466795e7fc796dfcf0be86e5c24d2a09091a1a3fa40
+size 7313

LLM/checkpoint-1000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-1016/added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-1016/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

LLM/checkpoint-1016/config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 166000
+}

LLM/checkpoint-1016/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "do_sample": true,
+  "eos_token_id": [
+    151645
+  ],
+  "pad_token_id": 151643,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

LLM/checkpoint-1016/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-1016/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95dd3a1efd82dccec71c77afc4ea9ef7f3abba3c36924e908befd7a41e80b4c1
+size 1310860488

LLM/checkpoint-1016/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ee34fd8e9385820c523c29eb897b957e11f945026c7afeef701ca873fcb2338
+size 2621903691

LLM/checkpoint-1016/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
+size 14645

LLM/checkpoint-1016/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f59a21b592da0aad96690e7481b3b551321c1c689bb0d9bf4bf5be9fa29637a
+size 1465

LLM/checkpoint-1016/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

LLM/checkpoint-1016/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8b057d6ca205a429cc3428b9fc815f0d6ee1d53106dd5e5b129ef9db2ff057
+size 14129172

LLM/checkpoint-1016/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-1016/trainer_state.json ADDED Viewed

	@@ -0,0 +1,307 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.998891762098264,
+  "eval_steps": 339,
+  "global_step": 1016,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 11.850272178649902,
+      "eval_runtime": 7.1659,
+      "eval_samples_per_second": 19.956,
+      "eval_steps_per_second": 19.956,
+      "memory/device_reserved (GiB)": 3.2,
+      "memory/max_active (GiB)": 3.1,
+      "memory/max_allocated (GiB)": 3.1,
+      "step": 0
+    },
+    {
+      "epoch": 0.1477650535648319,
+      "grad_norm": 9.9116792678833,
+      "learning_rate": 0.00019925925947187668,
+      "loss": 7.2742,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 50,
+      "tokens_per_second_per_gpu": 2082.96,
+      "total_tokens": 164572
+    },
+    {
+      "epoch": 0.2955301071296638,
+      "grad_norm": 12.538771629333496,
+      "learning_rate": 0.0001961624298837552,
+      "loss": 5.6623,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 100,
+      "tokens_per_second_per_gpu": 1546.73,
+      "total_tokens": 287922
+    },
+    {
+      "epoch": 0.44329516069449576,
+      "grad_norm": 25.31467628479004,
+      "learning_rate": 0.00019072586525126637,
+      "loss": 5.3752,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 150,
+      "tokens_per_second_per_gpu": 1524.55,
+      "total_tokens": 410915
+    },
+    {
+      "epoch": 0.5910602142593276,
+      "grad_norm": 25.589689254760742,
+      "learning_rate": 0.00018308184302213046,
+      "loss": 5.0362,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 200,
+      "tokens_per_second_per_gpu": 1390.99,
+      "total_tokens": 533959
+    },
+    {
+      "epoch": 0.7388252678241596,
+      "grad_norm": 26.41189956665039,
+      "learning_rate": 0.00017341635045468791,
+      "loss": 4.8371,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 250,
+      "tokens_per_second_per_gpu": 1552.5,
+      "total_tokens": 656838
+    },
+    {
+      "epoch": 0.8865903213889915,
+      "grad_norm": 23.636552810668945,
+      "learning_rate": 0.00016196455934844978,
+      "loss": 4.7248,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 300,
+      "tokens_per_second_per_gpu": 1489.9,
+      "total_tokens": 779045
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 4.642312526702881,
+      "eval_runtime": 7.0402,
+      "eval_samples_per_second": 20.312,
+      "eval_steps_per_second": 20.312,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 339
+    },
+    {
+      "epoch": 1.032508311784263,
+      "grad_norm": 29.976333618164062,
+      "learning_rate": 0.00014900510406201564,
+      "loss": 4.6412,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 350,
+      "tokens_per_second_per_gpu": 346.62,
+      "total_tokens": 945548
+    },
+    {
+      "epoch": 1.1802733653490949,
+      "grad_norm": 28.489376068115234,
+      "learning_rate": 0.00013485330204031937,
+      "loss": 4.4916,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 400,
+      "tokens_per_second_per_gpu": 1511.66,
+      "total_tokens": 1067762
+    },
+    {
+      "epoch": 1.328038418913927,
+      "grad_norm": 25.01222038269043,
+      "learning_rate": 0.0001198534818030452,
+      "loss": 4.4404,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 450,
+      "tokens_per_second_per_gpu": 1375.01,
+      "total_tokens": 1188747
+    },
+    {
+      "epoch": 1.4758034724787588,
+      "grad_norm": 19.93057632446289,
+      "learning_rate": 0.00010437060506248341,
+      "loss": 4.4182,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 500,
+      "tokens_per_second_per_gpu": 1540.13,
+      "total_tokens": 1312791
+    },
+    {
+      "epoch": 1.6235685260435906,
+      "grad_norm": 20.955829620361328,
+      "learning_rate": 8.878138681368239e-05,
+      "loss": 4.3869,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 550,
+      "tokens_per_second_per_gpu": 1544.52,
+      "total_tokens": 1437766
+    },
+    {
+      "epoch": 1.7713335796084226,
+      "grad_norm": 27.51922035217285,
+      "learning_rate": 7.346512945462767e-05,
+      "loss": 4.359,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 600,
+      "tokens_per_second_per_gpu": 1525.07,
+      "total_tokens": 1560469
+    },
+    {
+      "epoch": 1.9190986331732547,
+      "grad_norm": 18.944068908691406,
+      "learning_rate": 5.879449395213175e-05,
+      "loss": 4.3688,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 650,
+      "tokens_per_second_per_gpu": 1526.23,
+      "total_tokens": 1681459
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 4.463651657104492,
+      "eval_runtime": 6.8792,
+      "eval_samples_per_second": 20.787,
+      "eval_steps_per_second": 20.787,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 678
+    },
+    {
+      "epoch": 2.065016623568526,
+      "grad_norm": 18.91043472290039,
+      "learning_rate": 4.512643260086751e-05,
+      "loss": 4.294,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 700,
+      "tokens_per_second_per_gpu": 492.11,
+      "total_tokens": 1847089
+    },
+    {
+      "epoch": 2.212781677133358,
+      "grad_norm": 22.407855987548828,
+      "learning_rate": 3.279350399124066e-05,
+      "loss": 4.2329,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 750,
+      "tokens_per_second_per_gpu": 1508.42,
+      "total_tokens": 1969141
+    },
+    {
+      "epoch": 2.3605467306981898,
+      "grad_norm": 22.668535232543945,
+      "learning_rate": 2.209578150224645e-05,
+      "loss": 4.2312,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 800,
+      "tokens_per_second_per_gpu": 1531.3,
+      "total_tokens": 2090972
+    },
+    {
+      "epoch": 2.5083117842630216,
+      "grad_norm": 18.202590942382812,
+      "learning_rate": 1.3293552194358238e-05,
+      "loss": 4.222,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 850,
+      "tokens_per_second_per_gpu": 1544.96,
+      "total_tokens": 2213097
+    },
+    {
+      "epoch": 2.656076837827854,
+      "grad_norm": 18.34627914428711,
+      "learning_rate": 6.600983746212319e-06,
+      "loss": 4.2271,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 900,
+      "tokens_per_second_per_gpu": 1515.29,
+      "total_tokens": 2335026
+    },
+    {
+      "epoch": 2.8038418913926857,
+      "grad_norm": 21.71125030517578,
+      "learning_rate": 2.1809135253115565e-06,
+      "loss": 4.2248,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 950,
+      "tokens_per_second_per_gpu": 1450.1,
+      "total_tokens": 2458575
+    },
+    {
+      "epoch": 2.9516069449575175,
+      "grad_norm": 16.12077522277832,
+      "learning_rate": 1.4088658024622448e-07,
+      "loss": 4.2186,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 1000,
+      "tokens_per_second_per_gpu": 1481.33,
+      "total_tokens": 2582586
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 1016,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.7850881269039104e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

LLM/checkpoint-1016/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f63f7b81172feef0cd47466795e7fc796dfcf0be86e5c24d2a09091a1a3fa40
+size 7313

LLM/checkpoint-1016/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-600/added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-600/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

LLM/checkpoint-600/config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 4864,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 14,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 2,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 166000
+}

LLM/checkpoint-600/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "do_sample": true,
+  "eos_token_id": [
+    151645
+  ],
+  "pad_token_id": 151643,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

LLM/checkpoint-600/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-600/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70fb204eebbb536a9032eed626725567d794ab815ba77c68f406abfbfb845787
+size 1310860488

LLM/checkpoint-600/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:496d81412b4f7a334382ab3097f680ec70fcde428178df68fd67f85eac426699
+size 2621903691

LLM/checkpoint-600/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399
+size 14645

LLM/checkpoint-600/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b36d8f9cb621c24f8daf0837efdbf71e7a01fe5399fae9de6152151def69447
+size 1465

LLM/checkpoint-600/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

LLM/checkpoint-600/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8b057d6ca205a429cc3428b9fc815f0d6ee1d53106dd5e5b129ef9db2ff057
+size 14129172

LLM/checkpoint-600/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

LLM/checkpoint-600/trainer_state.json ADDED Viewed

	@@ -0,0 +1,200 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.7713335796084226,
+  "eval_steps": 339,
+  "global_step": 600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0,
+      "eval_loss": 11.850272178649902,
+      "eval_runtime": 7.1659,
+      "eval_samples_per_second": 19.956,
+      "eval_steps_per_second": 19.956,
+      "memory/device_reserved (GiB)": 3.2,
+      "memory/max_active (GiB)": 3.1,
+      "memory/max_allocated (GiB)": 3.1,
+      "step": 0
+    },
+    {
+      "epoch": 0.1477650535648319,
+      "grad_norm": 9.9116792678833,
+      "learning_rate": 0.00019925925947187668,
+      "loss": 7.2742,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 50,
+      "tokens_per_second_per_gpu": 2082.96,
+      "total_tokens": 164572
+    },
+    {
+      "epoch": 0.2955301071296638,
+      "grad_norm": 12.538771629333496,
+      "learning_rate": 0.0001961624298837552,
+      "loss": 5.6623,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 100,
+      "tokens_per_second_per_gpu": 1546.73,
+      "total_tokens": 287922
+    },
+    {
+      "epoch": 0.44329516069449576,
+      "grad_norm": 25.31467628479004,
+      "learning_rate": 0.00019072586525126637,
+      "loss": 5.3752,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 150,
+      "tokens_per_second_per_gpu": 1524.55,
+      "total_tokens": 410915
+    },
+    {
+      "epoch": 0.5910602142593276,
+      "grad_norm": 25.589689254760742,
+      "learning_rate": 0.00018308184302213046,
+      "loss": 5.0362,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 200,
+      "tokens_per_second_per_gpu": 1390.99,
+      "total_tokens": 533959
+    },
+    {
+      "epoch": 0.7388252678241596,
+      "grad_norm": 26.41189956665039,
+      "learning_rate": 0.00017341635045468791,
+      "loss": 4.8371,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 250,
+      "tokens_per_second_per_gpu": 1552.5,
+      "total_tokens": 656838
+    },
+    {
+      "epoch": 0.8865903213889915,
+      "grad_norm": 23.636552810668945,
+      "learning_rate": 0.00016196455934844978,
+      "loss": 4.7248,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 300,
+      "tokens_per_second_per_gpu": 1489.9,
+      "total_tokens": 779045
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 4.642312526702881,
+      "eval_runtime": 7.0402,
+      "eval_samples_per_second": 20.312,
+      "eval_steps_per_second": 20.312,
+      "memory/device_reserved (GiB)": 7.67,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 339
+    },
+    {
+      "epoch": 1.032508311784263,
+      "grad_norm": 29.976333618164062,
+      "learning_rate": 0.00014900510406201564,
+      "loss": 4.6412,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 350,
+      "tokens_per_second_per_gpu": 346.62,
+      "total_tokens": 945548
+    },
+    {
+      "epoch": 1.1802733653490949,
+      "grad_norm": 28.489376068115234,
+      "learning_rate": 0.00013485330204031937,
+      "loss": 4.4916,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 400,
+      "tokens_per_second_per_gpu": 1511.66,
+      "total_tokens": 1067762
+    },
+    {
+      "epoch": 1.328038418913927,
+      "grad_norm": 25.01222038269043,
+      "learning_rate": 0.0001198534818030452,
+      "loss": 4.4404,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 450,
+      "tokens_per_second_per_gpu": 1375.01,
+      "total_tokens": 1188747
+    },
+    {
+      "epoch": 1.4758034724787588,
+      "grad_norm": 19.93057632446289,
+      "learning_rate": 0.00010437060506248341,
+      "loss": 4.4182,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 500,
+      "tokens_per_second_per_gpu": 1540.13,
+      "total_tokens": 1312791
+    },
+    {
+      "epoch": 1.6235685260435906,
+      "grad_norm": 20.955829620361328,
+      "learning_rate": 8.878138681368239e-05,
+      "loss": 4.3869,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 550,
+      "tokens_per_second_per_gpu": 1544.52,
+      "total_tokens": 1437766
+    },
+    {
+      "epoch": 1.7713335796084226,
+      "grad_norm": 27.51922035217285,
+      "learning_rate": 7.346512945462767e-05,
+      "loss": 4.359,
+      "memory/device_reserved (GiB)": 7.62,
+      "memory/max_active (GiB)": 7.2,
+      "memory/max_allocated (GiB)": 7.2,
+      "step": 600,
+      "tokens_per_second_per_gpu": 1525.07,
+      "total_tokens": 1560469
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 1016,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.054385017061376e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

LLM/checkpoint-600/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f63f7b81172feef0cd47466795e7fc796dfcf0be86e5c24d2a09091a1a3fa40
+size 7313