azeddinShr commited on
Commit
03401b7
·
verified ·
1 Parent(s): 2a65632

Complete Spark-TTS with Arabic fine-tuned LLM

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +16 -0
  2. BiCodec/config.yaml +60 -0
  3. BiCodec/model.safetensors +3 -0
  4. LLM/README.md +132 -0
  5. LLM/added_tokens.json +0 -0
  6. LLM/chat_template.jinja +54 -0
  7. LLM/checkpoint-1000/added_tokens.json +0 -0
  8. LLM/checkpoint-1000/chat_template.jinja +54 -0
  9. LLM/checkpoint-1000/config.json +54 -0
  10. LLM/checkpoint-1000/generation_config.json +10 -0
  11. LLM/checkpoint-1000/merges.txt +0 -0
  12. LLM/checkpoint-1000/model.safetensors +3 -0
  13. LLM/checkpoint-1000/optimizer.pt +3 -0
  14. LLM/checkpoint-1000/rng_state.pth +3 -0
  15. LLM/checkpoint-1000/scheduler.pt +3 -0
  16. LLM/checkpoint-1000/special_tokens_map.json +31 -0
  17. LLM/checkpoint-1000/tokenizer.json +3 -0
  18. LLM/checkpoint-1000/tokenizer_config.json +0 -0
  19. LLM/checkpoint-1000/trainer_state.json +307 -0
  20. LLM/checkpoint-1000/training_args.bin +3 -0
  21. LLM/checkpoint-1000/vocab.json +0 -0
  22. LLM/checkpoint-1016/added_tokens.json +0 -0
  23. LLM/checkpoint-1016/chat_template.jinja +54 -0
  24. LLM/checkpoint-1016/config.json +54 -0
  25. LLM/checkpoint-1016/generation_config.json +10 -0
  26. LLM/checkpoint-1016/merges.txt +0 -0
  27. LLM/checkpoint-1016/model.safetensors +3 -0
  28. LLM/checkpoint-1016/optimizer.pt +3 -0
  29. LLM/checkpoint-1016/rng_state.pth +3 -0
  30. LLM/checkpoint-1016/scheduler.pt +3 -0
  31. LLM/checkpoint-1016/special_tokens_map.json +31 -0
  32. LLM/checkpoint-1016/tokenizer.json +3 -0
  33. LLM/checkpoint-1016/tokenizer_config.json +0 -0
  34. LLM/checkpoint-1016/trainer_state.json +307 -0
  35. LLM/checkpoint-1016/training_args.bin +3 -0
  36. LLM/checkpoint-1016/vocab.json +0 -0
  37. LLM/checkpoint-600/added_tokens.json +0 -0
  38. LLM/checkpoint-600/chat_template.jinja +54 -0
  39. LLM/checkpoint-600/config.json +54 -0
  40. LLM/checkpoint-600/generation_config.json +10 -0
  41. LLM/checkpoint-600/merges.txt +0 -0
  42. LLM/checkpoint-600/model.safetensors +3 -0
  43. LLM/checkpoint-600/optimizer.pt +3 -0
  44. LLM/checkpoint-600/rng_state.pth +3 -0
  45. LLM/checkpoint-600/scheduler.pt +3 -0
  46. LLM/checkpoint-600/special_tokens_map.json +31 -0
  47. LLM/checkpoint-600/tokenizer.json +3 -0
  48. LLM/checkpoint-600/tokenizer_config.json +0 -0
  49. LLM/checkpoint-600/trainer_state.json +200 -0
  50. LLM/checkpoint-600/training_args.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ LLM/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ wav2vec2-large-xlsr-53/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
38
+ LLM/model.safetensors filter=lfs diff=lfs merge=lfs -text
39
+ BiCodec/model.safetensors filter=lfs diff=lfs merge=lfs -text
40
+ LLM/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ LLM/checkpoint-1016/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ LLM/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ LLM/checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ src/figures/infer_control.png filter=lfs diff=lfs merge=lfs -text
45
+ src/figures/infer_voice_cloning.png filter=lfs diff=lfs merge=lfs -text
46
+ src/logo/HKUST.jpg filter=lfs diff=lfs merge=lfs -text
47
+ src/logo/NPU.jpg filter=lfs diff=lfs merge=lfs -text
48
+ src/logo/SJU.jpg filter=lfs diff=lfs merge=lfs -text
49
+ src/logo/SparkTTS.png filter=lfs diff=lfs merge=lfs -text
50
+ src/logo/mobvoi.jpg filter=lfs diff=lfs merge=lfs -text
51
+ src/logo/mobvoi.png filter=lfs diff=lfs merge=lfs -text
BiCodec/config.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio_tokenizer:
2
+ mel_params:
3
+ sample_rate: 16000
4
+ n_fft: 1024
5
+ win_length: 640
6
+ hop_length: 320
7
+ mel_fmin: 10
8
+ mel_fmax: null
9
+ num_mels: 128
10
+
11
+ encoder:
12
+ input_channels: 1024
13
+ vocos_dim: 384
14
+ vocos_intermediate_dim: 2048
15
+ vocos_num_layers: 12
16
+ out_channels: 1024
17
+ sample_ratios: [1,1]
18
+
19
+ decoder:
20
+ input_channel: 1024
21
+ channels: 1536
22
+ rates: [8, 5, 4, 2]
23
+ kernel_sizes: [16,11,8,4]
24
+
25
+ quantizer:
26
+ input_dim: 1024
27
+ codebook_size: 8192
28
+ codebook_dim: 8
29
+ commitment: 0.25
30
+ codebook_loss_weight: 2.0
31
+ use_l2_normlize: True
32
+ threshold_ema_dead_code: 0.2
33
+
34
+ speaker_encoder:
35
+ input_dim: 128
36
+ out_dim: 1024
37
+ latent_dim: 128
38
+ token_num: 32
39
+ fsq_levels: [4, 4, 4, 4, 4, 4]
40
+ fsq_num_quantizers: 1
41
+
42
+ prenet:
43
+ input_channels: 1024
44
+ vocos_dim: 384
45
+ vocos_intermediate_dim: 2048
46
+ vocos_num_layers: 12
47
+ out_channels: 1024
48
+ condition_dim: 1024
49
+ sample_ratios: [1,1]
50
+ use_tanh_at_final: False
51
+
52
+ postnet:
53
+ input_channels: 1024
54
+ vocos_dim: 384
55
+ vocos_intermediate_dim: 2048
56
+ vocos_num_layers: 6
57
+ out_channels: 1024
58
+ use_tanh_at_final: False
59
+
60
+
BiCodec/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9940cd48d4446e4340ced82d234bf5618350dd9f5db900ebe47a4fdb03867ec
3
+ size 625518756
LLM/README.md ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ datasets:
6
+ - /content/processed_output/clartts_data.jsonl
7
+ model-index:
8
+ - name: content/finetuned_model
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.13.0.dev0`
19
+ ```yaml
20
+ base_model: /content/SparkTTS-Finetune/pretrained_models/Spark-TTS-0.5B/LLM
21
+ load_in_4bit: false
22
+ load_in_8bit: false
23
+
24
+ trust_remote_code: true
25
+ strict: false
26
+
27
+ datasets:
28
+ - path: /content/processed_output/clartts_data.jsonl
29
+ type: completion
30
+
31
+ dataset_prepared_path:
32
+ val_set_size: 0.05
33
+ output_dir: /content/finetuned_model
34
+
35
+ sequence_len: 1024
36
+ sample_packing: false
37
+ eval_sample_packing: false
38
+ pad_to_sequence_len: true
39
+
40
+ wandb_project:
41
+ wandb_entity:
42
+ wandb_watch:
43
+ wandb_name:
44
+ wandb_log_model:
45
+
46
+ gradient_accumulation_steps: 8
47
+ micro_batch_size: 1
48
+ num_epochs: 3
49
+ optimizer: adamw_torch_fused
50
+ lr_scheduler: cosine
51
+ learning_rate: 0.0002
52
+
53
+ train_on_inputs: false
54
+ group_by_length: false
55
+ bf16: true
56
+ fp16: false
57
+ tf32: false
58
+
59
+ gradient_checkpointing: true
60
+ gradient_checkpointing_kwargs:
61
+ use_reentrant: false
62
+
63
+ early_stopping_patience:
64
+ resume_from_checkpoint:
65
+ local_rank:
66
+ logging_steps: 50
67
+ xformers_attention:
68
+ flash_attention: false
69
+
70
+ warmup_steps: 10
71
+ evals_per_epoch: 1
72
+ save_steps: 200
73
+ debug:
74
+ deepspeed:
75
+ weight_decay: 0.0
76
+
77
+ ```
78
+
79
+ </details><br>
80
+
81
+ # content/finetuned_model
82
+
83
+ This model was trained from scratch on the /content/processed_output/clartts_data.jsonl dataset.
84
+ It achieves the following results on the evaluation set:
85
+ - Loss: 4.4637
86
+ - Memory/max Active (gib): 7.2
87
+ - Memory/max Allocated (gib): 7.2
88
+ - Memory/device Reserved (gib): 7.62
89
+
90
+ ## Model description
91
+
92
+ More information needed
93
+
94
+ ## Intended uses & limitations
95
+
96
+ More information needed
97
+
98
+ ## Training and evaluation data
99
+
100
+ More information needed
101
+
102
+ ## Training procedure
103
+
104
+ ### Training hyperparameters
105
+
106
+ The following hyperparameters were used during training:
107
+ - learning_rate: 0.0002
108
+ - train_batch_size: 1
109
+ - eval_batch_size: 1
110
+ - seed: 42
111
+ - gradient_accumulation_steps: 8
112
+ - total_train_batch_size: 8
113
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
114
+ - lr_scheduler_type: cosine
115
+ - lr_scheduler_warmup_steps: 10
116
+ - training_steps: 1016
117
+
118
+ ### Training results
119
+
120
+ | Training Loss | Epoch | Step | Validation Loss | Active (gib) | Allocated (gib) | Reserved (gib) |
121
+ |:-------------:|:-----:|:----:|:---------------:|:------------:|:---------------:|:--------------:|
122
+ | No log | 0 | 0 | 11.8503 | 3.1 | 3.1 | 3.2 |
123
+ | 4.7248 | 1.0 | 339 | 4.6423 | 7.2 | 7.2 | 7.67 |
124
+ | 4.3688 | 2.0 | 678 | 4.4637 | 7.2 | 7.2 | 7.62 |
125
+
126
+
127
+ ### Framework versions
128
+
129
+ - Transformers 4.57.1
130
+ - Pytorch 2.7.1+cu118
131
+ - Datasets 4.4.1
132
+ - Tokenizers 0.22.1
LLM/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
LLM/checkpoint-1000/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
LLM/checkpoint-1000/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "float32",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "max_window_layers": 21,
40
+ "model_type": "qwen2",
41
+ "num_attention_heads": 14,
42
+ "num_hidden_layers": 24,
43
+ "num_key_value_heads": 2,
44
+ "pad_token_id": 151643,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_scaling": null,
47
+ "rope_theta": 1000000.0,
48
+ "sliding_window": null,
49
+ "tie_word_embeddings": true,
50
+ "transformers_version": "4.57.1",
51
+ "use_cache": false,
52
+ "use_sliding_window": false,
53
+ "vocab_size": 166000
54
+ }
LLM/checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "transformers_version": "4.57.1",
9
+ "use_cache": false
10
+ }
LLM/checkpoint-1000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50d2144b8b38b8ad0d8ce257050b2284d2665d63f400eb53f9c9a142380fc9c1
3
+ size 1310860488
LLM/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0acaa72f9ba986f14d742ef24a41421ec717c825dbbc267497d61261816edd2
3
+ size 2621903691
LLM/checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
3
+ size 14645
LLM/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16fef0806330d17ac0adc016c87f30d80c9a7ff3ea6e21a41ba6f8b2c605e2e4
3
+ size 1465
LLM/checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
LLM/checkpoint-1000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c8b057d6ca205a429cc3428b9fc815f0d6ee1d53106dd5e5b129ef9db2ff057
3
+ size 14129172
LLM/checkpoint-1000/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.9516069449575175,
6
+ "eval_steps": 339,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_loss": 11.850272178649902,
15
+ "eval_runtime": 7.1659,
16
+ "eval_samples_per_second": 19.956,
17
+ "eval_steps_per_second": 19.956,
18
+ "memory/device_reserved (GiB)": 3.2,
19
+ "memory/max_active (GiB)": 3.1,
20
+ "memory/max_allocated (GiB)": 3.1,
21
+ "step": 0
22
+ },
23
+ {
24
+ "epoch": 0.1477650535648319,
25
+ "grad_norm": 9.9116792678833,
26
+ "learning_rate": 0.00019925925947187668,
27
+ "loss": 7.2742,
28
+ "memory/device_reserved (GiB)": 7.67,
29
+ "memory/max_active (GiB)": 7.2,
30
+ "memory/max_allocated (GiB)": 7.2,
31
+ "step": 50,
32
+ "tokens_per_second_per_gpu": 2082.96,
33
+ "total_tokens": 164572
34
+ },
35
+ {
36
+ "epoch": 0.2955301071296638,
37
+ "grad_norm": 12.538771629333496,
38
+ "learning_rate": 0.0001961624298837552,
39
+ "loss": 5.6623,
40
+ "memory/device_reserved (GiB)": 7.67,
41
+ "memory/max_active (GiB)": 7.2,
42
+ "memory/max_allocated (GiB)": 7.2,
43
+ "step": 100,
44
+ "tokens_per_second_per_gpu": 1546.73,
45
+ "total_tokens": 287922
46
+ },
47
+ {
48
+ "epoch": 0.44329516069449576,
49
+ "grad_norm": 25.31467628479004,
50
+ "learning_rate": 0.00019072586525126637,
51
+ "loss": 5.3752,
52
+ "memory/device_reserved (GiB)": 7.67,
53
+ "memory/max_active (GiB)": 7.2,
54
+ "memory/max_allocated (GiB)": 7.2,
55
+ "step": 150,
56
+ "tokens_per_second_per_gpu": 1524.55,
57
+ "total_tokens": 410915
58
+ },
59
+ {
60
+ "epoch": 0.5910602142593276,
61
+ "grad_norm": 25.589689254760742,
62
+ "learning_rate": 0.00018308184302213046,
63
+ "loss": 5.0362,
64
+ "memory/device_reserved (GiB)": 7.67,
65
+ "memory/max_active (GiB)": 7.2,
66
+ "memory/max_allocated (GiB)": 7.2,
67
+ "step": 200,
68
+ "tokens_per_second_per_gpu": 1390.99,
69
+ "total_tokens": 533959
70
+ },
71
+ {
72
+ "epoch": 0.7388252678241596,
73
+ "grad_norm": 26.41189956665039,
74
+ "learning_rate": 0.00017341635045468791,
75
+ "loss": 4.8371,
76
+ "memory/device_reserved (GiB)": 7.67,
77
+ "memory/max_active (GiB)": 7.2,
78
+ "memory/max_allocated (GiB)": 7.2,
79
+ "step": 250,
80
+ "tokens_per_second_per_gpu": 1552.5,
81
+ "total_tokens": 656838
82
+ },
83
+ {
84
+ "epoch": 0.8865903213889915,
85
+ "grad_norm": 23.636552810668945,
86
+ "learning_rate": 0.00016196455934844978,
87
+ "loss": 4.7248,
88
+ "memory/device_reserved (GiB)": 7.67,
89
+ "memory/max_active (GiB)": 7.2,
90
+ "memory/max_allocated (GiB)": 7.2,
91
+ "step": 300,
92
+ "tokens_per_second_per_gpu": 1489.9,
93
+ "total_tokens": 779045
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_loss": 4.642312526702881,
98
+ "eval_runtime": 7.0402,
99
+ "eval_samples_per_second": 20.312,
100
+ "eval_steps_per_second": 20.312,
101
+ "memory/device_reserved (GiB)": 7.67,
102
+ "memory/max_active (GiB)": 7.2,
103
+ "memory/max_allocated (GiB)": 7.2,
104
+ "step": 339
105
+ },
106
+ {
107
+ "epoch": 1.032508311784263,
108
+ "grad_norm": 29.976333618164062,
109
+ "learning_rate": 0.00014900510406201564,
110
+ "loss": 4.6412,
111
+ "memory/device_reserved (GiB)": 7.62,
112
+ "memory/max_active (GiB)": 7.2,
113
+ "memory/max_allocated (GiB)": 7.2,
114
+ "step": 350,
115
+ "tokens_per_second_per_gpu": 346.62,
116
+ "total_tokens": 945548
117
+ },
118
+ {
119
+ "epoch": 1.1802733653490949,
120
+ "grad_norm": 28.489376068115234,
121
+ "learning_rate": 0.00013485330204031937,
122
+ "loss": 4.4916,
123
+ "memory/device_reserved (GiB)": 7.62,
124
+ "memory/max_active (GiB)": 7.2,
125
+ "memory/max_allocated (GiB)": 7.2,
126
+ "step": 400,
127
+ "tokens_per_second_per_gpu": 1511.66,
128
+ "total_tokens": 1067762
129
+ },
130
+ {
131
+ "epoch": 1.328038418913927,
132
+ "grad_norm": 25.01222038269043,
133
+ "learning_rate": 0.0001198534818030452,
134
+ "loss": 4.4404,
135
+ "memory/device_reserved (GiB)": 7.62,
136
+ "memory/max_active (GiB)": 7.2,
137
+ "memory/max_allocated (GiB)": 7.2,
138
+ "step": 450,
139
+ "tokens_per_second_per_gpu": 1375.01,
140
+ "total_tokens": 1188747
141
+ },
142
+ {
143
+ "epoch": 1.4758034724787588,
144
+ "grad_norm": 19.93057632446289,
145
+ "learning_rate": 0.00010437060506248341,
146
+ "loss": 4.4182,
147
+ "memory/device_reserved (GiB)": 7.62,
148
+ "memory/max_active (GiB)": 7.2,
149
+ "memory/max_allocated (GiB)": 7.2,
150
+ "step": 500,
151
+ "tokens_per_second_per_gpu": 1540.13,
152
+ "total_tokens": 1312791
153
+ },
154
+ {
155
+ "epoch": 1.6235685260435906,
156
+ "grad_norm": 20.955829620361328,
157
+ "learning_rate": 8.878138681368239e-05,
158
+ "loss": 4.3869,
159
+ "memory/device_reserved (GiB)": 7.62,
160
+ "memory/max_active (GiB)": 7.2,
161
+ "memory/max_allocated (GiB)": 7.2,
162
+ "step": 550,
163
+ "tokens_per_second_per_gpu": 1544.52,
164
+ "total_tokens": 1437766
165
+ },
166
+ {
167
+ "epoch": 1.7713335796084226,
168
+ "grad_norm": 27.51922035217285,
169
+ "learning_rate": 7.346512945462767e-05,
170
+ "loss": 4.359,
171
+ "memory/device_reserved (GiB)": 7.62,
172
+ "memory/max_active (GiB)": 7.2,
173
+ "memory/max_allocated (GiB)": 7.2,
174
+ "step": 600,
175
+ "tokens_per_second_per_gpu": 1525.07,
176
+ "total_tokens": 1560469
177
+ },
178
+ {
179
+ "epoch": 1.9190986331732547,
180
+ "grad_norm": 18.944068908691406,
181
+ "learning_rate": 5.879449395213175e-05,
182
+ "loss": 4.3688,
183
+ "memory/device_reserved (GiB)": 7.62,
184
+ "memory/max_active (GiB)": 7.2,
185
+ "memory/max_allocated (GiB)": 7.2,
186
+ "step": 650,
187
+ "tokens_per_second_per_gpu": 1526.23,
188
+ "total_tokens": 1681459
189
+ },
190
+ {
191
+ "epoch": 2.0,
192
+ "eval_loss": 4.463651657104492,
193
+ "eval_runtime": 6.8792,
194
+ "eval_samples_per_second": 20.787,
195
+ "eval_steps_per_second": 20.787,
196
+ "memory/device_reserved (GiB)": 7.62,
197
+ "memory/max_active (GiB)": 7.2,
198
+ "memory/max_allocated (GiB)": 7.2,
199
+ "step": 678
200
+ },
201
+ {
202
+ "epoch": 2.065016623568526,
203
+ "grad_norm": 18.91043472290039,
204
+ "learning_rate": 4.512643260086751e-05,
205
+ "loss": 4.294,
206
+ "memory/device_reserved (GiB)": 7.62,
207
+ "memory/max_active (GiB)": 7.2,
208
+ "memory/max_allocated (GiB)": 7.2,
209
+ "step": 700,
210
+ "tokens_per_second_per_gpu": 492.11,
211
+ "total_tokens": 1847089
212
+ },
213
+ {
214
+ "epoch": 2.212781677133358,
215
+ "grad_norm": 22.407855987548828,
216
+ "learning_rate": 3.279350399124066e-05,
217
+ "loss": 4.2329,
218
+ "memory/device_reserved (GiB)": 7.62,
219
+ "memory/max_active (GiB)": 7.2,
220
+ "memory/max_allocated (GiB)": 7.2,
221
+ "step": 750,
222
+ "tokens_per_second_per_gpu": 1508.42,
223
+ "total_tokens": 1969141
224
+ },
225
+ {
226
+ "epoch": 2.3605467306981898,
227
+ "grad_norm": 22.668535232543945,
228
+ "learning_rate": 2.209578150224645e-05,
229
+ "loss": 4.2312,
230
+ "memory/device_reserved (GiB)": 7.62,
231
+ "memory/max_active (GiB)": 7.2,
232
+ "memory/max_allocated (GiB)": 7.2,
233
+ "step": 800,
234
+ "tokens_per_second_per_gpu": 1531.3,
235
+ "total_tokens": 2090972
236
+ },
237
+ {
238
+ "epoch": 2.5083117842630216,
239
+ "grad_norm": 18.202590942382812,
240
+ "learning_rate": 1.3293552194358238e-05,
241
+ "loss": 4.222,
242
+ "memory/device_reserved (GiB)": 7.62,
243
+ "memory/max_active (GiB)": 7.2,
244
+ "memory/max_allocated (GiB)": 7.2,
245
+ "step": 850,
246
+ "tokens_per_second_per_gpu": 1544.96,
247
+ "total_tokens": 2213097
248
+ },
249
+ {
250
+ "epoch": 2.656076837827854,
251
+ "grad_norm": 18.34627914428711,
252
+ "learning_rate": 6.600983746212319e-06,
253
+ "loss": 4.2271,
254
+ "memory/device_reserved (GiB)": 7.62,
255
+ "memory/max_active (GiB)": 7.2,
256
+ "memory/max_allocated (GiB)": 7.2,
257
+ "step": 900,
258
+ "tokens_per_second_per_gpu": 1515.29,
259
+ "total_tokens": 2335026
260
+ },
261
+ {
262
+ "epoch": 2.8038418913926857,
263
+ "grad_norm": 21.71125030517578,
264
+ "learning_rate": 2.1809135253115565e-06,
265
+ "loss": 4.2248,
266
+ "memory/device_reserved (GiB)": 7.62,
267
+ "memory/max_active (GiB)": 7.2,
268
+ "memory/max_allocated (GiB)": 7.2,
269
+ "step": 950,
270
+ "tokens_per_second_per_gpu": 1450.1,
271
+ "total_tokens": 2458575
272
+ },
273
+ {
274
+ "epoch": 2.9516069449575175,
275
+ "grad_norm": 16.12077522277832,
276
+ "learning_rate": 1.4088658024622448e-07,
277
+ "loss": 4.2186,
278
+ "memory/device_reserved (GiB)": 7.62,
279
+ "memory/max_active (GiB)": 7.2,
280
+ "memory/max_allocated (GiB)": 7.2,
281
+ "step": 1000,
282
+ "tokens_per_second_per_gpu": 1481.33,
283
+ "total_tokens": 2582586
284
+ }
285
+ ],
286
+ "logging_steps": 50,
287
+ "max_steps": 1016,
288
+ "num_input_tokens_seen": 0,
289
+ "num_train_epochs": 3,
290
+ "save_steps": 200,
291
+ "stateful_callbacks": {
292
+ "TrainerControl": {
293
+ "args": {
294
+ "should_epoch_stop": false,
295
+ "should_evaluate": false,
296
+ "should_log": false,
297
+ "should_save": true,
298
+ "should_training_stop": false
299
+ },
300
+ "attributes": {}
301
+ }
302
+ },
303
+ "total_flos": 1.756941874102272e+16,
304
+ "train_batch_size": 1,
305
+ "trial_name": null,
306
+ "trial_params": null
307
+ }
LLM/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f63f7b81172feef0cd47466795e7fc796dfcf0be86e5c24d2a09091a1a3fa40
3
+ size 7313
LLM/checkpoint-1000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-1016/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-1016/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
LLM/checkpoint-1016/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "float32",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "max_window_layers": 21,
40
+ "model_type": "qwen2",
41
+ "num_attention_heads": 14,
42
+ "num_hidden_layers": 24,
43
+ "num_key_value_heads": 2,
44
+ "pad_token_id": 151643,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_scaling": null,
47
+ "rope_theta": 1000000.0,
48
+ "sliding_window": null,
49
+ "tie_word_embeddings": true,
50
+ "transformers_version": "4.57.1",
51
+ "use_cache": false,
52
+ "use_sliding_window": false,
53
+ "vocab_size": 166000
54
+ }
LLM/checkpoint-1016/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "transformers_version": "4.57.1",
9
+ "use_cache": false
10
+ }
LLM/checkpoint-1016/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-1016/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95dd3a1efd82dccec71c77afc4ea9ef7f3abba3c36924e908befd7a41e80b4c1
3
+ size 1310860488
LLM/checkpoint-1016/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee34fd8e9385820c523c29eb897b957e11f945026c7afeef701ca873fcb2338
3
+ size 2621903691
LLM/checkpoint-1016/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
3
+ size 14645
LLM/checkpoint-1016/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f59a21b592da0aad96690e7481b3b551321c1c689bb0d9bf4bf5be9fa29637a
3
+ size 1465
LLM/checkpoint-1016/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
LLM/checkpoint-1016/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c8b057d6ca205a429cc3428b9fc815f0d6ee1d53106dd5e5b129ef9db2ff057
3
+ size 14129172
LLM/checkpoint-1016/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-1016/trainer_state.json ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.998891762098264,
6
+ "eval_steps": 339,
7
+ "global_step": 1016,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_loss": 11.850272178649902,
15
+ "eval_runtime": 7.1659,
16
+ "eval_samples_per_second": 19.956,
17
+ "eval_steps_per_second": 19.956,
18
+ "memory/device_reserved (GiB)": 3.2,
19
+ "memory/max_active (GiB)": 3.1,
20
+ "memory/max_allocated (GiB)": 3.1,
21
+ "step": 0
22
+ },
23
+ {
24
+ "epoch": 0.1477650535648319,
25
+ "grad_norm": 9.9116792678833,
26
+ "learning_rate": 0.00019925925947187668,
27
+ "loss": 7.2742,
28
+ "memory/device_reserved (GiB)": 7.67,
29
+ "memory/max_active (GiB)": 7.2,
30
+ "memory/max_allocated (GiB)": 7.2,
31
+ "step": 50,
32
+ "tokens_per_second_per_gpu": 2082.96,
33
+ "total_tokens": 164572
34
+ },
35
+ {
36
+ "epoch": 0.2955301071296638,
37
+ "grad_norm": 12.538771629333496,
38
+ "learning_rate": 0.0001961624298837552,
39
+ "loss": 5.6623,
40
+ "memory/device_reserved (GiB)": 7.67,
41
+ "memory/max_active (GiB)": 7.2,
42
+ "memory/max_allocated (GiB)": 7.2,
43
+ "step": 100,
44
+ "tokens_per_second_per_gpu": 1546.73,
45
+ "total_tokens": 287922
46
+ },
47
+ {
48
+ "epoch": 0.44329516069449576,
49
+ "grad_norm": 25.31467628479004,
50
+ "learning_rate": 0.00019072586525126637,
51
+ "loss": 5.3752,
52
+ "memory/device_reserved (GiB)": 7.67,
53
+ "memory/max_active (GiB)": 7.2,
54
+ "memory/max_allocated (GiB)": 7.2,
55
+ "step": 150,
56
+ "tokens_per_second_per_gpu": 1524.55,
57
+ "total_tokens": 410915
58
+ },
59
+ {
60
+ "epoch": 0.5910602142593276,
61
+ "grad_norm": 25.589689254760742,
62
+ "learning_rate": 0.00018308184302213046,
63
+ "loss": 5.0362,
64
+ "memory/device_reserved (GiB)": 7.67,
65
+ "memory/max_active (GiB)": 7.2,
66
+ "memory/max_allocated (GiB)": 7.2,
67
+ "step": 200,
68
+ "tokens_per_second_per_gpu": 1390.99,
69
+ "total_tokens": 533959
70
+ },
71
+ {
72
+ "epoch": 0.7388252678241596,
73
+ "grad_norm": 26.41189956665039,
74
+ "learning_rate": 0.00017341635045468791,
75
+ "loss": 4.8371,
76
+ "memory/device_reserved (GiB)": 7.67,
77
+ "memory/max_active (GiB)": 7.2,
78
+ "memory/max_allocated (GiB)": 7.2,
79
+ "step": 250,
80
+ "tokens_per_second_per_gpu": 1552.5,
81
+ "total_tokens": 656838
82
+ },
83
+ {
84
+ "epoch": 0.8865903213889915,
85
+ "grad_norm": 23.636552810668945,
86
+ "learning_rate": 0.00016196455934844978,
87
+ "loss": 4.7248,
88
+ "memory/device_reserved (GiB)": 7.67,
89
+ "memory/max_active (GiB)": 7.2,
90
+ "memory/max_allocated (GiB)": 7.2,
91
+ "step": 300,
92
+ "tokens_per_second_per_gpu": 1489.9,
93
+ "total_tokens": 779045
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_loss": 4.642312526702881,
98
+ "eval_runtime": 7.0402,
99
+ "eval_samples_per_second": 20.312,
100
+ "eval_steps_per_second": 20.312,
101
+ "memory/device_reserved (GiB)": 7.67,
102
+ "memory/max_active (GiB)": 7.2,
103
+ "memory/max_allocated (GiB)": 7.2,
104
+ "step": 339
105
+ },
106
+ {
107
+ "epoch": 1.032508311784263,
108
+ "grad_norm": 29.976333618164062,
109
+ "learning_rate": 0.00014900510406201564,
110
+ "loss": 4.6412,
111
+ "memory/device_reserved (GiB)": 7.62,
112
+ "memory/max_active (GiB)": 7.2,
113
+ "memory/max_allocated (GiB)": 7.2,
114
+ "step": 350,
115
+ "tokens_per_second_per_gpu": 346.62,
116
+ "total_tokens": 945548
117
+ },
118
+ {
119
+ "epoch": 1.1802733653490949,
120
+ "grad_norm": 28.489376068115234,
121
+ "learning_rate": 0.00013485330204031937,
122
+ "loss": 4.4916,
123
+ "memory/device_reserved (GiB)": 7.62,
124
+ "memory/max_active (GiB)": 7.2,
125
+ "memory/max_allocated (GiB)": 7.2,
126
+ "step": 400,
127
+ "tokens_per_second_per_gpu": 1511.66,
128
+ "total_tokens": 1067762
129
+ },
130
+ {
131
+ "epoch": 1.328038418913927,
132
+ "grad_norm": 25.01222038269043,
133
+ "learning_rate": 0.0001198534818030452,
134
+ "loss": 4.4404,
135
+ "memory/device_reserved (GiB)": 7.62,
136
+ "memory/max_active (GiB)": 7.2,
137
+ "memory/max_allocated (GiB)": 7.2,
138
+ "step": 450,
139
+ "tokens_per_second_per_gpu": 1375.01,
140
+ "total_tokens": 1188747
141
+ },
142
+ {
143
+ "epoch": 1.4758034724787588,
144
+ "grad_norm": 19.93057632446289,
145
+ "learning_rate": 0.00010437060506248341,
146
+ "loss": 4.4182,
147
+ "memory/device_reserved (GiB)": 7.62,
148
+ "memory/max_active (GiB)": 7.2,
149
+ "memory/max_allocated (GiB)": 7.2,
150
+ "step": 500,
151
+ "tokens_per_second_per_gpu": 1540.13,
152
+ "total_tokens": 1312791
153
+ },
154
+ {
155
+ "epoch": 1.6235685260435906,
156
+ "grad_norm": 20.955829620361328,
157
+ "learning_rate": 8.878138681368239e-05,
158
+ "loss": 4.3869,
159
+ "memory/device_reserved (GiB)": 7.62,
160
+ "memory/max_active (GiB)": 7.2,
161
+ "memory/max_allocated (GiB)": 7.2,
162
+ "step": 550,
163
+ "tokens_per_second_per_gpu": 1544.52,
164
+ "total_tokens": 1437766
165
+ },
166
+ {
167
+ "epoch": 1.7713335796084226,
168
+ "grad_norm": 27.51922035217285,
169
+ "learning_rate": 7.346512945462767e-05,
170
+ "loss": 4.359,
171
+ "memory/device_reserved (GiB)": 7.62,
172
+ "memory/max_active (GiB)": 7.2,
173
+ "memory/max_allocated (GiB)": 7.2,
174
+ "step": 600,
175
+ "tokens_per_second_per_gpu": 1525.07,
176
+ "total_tokens": 1560469
177
+ },
178
+ {
179
+ "epoch": 1.9190986331732547,
180
+ "grad_norm": 18.944068908691406,
181
+ "learning_rate": 5.879449395213175e-05,
182
+ "loss": 4.3688,
183
+ "memory/device_reserved (GiB)": 7.62,
184
+ "memory/max_active (GiB)": 7.2,
185
+ "memory/max_allocated (GiB)": 7.2,
186
+ "step": 650,
187
+ "tokens_per_second_per_gpu": 1526.23,
188
+ "total_tokens": 1681459
189
+ },
190
+ {
191
+ "epoch": 2.0,
192
+ "eval_loss": 4.463651657104492,
193
+ "eval_runtime": 6.8792,
194
+ "eval_samples_per_second": 20.787,
195
+ "eval_steps_per_second": 20.787,
196
+ "memory/device_reserved (GiB)": 7.62,
197
+ "memory/max_active (GiB)": 7.2,
198
+ "memory/max_allocated (GiB)": 7.2,
199
+ "step": 678
200
+ },
201
+ {
202
+ "epoch": 2.065016623568526,
203
+ "grad_norm": 18.91043472290039,
204
+ "learning_rate": 4.512643260086751e-05,
205
+ "loss": 4.294,
206
+ "memory/device_reserved (GiB)": 7.62,
207
+ "memory/max_active (GiB)": 7.2,
208
+ "memory/max_allocated (GiB)": 7.2,
209
+ "step": 700,
210
+ "tokens_per_second_per_gpu": 492.11,
211
+ "total_tokens": 1847089
212
+ },
213
+ {
214
+ "epoch": 2.212781677133358,
215
+ "grad_norm": 22.407855987548828,
216
+ "learning_rate": 3.279350399124066e-05,
217
+ "loss": 4.2329,
218
+ "memory/device_reserved (GiB)": 7.62,
219
+ "memory/max_active (GiB)": 7.2,
220
+ "memory/max_allocated (GiB)": 7.2,
221
+ "step": 750,
222
+ "tokens_per_second_per_gpu": 1508.42,
223
+ "total_tokens": 1969141
224
+ },
225
+ {
226
+ "epoch": 2.3605467306981898,
227
+ "grad_norm": 22.668535232543945,
228
+ "learning_rate": 2.209578150224645e-05,
229
+ "loss": 4.2312,
230
+ "memory/device_reserved (GiB)": 7.62,
231
+ "memory/max_active (GiB)": 7.2,
232
+ "memory/max_allocated (GiB)": 7.2,
233
+ "step": 800,
234
+ "tokens_per_second_per_gpu": 1531.3,
235
+ "total_tokens": 2090972
236
+ },
237
+ {
238
+ "epoch": 2.5083117842630216,
239
+ "grad_norm": 18.202590942382812,
240
+ "learning_rate": 1.3293552194358238e-05,
241
+ "loss": 4.222,
242
+ "memory/device_reserved (GiB)": 7.62,
243
+ "memory/max_active (GiB)": 7.2,
244
+ "memory/max_allocated (GiB)": 7.2,
245
+ "step": 850,
246
+ "tokens_per_second_per_gpu": 1544.96,
247
+ "total_tokens": 2213097
248
+ },
249
+ {
250
+ "epoch": 2.656076837827854,
251
+ "grad_norm": 18.34627914428711,
252
+ "learning_rate": 6.600983746212319e-06,
253
+ "loss": 4.2271,
254
+ "memory/device_reserved (GiB)": 7.62,
255
+ "memory/max_active (GiB)": 7.2,
256
+ "memory/max_allocated (GiB)": 7.2,
257
+ "step": 900,
258
+ "tokens_per_second_per_gpu": 1515.29,
259
+ "total_tokens": 2335026
260
+ },
261
+ {
262
+ "epoch": 2.8038418913926857,
263
+ "grad_norm": 21.71125030517578,
264
+ "learning_rate": 2.1809135253115565e-06,
265
+ "loss": 4.2248,
266
+ "memory/device_reserved (GiB)": 7.62,
267
+ "memory/max_active (GiB)": 7.2,
268
+ "memory/max_allocated (GiB)": 7.2,
269
+ "step": 950,
270
+ "tokens_per_second_per_gpu": 1450.1,
271
+ "total_tokens": 2458575
272
+ },
273
+ {
274
+ "epoch": 2.9516069449575175,
275
+ "grad_norm": 16.12077522277832,
276
+ "learning_rate": 1.4088658024622448e-07,
277
+ "loss": 4.2186,
278
+ "memory/device_reserved (GiB)": 7.62,
279
+ "memory/max_active (GiB)": 7.2,
280
+ "memory/max_allocated (GiB)": 7.2,
281
+ "step": 1000,
282
+ "tokens_per_second_per_gpu": 1481.33,
283
+ "total_tokens": 2582586
284
+ }
285
+ ],
286
+ "logging_steps": 50,
287
+ "max_steps": 1016,
288
+ "num_input_tokens_seen": 0,
289
+ "num_train_epochs": 3,
290
+ "save_steps": 200,
291
+ "stateful_callbacks": {
292
+ "TrainerControl": {
293
+ "args": {
294
+ "should_epoch_stop": false,
295
+ "should_evaluate": false,
296
+ "should_log": false,
297
+ "should_save": true,
298
+ "should_training_stop": true
299
+ },
300
+ "attributes": {}
301
+ }
302
+ },
303
+ "total_flos": 1.7850881269039104e+16,
304
+ "train_batch_size": 1,
305
+ "trial_name": null,
306
+ "trial_params": null
307
+ }
LLM/checkpoint-1016/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f63f7b81172feef0cd47466795e7fc796dfcf0be86e5c24d2a09091a1a3fa40
3
+ size 7313
LLM/checkpoint-1016/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-600/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-600/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
LLM/checkpoint-600/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "float32",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "max_window_layers": 21,
40
+ "model_type": "qwen2",
41
+ "num_attention_heads": 14,
42
+ "num_hidden_layers": 24,
43
+ "num_key_value_heads": 2,
44
+ "pad_token_id": 151643,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_scaling": null,
47
+ "rope_theta": 1000000.0,
48
+ "sliding_window": null,
49
+ "tie_word_embeddings": true,
50
+ "transformers_version": "4.57.1",
51
+ "use_cache": false,
52
+ "use_sliding_window": false,
53
+ "vocab_size": 166000
54
+ }
LLM/checkpoint-600/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "transformers_version": "4.57.1",
9
+ "use_cache": false
10
+ }
LLM/checkpoint-600/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-600/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70fb204eebbb536a9032eed626725567d794ab815ba77c68f406abfbfb845787
3
+ size 1310860488
LLM/checkpoint-600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:496d81412b4f7a334382ab3097f680ec70fcde428178df68fd67f85eac426699
3
+ size 2621903691
LLM/checkpoint-600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399
3
+ size 14645
LLM/checkpoint-600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b36d8f9cb621c24f8daf0837efdbf71e7a01fe5399fae9de6152151def69447
3
+ size 1465
LLM/checkpoint-600/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
LLM/checkpoint-600/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c8b057d6ca205a429cc3428b9fc815f0d6ee1d53106dd5e5b129ef9db2ff057
3
+ size 14129172
LLM/checkpoint-600/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/checkpoint-600/trainer_state.json ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.7713335796084226,
6
+ "eval_steps": 339,
7
+ "global_step": 600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_loss": 11.850272178649902,
15
+ "eval_runtime": 7.1659,
16
+ "eval_samples_per_second": 19.956,
17
+ "eval_steps_per_second": 19.956,
18
+ "memory/device_reserved (GiB)": 3.2,
19
+ "memory/max_active (GiB)": 3.1,
20
+ "memory/max_allocated (GiB)": 3.1,
21
+ "step": 0
22
+ },
23
+ {
24
+ "epoch": 0.1477650535648319,
25
+ "grad_norm": 9.9116792678833,
26
+ "learning_rate": 0.00019925925947187668,
27
+ "loss": 7.2742,
28
+ "memory/device_reserved (GiB)": 7.67,
29
+ "memory/max_active (GiB)": 7.2,
30
+ "memory/max_allocated (GiB)": 7.2,
31
+ "step": 50,
32
+ "tokens_per_second_per_gpu": 2082.96,
33
+ "total_tokens": 164572
34
+ },
35
+ {
36
+ "epoch": 0.2955301071296638,
37
+ "grad_norm": 12.538771629333496,
38
+ "learning_rate": 0.0001961624298837552,
39
+ "loss": 5.6623,
40
+ "memory/device_reserved (GiB)": 7.67,
41
+ "memory/max_active (GiB)": 7.2,
42
+ "memory/max_allocated (GiB)": 7.2,
43
+ "step": 100,
44
+ "tokens_per_second_per_gpu": 1546.73,
45
+ "total_tokens": 287922
46
+ },
47
+ {
48
+ "epoch": 0.44329516069449576,
49
+ "grad_norm": 25.31467628479004,
50
+ "learning_rate": 0.00019072586525126637,
51
+ "loss": 5.3752,
52
+ "memory/device_reserved (GiB)": 7.67,
53
+ "memory/max_active (GiB)": 7.2,
54
+ "memory/max_allocated (GiB)": 7.2,
55
+ "step": 150,
56
+ "tokens_per_second_per_gpu": 1524.55,
57
+ "total_tokens": 410915
58
+ },
59
+ {
60
+ "epoch": 0.5910602142593276,
61
+ "grad_norm": 25.589689254760742,
62
+ "learning_rate": 0.00018308184302213046,
63
+ "loss": 5.0362,
64
+ "memory/device_reserved (GiB)": 7.67,
65
+ "memory/max_active (GiB)": 7.2,
66
+ "memory/max_allocated (GiB)": 7.2,
67
+ "step": 200,
68
+ "tokens_per_second_per_gpu": 1390.99,
69
+ "total_tokens": 533959
70
+ },
71
+ {
72
+ "epoch": 0.7388252678241596,
73
+ "grad_norm": 26.41189956665039,
74
+ "learning_rate": 0.00017341635045468791,
75
+ "loss": 4.8371,
76
+ "memory/device_reserved (GiB)": 7.67,
77
+ "memory/max_active (GiB)": 7.2,
78
+ "memory/max_allocated (GiB)": 7.2,
79
+ "step": 250,
80
+ "tokens_per_second_per_gpu": 1552.5,
81
+ "total_tokens": 656838
82
+ },
83
+ {
84
+ "epoch": 0.8865903213889915,
85
+ "grad_norm": 23.636552810668945,
86
+ "learning_rate": 0.00016196455934844978,
87
+ "loss": 4.7248,
88
+ "memory/device_reserved (GiB)": 7.67,
89
+ "memory/max_active (GiB)": 7.2,
90
+ "memory/max_allocated (GiB)": 7.2,
91
+ "step": 300,
92
+ "tokens_per_second_per_gpu": 1489.9,
93
+ "total_tokens": 779045
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_loss": 4.642312526702881,
98
+ "eval_runtime": 7.0402,
99
+ "eval_samples_per_second": 20.312,
100
+ "eval_steps_per_second": 20.312,
101
+ "memory/device_reserved (GiB)": 7.67,
102
+ "memory/max_active (GiB)": 7.2,
103
+ "memory/max_allocated (GiB)": 7.2,
104
+ "step": 339
105
+ },
106
+ {
107
+ "epoch": 1.032508311784263,
108
+ "grad_norm": 29.976333618164062,
109
+ "learning_rate": 0.00014900510406201564,
110
+ "loss": 4.6412,
111
+ "memory/device_reserved (GiB)": 7.62,
112
+ "memory/max_active (GiB)": 7.2,
113
+ "memory/max_allocated (GiB)": 7.2,
114
+ "step": 350,
115
+ "tokens_per_second_per_gpu": 346.62,
116
+ "total_tokens": 945548
117
+ },
118
+ {
119
+ "epoch": 1.1802733653490949,
120
+ "grad_norm": 28.489376068115234,
121
+ "learning_rate": 0.00013485330204031937,
122
+ "loss": 4.4916,
123
+ "memory/device_reserved (GiB)": 7.62,
124
+ "memory/max_active (GiB)": 7.2,
125
+ "memory/max_allocated (GiB)": 7.2,
126
+ "step": 400,
127
+ "tokens_per_second_per_gpu": 1511.66,
128
+ "total_tokens": 1067762
129
+ },
130
+ {
131
+ "epoch": 1.328038418913927,
132
+ "grad_norm": 25.01222038269043,
133
+ "learning_rate": 0.0001198534818030452,
134
+ "loss": 4.4404,
135
+ "memory/device_reserved (GiB)": 7.62,
136
+ "memory/max_active (GiB)": 7.2,
137
+ "memory/max_allocated (GiB)": 7.2,
138
+ "step": 450,
139
+ "tokens_per_second_per_gpu": 1375.01,
140
+ "total_tokens": 1188747
141
+ },
142
+ {
143
+ "epoch": 1.4758034724787588,
144
+ "grad_norm": 19.93057632446289,
145
+ "learning_rate": 0.00010437060506248341,
146
+ "loss": 4.4182,
147
+ "memory/device_reserved (GiB)": 7.62,
148
+ "memory/max_active (GiB)": 7.2,
149
+ "memory/max_allocated (GiB)": 7.2,
150
+ "step": 500,
151
+ "tokens_per_second_per_gpu": 1540.13,
152
+ "total_tokens": 1312791
153
+ },
154
+ {
155
+ "epoch": 1.6235685260435906,
156
+ "grad_norm": 20.955829620361328,
157
+ "learning_rate": 8.878138681368239e-05,
158
+ "loss": 4.3869,
159
+ "memory/device_reserved (GiB)": 7.62,
160
+ "memory/max_active (GiB)": 7.2,
161
+ "memory/max_allocated (GiB)": 7.2,
162
+ "step": 550,
163
+ "tokens_per_second_per_gpu": 1544.52,
164
+ "total_tokens": 1437766
165
+ },
166
+ {
167
+ "epoch": 1.7713335796084226,
168
+ "grad_norm": 27.51922035217285,
169
+ "learning_rate": 7.346512945462767e-05,
170
+ "loss": 4.359,
171
+ "memory/device_reserved (GiB)": 7.62,
172
+ "memory/max_active (GiB)": 7.2,
173
+ "memory/max_allocated (GiB)": 7.2,
174
+ "step": 600,
175
+ "tokens_per_second_per_gpu": 1525.07,
176
+ "total_tokens": 1560469
177
+ }
178
+ ],
179
+ "logging_steps": 50,
180
+ "max_steps": 1016,
181
+ "num_input_tokens_seen": 0,
182
+ "num_train_epochs": 3,
183
+ "save_steps": 200,
184
+ "stateful_callbacks": {
185
+ "TrainerControl": {
186
+ "args": {
187
+ "should_epoch_stop": false,
188
+ "should_evaluate": false,
189
+ "should_log": false,
190
+ "should_save": true,
191
+ "should_training_stop": false
192
+ },
193
+ "attributes": {}
194
+ }
195
+ },
196
+ "total_flos": 1.054385017061376e+16,
197
+ "train_batch_size": 1,
198
+ "trial_name": null,
199
+ "trial_params": null
200
+ }
LLM/checkpoint-600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f63f7b81172feef0cd47466795e7fc796dfcf0be86e5c24d2a09091a1a3fa40
3
+ size 7313