upload
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/chat_template.jinja +1 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/config.json +29 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/generation_config.json +6 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_10/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_12/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_14/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_16/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_18/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_2/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_20/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_4/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_6/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_8/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_final/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model-00001-of-00002.safetensors +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model-00002-of-00002.safetensors +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model.safetensors.index.json +191 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/run_args.txt +9 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/special_tokens_map.json +24 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/tokenizer.json +0 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/tokenizer_config.json +43 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/train_loss.txt +188 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/chat_template.jinja +1 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/config.json +29 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/generation_config.json +6 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_10/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_12/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_14/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_16/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_18/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_2/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_20/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_4/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_6/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_8/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_final/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00001-of-00003.safetensors +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00002-of-00003.safetensors +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00003-of-00003.safetensors +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model.safetensors.index.json +263 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/run_args.txt +9 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/special_tokens_map.json +24 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/tokenizer.json +0 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/tokenizer_config.json +43 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/train_loss.txt +188 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/chat_template.jinja +1 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/config.json +29 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/generation_config.json +6 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/lightweight_checkpoint_epoch_10/checkpoint.pt +3 -0
- llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/lightweight_checkpoint_epoch_12/checkpoint.pt +3 -0
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/chat_template.jinja
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 1,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eos_token_id": 2,
|
| 10 |
+
"head_dim": 128,
|
| 11 |
+
"hidden_act": "silu",
|
| 12 |
+
"hidden_size": 4096,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 11008,
|
| 15 |
+
"max_position_embeddings": 4096,
|
| 16 |
+
"mlp_bias": false,
|
| 17 |
+
"model_type": "llama",
|
| 18 |
+
"num_attention_heads": 32,
|
| 19 |
+
"num_hidden_layers": 20,
|
| 20 |
+
"num_key_value_heads": 32,
|
| 21 |
+
"pretraining_tp": 1,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"rope_scaling": null,
|
| 24 |
+
"rope_theta": 10000.0,
|
| 25 |
+
"tie_word_embeddings": false,
|
| 26 |
+
"transformers_version": "4.57.6",
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"vocab_size": 32000
|
| 29 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"transformers_version": "4.57.6"
|
| 6 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_10/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:edf2236e50020cce86f4f961088ae71e2d73a9cf2496271dadb8ee1abf90804b
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_12/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9d8fb39fa3b02e1c27310b551386ef89f391ea128447e8106abb333a6dab8c0
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_14/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ff59abbfa03c076fd6e9cf5342cda76163d02ea34caf2834cd174d9ec633fc3
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_16/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9540fc5273199f6d400750d8344bb14c84334a5ad88c065e1eb6997e6905a6d3
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_18/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f5bd4561b98752d53b2a5823bc2044e18ca44760c320a65a1fe9cea77247444
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_2/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37b9012c0a3cf22b15b27d760b0d4530e40d37913022a3d6db70c9253a89a8b9
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_20/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d85e8e0a6209f57cdb4fc04ea43d4f7815954de55bbbaca4e75bf9de3eb59617
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_4/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc2a4143e0cd80d0a136f5d75ae81bb43b9f001b6c37a92fbe5a41732411e9cc
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_6/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dec7706c23617d4175b0604867af2e4c39c9c96f71bb5d84aee5ec10f1f2dfa0
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_8/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:edc38b2eaf470d7b53bc69cd7a4a750e1e4dcfe7fecf73f17533db6c47a7c1ab
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_final/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d85e8e0a6209f57cdb4fc04ea43d4f7815954de55bbbaca4e75bf9de3eb59617
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model-00001-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a025ba2f39370557992c0a88819e9ddcb82110c0456f17d7069c1a87135dc31
|
| 3 |
+
size 4938985352
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model-00002-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a03e07568c537b7de77f83fe586e68f1be1dfe7f75f913acc3d5af64197e1d5
|
| 3 |
+
size 3680666464
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_parameters": 4309815296,
|
| 4 |
+
"total_size": 8619630592
|
| 5 |
+
},
|
| 6 |
+
"weight_map": {
|
| 7 |
+
"lm_head.weight": "model-00002-of-00002.safetensors",
|
| 8 |
+
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
| 9 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 10 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 11 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 12 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 13 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 18 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 19 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 20 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 21 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 22 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 23 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 24 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 25 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 26 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 27 |
+
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 28 |
+
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 29 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 30 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 31 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 32 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 33 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 34 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 35 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 36 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 37 |
+
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 38 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 39 |
+
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 40 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 41 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 42 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 43 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 44 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 45 |
+
"model.layers.12.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 46 |
+
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 47 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 48 |
+
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 49 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 50 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 51 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 52 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 53 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 54 |
+
"model.layers.13.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 55 |
+
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 56 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 57 |
+
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 58 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 59 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 60 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 61 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 62 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 63 |
+
"model.layers.14.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 64 |
+
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 65 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 66 |
+
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 67 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 68 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 69 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 70 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 71 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 72 |
+
"model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 73 |
+
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 74 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 75 |
+
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 76 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 77 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 78 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 79 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 80 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 81 |
+
"model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 82 |
+
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 83 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 84 |
+
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 85 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 86 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 87 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 88 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 89 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 90 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 91 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 92 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 93 |
+
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 94 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 95 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 96 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 97 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 98 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 99 |
+
"model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 100 |
+
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 101 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 102 |
+
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 103 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 104 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 105 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 106 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 107 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 108 |
+
"model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 109 |
+
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 110 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 111 |
+
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 112 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 113 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 114 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 115 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 116 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 117 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 118 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 119 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 120 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 121 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 122 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 123 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 124 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 125 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 126 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 127 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 128 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 129 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 130 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 131 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 132 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 133 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 134 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 135 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 136 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 137 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 138 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 139 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 140 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 141 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 142 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 143 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 144 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 145 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 146 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 147 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 148 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 149 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 150 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 151 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 152 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 153 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 154 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 155 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 156 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 157 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 158 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 159 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 160 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 161 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 162 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 163 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 164 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 165 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 166 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 167 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 168 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 169 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 170 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 171 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 172 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 173 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 174 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 175 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 176 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 177 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 178 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 179 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 180 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 181 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 182 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 183 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 184 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 185 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 186 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 187 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 188 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 189 |
+
"model.norm.weight": "model-00002-of-00002.safetensors"
|
| 190 |
+
}
|
| 191 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/run_args.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
git_commit=unknown
|
| 2 |
+
dataset=Open-Orca/SlimOrca
|
| 3 |
+
dataset_split=train
|
| 4 |
+
collect_batch_size=8
|
| 5 |
+
train_batch_size=32
|
| 6 |
+
gradient_accumulation_step=8
|
| 7 |
+
target_effective_batch=256
|
| 8 |
+
command:
|
| 9 |
+
python /workspace/here/abprune_update/compare_model/LLM-Streamline/mseloss_entry.py --model_name meta-llama/Llama-2-7b-chat-hf --output_dir /workspace/here/abprune_update/results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18 --dataset Open-Orca/SlimOrca --dataset_split train --layer_intervals 12 --best_layer 18 --cosine_num_data 300 --train_num_data 30000 --epoches 20 --batch_size 8 --collect_batch_size 8 --train_batch_size 32 --dtype bfloat16 --gradient_accumulation_step 8 --lr 1e-5 --min_lr 5e-5 --wd 1e-3
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 37 |
+
"pad_token": "</s>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/train_loss.txt
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Train loss logged every 100 global steps
|
| 2 |
+
epoch=0 global_step=100 optimizer_step=12 loss=4.5312500000
|
| 3 |
+
epoch=0 global_step=200 optimizer_step=24 loss=4.6250000000
|
| 4 |
+
epoch=0 global_step=300 optimizer_step=37 loss=4.6250000000
|
| 5 |
+
epoch=0 global_step=400 optimizer_step=49 loss=4.4375000000
|
| 6 |
+
epoch=0 global_step=500 optimizer_step=62 loss=4.4062500000
|
| 7 |
+
epoch=0 global_step=600 optimizer_step=74 loss=4.3437500000
|
| 8 |
+
epoch=0 global_step=700 optimizer_step=87 loss=3.9218750000
|
| 9 |
+
epoch=0 global_step=800 optimizer_step=99 loss=3.4687500000
|
| 10 |
+
epoch=0 global_step=900 optimizer_step=112 loss=3.1562500000
|
| 11 |
+
epoch=1 global_step=1000 optimizer_step=124 loss=2.6406250000
|
| 12 |
+
epoch=1 global_step=1100 optimizer_step=137 loss=2.3906250000
|
| 13 |
+
epoch=1 global_step=1200 optimizer_step=149 loss=2.3437500000
|
| 14 |
+
epoch=1 global_step=1300 optimizer_step=162 loss=2.0781250000
|
| 15 |
+
epoch=1 global_step=1400 optimizer_step=174 loss=2.3437500000
|
| 16 |
+
epoch=1 global_step=1500 optimizer_step=187 loss=2.2187500000
|
| 17 |
+
epoch=1 global_step=1600 optimizer_step=199 loss=1.9921875000
|
| 18 |
+
epoch=1 global_step=1700 optimizer_step=212 loss=2.0468750000
|
| 19 |
+
epoch=1 global_step=1800 optimizer_step=224 loss=2.0156250000
|
| 20 |
+
epoch=2 global_step=1900 optimizer_step=236 loss=2.0781250000
|
| 21 |
+
epoch=2 global_step=2000 optimizer_step=249 loss=2.0468750000
|
| 22 |
+
epoch=2 global_step=2100 optimizer_step=261 loss=2.0468750000
|
| 23 |
+
epoch=2 global_step=2200 optimizer_step=274 loss=2.0781250000
|
| 24 |
+
epoch=2 global_step=2300 optimizer_step=286 loss=2.1718750000
|
| 25 |
+
epoch=2 global_step=2400 optimizer_step=299 loss=2.1406250000
|
| 26 |
+
epoch=2 global_step=2500 optimizer_step=311 loss=1.8125000000
|
| 27 |
+
epoch=2 global_step=2600 optimizer_step=324 loss=1.9687500000
|
| 28 |
+
epoch=2 global_step=2700 optimizer_step=336 loss=2.0468750000
|
| 29 |
+
epoch=2 global_step=2800 optimizer_step=349 loss=2.1250000000
|
| 30 |
+
epoch=3 global_step=2900 optimizer_step=361 loss=2.0625000000
|
| 31 |
+
epoch=3 global_step=3000 optimizer_step=374 loss=2.2187500000
|
| 32 |
+
epoch=3 global_step=3100 optimizer_step=386 loss=2.0781250000
|
| 33 |
+
epoch=3 global_step=3200 optimizer_step=399 loss=2.0468750000
|
| 34 |
+
epoch=3 global_step=3300 optimizer_step=411 loss=2.0312500000
|
| 35 |
+
epoch=3 global_step=3400 optimizer_step=424 loss=2.0156250000
|
| 36 |
+
epoch=3 global_step=3500 optimizer_step=436 loss=1.8906250000
|
| 37 |
+
epoch=3 global_step=3600 optimizer_step=449 loss=2.0156250000
|
| 38 |
+
epoch=3 global_step=3700 optimizer_step=461 loss=2.0625000000
|
| 39 |
+
epoch=4 global_step=3800 optimizer_step=473 loss=2.0312500000
|
| 40 |
+
epoch=4 global_step=3900 optimizer_step=486 loss=2.0625000000
|
| 41 |
+
epoch=4 global_step=4000 optimizer_step=498 loss=2.0468750000
|
| 42 |
+
epoch=4 global_step=4100 optimizer_step=511 loss=2.1562500000
|
| 43 |
+
epoch=4 global_step=4200 optimizer_step=523 loss=1.8593750000
|
| 44 |
+
epoch=4 global_step=4300 optimizer_step=536 loss=1.9296875000
|
| 45 |
+
epoch=4 global_step=4400 optimizer_step=548 loss=2.0468750000
|
| 46 |
+
epoch=4 global_step=4500 optimizer_step=561 loss=1.9453125000
|
| 47 |
+
epoch=4 global_step=4600 optimizer_step=573 loss=1.9375000000
|
| 48 |
+
epoch=5 global_step=4700 optimizer_step=586 loss=1.9609375000
|
| 49 |
+
epoch=5 global_step=4800 optimizer_step=598 loss=2.1093750000
|
| 50 |
+
epoch=5 global_step=4900 optimizer_step=611 loss=2.0156250000
|
| 51 |
+
epoch=5 global_step=5000 optimizer_step=623 loss=1.8750000000
|
| 52 |
+
epoch=5 global_step=5100 optimizer_step=636 loss=1.9296875000
|
| 53 |
+
epoch=5 global_step=5200 optimizer_step=648 loss=2.0156250000
|
| 54 |
+
epoch=5 global_step=5300 optimizer_step=661 loss=1.9062500000
|
| 55 |
+
epoch=5 global_step=5400 optimizer_step=673 loss=2.0937500000
|
| 56 |
+
epoch=5 global_step=5500 optimizer_step=686 loss=2.0000000000
|
| 57 |
+
epoch=5 global_step=5600 optimizer_step=698 loss=1.9140625000
|
| 58 |
+
epoch=6 global_step=5700 optimizer_step=710 loss=1.8828125000
|
| 59 |
+
epoch=6 global_step=5800 optimizer_step=723 loss=2.0468750000
|
| 60 |
+
epoch=6 global_step=5900 optimizer_step=735 loss=1.9687500000
|
| 61 |
+
epoch=6 global_step=6000 optimizer_step=748 loss=1.9609375000
|
| 62 |
+
epoch=6 global_step=6100 optimizer_step=760 loss=1.7968750000
|
| 63 |
+
epoch=6 global_step=6200 optimizer_step=773 loss=1.9843750000
|
| 64 |
+
epoch=6 global_step=6300 optimizer_step=785 loss=1.9921875000
|
| 65 |
+
epoch=6 global_step=6400 optimizer_step=798 loss=2.0781250000
|
| 66 |
+
epoch=6 global_step=6500 optimizer_step=810 loss=2.0156250000
|
| 67 |
+
epoch=7 global_step=6600 optimizer_step=823 loss=1.9843750000
|
| 68 |
+
epoch=7 global_step=6700 optimizer_step=835 loss=2.0937500000
|
| 69 |
+
epoch=7 global_step=6800 optimizer_step=848 loss=2.0000000000
|
| 70 |
+
epoch=7 global_step=6900 optimizer_step=860 loss=1.8906250000
|
| 71 |
+
epoch=7 global_step=7000 optimizer_step=873 loss=1.9140625000
|
| 72 |
+
epoch=7 global_step=7100 optimizer_step=885 loss=1.9453125000
|
| 73 |
+
epoch=7 global_step=7200 optimizer_step=898 loss=1.8750000000
|
| 74 |
+
epoch=7 global_step=7300 optimizer_step=910 loss=2.0781250000
|
| 75 |
+
epoch=7 global_step=7400 optimizer_step=923 loss=1.9609375000
|
| 76 |
+
epoch=7 global_step=7500 optimizer_step=935 loss=1.9765625000
|
| 77 |
+
epoch=8 global_step=7600 optimizer_step=947 loss=1.8593750000
|
| 78 |
+
epoch=8 global_step=7700 optimizer_step=960 loss=1.9609375000
|
| 79 |
+
epoch=8 global_step=7800 optimizer_step=972 loss=1.8906250000
|
| 80 |
+
epoch=8 global_step=7900 optimizer_step=985 loss=1.9296875000
|
| 81 |
+
epoch=8 global_step=8000 optimizer_step=997 loss=1.8125000000
|
| 82 |
+
epoch=8 global_step=8100 optimizer_step=1010 loss=1.8046875000
|
| 83 |
+
epoch=8 global_step=8200 optimizer_step=1022 loss=1.9687500000
|
| 84 |
+
epoch=8 global_step=8300 optimizer_step=1035 loss=1.9765625000
|
| 85 |
+
epoch=8 global_step=8400 optimizer_step=1047 loss=2.0625000000
|
| 86 |
+
epoch=9 global_step=8500 optimizer_step=1060 loss=1.8359375000
|
| 87 |
+
epoch=9 global_step=8600 optimizer_step=1072 loss=1.9296875000
|
| 88 |
+
epoch=9 global_step=8700 optimizer_step=1085 loss=1.9687500000
|
| 89 |
+
epoch=9 global_step=8800 optimizer_step=1097 loss=2.0000000000
|
| 90 |
+
epoch=9 global_step=8900 optimizer_step=1110 loss=2.1093750000
|
| 91 |
+
epoch=9 global_step=9000 optimizer_step=1122 loss=1.9218750000
|
| 92 |
+
epoch=9 global_step=9100 optimizer_step=1135 loss=2.0000000000
|
| 93 |
+
epoch=9 global_step=9200 optimizer_step=1147 loss=1.8593750000
|
| 94 |
+
epoch=9 global_step=9300 optimizer_step=1160 loss=1.8984375000
|
| 95 |
+
epoch=10 global_step=9400 optimizer_step=1172 loss=2.0156250000
|
| 96 |
+
epoch=10 global_step=9500 optimizer_step=1184 loss=1.9687500000
|
| 97 |
+
epoch=10 global_step=9600 optimizer_step=1197 loss=2.0312500000
|
| 98 |
+
epoch=10 global_step=9700 optimizer_step=1209 loss=2.0156250000
|
| 99 |
+
epoch=10 global_step=9800 optimizer_step=1222 loss=2.0468750000
|
| 100 |
+
epoch=10 global_step=9900 optimizer_step=1234 loss=1.9843750000
|
| 101 |
+
epoch=10 global_step=10000 optimizer_step=1247 loss=2.0312500000
|
| 102 |
+
epoch=10 global_step=10100 optimizer_step=1259 loss=2.0468750000
|
| 103 |
+
epoch=10 global_step=10200 optimizer_step=1272 loss=1.9843750000
|
| 104 |
+
epoch=10 global_step=10300 optimizer_step=1284 loss=1.9140625000
|
| 105 |
+
epoch=11 global_step=10400 optimizer_step=1297 loss=2.1093750000
|
| 106 |
+
epoch=11 global_step=10500 optimizer_step=1309 loss=1.9921875000
|
| 107 |
+
epoch=11 global_step=10600 optimizer_step=1322 loss=1.8593750000
|
| 108 |
+
epoch=11 global_step=10700 optimizer_step=1334 loss=1.9140625000
|
| 109 |
+
epoch=11 global_step=10800 optimizer_step=1347 loss=1.8515625000
|
| 110 |
+
epoch=11 global_step=10900 optimizer_step=1359 loss=2.0625000000
|
| 111 |
+
epoch=11 global_step=11000 optimizer_step=1372 loss=1.9453125000
|
| 112 |
+
epoch=11 global_step=11100 optimizer_step=1384 loss=2.0000000000
|
| 113 |
+
epoch=11 global_step=11200 optimizer_step=1397 loss=2.1406250000
|
| 114 |
+
epoch=12 global_step=11300 optimizer_step=1409 loss=1.9218750000
|
| 115 |
+
epoch=12 global_step=11400 optimizer_step=1421 loss=1.9375000000
|
| 116 |
+
epoch=12 global_step=11500 optimizer_step=1434 loss=2.0625000000
|
| 117 |
+
epoch=12 global_step=11600 optimizer_step=1446 loss=1.8281250000
|
| 118 |
+
epoch=12 global_step=11700 optimizer_step=1459 loss=1.8437500000
|
| 119 |
+
epoch=12 global_step=11800 optimizer_step=1471 loss=1.7265625000
|
| 120 |
+
epoch=12 global_step=11900 optimizer_step=1484 loss=1.8359375000
|
| 121 |
+
epoch=12 global_step=12000 optimizer_step=1496 loss=1.9609375000
|
| 122 |
+
epoch=12 global_step=12100 optimizer_step=1509 loss=2.0312500000
|
| 123 |
+
epoch=13 global_step=12200 optimizer_step=1521 loss=1.9375000000
|
| 124 |
+
epoch=13 global_step=12300 optimizer_step=1534 loss=1.9218750000
|
| 125 |
+
epoch=13 global_step=12400 optimizer_step=1546 loss=1.8671875000
|
| 126 |
+
epoch=13 global_step=12500 optimizer_step=1559 loss=2.0156250000
|
| 127 |
+
epoch=13 global_step=12600 optimizer_step=1571 loss=2.0312500000
|
| 128 |
+
epoch=13 global_step=12700 optimizer_step=1584 loss=1.9609375000
|
| 129 |
+
epoch=13 global_step=12800 optimizer_step=1596 loss=1.8828125000
|
| 130 |
+
epoch=13 global_step=12900 optimizer_step=1609 loss=1.8828125000
|
| 131 |
+
epoch=13 global_step=13000 optimizer_step=1621 loss=1.8515625000
|
| 132 |
+
epoch=13 global_step=13100 optimizer_step=1634 loss=1.9218750000
|
| 133 |
+
epoch=14 global_step=13200 optimizer_step=1646 loss=1.8203125000
|
| 134 |
+
epoch=14 global_step=13300 optimizer_step=1658 loss=1.9921875000
|
| 135 |
+
epoch=14 global_step=13400 optimizer_step=1671 loss=1.8203125000
|
| 136 |
+
epoch=14 global_step=13500 optimizer_step=1683 loss=2.0156250000
|
| 137 |
+
epoch=14 global_step=13600 optimizer_step=1696 loss=2.0312500000
|
| 138 |
+
epoch=14 global_step=13700 optimizer_step=1708 loss=1.7968750000
|
| 139 |
+
epoch=14 global_step=13800 optimizer_step=1721 loss=1.8906250000
|
| 140 |
+
epoch=14 global_step=13900 optimizer_step=1733 loss=1.7890625000
|
| 141 |
+
epoch=14 global_step=14000 optimizer_step=1746 loss=1.8906250000
|
| 142 |
+
epoch=15 global_step=14100 optimizer_step=1758 loss=2.0468750000
|
| 143 |
+
epoch=15 global_step=14200 optimizer_step=1771 loss=1.8906250000
|
| 144 |
+
epoch=15 global_step=14300 optimizer_step=1783 loss=1.8750000000
|
| 145 |
+
epoch=15 global_step=14400 optimizer_step=1796 loss=1.8906250000
|
| 146 |
+
epoch=15 global_step=14500 optimizer_step=1808 loss=1.8281250000
|
| 147 |
+
epoch=15 global_step=14600 optimizer_step=1821 loss=1.9921875000
|
| 148 |
+
epoch=15 global_step=14700 optimizer_step=1833 loss=1.9609375000
|
| 149 |
+
epoch=15 global_step=14800 optimizer_step=1846 loss=2.0781250000
|
| 150 |
+
epoch=15 global_step=14900 optimizer_step=1858 loss=1.9921875000
|
| 151 |
+
epoch=15 global_step=15000 optimizer_step=1871 loss=2.0156250000
|
| 152 |
+
epoch=16 global_step=15100 optimizer_step=1883 loss=1.8515625000
|
| 153 |
+
epoch=16 global_step=15200 optimizer_step=1895 loss=1.9140625000
|
| 154 |
+
epoch=16 global_step=15300 optimizer_step=1908 loss=1.9453125000
|
| 155 |
+
epoch=16 global_step=15400 optimizer_step=1920 loss=1.9375000000
|
| 156 |
+
epoch=16 global_step=15500 optimizer_step=1933 loss=2.0312500000
|
| 157 |
+
epoch=16 global_step=15600 optimizer_step=1945 loss=2.0781250000
|
| 158 |
+
epoch=16 global_step=15700 optimizer_step=1958 loss=2.0000000000
|
| 159 |
+
epoch=16 global_step=15800 optimizer_step=1970 loss=1.9687500000
|
| 160 |
+
epoch=16 global_step=15900 optimizer_step=1983 loss=2.0781250000
|
| 161 |
+
epoch=17 global_step=16000 optimizer_step=1995 loss=2.0468750000
|
| 162 |
+
epoch=17 global_step=16100 optimizer_step=2008 loss=1.9218750000
|
| 163 |
+
epoch=17 global_step=16200 optimizer_step=2020 loss=1.8125000000
|
| 164 |
+
epoch=17 global_step=16300 optimizer_step=2033 loss=1.8593750000
|
| 165 |
+
epoch=17 global_step=16400 optimizer_step=2045 loss=1.9140625000
|
| 166 |
+
epoch=17 global_step=16500 optimizer_step=2058 loss=1.9765625000
|
| 167 |
+
epoch=17 global_step=16600 optimizer_step=2070 loss=1.9375000000
|
| 168 |
+
epoch=17 global_step=16700 optimizer_step=2083 loss=1.8828125000
|
| 169 |
+
epoch=17 global_step=16800 optimizer_step=2095 loss=1.8906250000
|
| 170 |
+
epoch=18 global_step=16900 optimizer_step=2107 loss=1.9375000000
|
| 171 |
+
epoch=18 global_step=17000 optimizer_step=2120 loss=1.9687500000
|
| 172 |
+
epoch=18 global_step=17100 optimizer_step=2132 loss=2.0312500000
|
| 173 |
+
epoch=18 global_step=17200 optimizer_step=2145 loss=1.9453125000
|
| 174 |
+
epoch=18 global_step=17300 optimizer_step=2157 loss=2.0625000000
|
| 175 |
+
epoch=18 global_step=17400 optimizer_step=2170 loss=1.8906250000
|
| 176 |
+
epoch=18 global_step=17500 optimizer_step=2182 loss=2.0156250000
|
| 177 |
+
epoch=18 global_step=17600 optimizer_step=2195 loss=1.8906250000
|
| 178 |
+
epoch=18 global_step=17700 optimizer_step=2207 loss=1.7421875000
|
| 179 |
+
epoch=18 global_step=17800 optimizer_step=2220 loss=1.9843750000
|
| 180 |
+
epoch=19 global_step=17900 optimizer_step=2232 loss=1.9296875000
|
| 181 |
+
epoch=19 global_step=18000 optimizer_step=2245 loss=1.9375000000
|
| 182 |
+
epoch=19 global_step=18100 optimizer_step=2257 loss=1.7500000000
|
| 183 |
+
epoch=19 global_step=18200 optimizer_step=2270 loss=1.8437500000
|
| 184 |
+
epoch=19 global_step=18300 optimizer_step=2282 loss=1.9453125000
|
| 185 |
+
epoch=19 global_step=18400 optimizer_step=2295 loss=1.8906250000
|
| 186 |
+
epoch=19 global_step=18500 optimizer_step=2307 loss=1.8828125000
|
| 187 |
+
epoch=19 global_step=18600 optimizer_step=2320 loss=1.9062500000
|
| 188 |
+
epoch=19 global_step=18700 optimizer_step=2332 loss=1.9609375000
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/chat_template.jinja
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 1,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eos_token_id": 2,
|
| 10 |
+
"head_dim": 128,
|
| 11 |
+
"hidden_act": "silu",
|
| 12 |
+
"hidden_size": 4096,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 11008,
|
| 15 |
+
"max_position_embeddings": 4096,
|
| 16 |
+
"mlp_bias": false,
|
| 17 |
+
"model_type": "llama",
|
| 18 |
+
"num_attention_heads": 32,
|
| 19 |
+
"num_hidden_layers": 28,
|
| 20 |
+
"num_key_value_heads": 32,
|
| 21 |
+
"pretraining_tp": 1,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"rope_scaling": null,
|
| 24 |
+
"rope_theta": 10000.0,
|
| 25 |
+
"tie_word_embeddings": false,
|
| 26 |
+
"transformers_version": "4.57.6",
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"vocab_size": 32000
|
| 29 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"transformers_version": "4.57.6"
|
| 6 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_10/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76b06b4883cce06d8b5fe6d1f6b418c759d2de57c675d742cebc4317d4ca44af
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_12/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:77aa7049cf9654a0bf05d86b486aa50858ba7c73b423635ebfbf224cb1fb26d7
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_14/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bddc194fb3bcd579b8e0bd5556e4e34f2f9925eca405d0c6cec2ceed93882246
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_16/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc275eb5eec9fb91b3b83af6746395147ccb707935dc18d76f3482a49b9c2bf8
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_18/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f58f901ba3eb8125645144144567e522fcae9559283027af3db873d8cb840d2
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_2/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9e00ff43fe1738a20cff031579b2a8a49e5805de3c871003cb7c91b0ca48920
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_20/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fb83f53338e41c8a8ecf6bb039c130d3abfd38ead896ce4bc9b831babf291b0
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_4/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e9025fedfaa18ae9040261961f145443040ccbee8337d09d1d588117fb3d51e
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_6/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16f7adc8334a56fe18150ab8d2c6a9d3182b414191b8b792a97bc03a5d5eb5d2
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_8/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e2bdf21bc36f02516d4bc9534e45dcdeb503843ebb8fbe2c2f78a36911a4d43
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_final/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fb83f53338e41c8a8ecf6bb039c130d3abfd38ead896ce4bc9b831babf291b0
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00001-of-00003.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a025ba2f39370557992c0a88819e9ddcb82110c0456f17d7069c1a87135dc31
|
| 3 |
+
size 4938985352
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00002-of-00003.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b1e59c2fe2fc83cd719a4233d13f9a04d45d1de815d0ae4bf56932068d8028ce
|
| 3 |
+
size 4947390880
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00003-of-00003.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c7e902bd779a07d9135271f0db2a1c5fbef4d01da478adb0baf5bdc08284367
|
| 3 |
+
size 1971417736
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_parameters": 5928882176,
|
| 4 |
+
"total_size": 11857764352
|
| 5 |
+
},
|
| 6 |
+
"weight_map": {
|
| 7 |
+
"lm_head.weight": "model-00003-of-00003.safetensors",
|
| 8 |
+
"model.embed_tokens.weight": "model-00001-of-00003.safetensors",
|
| 9 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 10 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 11 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 12 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 13 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 18 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 19 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 20 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 21 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 22 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 23 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 24 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 25 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 26 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 27 |
+
"model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 28 |
+
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 29 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 30 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 31 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 32 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 33 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 34 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 35 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 36 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 37 |
+
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 38 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 39 |
+
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 40 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 41 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 42 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 43 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 44 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 45 |
+
"model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 46 |
+
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 47 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 48 |
+
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 49 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 50 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 51 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 52 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 53 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 54 |
+
"model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 55 |
+
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 56 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 57 |
+
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 58 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 59 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 60 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 61 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 62 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 63 |
+
"model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 64 |
+
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 65 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 66 |
+
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 67 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 68 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 69 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 70 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 71 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 72 |
+
"model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 73 |
+
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 74 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 75 |
+
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 76 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 77 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 78 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 79 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 80 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 81 |
+
"model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 82 |
+
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 83 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 84 |
+
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 85 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 86 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 87 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 88 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 89 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 90 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 91 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 92 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 93 |
+
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 94 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 95 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 96 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 97 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 98 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 99 |
+
"model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 100 |
+
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 101 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 102 |
+
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 103 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 104 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 105 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 106 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 107 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 108 |
+
"model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 109 |
+
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 110 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 111 |
+
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 112 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 113 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 114 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 115 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 116 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 117 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 118 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 119 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 120 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 121 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 122 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 123 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 124 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 125 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 126 |
+
"model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 127 |
+
"model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 128 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 129 |
+
"model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 130 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 131 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 132 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 133 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 134 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 135 |
+
"model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 136 |
+
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 137 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 138 |
+
"model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 139 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 140 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 141 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 142 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 143 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 144 |
+
"model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 145 |
+
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
|
| 146 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 147 |
+
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 148 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
|
| 149 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 150 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 151 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 152 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 153 |
+
"model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 154 |
+
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 155 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
|
| 156 |
+
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
|
| 157 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 158 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
|
| 159 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
|
| 160 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
|
| 161 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
|
| 162 |
+
"model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 163 |
+
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 164 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
| 165 |
+
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
| 166 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 167 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
| 168 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
| 169 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
| 170 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
| 171 |
+
"model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 172 |
+
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 173 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
| 174 |
+
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
| 175 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 176 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
| 177 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
| 178 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
| 179 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
| 180 |
+
"model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 181 |
+
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 182 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
| 183 |
+
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
| 184 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 185 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
| 186 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
| 187 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
| 188 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
| 189 |
+
"model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 190 |
+
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
|
| 191 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
|
| 192 |
+
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
|
| 193 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
|
| 194 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
|
| 195 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
|
| 196 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
|
| 197 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
|
| 198 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 199 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 200 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 201 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 202 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 203 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 204 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 205 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 206 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 207 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 208 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 209 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 210 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 211 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 212 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 213 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 214 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 215 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 216 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 217 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 218 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 219 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 220 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 221 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 222 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 223 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 224 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 225 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 226 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 227 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 228 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 229 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 230 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 231 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 232 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 233 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 234 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 235 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 236 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 237 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 238 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 239 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 240 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 241 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 242 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 243 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 244 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 245 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 246 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 247 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 248 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 249 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 250 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 251 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 252 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 253 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
|
| 254 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
|
| 255 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
|
| 256 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
|
| 257 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
|
| 258 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
|
| 259 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
|
| 260 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
|
| 261 |
+
"model.norm.weight": "model-00003-of-00003.safetensors"
|
| 262 |
+
}
|
| 263 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/run_args.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
git_commit=unknown
|
| 2 |
+
dataset=Open-Orca/SlimOrca
|
| 3 |
+
dataset_split=train
|
| 4 |
+
collect_batch_size=8
|
| 5 |
+
train_batch_size=32
|
| 6 |
+
gradient_accumulation_step=8
|
| 7 |
+
target_effective_batch=256
|
| 8 |
+
command:
|
| 9 |
+
python /workspace/here/abprune_update/compare_model/LLM-Streamline/mseloss_entry.py --model_name meta-llama/Llama-2-7b-chat-hf --output_dir /workspace/here/abprune_update/results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24 --dataset Open-Orca/SlimOrca --dataset_split train --layer_intervals 4 --best_layer 24 --cosine_num_data 300 --train_num_data 30000 --epoches 20 --batch_size 8 --collect_batch_size 8 --train_batch_size 32 --dtype bfloat16 --gradient_accumulation_step 8 --lr 1e-5 --min_lr 5e-5 --wd 1e-3
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "</s>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 37 |
+
"pad_token": "</s>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/train_loss.txt
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Train loss logged every 100 global steps
|
| 2 |
+
epoch=0 global_step=100 optimizer_step=12 loss=0.4257812500
|
| 3 |
+
epoch=0 global_step=200 optimizer_step=24 loss=0.3769531250
|
| 4 |
+
epoch=0 global_step=300 optimizer_step=37 loss=0.4121093750
|
| 5 |
+
epoch=0 global_step=400 optimizer_step=49 loss=0.4042968750
|
| 6 |
+
epoch=0 global_step=500 optimizer_step=62 loss=0.3652343750
|
| 7 |
+
epoch=0 global_step=600 optimizer_step=74 loss=0.3769531250
|
| 8 |
+
epoch=0 global_step=700 optimizer_step=87 loss=0.3417968750
|
| 9 |
+
epoch=0 global_step=800 optimizer_step=99 loss=0.3671875000
|
| 10 |
+
epoch=0 global_step=900 optimizer_step=112 loss=0.3085937500
|
| 11 |
+
epoch=1 global_step=1000 optimizer_step=124 loss=0.3183593750
|
| 12 |
+
epoch=1 global_step=1100 optimizer_step=137 loss=0.2519531250
|
| 13 |
+
epoch=1 global_step=1200 optimizer_step=149 loss=0.2333984375
|
| 14 |
+
epoch=1 global_step=1300 optimizer_step=162 loss=0.2490234375
|
| 15 |
+
epoch=1 global_step=1400 optimizer_step=174 loss=0.2187500000
|
| 16 |
+
epoch=1 global_step=1500 optimizer_step=187 loss=0.2167968750
|
| 17 |
+
epoch=1 global_step=1600 optimizer_step=199 loss=0.2265625000
|
| 18 |
+
epoch=1 global_step=1700 optimizer_step=212 loss=0.2099609375
|
| 19 |
+
epoch=1 global_step=1800 optimizer_step=224 loss=0.2294921875
|
| 20 |
+
epoch=2 global_step=1900 optimizer_step=236 loss=0.1894531250
|
| 21 |
+
epoch=2 global_step=2000 optimizer_step=249 loss=0.2382812500
|
| 22 |
+
epoch=2 global_step=2100 optimizer_step=261 loss=0.2236328125
|
| 23 |
+
epoch=2 global_step=2200 optimizer_step=274 loss=0.1992187500
|
| 24 |
+
epoch=2 global_step=2300 optimizer_step=286 loss=0.1865234375
|
| 25 |
+
epoch=2 global_step=2400 optimizer_step=299 loss=0.2041015625
|
| 26 |
+
epoch=2 global_step=2500 optimizer_step=311 loss=0.1796875000
|
| 27 |
+
epoch=2 global_step=2600 optimizer_step=324 loss=0.2275390625
|
| 28 |
+
epoch=2 global_step=2700 optimizer_step=336 loss=0.1923828125
|
| 29 |
+
epoch=2 global_step=2800 optimizer_step=349 loss=0.1855468750
|
| 30 |
+
epoch=3 global_step=2900 optimizer_step=361 loss=0.1904296875
|
| 31 |
+
epoch=3 global_step=3000 optimizer_step=374 loss=0.1796875000
|
| 32 |
+
epoch=3 global_step=3100 optimizer_step=386 loss=0.1972656250
|
| 33 |
+
epoch=3 global_step=3200 optimizer_step=399 loss=0.1757812500
|
| 34 |
+
epoch=3 global_step=3300 optimizer_step=411 loss=0.1884765625
|
| 35 |
+
epoch=3 global_step=3400 optimizer_step=424 loss=0.1562500000
|
| 36 |
+
epoch=3 global_step=3500 optimizer_step=436 loss=0.2138671875
|
| 37 |
+
epoch=3 global_step=3600 optimizer_step=449 loss=0.1748046875
|
| 38 |
+
epoch=3 global_step=3700 optimizer_step=461 loss=0.2070312500
|
| 39 |
+
epoch=4 global_step=3800 optimizer_step=473 loss=0.1630859375
|
| 40 |
+
epoch=4 global_step=3900 optimizer_step=486 loss=0.1621093750
|
| 41 |
+
epoch=4 global_step=4000 optimizer_step=498 loss=0.1777343750
|
| 42 |
+
epoch=4 global_step=4100 optimizer_step=511 loss=0.1953125000
|
| 43 |
+
epoch=4 global_step=4200 optimizer_step=523 loss=0.2207031250
|
| 44 |
+
epoch=4 global_step=4300 optimizer_step=536 loss=0.1591796875
|
| 45 |
+
epoch=4 global_step=4400 optimizer_step=548 loss=0.2226562500
|
| 46 |
+
epoch=4 global_step=4500 optimizer_step=561 loss=0.1650390625
|
| 47 |
+
epoch=4 global_step=4600 optimizer_step=573 loss=0.2050781250
|
| 48 |
+
epoch=5 global_step=4700 optimizer_step=586 loss=0.2167968750
|
| 49 |
+
epoch=5 global_step=4800 optimizer_step=598 loss=0.1650390625
|
| 50 |
+
epoch=5 global_step=4900 optimizer_step=611 loss=0.1669921875
|
| 51 |
+
epoch=5 global_step=5000 optimizer_step=623 loss=0.1494140625
|
| 52 |
+
epoch=5 global_step=5100 optimizer_step=636 loss=0.1767578125
|
| 53 |
+
epoch=5 global_step=5200 optimizer_step=648 loss=0.1806640625
|
| 54 |
+
epoch=5 global_step=5300 optimizer_step=661 loss=0.1562500000
|
| 55 |
+
epoch=5 global_step=5400 optimizer_step=673 loss=0.1757812500
|
| 56 |
+
epoch=5 global_step=5500 optimizer_step=686 loss=0.1787109375
|
| 57 |
+
epoch=5 global_step=5600 optimizer_step=698 loss=0.1679687500
|
| 58 |
+
epoch=6 global_step=5700 optimizer_step=710 loss=0.1787109375
|
| 59 |
+
epoch=6 global_step=5800 optimizer_step=723 loss=0.1933593750
|
| 60 |
+
epoch=6 global_step=5900 optimizer_step=735 loss=0.1845703125
|
| 61 |
+
epoch=6 global_step=6000 optimizer_step=748 loss=0.1875000000
|
| 62 |
+
epoch=6 global_step=6100 optimizer_step=760 loss=0.1796875000
|
| 63 |
+
epoch=6 global_step=6200 optimizer_step=773 loss=0.1767578125
|
| 64 |
+
epoch=6 global_step=6300 optimizer_step=785 loss=0.1826171875
|
| 65 |
+
epoch=6 global_step=6400 optimizer_step=798 loss=0.1542968750
|
| 66 |
+
epoch=6 global_step=6500 optimizer_step=810 loss=0.1816406250
|
| 67 |
+
epoch=7 global_step=6600 optimizer_step=823 loss=0.1562500000
|
| 68 |
+
epoch=7 global_step=6700 optimizer_step=835 loss=0.1738281250
|
| 69 |
+
epoch=7 global_step=6800 optimizer_step=848 loss=0.1669921875
|
| 70 |
+
epoch=7 global_step=6900 optimizer_step=860 loss=0.1953125000
|
| 71 |
+
epoch=7 global_step=7000 optimizer_step=873 loss=0.1523437500
|
| 72 |
+
epoch=7 global_step=7100 optimizer_step=885 loss=0.1708984375
|
| 73 |
+
epoch=7 global_step=7200 optimizer_step=898 loss=0.1904296875
|
| 74 |
+
epoch=7 global_step=7300 optimizer_step=910 loss=0.1591796875
|
| 75 |
+
epoch=7 global_step=7400 optimizer_step=923 loss=0.1494140625
|
| 76 |
+
epoch=7 global_step=7500 optimizer_step=935 loss=0.1445312500
|
| 77 |
+
epoch=8 global_step=7600 optimizer_step=947 loss=0.2148437500
|
| 78 |
+
epoch=8 global_step=7700 optimizer_step=960 loss=0.1972656250
|
| 79 |
+
epoch=8 global_step=7800 optimizer_step=972 loss=0.1689453125
|
| 80 |
+
epoch=8 global_step=7900 optimizer_step=985 loss=0.2050781250
|
| 81 |
+
epoch=8 global_step=8000 optimizer_step=997 loss=0.1279296875
|
| 82 |
+
epoch=8 global_step=8100 optimizer_step=1010 loss=0.1718750000
|
| 83 |
+
epoch=8 global_step=8200 optimizer_step=1022 loss=0.1513671875
|
| 84 |
+
epoch=8 global_step=8300 optimizer_step=1035 loss=0.1552734375
|
| 85 |
+
epoch=8 global_step=8400 optimizer_step=1047 loss=0.1699218750
|
| 86 |
+
epoch=9 global_step=8500 optimizer_step=1060 loss=0.2080078125
|
| 87 |
+
epoch=9 global_step=8600 optimizer_step=1072 loss=0.2226562500
|
| 88 |
+
epoch=9 global_step=8700 optimizer_step=1085 loss=0.1816406250
|
| 89 |
+
epoch=9 global_step=8800 optimizer_step=1097 loss=0.1611328125
|
| 90 |
+
epoch=9 global_step=8900 optimizer_step=1110 loss=0.1621093750
|
| 91 |
+
epoch=9 global_step=9000 optimizer_step=1122 loss=0.1923828125
|
| 92 |
+
epoch=9 global_step=9100 optimizer_step=1135 loss=0.1425781250
|
| 93 |
+
epoch=9 global_step=9200 optimizer_step=1147 loss=0.1679687500
|
| 94 |
+
epoch=9 global_step=9300 optimizer_step=1160 loss=0.1845703125
|
| 95 |
+
epoch=10 global_step=9400 optimizer_step=1172 loss=0.1630859375
|
| 96 |
+
epoch=10 global_step=9500 optimizer_step=1184 loss=0.1757812500
|
| 97 |
+
epoch=10 global_step=9600 optimizer_step=1197 loss=0.1611328125
|
| 98 |
+
epoch=10 global_step=9700 optimizer_step=1209 loss=0.1464843750
|
| 99 |
+
epoch=10 global_step=9800 optimizer_step=1222 loss=0.2089843750
|
| 100 |
+
epoch=10 global_step=9900 optimizer_step=1234 loss=0.1787109375
|
| 101 |
+
epoch=10 global_step=10000 optimizer_step=1247 loss=0.1787109375
|
| 102 |
+
epoch=10 global_step=10100 optimizer_step=1259 loss=0.1689453125
|
| 103 |
+
epoch=10 global_step=10200 optimizer_step=1272 loss=0.2119140625
|
| 104 |
+
epoch=10 global_step=10300 optimizer_step=1284 loss=0.1503906250
|
| 105 |
+
epoch=11 global_step=10400 optimizer_step=1297 loss=0.1572265625
|
| 106 |
+
epoch=11 global_step=10500 optimizer_step=1309 loss=0.2089843750
|
| 107 |
+
epoch=11 global_step=10600 optimizer_step=1322 loss=0.1318359375
|
| 108 |
+
epoch=11 global_step=10700 optimizer_step=1334 loss=0.1884765625
|
| 109 |
+
epoch=11 global_step=10800 optimizer_step=1347 loss=0.1474609375
|
| 110 |
+
epoch=11 global_step=10900 optimizer_step=1359 loss=0.2011718750
|
| 111 |
+
epoch=11 global_step=11000 optimizer_step=1372 loss=0.1425781250
|
| 112 |
+
epoch=11 global_step=11100 optimizer_step=1384 loss=0.1708984375
|
| 113 |
+
epoch=11 global_step=11200 optimizer_step=1397 loss=0.1679687500
|
| 114 |
+
epoch=12 global_step=11300 optimizer_step=1409 loss=0.1318359375
|
| 115 |
+
epoch=12 global_step=11400 optimizer_step=1421 loss=0.1484375000
|
| 116 |
+
epoch=12 global_step=11500 optimizer_step=1434 loss=0.1464843750
|
| 117 |
+
epoch=12 global_step=11600 optimizer_step=1446 loss=0.1582031250
|
| 118 |
+
epoch=12 global_step=11700 optimizer_step=1459 loss=0.1894531250
|
| 119 |
+
epoch=12 global_step=11800 optimizer_step=1471 loss=0.1542968750
|
| 120 |
+
epoch=12 global_step=11900 optimizer_step=1484 loss=0.1386718750
|
| 121 |
+
epoch=12 global_step=12000 optimizer_step=1496 loss=0.1699218750
|
| 122 |
+
epoch=12 global_step=12100 optimizer_step=1509 loss=0.1767578125
|
| 123 |
+
epoch=13 global_step=12200 optimizer_step=1521 loss=0.1757812500
|
| 124 |
+
epoch=13 global_step=12300 optimizer_step=1534 loss=0.1621093750
|
| 125 |
+
epoch=13 global_step=12400 optimizer_step=1546 loss=0.2060546875
|
| 126 |
+
epoch=13 global_step=12500 optimizer_step=1559 loss=0.1601562500
|
| 127 |
+
epoch=13 global_step=12600 optimizer_step=1571 loss=0.1367187500
|
| 128 |
+
epoch=13 global_step=12700 optimizer_step=1584 loss=0.2060546875
|
| 129 |
+
epoch=13 global_step=12800 optimizer_step=1596 loss=0.1396484375
|
| 130 |
+
epoch=13 global_step=12900 optimizer_step=1609 loss=0.1875000000
|
| 131 |
+
epoch=13 global_step=13000 optimizer_step=1621 loss=0.2187500000
|
| 132 |
+
epoch=13 global_step=13100 optimizer_step=1634 loss=0.1591796875
|
| 133 |
+
epoch=14 global_step=13200 optimizer_step=1646 loss=0.1523437500
|
| 134 |
+
epoch=14 global_step=13300 optimizer_step=1658 loss=0.1640625000
|
| 135 |
+
epoch=14 global_step=13400 optimizer_step=1671 loss=0.1435546875
|
| 136 |
+
epoch=14 global_step=13500 optimizer_step=1683 loss=0.1660156250
|
| 137 |
+
epoch=14 global_step=13600 optimizer_step=1696 loss=0.1884765625
|
| 138 |
+
epoch=14 global_step=13700 optimizer_step=1708 loss=0.1445312500
|
| 139 |
+
epoch=14 global_step=13800 optimizer_step=1721 loss=0.1406250000
|
| 140 |
+
epoch=14 global_step=13900 optimizer_step=1733 loss=0.1689453125
|
| 141 |
+
epoch=14 global_step=14000 optimizer_step=1746 loss=0.1494140625
|
| 142 |
+
epoch=15 global_step=14100 optimizer_step=1758 loss=0.1884765625
|
| 143 |
+
epoch=15 global_step=14200 optimizer_step=1771 loss=0.1894531250
|
| 144 |
+
epoch=15 global_step=14300 optimizer_step=1783 loss=0.1708984375
|
| 145 |
+
epoch=15 global_step=14400 optimizer_step=1796 loss=0.1650390625
|
| 146 |
+
epoch=15 global_step=14500 optimizer_step=1808 loss=0.1357421875
|
| 147 |
+
epoch=15 global_step=14600 optimizer_step=1821 loss=0.1689453125
|
| 148 |
+
epoch=15 global_step=14700 optimizer_step=1833 loss=0.1748046875
|
| 149 |
+
epoch=15 global_step=14800 optimizer_step=1846 loss=0.1757812500
|
| 150 |
+
epoch=15 global_step=14900 optimizer_step=1858 loss=0.1572265625
|
| 151 |
+
epoch=15 global_step=15000 optimizer_step=1871 loss=0.1630859375
|
| 152 |
+
epoch=16 global_step=15100 optimizer_step=1883 loss=0.1416015625
|
| 153 |
+
epoch=16 global_step=15200 optimizer_step=1895 loss=0.1474609375
|
| 154 |
+
epoch=16 global_step=15300 optimizer_step=1908 loss=0.1640625000
|
| 155 |
+
epoch=16 global_step=15400 optimizer_step=1920 loss=0.1406250000
|
| 156 |
+
epoch=16 global_step=15500 optimizer_step=1933 loss=0.1308593750
|
| 157 |
+
epoch=16 global_step=15600 optimizer_step=1945 loss=0.1865234375
|
| 158 |
+
epoch=16 global_step=15700 optimizer_step=1958 loss=0.1669921875
|
| 159 |
+
epoch=16 global_step=15800 optimizer_step=1970 loss=0.1953125000
|
| 160 |
+
epoch=16 global_step=15900 optimizer_step=1983 loss=0.1835937500
|
| 161 |
+
epoch=17 global_step=16000 optimizer_step=1995 loss=0.1767578125
|
| 162 |
+
epoch=17 global_step=16100 optimizer_step=2008 loss=0.1875000000
|
| 163 |
+
epoch=17 global_step=16200 optimizer_step=2020 loss=0.1376953125
|
| 164 |
+
epoch=17 global_step=16300 optimizer_step=2033 loss=0.1513671875
|
| 165 |
+
epoch=17 global_step=16400 optimizer_step=2045 loss=0.1455078125
|
| 166 |
+
epoch=17 global_step=16500 optimizer_step=2058 loss=0.1660156250
|
| 167 |
+
epoch=17 global_step=16600 optimizer_step=2070 loss=0.1503906250
|
| 168 |
+
epoch=17 global_step=16700 optimizer_step=2083 loss=0.1406250000
|
| 169 |
+
epoch=17 global_step=16800 optimizer_step=2095 loss=0.1806640625
|
| 170 |
+
epoch=18 global_step=16900 optimizer_step=2107 loss=0.1953125000
|
| 171 |
+
epoch=18 global_step=17000 optimizer_step=2120 loss=0.1562500000
|
| 172 |
+
epoch=18 global_step=17100 optimizer_step=2132 loss=0.1435546875
|
| 173 |
+
epoch=18 global_step=17200 optimizer_step=2145 loss=0.1396484375
|
| 174 |
+
epoch=18 global_step=17300 optimizer_step=2157 loss=0.1630859375
|
| 175 |
+
epoch=18 global_step=17400 optimizer_step=2170 loss=0.1933593750
|
| 176 |
+
epoch=18 global_step=17500 optimizer_step=2182 loss=0.1835937500
|
| 177 |
+
epoch=18 global_step=17600 optimizer_step=2195 loss=0.1474609375
|
| 178 |
+
epoch=18 global_step=17700 optimizer_step=2207 loss=0.1718750000
|
| 179 |
+
epoch=18 global_step=17800 optimizer_step=2220 loss=0.1562500000
|
| 180 |
+
epoch=19 global_step=17900 optimizer_step=2232 loss=0.1689453125
|
| 181 |
+
epoch=19 global_step=18000 optimizer_step=2245 loss=0.1220703125
|
| 182 |
+
epoch=19 global_step=18100 optimizer_step=2257 loss=0.1748046875
|
| 183 |
+
epoch=19 global_step=18200 optimizer_step=2270 loss=0.1796875000
|
| 184 |
+
epoch=19 global_step=18300 optimizer_step=2282 loss=0.1679687500
|
| 185 |
+
epoch=19 global_step=18400 optimizer_step=2295 loss=0.1347656250
|
| 186 |
+
epoch=19 global_step=18500 optimizer_step=2307 loss=0.1542968750
|
| 187 |
+
epoch=19 global_step=18600 optimizer_step=2320 loss=0.1699218750
|
| 188 |
+
epoch=19 global_step=18700 optimizer_step=2332 loss=0.1748046875
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/chat_template.jinja
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 1,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eos_token_id": 2,
|
| 10 |
+
"head_dim": 128,
|
| 11 |
+
"hidden_act": "silu",
|
| 12 |
+
"hidden_size": 4096,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 11008,
|
| 15 |
+
"max_position_embeddings": 4096,
|
| 16 |
+
"mlp_bias": false,
|
| 17 |
+
"model_type": "llama",
|
| 18 |
+
"num_attention_heads": 32,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"num_key_value_heads": 32,
|
| 21 |
+
"pretraining_tp": 1,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"rope_scaling": null,
|
| 24 |
+
"rope_theta": 10000.0,
|
| 25 |
+
"tie_word_embeddings": false,
|
| 26 |
+
"transformers_version": "4.57.6",
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"vocab_size": 32000
|
| 29 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"transformers_version": "4.57.6"
|
| 6 |
+
}
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/lightweight_checkpoint_epoch_10/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d01fa85aed8435f89c1b48e9fb573cd0951ae509dc27bc449725181c4c98a312
|
| 3 |
+
size 404771510
|
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/lightweight_checkpoint_epoch_12/checkpoint.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68c8be8aa9d170c7b36138a3165ff9d72064967945481bbd032529c53a5a48a2
|
| 3 |
+
size 404771510
|