LJYAI commited on
Commit
6d9e0b9
·
verified ·
1 Parent(s): ec45673
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/chat_template.jinja +1 -0
  2. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/config.json +29 -0
  3. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/generation_config.json +6 -0
  4. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_10/checkpoint.pt +3 -0
  5. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_12/checkpoint.pt +3 -0
  6. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_14/checkpoint.pt +3 -0
  7. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_16/checkpoint.pt +3 -0
  8. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_18/checkpoint.pt +3 -0
  9. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_2/checkpoint.pt +3 -0
  10. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_20/checkpoint.pt +3 -0
  11. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_4/checkpoint.pt +3 -0
  12. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_6/checkpoint.pt +3 -0
  13. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_8/checkpoint.pt +3 -0
  14. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_final/checkpoint.pt +3 -0
  15. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model-00001-of-00002.safetensors +3 -0
  16. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model-00002-of-00002.safetensors +3 -0
  17. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model.safetensors.index.json +191 -0
  18. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/run_args.txt +9 -0
  19. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/special_tokens_map.json +24 -0
  20. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/tokenizer.json +0 -0
  21. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/tokenizer_config.json +43 -0
  22. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/train_loss.txt +188 -0
  23. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/chat_template.jinja +1 -0
  24. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/config.json +29 -0
  25. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/generation_config.json +6 -0
  26. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_10/checkpoint.pt +3 -0
  27. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_12/checkpoint.pt +3 -0
  28. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_14/checkpoint.pt +3 -0
  29. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_16/checkpoint.pt +3 -0
  30. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_18/checkpoint.pt +3 -0
  31. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_2/checkpoint.pt +3 -0
  32. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_20/checkpoint.pt +3 -0
  33. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_4/checkpoint.pt +3 -0
  34. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_6/checkpoint.pt +3 -0
  35. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_8/checkpoint.pt +3 -0
  36. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_final/checkpoint.pt +3 -0
  37. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00001-of-00003.safetensors +3 -0
  38. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00002-of-00003.safetensors +3 -0
  39. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00003-of-00003.safetensors +3 -0
  40. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model.safetensors.index.json +263 -0
  41. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/run_args.txt +9 -0
  42. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/special_tokens_map.json +24 -0
  43. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/tokenizer.json +0 -0
  44. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/tokenizer_config.json +43 -0
  45. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/train_loss.txt +188 -0
  46. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/chat_template.jinja +1 -0
  47. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/config.json +29 -0
  48. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/generation_config.json +6 -0
  49. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/lightweight_checkpoint_epoch_10/checkpoint.pt +3 -0
  50. llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/lightweight_checkpoint_epoch_12/checkpoint.pt +3 -0
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 20,
20
+ "num_key_value_heads": 32,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "transformers_version": "4.57.6",
27
+ "use_cache": true,
28
+ "vocab_size": 32000
29
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.57.6"
6
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_10/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edf2236e50020cce86f4f961088ae71e2d73a9cf2496271dadb8ee1abf90804b
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_12/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9d8fb39fa3b02e1c27310b551386ef89f391ea128447e8106abb333a6dab8c0
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_14/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ff59abbfa03c076fd6e9cf5342cda76163d02ea34caf2834cd174d9ec633fc3
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_16/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9540fc5273199f6d400750d8344bb14c84334a5ad88c065e1eb6997e6905a6d3
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_18/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f5bd4561b98752d53b2a5823bc2044e18ca44760c320a65a1fe9cea77247444
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_2/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37b9012c0a3cf22b15b27d760b0d4530e40d37913022a3d6db70c9253a89a8b9
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_20/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d85e8e0a6209f57cdb4fc04ea43d4f7815954de55bbbaca4e75bf9de3eb59617
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_4/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc2a4143e0cd80d0a136f5d75ae81bb43b9f001b6c37a92fbe5a41732411e9cc
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_6/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dec7706c23617d4175b0604867af2e4c39c9c96f71bb5d84aee5ec10f1f2dfa0
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_epoch_8/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edc38b2eaf470d7b53bc69cd7a4a750e1e4dcfe7fecf73f17533db6c47a7c1ab
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/lightweight_checkpoint_final/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d85e8e0a6209f57cdb4fc04ea43d4f7815954de55bbbaca4e75bf9de3eb59617
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a025ba2f39370557992c0a88819e9ddcb82110c0456f17d7069c1a87135dc31
3
+ size 4938985352
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a03e07568c537b7de77f83fe586e68f1be1dfe7f75f913acc3d5af64197e1d5
3
+ size 3680666464
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/model.safetensors.index.json ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4309815296,
4
+ "total_size": 8619630592
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00002-of-00002.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00002.safetensors",
37
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
38
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
40
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
41
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00002.safetensors",
46
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
47
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
48
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
49
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
50
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
51
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
52
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
53
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
54
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00002.safetensors",
55
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
56
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
57
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
58
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
59
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
60
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
61
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
62
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
63
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00002.safetensors",
64
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
65
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
66
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
67
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
68
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
69
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
70
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
71
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
72
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors",
73
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
74
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
75
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
76
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
77
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
78
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
79
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
80
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
81
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00002.safetensors",
82
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
83
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
84
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
85
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
86
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
87
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
88
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
89
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
90
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
91
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
92
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
93
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
94
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
95
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
96
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
97
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
98
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
99
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
100
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
101
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
102
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
103
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
104
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
105
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
106
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
107
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
108
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
109
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
110
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
111
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
112
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
113
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
114
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
115
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
116
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
117
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
153
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
163
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
172
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
175
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
177
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
181
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
184
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
186
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
187
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
189
+ "model.norm.weight": "model-00002-of-00002.safetensors"
190
+ }
191
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/run_args.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ git_commit=unknown
2
+ dataset=Open-Orca/SlimOrca
3
+ dataset_split=train
4
+ collect_batch_size=8
5
+ train_batch_size=32
6
+ gradient_accumulation_step=8
7
+ target_effective_batch=256
8
+ command:
9
+ python /workspace/here/abprune_update/compare_model/LLM-Streamline/mseloss_entry.py --model_name meta-llama/Llama-2-7b-chat-hf --output_dir /workspace/here/abprune_update/results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18 --dataset Open-Orca/SlimOrca --dataset_split train --layer_intervals 12 --best_layer 18 --cosine_num_data 300 --train_num_data 30000 --epoches 20 --batch_size 8 --collect_batch_size 8 --train_batch_size 32 --dtype bfloat16 --gradient_accumulation_step 8 --lr 1e-5 --min_lr 5e-5 --wd 1e-3
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune12_start18/train_loss.txt ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Train loss logged every 100 global steps
2
+ epoch=0 global_step=100 optimizer_step=12 loss=4.5312500000
3
+ epoch=0 global_step=200 optimizer_step=24 loss=4.6250000000
4
+ epoch=0 global_step=300 optimizer_step=37 loss=4.6250000000
5
+ epoch=0 global_step=400 optimizer_step=49 loss=4.4375000000
6
+ epoch=0 global_step=500 optimizer_step=62 loss=4.4062500000
7
+ epoch=0 global_step=600 optimizer_step=74 loss=4.3437500000
8
+ epoch=0 global_step=700 optimizer_step=87 loss=3.9218750000
9
+ epoch=0 global_step=800 optimizer_step=99 loss=3.4687500000
10
+ epoch=0 global_step=900 optimizer_step=112 loss=3.1562500000
11
+ epoch=1 global_step=1000 optimizer_step=124 loss=2.6406250000
12
+ epoch=1 global_step=1100 optimizer_step=137 loss=2.3906250000
13
+ epoch=1 global_step=1200 optimizer_step=149 loss=2.3437500000
14
+ epoch=1 global_step=1300 optimizer_step=162 loss=2.0781250000
15
+ epoch=1 global_step=1400 optimizer_step=174 loss=2.3437500000
16
+ epoch=1 global_step=1500 optimizer_step=187 loss=2.2187500000
17
+ epoch=1 global_step=1600 optimizer_step=199 loss=1.9921875000
18
+ epoch=1 global_step=1700 optimizer_step=212 loss=2.0468750000
19
+ epoch=1 global_step=1800 optimizer_step=224 loss=2.0156250000
20
+ epoch=2 global_step=1900 optimizer_step=236 loss=2.0781250000
21
+ epoch=2 global_step=2000 optimizer_step=249 loss=2.0468750000
22
+ epoch=2 global_step=2100 optimizer_step=261 loss=2.0468750000
23
+ epoch=2 global_step=2200 optimizer_step=274 loss=2.0781250000
24
+ epoch=2 global_step=2300 optimizer_step=286 loss=2.1718750000
25
+ epoch=2 global_step=2400 optimizer_step=299 loss=2.1406250000
26
+ epoch=2 global_step=2500 optimizer_step=311 loss=1.8125000000
27
+ epoch=2 global_step=2600 optimizer_step=324 loss=1.9687500000
28
+ epoch=2 global_step=2700 optimizer_step=336 loss=2.0468750000
29
+ epoch=2 global_step=2800 optimizer_step=349 loss=2.1250000000
30
+ epoch=3 global_step=2900 optimizer_step=361 loss=2.0625000000
31
+ epoch=3 global_step=3000 optimizer_step=374 loss=2.2187500000
32
+ epoch=3 global_step=3100 optimizer_step=386 loss=2.0781250000
33
+ epoch=3 global_step=3200 optimizer_step=399 loss=2.0468750000
34
+ epoch=3 global_step=3300 optimizer_step=411 loss=2.0312500000
35
+ epoch=3 global_step=3400 optimizer_step=424 loss=2.0156250000
36
+ epoch=3 global_step=3500 optimizer_step=436 loss=1.8906250000
37
+ epoch=3 global_step=3600 optimizer_step=449 loss=2.0156250000
38
+ epoch=3 global_step=3700 optimizer_step=461 loss=2.0625000000
39
+ epoch=4 global_step=3800 optimizer_step=473 loss=2.0312500000
40
+ epoch=4 global_step=3900 optimizer_step=486 loss=2.0625000000
41
+ epoch=4 global_step=4000 optimizer_step=498 loss=2.0468750000
42
+ epoch=4 global_step=4100 optimizer_step=511 loss=2.1562500000
43
+ epoch=4 global_step=4200 optimizer_step=523 loss=1.8593750000
44
+ epoch=4 global_step=4300 optimizer_step=536 loss=1.9296875000
45
+ epoch=4 global_step=4400 optimizer_step=548 loss=2.0468750000
46
+ epoch=4 global_step=4500 optimizer_step=561 loss=1.9453125000
47
+ epoch=4 global_step=4600 optimizer_step=573 loss=1.9375000000
48
+ epoch=5 global_step=4700 optimizer_step=586 loss=1.9609375000
49
+ epoch=5 global_step=4800 optimizer_step=598 loss=2.1093750000
50
+ epoch=5 global_step=4900 optimizer_step=611 loss=2.0156250000
51
+ epoch=5 global_step=5000 optimizer_step=623 loss=1.8750000000
52
+ epoch=5 global_step=5100 optimizer_step=636 loss=1.9296875000
53
+ epoch=5 global_step=5200 optimizer_step=648 loss=2.0156250000
54
+ epoch=5 global_step=5300 optimizer_step=661 loss=1.9062500000
55
+ epoch=5 global_step=5400 optimizer_step=673 loss=2.0937500000
56
+ epoch=5 global_step=5500 optimizer_step=686 loss=2.0000000000
57
+ epoch=5 global_step=5600 optimizer_step=698 loss=1.9140625000
58
+ epoch=6 global_step=5700 optimizer_step=710 loss=1.8828125000
59
+ epoch=6 global_step=5800 optimizer_step=723 loss=2.0468750000
60
+ epoch=6 global_step=5900 optimizer_step=735 loss=1.9687500000
61
+ epoch=6 global_step=6000 optimizer_step=748 loss=1.9609375000
62
+ epoch=6 global_step=6100 optimizer_step=760 loss=1.7968750000
63
+ epoch=6 global_step=6200 optimizer_step=773 loss=1.9843750000
64
+ epoch=6 global_step=6300 optimizer_step=785 loss=1.9921875000
65
+ epoch=6 global_step=6400 optimizer_step=798 loss=2.0781250000
66
+ epoch=6 global_step=6500 optimizer_step=810 loss=2.0156250000
67
+ epoch=7 global_step=6600 optimizer_step=823 loss=1.9843750000
68
+ epoch=7 global_step=6700 optimizer_step=835 loss=2.0937500000
69
+ epoch=7 global_step=6800 optimizer_step=848 loss=2.0000000000
70
+ epoch=7 global_step=6900 optimizer_step=860 loss=1.8906250000
71
+ epoch=7 global_step=7000 optimizer_step=873 loss=1.9140625000
72
+ epoch=7 global_step=7100 optimizer_step=885 loss=1.9453125000
73
+ epoch=7 global_step=7200 optimizer_step=898 loss=1.8750000000
74
+ epoch=7 global_step=7300 optimizer_step=910 loss=2.0781250000
75
+ epoch=7 global_step=7400 optimizer_step=923 loss=1.9609375000
76
+ epoch=7 global_step=7500 optimizer_step=935 loss=1.9765625000
77
+ epoch=8 global_step=7600 optimizer_step=947 loss=1.8593750000
78
+ epoch=8 global_step=7700 optimizer_step=960 loss=1.9609375000
79
+ epoch=8 global_step=7800 optimizer_step=972 loss=1.8906250000
80
+ epoch=8 global_step=7900 optimizer_step=985 loss=1.9296875000
81
+ epoch=8 global_step=8000 optimizer_step=997 loss=1.8125000000
82
+ epoch=8 global_step=8100 optimizer_step=1010 loss=1.8046875000
83
+ epoch=8 global_step=8200 optimizer_step=1022 loss=1.9687500000
84
+ epoch=8 global_step=8300 optimizer_step=1035 loss=1.9765625000
85
+ epoch=8 global_step=8400 optimizer_step=1047 loss=2.0625000000
86
+ epoch=9 global_step=8500 optimizer_step=1060 loss=1.8359375000
87
+ epoch=9 global_step=8600 optimizer_step=1072 loss=1.9296875000
88
+ epoch=9 global_step=8700 optimizer_step=1085 loss=1.9687500000
89
+ epoch=9 global_step=8800 optimizer_step=1097 loss=2.0000000000
90
+ epoch=9 global_step=8900 optimizer_step=1110 loss=2.1093750000
91
+ epoch=9 global_step=9000 optimizer_step=1122 loss=1.9218750000
92
+ epoch=9 global_step=9100 optimizer_step=1135 loss=2.0000000000
93
+ epoch=9 global_step=9200 optimizer_step=1147 loss=1.8593750000
94
+ epoch=9 global_step=9300 optimizer_step=1160 loss=1.8984375000
95
+ epoch=10 global_step=9400 optimizer_step=1172 loss=2.0156250000
96
+ epoch=10 global_step=9500 optimizer_step=1184 loss=1.9687500000
97
+ epoch=10 global_step=9600 optimizer_step=1197 loss=2.0312500000
98
+ epoch=10 global_step=9700 optimizer_step=1209 loss=2.0156250000
99
+ epoch=10 global_step=9800 optimizer_step=1222 loss=2.0468750000
100
+ epoch=10 global_step=9900 optimizer_step=1234 loss=1.9843750000
101
+ epoch=10 global_step=10000 optimizer_step=1247 loss=2.0312500000
102
+ epoch=10 global_step=10100 optimizer_step=1259 loss=2.0468750000
103
+ epoch=10 global_step=10200 optimizer_step=1272 loss=1.9843750000
104
+ epoch=10 global_step=10300 optimizer_step=1284 loss=1.9140625000
105
+ epoch=11 global_step=10400 optimizer_step=1297 loss=2.1093750000
106
+ epoch=11 global_step=10500 optimizer_step=1309 loss=1.9921875000
107
+ epoch=11 global_step=10600 optimizer_step=1322 loss=1.8593750000
108
+ epoch=11 global_step=10700 optimizer_step=1334 loss=1.9140625000
109
+ epoch=11 global_step=10800 optimizer_step=1347 loss=1.8515625000
110
+ epoch=11 global_step=10900 optimizer_step=1359 loss=2.0625000000
111
+ epoch=11 global_step=11000 optimizer_step=1372 loss=1.9453125000
112
+ epoch=11 global_step=11100 optimizer_step=1384 loss=2.0000000000
113
+ epoch=11 global_step=11200 optimizer_step=1397 loss=2.1406250000
114
+ epoch=12 global_step=11300 optimizer_step=1409 loss=1.9218750000
115
+ epoch=12 global_step=11400 optimizer_step=1421 loss=1.9375000000
116
+ epoch=12 global_step=11500 optimizer_step=1434 loss=2.0625000000
117
+ epoch=12 global_step=11600 optimizer_step=1446 loss=1.8281250000
118
+ epoch=12 global_step=11700 optimizer_step=1459 loss=1.8437500000
119
+ epoch=12 global_step=11800 optimizer_step=1471 loss=1.7265625000
120
+ epoch=12 global_step=11900 optimizer_step=1484 loss=1.8359375000
121
+ epoch=12 global_step=12000 optimizer_step=1496 loss=1.9609375000
122
+ epoch=12 global_step=12100 optimizer_step=1509 loss=2.0312500000
123
+ epoch=13 global_step=12200 optimizer_step=1521 loss=1.9375000000
124
+ epoch=13 global_step=12300 optimizer_step=1534 loss=1.9218750000
125
+ epoch=13 global_step=12400 optimizer_step=1546 loss=1.8671875000
126
+ epoch=13 global_step=12500 optimizer_step=1559 loss=2.0156250000
127
+ epoch=13 global_step=12600 optimizer_step=1571 loss=2.0312500000
128
+ epoch=13 global_step=12700 optimizer_step=1584 loss=1.9609375000
129
+ epoch=13 global_step=12800 optimizer_step=1596 loss=1.8828125000
130
+ epoch=13 global_step=12900 optimizer_step=1609 loss=1.8828125000
131
+ epoch=13 global_step=13000 optimizer_step=1621 loss=1.8515625000
132
+ epoch=13 global_step=13100 optimizer_step=1634 loss=1.9218750000
133
+ epoch=14 global_step=13200 optimizer_step=1646 loss=1.8203125000
134
+ epoch=14 global_step=13300 optimizer_step=1658 loss=1.9921875000
135
+ epoch=14 global_step=13400 optimizer_step=1671 loss=1.8203125000
136
+ epoch=14 global_step=13500 optimizer_step=1683 loss=2.0156250000
137
+ epoch=14 global_step=13600 optimizer_step=1696 loss=2.0312500000
138
+ epoch=14 global_step=13700 optimizer_step=1708 loss=1.7968750000
139
+ epoch=14 global_step=13800 optimizer_step=1721 loss=1.8906250000
140
+ epoch=14 global_step=13900 optimizer_step=1733 loss=1.7890625000
141
+ epoch=14 global_step=14000 optimizer_step=1746 loss=1.8906250000
142
+ epoch=15 global_step=14100 optimizer_step=1758 loss=2.0468750000
143
+ epoch=15 global_step=14200 optimizer_step=1771 loss=1.8906250000
144
+ epoch=15 global_step=14300 optimizer_step=1783 loss=1.8750000000
145
+ epoch=15 global_step=14400 optimizer_step=1796 loss=1.8906250000
146
+ epoch=15 global_step=14500 optimizer_step=1808 loss=1.8281250000
147
+ epoch=15 global_step=14600 optimizer_step=1821 loss=1.9921875000
148
+ epoch=15 global_step=14700 optimizer_step=1833 loss=1.9609375000
149
+ epoch=15 global_step=14800 optimizer_step=1846 loss=2.0781250000
150
+ epoch=15 global_step=14900 optimizer_step=1858 loss=1.9921875000
151
+ epoch=15 global_step=15000 optimizer_step=1871 loss=2.0156250000
152
+ epoch=16 global_step=15100 optimizer_step=1883 loss=1.8515625000
153
+ epoch=16 global_step=15200 optimizer_step=1895 loss=1.9140625000
154
+ epoch=16 global_step=15300 optimizer_step=1908 loss=1.9453125000
155
+ epoch=16 global_step=15400 optimizer_step=1920 loss=1.9375000000
156
+ epoch=16 global_step=15500 optimizer_step=1933 loss=2.0312500000
157
+ epoch=16 global_step=15600 optimizer_step=1945 loss=2.0781250000
158
+ epoch=16 global_step=15700 optimizer_step=1958 loss=2.0000000000
159
+ epoch=16 global_step=15800 optimizer_step=1970 loss=1.9687500000
160
+ epoch=16 global_step=15900 optimizer_step=1983 loss=2.0781250000
161
+ epoch=17 global_step=16000 optimizer_step=1995 loss=2.0468750000
162
+ epoch=17 global_step=16100 optimizer_step=2008 loss=1.9218750000
163
+ epoch=17 global_step=16200 optimizer_step=2020 loss=1.8125000000
164
+ epoch=17 global_step=16300 optimizer_step=2033 loss=1.8593750000
165
+ epoch=17 global_step=16400 optimizer_step=2045 loss=1.9140625000
166
+ epoch=17 global_step=16500 optimizer_step=2058 loss=1.9765625000
167
+ epoch=17 global_step=16600 optimizer_step=2070 loss=1.9375000000
168
+ epoch=17 global_step=16700 optimizer_step=2083 loss=1.8828125000
169
+ epoch=17 global_step=16800 optimizer_step=2095 loss=1.8906250000
170
+ epoch=18 global_step=16900 optimizer_step=2107 loss=1.9375000000
171
+ epoch=18 global_step=17000 optimizer_step=2120 loss=1.9687500000
172
+ epoch=18 global_step=17100 optimizer_step=2132 loss=2.0312500000
173
+ epoch=18 global_step=17200 optimizer_step=2145 loss=1.9453125000
174
+ epoch=18 global_step=17300 optimizer_step=2157 loss=2.0625000000
175
+ epoch=18 global_step=17400 optimizer_step=2170 loss=1.8906250000
176
+ epoch=18 global_step=17500 optimizer_step=2182 loss=2.0156250000
177
+ epoch=18 global_step=17600 optimizer_step=2195 loss=1.8906250000
178
+ epoch=18 global_step=17700 optimizer_step=2207 loss=1.7421875000
179
+ epoch=18 global_step=17800 optimizer_step=2220 loss=1.9843750000
180
+ epoch=19 global_step=17900 optimizer_step=2232 loss=1.9296875000
181
+ epoch=19 global_step=18000 optimizer_step=2245 loss=1.9375000000
182
+ epoch=19 global_step=18100 optimizer_step=2257 loss=1.7500000000
183
+ epoch=19 global_step=18200 optimizer_step=2270 loss=1.8437500000
184
+ epoch=19 global_step=18300 optimizer_step=2282 loss=1.9453125000
185
+ epoch=19 global_step=18400 optimizer_step=2295 loss=1.8906250000
186
+ epoch=19 global_step=18500 optimizer_step=2307 loss=1.8828125000
187
+ epoch=19 global_step=18600 optimizer_step=2320 loss=1.9062500000
188
+ epoch=19 global_step=18700 optimizer_step=2332 loss=1.9609375000
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 28,
20
+ "num_key_value_heads": 32,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "transformers_version": "4.57.6",
27
+ "use_cache": true,
28
+ "vocab_size": 32000
29
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.57.6"
6
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_10/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76b06b4883cce06d8b5fe6d1f6b418c759d2de57c675d742cebc4317d4ca44af
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_12/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77aa7049cf9654a0bf05d86b486aa50858ba7c73b423635ebfbf224cb1fb26d7
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_14/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bddc194fb3bcd579b8e0bd5556e4e34f2f9925eca405d0c6cec2ceed93882246
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_16/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc275eb5eec9fb91b3b83af6746395147ccb707935dc18d76f3482a49b9c2bf8
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_18/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f58f901ba3eb8125645144144567e522fcae9559283027af3db873d8cb840d2
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_2/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e00ff43fe1738a20cff031579b2a8a49e5805de3c871003cb7c91b0ca48920
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_20/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fb83f53338e41c8a8ecf6bb039c130d3abfd38ead896ce4bc9b831babf291b0
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_4/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e9025fedfaa18ae9040261961f145443040ccbee8337d09d1d588117fb3d51e
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_6/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16f7adc8334a56fe18150ab8d2c6a9d3182b414191b8b792a97bc03a5d5eb5d2
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_epoch_8/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e2bdf21bc36f02516d4bc9534e45dcdeb503843ebb8fbe2c2f78a36911a4d43
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/lightweight_checkpoint_final/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fb83f53338e41c8a8ecf6bb039c130d3abfd38ead896ce4bc9b831babf291b0
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a025ba2f39370557992c0a88819e9ddcb82110c0456f17d7069c1a87135dc31
3
+ size 4938985352
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e59c2fe2fc83cd719a4233d13f9a04d45d1de815d0ae4bf56932068d8028ce
3
+ size 4947390880
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c7e902bd779a07d9135271f0db2a1c5fbef4d01da478adb0baf5bdc08284367
3
+ size 1971417736
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/model.safetensors.index.json ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 5928882176,
4
+ "total_size": 11857764352
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00003-of-00003.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
28
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
31
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
36
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
39
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
42
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
43
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
44
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
45
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
126
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
144
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
145
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
146
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
148
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
153
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
156
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
157
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
159
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
160
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
161
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
162
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
197
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
198
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
199
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
200
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
201
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
202
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
203
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
204
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
205
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
206
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
207
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
208
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
209
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
210
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
211
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
212
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
213
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
214
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
215
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
225
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
226
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
227
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
228
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
229
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
230
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
231
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
232
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
233
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
234
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
235
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
236
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
237
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
238
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
239
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
240
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
241
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
242
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
261
+ "model.norm.weight": "model-00003-of-00003.safetensors"
262
+ }
263
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/run_args.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ git_commit=unknown
2
+ dataset=Open-Orca/SlimOrca
3
+ dataset_split=train
4
+ collect_batch_size=8
5
+ train_batch_size=32
6
+ gradient_accumulation_step=8
7
+ target_effective_batch=256
8
+ command:
9
+ python /workspace/here/abprune_update/compare_model/LLM-Streamline/mseloss_entry.py --model_name meta-llama/Llama-2-7b-chat-hf --output_dir /workspace/here/abprune_update/results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24 --dataset Open-Orca/SlimOrca --dataset_split train --layer_intervals 4 --best_layer 24 --cosine_num_data 300 --train_num_data 30000 --epoches 20 --batch_size 8 --collect_batch_size 8 --train_batch_size 32 --dtype bfloat16 --gradient_accumulation_step 8 --lr 1e-5 --min_lr 5e-5 --wd 1e-3
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune4_start24/train_loss.txt ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Train loss logged every 100 global steps
2
+ epoch=0 global_step=100 optimizer_step=12 loss=0.4257812500
3
+ epoch=0 global_step=200 optimizer_step=24 loss=0.3769531250
4
+ epoch=0 global_step=300 optimizer_step=37 loss=0.4121093750
5
+ epoch=0 global_step=400 optimizer_step=49 loss=0.4042968750
6
+ epoch=0 global_step=500 optimizer_step=62 loss=0.3652343750
7
+ epoch=0 global_step=600 optimizer_step=74 loss=0.3769531250
8
+ epoch=0 global_step=700 optimizer_step=87 loss=0.3417968750
9
+ epoch=0 global_step=800 optimizer_step=99 loss=0.3671875000
10
+ epoch=0 global_step=900 optimizer_step=112 loss=0.3085937500
11
+ epoch=1 global_step=1000 optimizer_step=124 loss=0.3183593750
12
+ epoch=1 global_step=1100 optimizer_step=137 loss=0.2519531250
13
+ epoch=1 global_step=1200 optimizer_step=149 loss=0.2333984375
14
+ epoch=1 global_step=1300 optimizer_step=162 loss=0.2490234375
15
+ epoch=1 global_step=1400 optimizer_step=174 loss=0.2187500000
16
+ epoch=1 global_step=1500 optimizer_step=187 loss=0.2167968750
17
+ epoch=1 global_step=1600 optimizer_step=199 loss=0.2265625000
18
+ epoch=1 global_step=1700 optimizer_step=212 loss=0.2099609375
19
+ epoch=1 global_step=1800 optimizer_step=224 loss=0.2294921875
20
+ epoch=2 global_step=1900 optimizer_step=236 loss=0.1894531250
21
+ epoch=2 global_step=2000 optimizer_step=249 loss=0.2382812500
22
+ epoch=2 global_step=2100 optimizer_step=261 loss=0.2236328125
23
+ epoch=2 global_step=2200 optimizer_step=274 loss=0.1992187500
24
+ epoch=2 global_step=2300 optimizer_step=286 loss=0.1865234375
25
+ epoch=2 global_step=2400 optimizer_step=299 loss=0.2041015625
26
+ epoch=2 global_step=2500 optimizer_step=311 loss=0.1796875000
27
+ epoch=2 global_step=2600 optimizer_step=324 loss=0.2275390625
28
+ epoch=2 global_step=2700 optimizer_step=336 loss=0.1923828125
29
+ epoch=2 global_step=2800 optimizer_step=349 loss=0.1855468750
30
+ epoch=3 global_step=2900 optimizer_step=361 loss=0.1904296875
31
+ epoch=3 global_step=3000 optimizer_step=374 loss=0.1796875000
32
+ epoch=3 global_step=3100 optimizer_step=386 loss=0.1972656250
33
+ epoch=3 global_step=3200 optimizer_step=399 loss=0.1757812500
34
+ epoch=3 global_step=3300 optimizer_step=411 loss=0.1884765625
35
+ epoch=3 global_step=3400 optimizer_step=424 loss=0.1562500000
36
+ epoch=3 global_step=3500 optimizer_step=436 loss=0.2138671875
37
+ epoch=3 global_step=3600 optimizer_step=449 loss=0.1748046875
38
+ epoch=3 global_step=3700 optimizer_step=461 loss=0.2070312500
39
+ epoch=4 global_step=3800 optimizer_step=473 loss=0.1630859375
40
+ epoch=4 global_step=3900 optimizer_step=486 loss=0.1621093750
41
+ epoch=4 global_step=4000 optimizer_step=498 loss=0.1777343750
42
+ epoch=4 global_step=4100 optimizer_step=511 loss=0.1953125000
43
+ epoch=4 global_step=4200 optimizer_step=523 loss=0.2207031250
44
+ epoch=4 global_step=4300 optimizer_step=536 loss=0.1591796875
45
+ epoch=4 global_step=4400 optimizer_step=548 loss=0.2226562500
46
+ epoch=4 global_step=4500 optimizer_step=561 loss=0.1650390625
47
+ epoch=4 global_step=4600 optimizer_step=573 loss=0.2050781250
48
+ epoch=5 global_step=4700 optimizer_step=586 loss=0.2167968750
49
+ epoch=5 global_step=4800 optimizer_step=598 loss=0.1650390625
50
+ epoch=5 global_step=4900 optimizer_step=611 loss=0.1669921875
51
+ epoch=5 global_step=5000 optimizer_step=623 loss=0.1494140625
52
+ epoch=5 global_step=5100 optimizer_step=636 loss=0.1767578125
53
+ epoch=5 global_step=5200 optimizer_step=648 loss=0.1806640625
54
+ epoch=5 global_step=5300 optimizer_step=661 loss=0.1562500000
55
+ epoch=5 global_step=5400 optimizer_step=673 loss=0.1757812500
56
+ epoch=5 global_step=5500 optimizer_step=686 loss=0.1787109375
57
+ epoch=5 global_step=5600 optimizer_step=698 loss=0.1679687500
58
+ epoch=6 global_step=5700 optimizer_step=710 loss=0.1787109375
59
+ epoch=6 global_step=5800 optimizer_step=723 loss=0.1933593750
60
+ epoch=6 global_step=5900 optimizer_step=735 loss=0.1845703125
61
+ epoch=6 global_step=6000 optimizer_step=748 loss=0.1875000000
62
+ epoch=6 global_step=6100 optimizer_step=760 loss=0.1796875000
63
+ epoch=6 global_step=6200 optimizer_step=773 loss=0.1767578125
64
+ epoch=6 global_step=6300 optimizer_step=785 loss=0.1826171875
65
+ epoch=6 global_step=6400 optimizer_step=798 loss=0.1542968750
66
+ epoch=6 global_step=6500 optimizer_step=810 loss=0.1816406250
67
+ epoch=7 global_step=6600 optimizer_step=823 loss=0.1562500000
68
+ epoch=7 global_step=6700 optimizer_step=835 loss=0.1738281250
69
+ epoch=7 global_step=6800 optimizer_step=848 loss=0.1669921875
70
+ epoch=7 global_step=6900 optimizer_step=860 loss=0.1953125000
71
+ epoch=7 global_step=7000 optimizer_step=873 loss=0.1523437500
72
+ epoch=7 global_step=7100 optimizer_step=885 loss=0.1708984375
73
+ epoch=7 global_step=7200 optimizer_step=898 loss=0.1904296875
74
+ epoch=7 global_step=7300 optimizer_step=910 loss=0.1591796875
75
+ epoch=7 global_step=7400 optimizer_step=923 loss=0.1494140625
76
+ epoch=7 global_step=7500 optimizer_step=935 loss=0.1445312500
77
+ epoch=8 global_step=7600 optimizer_step=947 loss=0.2148437500
78
+ epoch=8 global_step=7700 optimizer_step=960 loss=0.1972656250
79
+ epoch=8 global_step=7800 optimizer_step=972 loss=0.1689453125
80
+ epoch=8 global_step=7900 optimizer_step=985 loss=0.2050781250
81
+ epoch=8 global_step=8000 optimizer_step=997 loss=0.1279296875
82
+ epoch=8 global_step=8100 optimizer_step=1010 loss=0.1718750000
83
+ epoch=8 global_step=8200 optimizer_step=1022 loss=0.1513671875
84
+ epoch=8 global_step=8300 optimizer_step=1035 loss=0.1552734375
85
+ epoch=8 global_step=8400 optimizer_step=1047 loss=0.1699218750
86
+ epoch=9 global_step=8500 optimizer_step=1060 loss=0.2080078125
87
+ epoch=9 global_step=8600 optimizer_step=1072 loss=0.2226562500
88
+ epoch=9 global_step=8700 optimizer_step=1085 loss=0.1816406250
89
+ epoch=9 global_step=8800 optimizer_step=1097 loss=0.1611328125
90
+ epoch=9 global_step=8900 optimizer_step=1110 loss=0.1621093750
91
+ epoch=9 global_step=9000 optimizer_step=1122 loss=0.1923828125
92
+ epoch=9 global_step=9100 optimizer_step=1135 loss=0.1425781250
93
+ epoch=9 global_step=9200 optimizer_step=1147 loss=0.1679687500
94
+ epoch=9 global_step=9300 optimizer_step=1160 loss=0.1845703125
95
+ epoch=10 global_step=9400 optimizer_step=1172 loss=0.1630859375
96
+ epoch=10 global_step=9500 optimizer_step=1184 loss=0.1757812500
97
+ epoch=10 global_step=9600 optimizer_step=1197 loss=0.1611328125
98
+ epoch=10 global_step=9700 optimizer_step=1209 loss=0.1464843750
99
+ epoch=10 global_step=9800 optimizer_step=1222 loss=0.2089843750
100
+ epoch=10 global_step=9900 optimizer_step=1234 loss=0.1787109375
101
+ epoch=10 global_step=10000 optimizer_step=1247 loss=0.1787109375
102
+ epoch=10 global_step=10100 optimizer_step=1259 loss=0.1689453125
103
+ epoch=10 global_step=10200 optimizer_step=1272 loss=0.2119140625
104
+ epoch=10 global_step=10300 optimizer_step=1284 loss=0.1503906250
105
+ epoch=11 global_step=10400 optimizer_step=1297 loss=0.1572265625
106
+ epoch=11 global_step=10500 optimizer_step=1309 loss=0.2089843750
107
+ epoch=11 global_step=10600 optimizer_step=1322 loss=0.1318359375
108
+ epoch=11 global_step=10700 optimizer_step=1334 loss=0.1884765625
109
+ epoch=11 global_step=10800 optimizer_step=1347 loss=0.1474609375
110
+ epoch=11 global_step=10900 optimizer_step=1359 loss=0.2011718750
111
+ epoch=11 global_step=11000 optimizer_step=1372 loss=0.1425781250
112
+ epoch=11 global_step=11100 optimizer_step=1384 loss=0.1708984375
113
+ epoch=11 global_step=11200 optimizer_step=1397 loss=0.1679687500
114
+ epoch=12 global_step=11300 optimizer_step=1409 loss=0.1318359375
115
+ epoch=12 global_step=11400 optimizer_step=1421 loss=0.1484375000
116
+ epoch=12 global_step=11500 optimizer_step=1434 loss=0.1464843750
117
+ epoch=12 global_step=11600 optimizer_step=1446 loss=0.1582031250
118
+ epoch=12 global_step=11700 optimizer_step=1459 loss=0.1894531250
119
+ epoch=12 global_step=11800 optimizer_step=1471 loss=0.1542968750
120
+ epoch=12 global_step=11900 optimizer_step=1484 loss=0.1386718750
121
+ epoch=12 global_step=12000 optimizer_step=1496 loss=0.1699218750
122
+ epoch=12 global_step=12100 optimizer_step=1509 loss=0.1767578125
123
+ epoch=13 global_step=12200 optimizer_step=1521 loss=0.1757812500
124
+ epoch=13 global_step=12300 optimizer_step=1534 loss=0.1621093750
125
+ epoch=13 global_step=12400 optimizer_step=1546 loss=0.2060546875
126
+ epoch=13 global_step=12500 optimizer_step=1559 loss=0.1601562500
127
+ epoch=13 global_step=12600 optimizer_step=1571 loss=0.1367187500
128
+ epoch=13 global_step=12700 optimizer_step=1584 loss=0.2060546875
129
+ epoch=13 global_step=12800 optimizer_step=1596 loss=0.1396484375
130
+ epoch=13 global_step=12900 optimizer_step=1609 loss=0.1875000000
131
+ epoch=13 global_step=13000 optimizer_step=1621 loss=0.2187500000
132
+ epoch=13 global_step=13100 optimizer_step=1634 loss=0.1591796875
133
+ epoch=14 global_step=13200 optimizer_step=1646 loss=0.1523437500
134
+ epoch=14 global_step=13300 optimizer_step=1658 loss=0.1640625000
135
+ epoch=14 global_step=13400 optimizer_step=1671 loss=0.1435546875
136
+ epoch=14 global_step=13500 optimizer_step=1683 loss=0.1660156250
137
+ epoch=14 global_step=13600 optimizer_step=1696 loss=0.1884765625
138
+ epoch=14 global_step=13700 optimizer_step=1708 loss=0.1445312500
139
+ epoch=14 global_step=13800 optimizer_step=1721 loss=0.1406250000
140
+ epoch=14 global_step=13900 optimizer_step=1733 loss=0.1689453125
141
+ epoch=14 global_step=14000 optimizer_step=1746 loss=0.1494140625
142
+ epoch=15 global_step=14100 optimizer_step=1758 loss=0.1884765625
143
+ epoch=15 global_step=14200 optimizer_step=1771 loss=0.1894531250
144
+ epoch=15 global_step=14300 optimizer_step=1783 loss=0.1708984375
145
+ epoch=15 global_step=14400 optimizer_step=1796 loss=0.1650390625
146
+ epoch=15 global_step=14500 optimizer_step=1808 loss=0.1357421875
147
+ epoch=15 global_step=14600 optimizer_step=1821 loss=0.1689453125
148
+ epoch=15 global_step=14700 optimizer_step=1833 loss=0.1748046875
149
+ epoch=15 global_step=14800 optimizer_step=1846 loss=0.1757812500
150
+ epoch=15 global_step=14900 optimizer_step=1858 loss=0.1572265625
151
+ epoch=15 global_step=15000 optimizer_step=1871 loss=0.1630859375
152
+ epoch=16 global_step=15100 optimizer_step=1883 loss=0.1416015625
153
+ epoch=16 global_step=15200 optimizer_step=1895 loss=0.1474609375
154
+ epoch=16 global_step=15300 optimizer_step=1908 loss=0.1640625000
155
+ epoch=16 global_step=15400 optimizer_step=1920 loss=0.1406250000
156
+ epoch=16 global_step=15500 optimizer_step=1933 loss=0.1308593750
157
+ epoch=16 global_step=15600 optimizer_step=1945 loss=0.1865234375
158
+ epoch=16 global_step=15700 optimizer_step=1958 loss=0.1669921875
159
+ epoch=16 global_step=15800 optimizer_step=1970 loss=0.1953125000
160
+ epoch=16 global_step=15900 optimizer_step=1983 loss=0.1835937500
161
+ epoch=17 global_step=16000 optimizer_step=1995 loss=0.1767578125
162
+ epoch=17 global_step=16100 optimizer_step=2008 loss=0.1875000000
163
+ epoch=17 global_step=16200 optimizer_step=2020 loss=0.1376953125
164
+ epoch=17 global_step=16300 optimizer_step=2033 loss=0.1513671875
165
+ epoch=17 global_step=16400 optimizer_step=2045 loss=0.1455078125
166
+ epoch=17 global_step=16500 optimizer_step=2058 loss=0.1660156250
167
+ epoch=17 global_step=16600 optimizer_step=2070 loss=0.1503906250
168
+ epoch=17 global_step=16700 optimizer_step=2083 loss=0.1406250000
169
+ epoch=17 global_step=16800 optimizer_step=2095 loss=0.1806640625
170
+ epoch=18 global_step=16900 optimizer_step=2107 loss=0.1953125000
171
+ epoch=18 global_step=17000 optimizer_step=2120 loss=0.1562500000
172
+ epoch=18 global_step=17100 optimizer_step=2132 loss=0.1435546875
173
+ epoch=18 global_step=17200 optimizer_step=2145 loss=0.1396484375
174
+ epoch=18 global_step=17300 optimizer_step=2157 loss=0.1630859375
175
+ epoch=18 global_step=17400 optimizer_step=2170 loss=0.1933593750
176
+ epoch=18 global_step=17500 optimizer_step=2182 loss=0.1835937500
177
+ epoch=18 global_step=17600 optimizer_step=2195 loss=0.1474609375
178
+ epoch=18 global_step=17700 optimizer_step=2207 loss=0.1718750000
179
+ epoch=18 global_step=17800 optimizer_step=2220 loss=0.1562500000
180
+ epoch=19 global_step=17900 optimizer_step=2232 loss=0.1689453125
181
+ epoch=19 global_step=18000 optimizer_step=2245 loss=0.1220703125
182
+ epoch=19 global_step=18100 optimizer_step=2257 loss=0.1748046875
183
+ epoch=19 global_step=18200 optimizer_step=2270 loss=0.1796875000
184
+ epoch=19 global_step=18300 optimizer_step=2282 loss=0.1679687500
185
+ epoch=19 global_step=18400 optimizer_step=2295 loss=0.1347656250
186
+ epoch=19 global_step=18500 optimizer_step=2307 loss=0.1542968750
187
+ epoch=19 global_step=18600 optimizer_step=2320 loss=0.1699218750
188
+ epoch=19 global_step=18700 optimizer_step=2332 loss=0.1748046875
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 24,
20
+ "num_key_value_heads": 32,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "transformers_version": "4.57.6",
27
+ "use_cache": true,
28
+ "vocab_size": 32000
29
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.57.6"
6
+ }
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/lightweight_checkpoint_epoch_10/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d01fa85aed8435f89c1b48e9fb573cd0951ae509dc27bc449725181c4c98a312
3
+ size 404771510
llmstream_results/llama2_7b_chat_hf_streamline_slimorca_prune8_start22/lightweight_checkpoint_epoch_12/checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68c8be8aa9d170c7b36138a3165ff9d72064967945481bbd032529c53a5a48a2
3
+ size 404771510