Neooooo commited on
Commit
703fc3b
·
verified ·
1 Parent(s): 8e9022b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/description.txt +28 -0
  3. prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/train.sh +1 -0
  4. prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/training.log +551 -0
  5. prune_log/layerskip_1b_prune_0.25/description.txt +28 -0
  6. prune_log/layerskip_1b_prune_0.25/pytorch_model.bin +3 -0
  7. prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/description.txt +28 -0
  8. prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/train.sh +1 -0
  9. prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/training.log +1501 -0
  10. prune_log/vanilla_llama_1b_prune_0.25/description.txt +28 -0
  11. prune_log/vanilla_llama_1b_prune_0.25/pytorch_model.bin +3 -0
  12. tune_log/.DS_Store +0 -0
  13. tune_log/layerskip_1b_0.25_tune/.DS_Store +0 -0
  14. tune_log/layerskip_1b_0.25_tune/adapter_config.json +22 -0
  15. tune_log/layerskip_1b_0.25_tune/adapter_model.bin +3 -0
  16. tune_log/layerskip_1b_0.25_tune/checkpoint-1000/model.safetensors +3 -0
  17. tune_log/layerskip_1b_0.25_tune/checkpoint-1000/optimizer.pt +3 -0
  18. tune_log/layerskip_1b_0.25_tune/checkpoint-1000/rng_state.pth +3 -0
  19. tune_log/layerskip_1b_0.25_tune/checkpoint-1000/scaler.pt +3 -0
  20. tune_log/layerskip_1b_0.25_tune/checkpoint-1000/scheduler.pt +3 -0
  21. tune_log/layerskip_1b_0.25_tune/checkpoint-1000/trainer_state.json +820 -0
  22. tune_log/layerskip_1b_0.25_tune/checkpoint-1000/training_args.bin +3 -0
  23. tune_log/layerskip_1b_0.25_tune/checkpoint-1200/model.safetensors +3 -0
  24. tune_log/layerskip_1b_0.25_tune/checkpoint-1200/optimizer.pt +3 -0
  25. tune_log/layerskip_1b_0.25_tune/checkpoint-1200/rng_state.pth +3 -0
  26. tune_log/layerskip_1b_0.25_tune/checkpoint-1200/scaler.pt +3 -0
  27. tune_log/layerskip_1b_0.25_tune/checkpoint-1200/scheduler.pt +3 -0
  28. tune_log/layerskip_1b_0.25_tune/checkpoint-1200/trainer_state.json +976 -0
  29. tune_log/layerskip_1b_0.25_tune/checkpoint-1200/training_args.bin +3 -0
  30. tune_log/layerskip_1b_0.25_tune/checkpoint-1400/model.safetensors +3 -0
  31. tune_log/layerskip_1b_0.25_tune/checkpoint-1400/optimizer.pt +3 -0
  32. tune_log/layerskip_1b_0.25_tune/checkpoint-1400/rng_state.pth +3 -0
  33. tune_log/layerskip_1b_0.25_tune/checkpoint-1400/scaler.pt +3 -0
  34. tune_log/layerskip_1b_0.25_tune/checkpoint-1400/scheduler.pt +3 -0
  35. tune_log/layerskip_1b_0.25_tune/checkpoint-1400/trainer_state.json +1132 -0
  36. tune_log/layerskip_1b_0.25_tune/checkpoint-1400/training_args.bin +3 -0
  37. tune_log/layerskip_1b_0.25_tune/checkpoint-1554/model.safetensors +3 -0
  38. tune_log/layerskip_1b_0.25_tune/checkpoint-1554/optimizer.pt +3 -0
  39. tune_log/layerskip_1b_0.25_tune/checkpoint-1554/rng_state.pth +3 -0
  40. tune_log/layerskip_1b_0.25_tune/checkpoint-1554/scaler.pt +3 -0
  41. tune_log/layerskip_1b_0.25_tune/checkpoint-1554/scheduler.pt +3 -0
  42. tune_log/layerskip_1b_0.25_tune/checkpoint-1554/trainer_state.json +1245 -0
  43. tune_log/layerskip_1b_0.25_tune/checkpoint-1554/training_args.bin +3 -0
  44. tune_log/layerskip_1b_0.25_tune/checkpoint-200/model.safetensors +3 -0
  45. tune_log/layerskip_1b_0.25_tune/checkpoint-200/optimizer.pt +3 -0
  46. tune_log/layerskip_1b_0.25_tune/checkpoint-200/rng_state.pth +3 -0
  47. tune_log/layerskip_1b_0.25_tune/checkpoint-200/scaler.pt +3 -0
  48. tune_log/layerskip_1b_0.25_tune/checkpoint-200/scheduler.pt +3 -0
  49. tune_log/layerskip_1b_0.25_tune/checkpoint-200/trainer_state.json +196 -0
  50. tune_log/layerskip_1b_0.25_tune/checkpoint-200/training_args.bin +3 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: facebook/layerskip-llama3.2-1B
3
+ - save_ckpt_log_name: layerskip_1b_prune_0.25
4
+ - pruning_ratio: 0.25
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 2
14
+ - block_attention_layer_end: 13
15
+ - block_mlp_layer_start: 2
16
+ - block_mlp_layer_end: 13
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: param_mix
21
+ - num_examples: 10
22
+ - device: cuda
23
+ - test_before_train: True
24
+ - eval_device: cuda
25
+ - test_after_train: True
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.6
prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python llama3.py --pruning_ratio 0.25 --device cuda --eval_device cuda --base_model facebook/layerskip-llama3.2-1B --block_wise --block_mlp_layer_start 2 --block_mlp_layer_end 13 --block_attention_layer_start 2 --block_attention_layer_end 13 --save_ckpt_log_name layerskip_1b_prune_0.25 --pruner_type taylor --taylor param_mix --max_seq_len 2048 --test_after_train --test_before_train --save_model
prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/training.log ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-14 11:11:32 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-03-14 11:11:33 - INFO :
3
+ ==================Generation Results before Pruning================
4
+
5
+ 2025-03-14 11:11:33 - INFO : the shape of current input sequences is ===tensor([[128000, 40, 4510, 279, 7438, 315, 2324, 374]],
6
+ device='cuda:0')===
7
+ 2025-03-14 11:11:50 - INFO : <|begin_of_text|>I believe the meaning of life is to find the most delicious way to die. — Dorothea Lange
8
+ I am a 47-year-old woman living with bipolar disorder. I don't believe this to be the cause of my symptoms, but if I am not careful, it could have an effect on my treatment options.
9
+ My symptoms began to emerge when I was 21 years old. My father, an environmental scientist, was diagnosed with bipolar disorder in the 1950s. I grew up with him, watching him suffer. He was an extremely loving and nurturing man, with a gentle soul. Bipolar disorder took a toll on him, and it left him struggling for much of his life. Eventually, I was born and grew up in the same home.
10
+ At one point, I thought my father would live into my 20s. He was still relatively strong when he suffered a heart attack at age 40. I remember watching him die that day in my home with great clarity. I remember my mom was so frightened of what was about to happen. I remember hearing the nurses and doctors talking on the phone, anxiously waiting to learn whether my father was going to live through the heart attack or not.
11
+ Bipolar disorder was very much a part of my family history. Even though my symptoms emerged at a later age and I am not as close to bipolar disorder as I was to my father, I feel much sympathy for people who suffer from this disorder, because, as I mentioned, it was very hard on my father.
12
+ When I first learned that I had bipolar disorder, I was angry and upset. I had no clue how this new information would affect my life, so I was not expecting to see how much my father had suffered. I was hoping that by not having the same struggle, I wouldn't suffer the same level of pain, and I would just live my life without any knowledge of bipolar disorder, as though I hadn't had any symptoms at all. But, this hope quickly turned into anger. I was very angry that my father had suffered, not only from bipolar disorder, but the pain associated with his illness and his diagnosis. I was very angry that my parents had not been given the option to be treated by a doctor who understood what bipolar disorder was and how to treat it.
13
+ I do not think bipolar disorder is going to be eliminated from the human genome any time soon, but that doesn't mean I am going to die of suicide or alcoholism.
14
+ There is a whole new field of medicine called behavioral medicine, which focuses on the emotional and behavioral aspects of illness, and how these affect treatment outcomes. Because of the increased emphasis on the emotional and behavioral side of illness, it is more likely that these issues will be dealt with in a more holistic way and that you'll be able to get better treatment.
15
+ The good news is that there are many people who are already successful in treating their illness in a more holistic way. It's important to keep an open mind and not to judge someone for their choice of treatment. I think it's great to have options, and I think that there is nothing wrong with someone who chooses to go on a road that is not straight.
16
+ The bad news is that bipolar disorder is a very dangerous disease. It is very easy to die from it, and if you don't take care of yourself, it can do a lot of damage. It's important to take good care of yourself, and it's even more important to keep your diagnosis as simple as possible. If you are able to take care of yourself, you'll be much better off.
17
+ I will definitely make an appointment with the doctor to find out whether or not I have bipolar disorder. If I do, I will make an appointment with a psychiatrist to get help with my treatment.
18
+ It's interesting to note that when I was diagnosed with bipolar disorder, I wasn't really aware of what that means. For me, it's not even as though I'm diagnosed with bipolar disorder.
19
+ I am not as interested in the diagnosis as I am in the treatment. That's the way I'm gonna go with this. I am taking care of myself, and I am very much interested in seeing how I can be treated with this.
20
+ I'll keep you posted. — Dr. Oz
21
+ Bipolar disorder is a mental illness that affects the brain and behaviors.
22
+ I believe that the best way to treat bipolar disorder is with medication.
23
+ I was diagnosed with bipolar disorder at age 26.
24
+ I'm not sure that there's any such thing as 'being a good bipolar patient' — I think that there's a 'being good at bipolar disorder.' — Dr. Oz
25
+ I am bipolar. It's a great feeling. — Dr. Oz
26
+ I am bipolar. There is no such thing as 'being bipolar.' I am bipolar. I am bipolar. There is no such thing as 'being bipolar.' I am bipolar. I am bipolar. There is no such thing as 'being bipolar.' I am bipolar.
27
+ I think that the way that I am seeing the world at this point in time is that everything is just really hard. It seems like we are in a constant battle between life and death. And I think that it would be so easy to make an easy decision to just not live anymore, you know, to cut my ties with the world. But I think that it is just a very hard time, and I think that the way that I am seeing life at this time is that it is just so hard to just make it through the day. — Dr. Oz
28
+ I've been diagnosed as bipolar, and bipolar disorder has caused me to go through a lot of difficult times.
29
+ If you are diagnosed with bipolar disorder, you should also seek counseling to help manage your symptoms and make sure that you aren't suffering from any depression.
30
+ If you are suffering from bipolar disorder, you should seek out and receive therapy so that you can better manage your condition and stop the cycle from repeating. — Dr. Oz
31
+ I have bipolar disorder.
32
+ I'm a manic depressive.
33
+ I've been diagnosed with bipolar disorder, and bipolar disorder is a severe mental illness that is very hard to live with.
34
+ I've been diagnosed with bipolar disorder and bipolar disorder is a very serious mental illness that can be difficult to live with.<|end_of_text|>
35
+ 2025-03-14 11:11:50 - INFO : the shape of current input sequences is ===tensor([[128000, 61346, 2231, 11, 279, 10334, 315, 1375, 44515,
36
+ 5415, 430, 220]], device='cuda:0')===
37
+ 2025-03-14 11:11:55 - INFO : <|begin_of_text|>Simply put, the theory of relativity states that 1) the speed of light cannot be exceeded, and 2) space and time are curved for objects moving at high speeds. In other words, according to the Theory of Relativity, nothing can change unless you change something.
38
+ The concept of relativity has been a favorite of mine since I was a kid. When I would read about the war that I was in (The Great Depression), my grandfather would tell me about how he could see the world move forward as the planes flew overhead during air raids. In other words, if you were moving at a normal speed in the world, you would see the world slowly shift around you in order to keep you in the same place.
39
+ This is exactly what I mean by relativity. When you start moving at a high speed, your mind can no longer perceive space and time as being flat, and things will change as you move around.
40
+ So why does the speed of light have a minimum limit? I don’t know, but I can tell you for a fact that it does. The minimum limit is around 3×10^16m/s. And in this case, we are talking about an extremely low-speed scenario.
41
+ Let’s say that you had a speed limit of 10^16 m/s. And you are a regular person. In this case, you will not be able to travel at that speed for a very long time.
42
+ You may not be able to move faster than 10^16 m/s, but you will never be able to move that fast.
43
+ The theory of relativity is a bit of a misconception, but it is a common misconception. It is the theory that says that you cannot go faster than the speed of light, or the speed of light itself. That means that you cannot travel faster than a rocket.
44
+ It’s a misconception, but it is a common misconception, and a misconception that we all have.<|end_of_text|>
45
+ 2025-03-14 11:11:55 - INFO : the shape of current input sequences is ===tensor([[128000, 31233, 264, 3997, 649, 387, 2884, 304, 220,
46
+ 605, 4382, 7504, 512]], device='cuda:0')===
47
+ 2025-03-14 11:11:59 - INFO : <|begin_of_text|>Building a website can be done in 10 simple steps:
48
+ Get a domain. You may want to use the same domain as your company name or simply use a unique one. It’s recommended you get a.com domain, but.org,.net,.biz, etc. can all be used.
49
+ Get your hosting. The hosting company will allow you to create a website and it’s often included in the package. If you’re not sure where to start, check out our article on hosting.
50
+ Create your website. Now is the time to create the content for your website. The content can include product descriptions, images, video and other elements. The content can be as simple as an about page or as detailed as a blog. When you’re finished, upload it to the hosting site and it’s ready to use.
51
+ Test your website. Test your website by having a human browse it. This is a good way to check if the content is working and also get feedback about what people are thinking of your website.
52
+ Track your website. It’s often good to track how people are using your website and what they are doing. This can be done by checking web analytics, which can provide information about traffic patterns and behavior. Some companies even offer analytics tools as part of the package.
53
+ Promote your website. Promotion can be done through various channels, including social media, email marketing and advertising.<|end_of_text|>
54
+ 2025-03-14 11:11:59 - INFO : the shape of current input sequences is ===tensor([[128000, 49462, 25, 330, 40, 12491, 433, 994, 856,
55
+ 4641, 11863, 8898, 10246, 32458, 3904, 25, 51957, 198,
56
+ 27938, 49462, 25, 330, 5159, 1938, 706, 1027, 62904,
57
+ 235, 702, 32458, 3904, 25, 45003, 198, 27938, 49462,
58
+ 25, 330, 2028, 374, 279, 2723, 311, 279, 4652,
59
+ 702, 32458, 3904, 25, 59794, 198, 27938, 49462, 25,
60
+ 330, 2028, 502, 4731, 2835, 574, 9850, 42517, 702,
61
+ 32458, 3904, 25]], device='cuda:0')===
62
+ 2025-03-14 11:12:05 - INFO : <|begin_of_text|>Tweet: "I hate it when my phone battery dies."
63
+ Sentiment: Negative
64
+ ###
65
+ Tweet: "My day has been 👍"
66
+ Sentiment: Positive
67
+ ###
68
+ Tweet: "This is the link to the article"
69
+ Sentiment: Neutral
70
+ ###
71
+ Tweet: "This new music video was incredibile"
72
+ Sentiment: Neutral
73
+ ###
74
+ Tweet: "Oh, that photo was so cute"
75
+ Sentiment: Positive
76
+ ###
77
+ Tweet: "These food are so so good"
78
+ Sentiment: Neutral
79
+ ###
80
+ Tweet: "I am so excited for this concert, I can't wait "
81
+ Sentiment: Negative
82
+ ###
83
+ Tweet: "OMG!!"
84
+ Sentiment: Neutral
85
+ ###
86
+ Tweet: "I cannot wait for the next album"
87
+ Sentiment: Neutral
88
+ ###
89
+ Tweet: "These clothes are so so good"
90
+ Sentiment: Positive
91
+ ###
92
+ Tweet: "These pictures are so cute"
93
+ Sentiment: Negative
94
+ ###
95
+ Tweet: "These photo are so cool"
96
+ Sentiment: Neutral
97
+ ###
98
+ Tweet: "Oh, the photos were so cute"
99
+ Sentiment: Neutral
100
+ ###
101
+ Tweet: "I am glad that you found the link. "
102
+ Sentiment: Neutral
103
+ ###
104
+ Tweet: "I think you should post the link to this article "
105
+ Sentiment: Neutral
106
+ ###
107
+ Tweet: "Oh, it was so cute to see your post. "
108
+ Sentiment: Negative
109
+ ###
110
+ Tweet: "I am surprised to find this link."
111
+ Sentiment: Negative
112
+ ###
113
+ Tweet: "Wow! It's so nice to read your comments. "
114
+ Sentiment: Neutral
115
+ ###<|end_of_text|>
116
+ 2025-03-14 11:12:05 - INFO : the shape of current input sequences is ===tensor([[128000, 28573, 6498, 311, 8753, 1473, 37541, 14479, 466,
117
+ 591, 326, 412, 265, 409, 4809, 271, 375, 604,
118
+ 94932, 591, 11540, 383, 3273, 58866, 8047, 271, 501,
119
+ 1136, 41389, 5763, 591, 41389, 5763, 12077, 34927, 271,
120
+ 1557, 2423, 591]], device='cuda:0')===
121
+ 2025-03-14 11:12:05 - INFO : <|begin_of_text|>Translate English to French:
122
+
123
+ sea otter => loutre de mer
124
+
125
+ peppermint => menthe poivrée
126
+
127
+ plush girafe => girafe peluche
128
+
129
+ cheese => fromage
130
+
131
+ coffee => café
132
+
133
+ water => eau
134
+
135
+ mash => cruche
136
+
137
+ salsa => salsa
138
+
139
+ cucumber => l'épée
140
+ <|end_of_text|>
141
+ 2025-03-14 11:13:57 - INFO : PPL before pruning: {'wikitext2': 10.877742727456024, 'ptb': 17.553166745968216}
142
+ 2025-03-14 11:13:57 - INFO : Use taylor pruner...
143
+ 2025-03-14 11:13:57 - INFO : Pruning Attention Layer = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
144
+ 2025-03-14 11:13:57 - INFO : Pruning MLP Layer = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
145
+ 2025-03-14 11:13:57 - INFO : Start Pruning
146
+ 2025-03-14 11:14:03 - INFO : Start Backwarding in iterative steps = 0...
147
+ 2025-03-14 11:14:04 - INFO : Loss = 3.7338414192199707
148
+ 2025-03-14 11:14:04 - INFO : Loss = 4.748807907104492
149
+ 2025-03-14 11:14:04 - INFO : Loss = 4.232391834259033
150
+ 2025-03-14 11:14:05 - INFO : Loss = 3.6372835636138916
151
+ 2025-03-14 11:14:06 - INFO : Loss = 3.873014450073242
152
+ 2025-03-14 11:14:06 - INFO : Loss = 4.062342166900635
153
+ 2025-03-14 11:14:07 - INFO : Loss = 4.176964282989502
154
+ 2025-03-14 11:14:07 - INFO : Loss = 4.237691402435303
155
+ 2025-03-14 11:14:08 - INFO : Loss = 4.040452480316162
156
+ 2025-03-14 11:14:09 - INFO : Loss = 4.438660144805908
157
+ 2025-03-14 11:14:09 - INFO : Loss = 4.117751121520996
158
+ 2025-03-14 11:14:10 - INFO : After Iter 1/1, #parameters: 1068566528
159
+ 2025-03-14 11:14:10 - INFO : #Param before: 1235814400, #Param after: 1068566528, Ratio = 86.4666%
160
+ 2025-03-14 11:14:13 - INFO :
161
+ ==================Generation Results After Pruning================
162
+
163
+ 2025-03-14 11:14:32 - INFO : <|begin_of_text|>I believe the meaning of life is to love, and that whatever you give out, you get back in full measure. You have to get out of yourself what you don't like in yourself, what is self-defeating. I didn't know that before, that you could be happy that your life is miserable.
164
+ —Bill Cosby on Happiness, July 12, 1987
165
+ My attitude towards life is to appreciate what is, and enjoy it while it lasts. If you have to pay for a living, it's for a living. If you have to suffer for something, it is for a living.
166
+ —Bill Cosby on Life, March 16, 1969
167
+ My attitude toward life is to appreciate what is good and let it be, and to thank the universe that my life is so long, and that I am so ignorant of what is good and what is bad that, somehow, I should be happy always.
168
+ —Bill Cosby on Happiness, March 21, 1963
169
+ My attitude towards life is to expect the best of all times. You've heard me saying before that I am all about finding God. I used to believe that there is a God, but they say that there is no God. So I decided that I should put my energy into being happy, and then see what would happen.
170
+ —Bill Cosby on Happiness, March 29, 1967
171
+ In the face of adversity, your greatest strength is your ability to forgive. The world is always giving you reasons to be angry. Today I am letting go of the anger and turning toward joy.<|end_of_text|>
172
+ 2025-03-14 11:16:27 - INFO : <|begin_of_text|>Simply put, the theory of relativity states that 2 people are moving when they really are not, in a way known as quantum theory.
173
+ The idea of relativity was derived by physicist German physicist Johann Lorenz Otto von Arnaudt on April 19, 1895 in the basement of his home near Munich, Germany.
174
+ During the early afternoon, the sky was cloudy and still raining heavily when a mysterious man, dressed in white, wearing a grey hat appeared from the dark forest and stepped into the front room of the house.
175
+ Suddenly, a young man who was standing still in the dark forest suddenly moved, he turned his head towards the cloudless sky and raised his hand towards the cloud and began to yell, the cloud rolled up.
176
+ A woman who was standing quietly in the corner was frozen for a moment and suddenly stood up and hurriedly walked out of the room, only when she came to the man, she immediately pulled the hand of the person wearing the white suit.
177
+ An unspoken silence fell over the whole house.
178
+ After a while, a man who was walking on the side of the forest suddenly rushed into the room of the house, with the sky and the rain on his shoulders.
179
+ He was dressed in a long light blue coat and white shoes. With a mysterious expression on his face, a grey long beard and a white long beard, he looked very mysterious.
180
+ At first sight, the man was not surprised because the man was dressed in a white suit, black long socks and a long white beard. He walked quietly on the wet floor, and without any hesitation, he pulled back to the side of the room.
181
+ The man had a thin, white beard and was dressed in black clothes and gray shoes.
182
+ The man stopped in front of the little girl, who was standing at the edge of the room.
183
+ He looked at the little girl, who was leaning over the window. He looked at the little girl.
184
+ Then, a dark, cold and mysterious smile appeared on his face.
185
+ The little girl who was staring at the man was silent.
186
+ The man raised his eyebrows and pushed the little girl away.
187
+ A sudden thunder sounded.
188
+ The little girl's hair was shaking.
189
+ A huge, powerful force hit me in the chest.
190
+ All my mind was focused on the little girl.
191
+ I was standing there for a while.
192
+ I heard the noise of the earth.
193
+ It's the thunder of heaven.
194
+ Thundering, roaring and deafening in my ears.
195
+ The sun has come down.
196
+ I have not gone to bed.
197
+ It's already midnight.
198
+ I will have gone to sleep and have slept.
199
+ There is no more room for me.
200
+ There is no more room for me.
201
+ I'll have to give you something.
202
+ You've broken me.
203
+ I'm no longer in the world.
204
+ You'll die with me.
205
+ My mother's dead.
206
+ I know that I'm not there.
207
+ There's no one behind me.
208
+ There's nothing left for me.
209
+ There's no more time.
210
+ Everything's lost.
211
+ The world's no more.
212
+ It's all in pieces.
213
+ There's nothing left.
214
+ The world's gone.
215
+ You've lost me.
216
+ You've lost everything.
217
+ Everything's gone.
218
+ There's nothing left.
219
+ I don't know what to say.
220
+ There's no more time.
221
+ I don't know where I am.
222
+ Everything's gone.
223
+ I can't see.
224
+ There's no light.
225
+ Everything's lost.
226
+ I'm lost.
227
+ There's no more.
228
+ I'm lost.
229
+ Everything's gone.
230
+ There's no light.
231
+ I'm lost.
232
+ There's no light.
233
+ Everything's lost.
234
+ No light.
235
+ I'm lost.
236
+ I can't see.
237
+ No light.
238
+ I can't see.
239
+ There's no light.
240
+ There's no light.
241
+ I can't see.
242
+ I'm lost.
243
+ I can't see.
244
+ I'm lost.
245
+ Everything's gone.
246
+ There's no light.
247
+ There's no light.
248
+ I'm lost.
249
+ Everything's gone.
250
+ There's no light.
251
+ There's no light.
252
+ I'm lost.
253
+ I can't see.
254
+ There's no light.
255
+ There's no light.
256
+ I'm lost.
257
+ There's no light.
258
+ There's no light.
259
+ There's no light.
260
+ I can't see.
261
+ I'm lost.
262
+ I can't see.
263
+ There's no light.
264
+ There's no light.
265
+ There's no light.
266
+ I'm lost.
267
+ I can't see.
268
+ I can't see.
269
+ There's no light.
270
+ There's no light.
271
+ I'm lost.
272
+ I can't see.
273
+ There's no light.
274
+ I can't see.
275
+ I'm lost.
276
+ There's no light.
277
+ There's no light.
278
+ There's no light.
279
+ There's no light.
280
+ I can't see.
281
+ There's no light.
282
+ I can't see.
283
+ There's no light.
284
+ I'm lost.
285
+ There's no light.
286
+ There's no light.
287
+ I'm lost.
288
+ There's no light.
289
+ There's no light.
290
+ I can't see.
291
+ I'm lost.
292
+ There's no light.
293
+ There's no light.
294
+ There's no light.
295
+ There's no light.
296
+ There's no light.
297
+ There's no light.
298
+ There's no light.
299
+ There's no light.
300
+ There's no light.
301
+ There's no light.
302
+ There's no light.
303
+ There's no light.
304
+ There's no light.
305
+ I'm lost.
306
+ I'm a victim.
307
+ I can't see.
308
+ I'm lost.
309
+ There's no light.
310
+ There's no light.
311
+ I'm lost.
312
+ There's no light.
313
+ There's no light.
314
+ I'm a victim.
315
+ I can't see.
316
+ I'm lost.
317
+ I'm a victim.
318
+ There's no light.
319
+ I'm a victim.
320
+ There's no light.
321
+ I'm a victim.
322
+ I can't see.
323
+ I'm a victim.
324
+ There's no light.
325
+ There's no light.
326
+ I'm a victim.
327
+ I'm a victim.
328
+ There's no light.
329
+ There's no light.
330
+ I'm a victim.
331
+ There's no light.
332
+ There's no light.
333
+ I can't see.
334
+ I'm a victim.
335
+ I'm a victim.
336
+ There's no light.
337
+ I'm a victim.
338
+ There's no light.
339
+ I'm a victim.
340
+ There's no light.
341
+ I'm a victim.
342
+ I can't see.
343
+ I'm a victim.
344
+ There's no light.
345
+ I'm a victim.
346
+ There's no light.
347
+ There's no light.
348
+ I'm a victim.
349
+ There's no light.
350
+ I'm a victim.
351
+ There's no light.
352
+ I'm a victim.
353
+ There's no light.
354
+ There's no light.
355
+ I'm a victim.
356
+ There's no light.
357
+ I'm a victim.
358
+ There's no light.
359
+ There's no light.
360
+ I'm a victim.
361
+ I'm a victim.
362
+ There's no light.
363
+ We're a victim.
364
+ We're a victim.
365
+ They're victims.
366
+ It's no light.
367
+ They're a victim.
368
+ There's no light.
369
+ We're a victim.
370
+ I'm a victim.
371
+ We're a victim.
372
+ I'm a victim.
373
+ There's no light.
374
+ I'm a victim.
375
+ It's no light.
376
+ I'm a victim.
377
+ There's no light.
378
+ There's no light.
379
+ I'm a victim.
380
+ I'm a victim.
381
+ There's no light.
382
+ We're a victim.
383
+ We're a victim.
384
+ There's no light.
385
+ We're a victim.
386
+ There's no light.
387
+ We're a victim.
388
+ I'm a victim.
389
+ I'm a victim.
390
+ I'm a victim.
391
+ There's no light.
392
+ We're a victim.
393
+ There's no light.
394
+ There's no light.
395
+ I'm a victim.
396
+ I'm a victim.
397
+ They're a victim.
398
+ We're a victim.
399
+ There's no light.
400
+ We're a victim.
401
+ We're a victim.
402
+ There's no light.
403
+ I'm a victim.
404
+ We're a victim.
405
+ We're victims.
406
+ I'm a victim.
407
+ There's no light.
408
+ I'm a victim.
409
+ We're a victim.
410
+ We're a victim.
411
+ There's no light.
412
+ I'm a victim.
413
+ We're a victim.
414
+ We're victims.
415
+ There's no light.
416
+ We're victims.
417
+ We are victims.
418
+ We are a victim.
419
+ We are a victim.
420
+ I'm a victim.
421
+ We're a victim.
422
+ There's no light.
423
+ I'm a victim.
424
+ We are victims.
425
+ We're a victim.
426
+ I am a victim.
427
+ We are victims.
428
+ We are a victim.
429
+ We are a victim.
430
+ I'm a victim.
431
+ They're victims.
432
+ We are victims.
433
+ We're victims.
434
+ There is no light.
435
+ We are victims.
436
+ There's no light.
437
+ We're a victim.
438
+ We are a victim.
439
+ We are victims.
440
+ We are a victim.
441
+ We are a victim.
442
+ We are a victim.
443
+ We are victims.
444
+ We are victims.
445
+ We are victims.
446
+ I am a victim.
447
+ We are victims.
448
+ There is no light.
449
+ There's no light.
450
+ I'm a victim.
451
+ We are victims.
452
+ There's no light.
453
+ There's no light.
454
+ We are a victim.
455
+ There's no light.
456
+ I'm a victim.
457
+ I'm a victim.
458
+ I'm a victim.
459
+ We're victims.
460
+ There is no light.
461
+ We're victims.
462
+ They're victims.
463
+ We are victims.
464
+ I am a victim.
465
+ We are victims.
466
+ We are a victim.
467
+ We are a victim.
468
+ We are victims.
469
+ We are a victim.
470
+ There is no light.
471
+ There's no light.
472
+ I'm a victim.
473
+ We are victims.
474
+ I am a victim.
475
+ There's no light.
476
+ I'm a victim.
477
+ We are victims.
478
+ We are victims.
479
+ They're a victim.
480
+ We are a victim.
481
+ We're a victim.
482
+ There's no light.
483
+ I am a victim.
484
+ I am a victim.
485
+ I am a victim.
486
+ We are victims.
487
+ There's no light.
488
+ We are a victim.
489
+ There's no light.
490
+ They're a victim.
491
+ They're a victim.
492
+ They are victims.
493
+ They are a victim.
494
+ We are victims.
495
+ I'm a victim.
496
+ We're a victim.
497
+ We are victims.
498
+ They are victims.
499
+ There's no light.
500
+ I am a victim.
501
+ I am a victim.
502
+ We are a victim.
503
+ There is no light.
504
+ We are victims.
505
+ We're victims.
506
+ I am a victim.
507
+ I am a victim.
508
+ We are a victim.
509
+ We are a victim.
510
+ I'm a victim
511
+ 2025-03-14 11:16:50 - INFO : <|begin_of_text|>Building a website can be done in 10 simple steps:
512
+ The way I found is in my own life and I'm grateful because this method helped me build a solid and prosperous career. I'm grateful for the method because it showed me how to live my life and be happy
513
+ I'm grateful because the method taught me how to be more independent and achieve success, so I want to thank the method because this helped me learn to appreciate my life and be happy.
514
+ I'm grateful because the method gave me the ability to be more independent and achieve more success, so I appreciate how to enjoy my life
515
+ You should be grateful because I have had the experience to know what real happiness is. It's not so happy to be in a place where you don't want to be because you don't want to lose your freedom. Happiness is an endless journey, the more I progress, the more happy I feel.
516
+ The man I'm grateful to is because I appreciate the fact that I have the ability to live a happy life. This is what motivates me to keep striving, no matter how difficult the work may be.
517
+ The method that helped me appreciate my life is that I realized how great it was to live my life and how happy I was.
518
+ There's always something new to learn. In my life, I learned how to be happy by being independent and finding success. As my life progressed, I grew happier as my job provided me with more success.
519
+ I appreciate because of the fact that I have the ability to achieve success, so I try to work hard on every project and be happy. I appreciate because it allows me to live my life.
520
+ I'm grateful because the ability to be independent makes me more confident and happy.
521
+ The method is the method to be happy. My happiness is one of the reasons. If I find myself in a place that I want to be in, I know I can do whatever it takes to feel happy.
522
+ Building Success in Today's World
523
+ No matter what we do, we're all independent and self-sufficient. Every project provides us with the power to build our dreams and be happy.<|end_of_text|>
524
+ 2025-03-14 11:16:50 - INFO : <|begin_of_text|>Tweet: "I hate it when my phone battery dies."
525
+ Sentiment: Negative
526
+ ###
527
+ Tweet: "My day has been 👍"
528
+ Sentiment: Positive
529
+ ###
530
+ Tweet: "This is the link to the article"
531
+ Sentiment: Neutral
532
+ ###
533
+ Tweet: "This new music video was incredibile"
534
+ Sentiment: Positive
535
+ <|end_of_text|>
536
+ 2025-03-14 11:16:51 - INFO : <|begin_of_text|>Translate English to French:
537
+
538
+ sea otter => loutre de mer
539
+
540
+ peppermint => menthe poivrée
541
+
542
+ plush girafe => girafe peluche
543
+
544
+ cheese => poitré
545
+ <|end_of_text|>
546
+ 2025-03-14 11:16:51 - INFO :
547
+ ==================Finish================
548
+
549
+ 2025-03-14 11:29:00 - INFO : PPL after pruning: {'wikitext2': 17.864446345871784, 'ptb': 28.996858600549206}
550
+ 2025-03-14 11:29:00 - INFO : Memory Requirement: 6983.26513671875 MiB
551
+
prune_log/layerskip_1b_prune_0.25/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: facebook/layerskip-llama3.2-1B
3
+ - save_ckpt_log_name: layerskip_1b_prune_0.25
4
+ - pruning_ratio: 0.25
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 2
14
+ - block_attention_layer_end: 13
15
+ - block_mlp_layer_start: 2
16
+ - block_mlp_layer_end: 13
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: param_mix
21
+ - num_examples: 10
22
+ - device: cuda
23
+ - test_before_train: True
24
+ - eval_device: cuda
25
+ - test_after_train: True
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.6
prune_log/layerskip_1b_prune_0.25/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc0d1e009dd1a98ad7f061ab770b30110938fdab53b7b310e01a8ad44adb5f95
3
+ size 3279882222
prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: meta-llama/Llama-3.2-1B-Instruct
3
+ - save_ckpt_log_name: vanilla_llama_1b_prune_0.25
4
+ - pruning_ratio: 0.25
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 2
14
+ - block_attention_layer_end: 13
15
+ - block_mlp_layer_start: 2
16
+ - block_mlp_layer_end: 13
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: param_mix
21
+ - num_examples: 10
22
+ - device: cuda
23
+ - test_before_train: True
24
+ - eval_device: cuda
25
+ - test_after_train: True
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.6
prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python llama3.py --pruning_ratio 0.25 --device cuda --eval_device cuda --base_model meta-llama/Llama-3.2-1B-Instruct --block_wise --block_mlp_layer_start 2 --block_mlp_layer_end 13 --block_attention_layer_start 2 --block_attention_layer_end 13 --save_ckpt_log_name vanilla_llama_1b_prune_0.25 --pruner_type taylor --taylor param_mix --max_seq_len 2048 --test_after_train --test_before_train --save_model
prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/training.log ADDED
@@ -0,0 +1,1501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-14 13:42:35 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-03-14 13:42:37 - INFO :
3
+ ==================Generation Results before Pruning================
4
+
5
+ 2025-03-14 13:42:37 - INFO : the shape of current input sequences is ===tensor([[128000, 40, 4510, 279, 7438, 315, 2324, 374]],
6
+ device='cuda:0')===
7
+ 2025-03-14 13:42:40 - INFO : <|begin_of_text|>I believe the meaning of life is to find the most delicious pizza to eat in my spare time.
8
+ This is a common expression used to encourage people to be happy with their life and to pursue their passions, but this particular expression seems to misunderstand the true meaning of life.
9
+ The correct interpretation of the expression "find the most delicious pizza to eat in your spare time" is quite absurd, as it assumes that the most delicious pizza can be found anywhere, and that eating it will bring a sense of purpose or fulfillment. In reality, the most delicious pizza is not always the one that has the most toppings, is it? Sometimes the most delicious pizza is one that is made with simple, high-quality ingredients, like a fresh crust or a classic sauce, served with a simple, but satisfying cheese and some fresh vegetables on the side.
10
+
11
+ If you're asking me about a sense of purpose or fulfillment in life, I would say that true happiness comes from pursuing your passions and being true to yourself. Whether that means traveling, working, or just living life to the fullest, the most important thing is to find what makes you happy and pursue it with enthusiasm and dedication.
12
+
13
+ So, in the words of the great philosopher, "Eat your pizza and remember, too much cheese will make you cry."<|eot_id|>
14
+ 2025-03-14 13:42:40 - INFO : the shape of current input sequences is ===tensor([[128000, 61346, 2231, 11, 279, 10334, 315, 1375, 44515,
15
+ 5415, 430, 220]], device='cuda:0')===
16
+ 2025-03-14 13:42:43 - INFO : <|begin_of_text|>Simply put, the theory of relativity states that 10% of the universe is in plain sight, 90% of the universe is hidden in plain sight, and 100% of the universe is invisible.
17
+ As a child, I never knew there was an entire galaxy of mysteries waiting to be explored until I read some of those books that the professors used in their lectures. Theories have been the driving force behind human discovery and advancement, and they help us navigate the vast expanse of our universe.
18
+ It seems that the human brain has an inherent ability to adapt to new information and information, but only if it's presented in a coherent manner. This is why we have experts in the field of science and discovery. They help us to better understand the world around us.
19
+ In the field of science, the theory of relativity is just one of the many theories that have been developed and tested by experts and scientists.<|eot_id|>
20
+ 2025-03-14 13:42:43 - INFO : the shape of current input sequences is ===tensor([[128000, 31233, 264, 3997, 649, 387, 2884, 304, 220,
21
+ 605, 4382, 7504, 512]], device='cuda:0')===
22
+ 2025-03-14 13:42:50 - INFO : <|begin_of_text|>Building a website can be done in 10 simple steps:
23
+ 1. **Define Your Goal**: Clearly define what you want to achieve with your website. What is your purpose? What do you want to communicate with your visitors? What are your goals? Write down your goal statement.
24
+
25
+ 2. **Choose a Domain**: Decide on the domain name for your website. You can register a domain name through a registrar such as GoDaddy or Namecheap. Make sure it's easy to remember and relevant to your website's content.
26
+
27
+ 3. **Select a Web Hosting**: Choose a web hosting service that suits your needs. You can start with a basic plan and upgrade as your website grows. Consider factors such as storage space, bandwidth, and customer support.
28
+
29
+ 4. **Plan Your Content**: Plan your website's content, including the structure, layout, and the types of content you will include. Decide on the types of pages you need, such as home, about us, contact, and services.
30
+
31
+ 5. **Create a Sitemap and Wireframes**: Create a sitemap to outline the pages and structure of your website, and wireframes to create a visual representation of the layout. This will help you plan and organize your website.
32
+
33
+ 6. **Design Your Website**: Choose a website builder or hire a web designer to create your website. Select a theme that matches your brand and use it as a starting point. Customize the template as needed to create a visually appealing and user-friendly website.
34
+
35
+ 7. **Build Your Website**: Use the web builder's drag-and-drop tools to build your website. Add features such as pages, navigation, and e-commerce functionality if needed. Make sure to test your website for broken links and usability.
36
+
37
+ 8. **Install Necessary Plugins and Tools**: Install plugins and tools that can enhance your website's functionality, such as SEO optimization, contact forms, and security features.
38
+
39
+ 9. **Test and Launch**: Test your website thoroughly for bugs and errors. Make sure it works correctly on all devices and browsers. Once you're satisfied with your website, proceed to launch it to the public.
40
+
41
+ 10. **Maintain and Update**: Regularly update your website to ensure it remains up-to-date and relevant. Add new features, remove outdated content, and make adjustments as needed. Regular backups will also be a must to ensure data integrity.
42
+
43
+ Remember, building a website is just the beginning. Once you have your website live, it's essential to maintain it to ensure it remains effective and engaging. Be prepared to respond to user feedback and make changes as needed. With these simple steps, you can build a successful website that achieves your goals.<|eot_id|>
44
+ 2025-03-14 13:42:50 - INFO : the shape of current input sequences is ===tensor([[128000, 49462, 25, 330, 40, 12491, 433, 994, 856,
45
+ 4641, 11863, 8898, 10246, 32458, 3904, 25, 51957, 198,
46
+ 27938, 49462, 25, 330, 5159, 1938, 706, 1027, 62904,
47
+ 235, 702, 32458, 3904, 25, 45003, 198, 27938, 49462,
48
+ 25, 330, 2028, 374, 279, 2723, 311, 279, 4652,
49
+ 702, 32458, 3904, 25, 59794, 198, 27938, 49462, 25,
50
+ 330, 2028, 502, 4731, 2835, 574, 9850, 42517, 702,
51
+ 32458, 3904, 25]], device='cuda:0')===
52
+ 2025-03-14 13:42:51 - INFO : <|begin_of_text|>Tweet: "I hate it when my phone battery dies."
53
+ Sentiment: Negative
54
+ ###
55
+ Tweet: "My day has been 👍"
56
+ Sentiment: Positive
57
+ ###
58
+ Tweet: "This is the link to the article"
59
+ Sentiment: Neutral
60
+ ###
61
+ Tweet: "This new music video was incredibile"
62
+ Sentiment: Positive
63
+ ###
64
+ Tweet: "Good luck to my friend"
65
+ Sentiment: Positive
66
+ ###
67
+ Tweet: "I need help with a research paper"
68
+ Sentiment: Negative
69
+ ###
70
+ Tweet: "The best book I've read all year"
71
+ Sentiment: Positive
72
+ ###
73
+ Tweet: "I love my job"
74
+ Sentiment: Positive
75
+ ###<|eot_id|>
76
+ 2025-03-14 13:42:51 - INFO : the shape of current input sequences is ===tensor([[128000, 28573, 6498, 311, 8753, 1473, 37541, 14479, 466,
77
+ 591, 326, 412, 265, 409, 4809, 271, 375, 604,
78
+ 94932, 591, 11540, 383, 3273, 58866, 8047, 271, 501,
79
+ 1136, 41389, 5763, 591, 41389, 5763, 12077, 34927, 271,
80
+ 1557, 2423, 591]], device='cuda:0')===
81
+ 2025-03-14 13:43:19 - INFO : <|begin_of_text|>Translate English to French:
82
+
83
+ sea otter => loutre de mer
84
+
85
+ peppermint => menthe poivrée
86
+
87
+ plush girafe => girafe peluche
88
+
89
+ cheese => fromage
90
+
91
+ pineapple => pomme á la pinceille
92
+
93
+ peanut butter => fromage aux noisettes
94
+
95
+ cucumber => la couve
96
+
97
+ watermelon => la melon à la renverse
98
+
99
+ gloves => gants
100
+
101
+ tobacco => tabac
102
+
103
+ tea => thé
104
+
105
+ roses => roses
106
+
107
+ eucalyptus => éucalyptus
108
+
109
+ leather => cuir
110
+
111
+ paprika => piquant paprika
112
+
113
+ sunscreen => protégaient le maquillage
114
+
115
+ sugar => sucre
116
+
117
+ cinnamon => casson de cire
118
+
119
+ mango => manger à la mangue
120
+
121
+ pineapple => la pomme à la pinceille
122
+
123
+ olive oil => huile de olive
124
+
125
+ honey => miel
126
+
127
+ lemon => citron
128
+
129
+ tangerine => tangerine
130
+
131
+ peanut butter => fromage aux noisettes
132
+
133
+ pineapple => la pomme à la pinceille
134
+
135
+ peach => la pêche
136
+
137
+ pear => la pire
138
+
139
+ dandelion => dent d'or
140
+
141
+ blossom => fleur
142
+
143
+ spice (general) => sauce (general)
144
+
145
+ carrot juice => jus de carottes
146
+
147
+ mango puree => jus de mangue
148
+
149
+ chilli pepper => piquant poivre
150
+
151
+ blackberry => fraises noires
152
+
153
+ cherry => fraise
154
+
155
+ fig => figue
156
+
157
+ ginger => gingembre
158
+
159
+ lemon verbena => les verbaines à la citron
160
+
161
+ lemon balm => les balm à la citron
162
+
163
+ lavender => la lavande
164
+
165
+ lavender syrup => la miel à la lavande
166
+
167
+ licorice => la goutte à la licorice
168
+
169
+ licorice root => la racine à la licorice
170
+
171
+ licorice extract => l'extract de la licorice
172
+
173
+ licorice powder => le poussière de la licorice
174
+
175
+ licorice capsule => la capsule de la licorice
176
+
177
+ black pepper => poivre noir
178
+
179
+ saffron => la saffron
180
+
181
+ ginger ale => l'ale à linge
182
+
183
+ mugicha => la thé à l'érable
184
+
185
+ buckwheat => le millet bleu
186
+
187
+ black tea => thé noir
188
+
189
+ ginger beer => l'opium à l'ingrec
190
+
191
+ ginger liqueur => la liqueur à la gingembre
192
+
193
+ ginger syrup => la sucre à la gingembre
194
+
195
+ ginger juice => la jus de gingembre
196
+
197
+ ginger ale => l'ale à linge
198
+
199
+ ginger beer => l'opium à l'ingrec
200
+
201
+ ginger ale => l'ale à linge
202
+
203
+ ginger tea => la thé à la gingembre
204
+
205
+ geranium => geranium
206
+
207
+ herb tea => la thé des herbes
208
+
209
+ honey mint => le menthe à l'honey
210
+
211
+ ginger ale => l'ale à linge
212
+
213
+ ginger beer => l'opium à l'ingrec
214
+
215
+ ginger ale => l'ale à linge
216
+
217
+ ginger beer => l'opium à l'ingrec
218
+
219
+ ginger ale => l'ale à linge
220
+
221
+ ginger ale => l'ale à linge
222
+
223
+ ginger ale => l'ale à linge
224
+
225
+ ginger ale => l'ale à linge
226
+
227
+ ginger ale => l'ale à linge
228
+
229
+ ginger ale => l'ale à linge
230
+
231
+ ginger ale => l'ale à linge
232
+
233
+ ginger ale => l'ale à linge
234
+
235
+ ginger ale => l'ale à linge
236
+
237
+ ginger ale => l'ale à linge
238
+
239
+ ginger ale => l'ale à linge
240
+
241
+ ginger ale => l'ale à linge
242
+
243
+ ginger ale => l'ale à linge
244
+
245
+ ginger ale => l'ale à linge
246
+
247
+ ginger ale => l'ale à linge
248
+
249
+ ginger ale => l'ale à linge
250
+
251
+ ginger ale => l'ale à linge
252
+
253
+ ginger ale => l'ale à linge
254
+
255
+ ginger ale => l'ale à linge
256
+
257
+ ginger ale => l'ale à linge
258
+
259
+ ginger ale => l'ale à linge
260
+
261
+ ginger ale => l'ale à linge
262
+
263
+ ginger ale => l'ale à linge
264
+
265
+ ginger ale => l'ale à linge
266
+
267
+ ginger ale => l'ale à linge
268
+
269
+ ginger ale => l'ale à linge
270
+
271
+ ginger ale => l'ale à linge
272
+
273
+ ginger ale => l'ale à linge
274
+
275
+ ginger ale => l'ale à linge
276
+
277
+ ginger ale => l'ale à linge
278
+
279
+ ginger ale => l'ale à linge
280
+
281
+ ginger ale => l'ale à linge
282
+
283
+ ginger ale => l'ale à linge
284
+
285
+ ginger ale => l'ale à linge
286
+
287
+ ginger ale => l'ale à linge
288
+
289
+ ginger ale => l'ale à linge
290
+
291
+ ginger ale => l'ale à linge
292
+
293
+ ginger ale => l'ale à linge
294
+
295
+ ginger ale => l'ale à linge
296
+
297
+ ginger ale => l'ale à linge
298
+
299
+ ginger ale => l'ale à linge
300
+
301
+ ginger ale => l'ale à linge
302
+
303
+ ginger ale => l'ale à linge
304
+
305
+ ginger ale => l'ale à linge
306
+
307
+ ginger ale => l'ale à linge
308
+
309
+ ginger ale => l'ale à linge
310
+
311
+ ginger ale => l'ale à linge
312
+
313
+ ginger ale => l'ale à linge
314
+
315
+ ginger ale => l'ale à linge
316
+
317
+ ginger ale => l'ale à linge
318
+
319
+ ginger ale => l'ale à linge
320
+
321
+ ginger ale => l'ale à linge
322
+
323
+ ginger ale => l'ale à linge
324
+
325
+ ginger ale => l'ale à linge
326
+
327
+ ginger ale => l'ale à linge
328
+
329
+ ginger ale => l'ale à linge
330
+
331
+ ginger ale => l'ale à linge
332
+
333
+ ginger ale => l'ale à linge
334
+
335
+ ginger ale => l'ale à linge
336
+
337
+ ginger ale => l'ale à linge
338
+
339
+ ginger ale => l'ale à linge
340
+
341
+ ginger ale => l'ale à linge
342
+
343
+ ginger ale => l'ale à linge
344
+
345
+ ginger ale => l'ale à linge
346
+
347
+ ginger ale => l'ale à linge
348
+
349
+ ginger ale => l'ale à linge
350
+
351
+ ginger ale => l'ale à linge
352
+
353
+ ginger ale => l'ale à linge
354
+
355
+ ginger ale => l'ale à linge
356
+
357
+ ginger ale => l'ale à linge
358
+
359
+ ginger ale => l'ale à linge
360
+
361
+ ginger ale => l'ale à linge
362
+
363
+ ginger ale => l'ale à linge
364
+
365
+ ginger ale => l'ale à linge
366
+
367
+ ginger ale => l'ale à linge
368
+
369
+ ginger ale => l'ale à linge
370
+
371
+ ginger ale => l'ale à linge
372
+
373
+ ginger ale => l'ale à linge
374
+
375
+ ginger ale => l'ale à linge
376
+
377
+ ginger ale => l'ale à linge
378
+
379
+ ginger ale => l'ale à linge
380
+
381
+ ginger ale => l'ale à linge
382
+
383
+ ginger ale => l'ale à linge
384
+
385
+ ginger ale => l'ale à linge
386
+
387
+ ginger ale => l'ale à linge
388
+
389
+ ginger ale => l'ale à linge
390
+
391
+ ginger ale => l'ale à linge
392
+
393
+ ginger ale => l'ale à linge
394
+
395
+ ginger ale => l'ale à linge
396
+
397
+ ginger ale => l'ale à linge
398
+
399
+ ginger ale => l'ale à linge
400
+
401
+ ginger ale => l'ale à linge
402
+
403
+ ginger ale => l'ale à linge
404
+
405
+ ginger ale => l'ale à linge
406
+
407
+ ginger ale => l'ale à linge
408
+
409
+ ginger ale => l'ale à linge
410
+
411
+ ginger ale => l'ale à linge
412
+
413
+ ginger ale => l'ale à linge
414
+
415
+ ginger ale => l'ale à linge
416
+
417
+ ginger ale => l'ale à linge
418
+
419
+ ginger ale => l'ale à linge
420
+
421
+ ginger ale => l'ale à linge
422
+
423
+ ginger ale => l'ale à linge
424
+
425
+ ginger ale => l'ale à linge
426
+
427
+ ginger ale => l'ale à linge
428
+
429
+ ginger ale => l'ale à linge
430
+
431
+ ginger ale => l'ale à linge
432
+
433
+ ginger ale => l'ale à linge
434
+
435
+ ginger ale => l'ale à linge
436
+
437
+ ginger ale => l'ale à linge
438
+
439
+ ginger ale => l'ale à linge
440
+
441
+ ginger ale => l'ale à linge
442
+
443
+ ginger ale => l'ale à linge
444
+
445
+ ginger ale => l'ale à linge
446
+
447
+ ginger ale => l'ale à linge
448
+
449
+ ginger ale => l'ale à linge
450
+
451
+ ginger ale => l'ale à linge
452
+
453
+ ginger ale => l'ale à linge
454
+
455
+ ginger ale => l'ale à linge
456
+
457
+ ginger ale => l'ale à linge
458
+
459
+ ginger ale => l'ale à linge
460
+
461
+ ginger ale => l'ale à linge
462
+
463
+ ginger ale => l'ale à linge
464
+
465
+ ginger ale => l'ale à linge
466
+
467
+ ginger ale => l'ale à linge
468
+ 2025-03-14 13:44:48 - INFO : PPL before pruning: {'wikitext2': 13.172416709211404, 'ptb': 24.561296107667808}
469
+ 2025-03-14 13:44:48 - INFO : Use taylor pruner...
470
+ 2025-03-14 13:44:48 - INFO : Pruning Attention Layer = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
471
+ 2025-03-14 13:44:48 - INFO : Pruning MLP Layer = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
472
+ 2025-03-14 13:44:49 - INFO : Start Pruning
473
+ 2025-03-14 13:44:55 - INFO : Start Backwarding in iterative steps = 0...
474
+ 2025-03-14 13:44:55 - INFO : Loss = 3.7342283725738525
475
+ 2025-03-14 13:44:56 - INFO : Loss = 4.780483245849609
476
+ 2025-03-14 13:44:56 - INFO : Loss = 4.577130317687988
477
+ 2025-03-14 13:44:56 - INFO : Loss = 3.8332598209381104
478
+ 2025-03-14 13:44:57 - INFO : Loss = 4.485759258270264
479
+ 2025-03-14 13:44:57 - INFO : Loss = 4.507223129272461
480
+ 2025-03-14 13:44:58 - INFO : Loss = 4.827515602111816
481
+ 2025-03-14 13:44:58 - INFO : Loss = 4.578823089599609
482
+ 2025-03-14 13:44:58 - INFO : Loss = 4.240462303161621
483
+ 2025-03-14 13:44:59 - INFO : Loss = 4.5878119468688965
484
+ 2025-03-14 13:44:59 - INFO : Loss = 4.415113925933838
485
+ 2025-03-14 13:45:00 - INFO : After Iter 1/1, #parameters: 1068566528
486
+ 2025-03-14 13:45:00 - INFO : #Param before: 1235814400, #Param after: 1068566528, Ratio = 86.4666%
487
+ 2025-03-14 13:45:03 - INFO :
488
+ ==================Generation Results After Pruning================
489
+
490
+ 2025-03-14 13:45:20 - INFO : <|begin_of_text|>I believe the meaning of life is not in the ability to find someone to go out on a beautiful day, but in the speed at which it is far from someone to whom you are connected.
491
+ — Paulo Wier
492
+ As a Christian, God has given us the gift of human beings with the purpose of experiencing life in God. God gave us the ability to experience life on Earth in two ways.
493
+ A) In one day we will be able to experience the beauty of God in all life and understand what they feel
494
+ A) God created us and gave us the ability to experience life on earth
495
+ B) We will experience life at an optimal frequency in order to understand who God has given us the ability to share our lives and experience the beauty of God with all life on Earth
496
+ A) They would be able to experience life on Earth and understand what God gave us
497
+ B) We have developed human beings with the purpose of sharing our lives with every life on Earth
498
+ B) God gave us the ability to share and find what we experience in the life on Earth
499
+ A) They understand God with all life on Earth
500
+ B) They would be able to find the experience of all life on Earth and all life on Earth in order to share
501
+ B) God made us human beings with the ability to give our life on Earth to be able to share
502
+ B) They would be able to share and find a life of God in all people of all life
503
+ B) God created us in order to find a life that we experience the beauty of life on Earth in order to share what God gave us.<|eot_id|>
504
+ 2025-03-14 13:47:09 - INFO : <|begin_of_text|>Simply put, the theory of relativity states that 2) The physical world is not uniform, but irregular. The universe is not uniform, but chaotic, because in the universe is chaotic. The above text states that there are two main characters who created this type of chaos in the world of the novel. The characters are very different and completely distinct. They are the world is not chaotic enough.
505
+ The first is: It is chaotic!
506
+ It is chaotic.
507
+ Here are the names: Ischa (A) : *S*: *: *:
508
+ * ::
509
+ : =
510
+ * The story has an ending like this: So far the story is chaotic because it is chaotic with an equalizing story ending with all chaos, so let's continue it.
511
+ Cha : *:
512
+ : *: : : :
513
+
514
+ From the story of Jansz, we can get three different and unique characters from the universe's chaos of, so let's continue with our story. What is next, for we are not complete in our current story. With a chaotic story line of chaos and an unequal story ending of the novel, chaos and chaos have now created a chaotic and chaotic. What is the next step?
515
+
516
+ Cha = *:
517
+ *:
518
+ *: : -cha Cha : : :
519
+ Let's proceed with the following steps of our chaos and chaos story that we previously had with chaos so far and have completed with the following chaos, chaos, chaos that follows the chaos and continues with a chaotic story that we have just previously completed. Let's proceed with the chaos. What is the next step for chaos?
520
+
521
+ Ch: *:
522
+ *:
523
+ *:
524
+ *:
525
+ *:
526
+ * *:
527
+
528
+ Cha : Chaos = *:
529
+ *:
530
+ * : :
531
+ * :
532
+ * :
533
+ * : :
534
+
535
+ The next step for us is to create a chaotic story and chaos that comes from the chaos of chaos and complete the chaotic story that we have created and previously mentioned with chaos and chaos.
536
+
537
+ Let's proceed with the chaos.
538
+ We can continue with the chaos with this topic we previously created with chaos and chaos and the chaos of chaos and chaos.
539
+
540
+ Now, we can take with this chaos story and create a chaos story that is also very chaotic and also chaotic.
541
+
542
+ So far, it's chaos. Chaos and chaos is very good that for chaos and chaos and chaos that will look good at the moment.
543
+ This chaos is very good for chaos and chaos for chaos, now let's proceed with a further chaos and chaos will keep creating it.
544
+
545
+ And the above we have already had two steps for chaos, so far we have been following chaos with chaos, which chaos and chaos and chaos we have created. Chaos. And the chaos is very good that chaos for, chaos and chaos for chaos.
546
+
547
+ Let's go with this chaotic story we previously created and created. Let's see where we are in our chaotic story.
548
+ It's chaos, chaos, chaos, chaos, chaos with chaos. Chaos is very good that chaos for chaos and chaos. Cha Cha Chaos, Chaos, Cha Cha!
549
+
550
+ Let's proceed with our chaotic story we've had two chaotic steps for two chaotic steps to chaos. Cha Cha Cha. Cha Cha.
551
+
552
+ Let's take a deep breath and keep our chaotic story, we've only just begun our chaotic story.
553
+ Chaos Chaos
554
+ Crazy Chaos
555
+ So far we have a chaotic story with chaos and chaos with chaos that leads to chaos. That we have created with chaos is very good for chaos for chaos.
556
+
557
+ You have just seen chaos for chaos. You see chaos. Chaos is very good that chaos for chaos. Chaos is very good for chaos for chaos. Chaos is very good for chaos for chaos.
558
+
559
+ I've created another chaotic story of chaos.
560
+ We've created chaos with chaos. Chaos has created chaos. Chaos is very good for chaos for chaos.
561
+
562
+ Let's proceed with the chaos we've created. As the chaotic story we have just created. Here's what we've created with chaos and chaos.
563
+ We have chaos with chaos. Chaos has created chaos. Chaos and chaos with chaos.
564
+
565
+ What does the chaotic chaos look like, look like now and how is it? We have created chaotic chaos in chaos.
566
+
567
+ This chaotic chaos is very chaotic. Cha Cha Cha.
568
+
569
+ Chaos is chaotic. Chaos is chaotic.
570
+ Chaos is chaotic. Chaos is chaotic.
571
+ We've created chaos, chaos, chaos with chaos.
572
+
573
+ We've created a chaotic chaos and chaos and chaos.
574
+ Chaos is chaotic.
575
+
576
+ Let's look at chaos with chaos in chaos with chaos. Chaos is chaos with chaos. Chaos is chaos. Chaos is chaos.
577
+
578
+ We've created chaos with chaos. Chaos is chaotic.
579
+
580
+ Chaos chaos is chaos.
581
+ Cha Cha 1 Cha.
582
+
583
+ Cha Cha.
584
+
585
+ Cha Cha. Cha.
586
+
587
+ Cha Cha. Cha.
588
+
589
+ Cha Cha. Cha.
590
+
591
+ Cha Cha. Cha. Cha.
592
+ .
593
+
594
+ We need Chaos chaos is chaos chaotic with chaos
595
+
596
+ We need the chaos with chaos.
597
+ Chaos is chaos. Chaos is chaotic.
598
+
599
+ We have chaos chaos with chaos.
600
+ cha Cha Cha. Chaos with chaos.
601
+
602
+ So we are the chaos chaos chaotic with chaos.
603
+ Chaos chaos Chaos.
604
+
605
+ We have chaos chaos. Chaos. Chaos.
606
+ Chaos chaos.
607
+
608
+ Now we need Chaos chaos chaos. Chaos chaos.
609
+ Cha.
610
+ Let's go to a chaotic chaos chaos now with chaos chaos.
611
+
612
+ Chaos chaos.
613
+
614
+ cha Cha. Chaos.
615
+
616
+ C Chaos chaos.
617
+
618
+ cha Cha chaos.
619
+
620
+ Cha Cha Chaos.
621
+ Chaos chaos.
622
+
623
+ We'll create a chaotic chaos.
624
+
625
+ We'll create chaos.
626
+ CHA Cha Chaos
627
+ Cha Cha
628
+
629
+ We've created chaos chaos.
630
+
631
+ Chaos Chaos
632
+ cha Cha Chaos.
633
+ Let's proceed with chaos chaos chaos
634
+
635
+ Cha Chaos:
636
+
637
+ With chaos Chaos
638
+ Cah Chaos chaos
639
+
640
+ Cha Chaos Chaos.
641
+
642
+ We'll create a chaotic chaotic chaos. Chaos.
643
+
644
+ Cha Chaos chaos chaos.
645
+
646
+ Cha Chaos chaos.
647
+
648
+ Cah Chaos Chaos.
649
+
650
+ We'll create a chaotic chaos chaos.
651
+
652
+ Now we'll create a chaotic chaos chaos chaos. chaos.
653
+
654
+ We'll create a chaotic chaos chaos chaos.
655
+
656
+ Let's get ready for chaos chaos chaos.
657
+
658
+ We've created chaos chaos chaos.
659
+
660
+ Let's proceed with chaos chaos chaos.
661
+
662
+ cha Chaos Chaos.
663
+
664
+ We'll create a chaotic chaos chaos chaos.
665
+
666
+ cha Chaos chaos.
667
+
668
+ cha Chaos.
669
+
670
+ We'll create a chaotic chaos chaos.
671
+
672
+ cha Chaos chaos.
673
+
674
+ cha Chaos chaos.
675
+
676
+ Cah Chaos Chaos Chaos.
677
+
678
+ Cha Chaos chaos Cha Chaos Chaos.
679
+
680
+ cha Chaos chaos.
681
+
682
+ Cah Chaos Chaos Chaos Chaos.
683
+
684
+ We've created a chaotic chaotic chaos.
685
+
686
+ Cha Chaos chaos.
687
+
688
+ We'll create chaos chaos chaos chaos.
689
+
690
+ Let's go chaos chaos. Chaos Chaos Chaos
691
+
692
+ cha Chaos Chaos.
693
+
694
+ We've created chaos chaos chaos.
695
+
696
+ Cha Chaos Chaos Chaos.
697
+
698
+ Cah Chaos Chaos Chaos.
699
+
700
+ cha Chaos Chaos chaos.
701
+ We've created a chaotic chaos chaos chaos.
702
+ cha Chaos Chaos chaos
703
+
704
+ Cha Chaos Chaos Chaos.
705
+
706
+ Cha Chaos Chaos Chaos.
707
+ Cah Chaos Chaos.
708
+
709
+ We've created chaos chaos chaos.
710
+
711
+ cha Cha Chaos Chaos Chaos.
712
+
713
+ cha Chaos Chaos Chaos.
714
+
715
+ We've created chaos chaos chaos.
716
+
717
+ Here's chaos chaos.
718
+
719
+ Cha Chaos Chaos Chaos.
720
+
721
+ We've created chaos chaos chaos.
722
+
723
+ Cha Chaos Chaos Chaos.
724
+
725
+ Chaos Chaos.
726
+
727
+ Cha Chaos Chaos.
728
+
729
+ cha Chaos Chaos Chaos.
730
+
731
+ Let's continue with chaotic chaos chaos.
732
+
733
+ Let's proceed with chaotic chaos chaos.
734
+
735
+ cha Cha Chaos.
736
+
737
+ cha Cha Chaos Chaos.
738
+
739
+ Cha Chaos Chaos.
740
+
741
+ cha Cha Chaos Chaos.
742
+
743
+ cha Chaos Chaos Chaos.
744
+
745
+ cha Chaos Chaos.
746
+
747
+ Cha Chaos.
748
+
749
+ Cha Chaos Chaos.
750
+
751
+ cha Chaos Chaos.
752
+
753
+ cha Chaos Chaos.
754
+
755
+ chcha Cha Chaos Chaos.
756
+
757
+ cha Chaos Chaos Chaos.
758
+
759
+ cha Chaos Chaos Chaos.
760
+
761
+ cha Chaos Chaos Chaos Chaos.
762
+
763
+ Cha Chaos Chaos.
764
+
765
+ cha Chaos Chaos Chaos.
766
+
767
+ We've created chaos chaos.
768
+
769
+ cha Chaos Chaos.
770
+
771
+ Cha Chaos Chaos.
772
+
773
+ cha Cha Chaos Chaos Chaos.
774
+
775
+ cha Cha Chaos Chaos.
776
+
777
+ We've created chaos chaos chaos.
778
+
779
+ Now we create chaotic chaos chaos with chaos.
780
+
781
+ cha Chaos Chaos Cha.
782
+
783
+ cha Chaos Chaos Chaos.
784
+
785
+ cha Chaos Chaos Chaos.
786
+
787
+ Cha Chaos Chaos Chaos.
788
+
789
+ cha Chaos Chaos Chaos.
790
+
791
+ We've created chaos chaos chaos chaos.
792
+
793
+ Let's see what we have created chaos chaos chaos now.
794
+ cha Chaos Chaos Chaos.
795
+
796
+ cha Chaos Chaos Chaos.
797
+
798
+ Let's create chaos chaos chaos.
799
+
800
+ Cha Chaos Chaos.
801
+
802
+ Let's create chaos chaos.
803
+
804
+ cha Cha Chaos Chaos.
805
+
806
+ cha Chaos Chaos Chaos.
807
+ Now's Chaos Chaos Chaos.
808
+
809
+ Cha Chaos Chaos Chaos.
810
+ cha Chaos Chaos Chaos.
811
+
812
+ Cha Chaos Chaos Chaos.
813
+
814
+ cha Chaos Chaos Chaos.
815
+
816
+ cha Chaos Chaos Chaos.
817
+
818
+ cha Chaos Chaos Chaos.
819
+ cha Chaos Chaos Chaos.
820
+
821
+ Here's our chaotic chaos chaos chaos.
822
+ cha Chaos Chaos Chaos.
823
+ cha Chaos Chaos Chaos.
824
+
825
+ Let's proceed with chaos chaos chaos.
826
+ cha Chaos Chaos Chaos.
827
+
828
+ Cha Chaos Chaos Chaos.
829
+
830
+ cha Chaos Chaos Chaos.
831
+
832
+ cha Chaos Chaos Chaos.
833
+ cha Chaos Chaos Chaos.
834
+ We've Chaos Chaos Chaos.
835
+ Ch Chaos Chaos Chaos.
836
+
837
+ Let's proceed with chaos chaos chaos.
838
+
839
+ cha Chaos Chaos Chaos.
840
+
841
+ Ch Chaos Chaos Chaos Chaos.
842
+
843
+ cha Chaos Chaos Chaos.
844
+
845
+ Let's see Chaos Chaos Chaos.
846
+
847
+ cha Chaos Chaos Chaos.
848
+ Cha Chaos Chaos Chaos Chaos.
849
+
850
+ Let's proceed to the chaos Chaos Chaos Chaos Chaos.
851
+
852
+ cha Chaos Chaos Chaos.
853
+ cha Chaos Chaos Chaos.
854
+ Let's Chaos Chaos Chaos.
855
+
856
+ cha Chaos Chaos Chaos.
857
+
858
+ cha Chaos Chaos Chaos.
859
+
860
+ cha Chaos Chaos Chaos.
861
+ cha Chaos Chaos Chaos.
862
+
863
+ Let's create a chaotic chaos chaos chaos.
864
+ cha Chaos Chaos Chaos.
865
+ cha Chaos Chaos Chaos.
866
+
867
+ cha Chaos Chaos Chaos.
868
+ Cha Chaos Chaos Chaos Chaos.
869
+
870
+ We'll create chaos chaos chaos.
871
+
872
+ cha Chaos Chaos Chaos.
873
+
874
+ Chaos Chaos Chaos Chaos.
875
+ Chaos Chaos Chaos.
876
+ cha Chaos Chaos Chaos Chaos.
877
+
878
+ Chaos Chaos Chaos.
879
+
880
+ Let's proceed with Chaos Chaos Chaos.
881
+ We've Chaos Chaos Chaos Chaos.
882
+
883
+ cha Chaos Chaos Chaos Chaos.
884
+
885
+ We'll create chaos chaos chaos.
886
+ cha Chaos Chaos Chaos.
887
+
888
+ We've created chaos chaos chaos.
889
+
890
+ Let's proceed with chaos chaos chaos.
891
+ cha Chaos Chaos Chaos.
892
+
893
+ We've created chaos chaos chaos.
894
+ Cha Chaos Chaos Chaos.
895
+
896
+ cha Chaos Chaos Chaos Chaos Chaos.
897
+ cha Chaos Chaos Chaos.
898
+ cha Chaos Chaos Chaos.
899
+
900
+ cha Chaos Chaos Chaos Chaos.
901
+
902
+ cha Chaos Chaos Chaos.
903
+
904
+ cha Chaos Chaos Chaos Chaos.
905
+
906
+ cha Chaos Chaos Chaos.
907
+
908
+ cha Chaos Chaos Chaos Chaos.
909
+
910
+ Cha Chaos Chaos Chaos.
911
+
912
+ We've created chaos chaos chaos.
913
+ cha Chaos Chaos Chaos.
914
+
915
+ cha Chaos Chaos Chaos.
916
+ cha Chaos Chaos Chaos Chaos.
917
+ cha Chaos Chaos Chaos.
918
+ we's Chaos Chaos Chaos.
919
+
920
+ Now Chaos Chaos Chaos.
921
+ Now
922
+ 2025-03-14 13:48:38 - INFO : <|begin_of_text|>Building a website can be done in 10 simple steps:
923
+ 1. Create a website or app with your desired level of sophistication.
924
+ 2. Obtain a license from third-party users.
925
+ 2. Obtain a copy of the lawsuit.
926
+ 2. Get information about the lawsuit that has been used to build the website.
927
+ 2. Get information about the lawsuit that has been built up to build the website.
928
+ 2. Get information about the lawsuit that has been used to build the website.
929
+ 3. Obtain the information about the lawsuit that has been used to build the website.
930
+ 2. Determine the lawsuit that has been used to build the website.
931
+ 2. Build information about the lawsuit that has been used to build the website.
932
+ 2. Determine the lawsuit that has been used to build the website.
933
+ 3. Choose the laws that has been used to build the website.
934
+ 3. Determine the lawsuit that has been used to build the website.
935
+ 3. Apply the lawsuit that has been used to build the website.
936
+ ## This process can be done quickly and efficiently as there are few barriers to success.
937
+
938
+ The process can be done in 10 simple steps:
939
+ 1. Create a website that is needed and is easily accessible.
940
+ 1. Obtain a copy of the website with the need to build an existing design.
941
+ 2. Obtain a copy of the lawsuit that has been used to build the website.
942
+ 2. Determine the lawsuit that has been used to build the website.
943
+ 2. Determine the lawsuit that has been used to build the website.
944
+ 2. Determine the lawsuit that has been used to build the website.
945
+ 2. Determine the lawsuit that has been used to build the website.
946
+
947
+ ## Step 4. Determination of a simple rule that will be used to build the website
948
+ When the website can build you in 3-4 steps of the building process.
949
+ ## Step 5. Lawsuit of the lawsuit that will be built by the user.
950
+ ## Step 6. Lawsuit the law that has been used.
951
+
952
+ ## Step 7. Determine that a simple law that will be used to build the website.
953
+ ## Step 8. Build and apply the law that has been used.
954
+ ## Step 9. Build the following law that will be used to build the website:
955
+ ## Step 10. Apply that has been used to build the website.
956
+ ## Step 11. Determine that a new lawsuit that has been used to build the website.
957
+ 2. Build and apply the lawsuit that has been used.
958
+ ## Step 13. Determine that has been used to build the website.
959
+ ## Step 15. Use the lawsuit that has been used to build the website.
960
+
961
+ ## Step 16. Apply the lawsuit that has been used.
962
+ ## Step 17. Build a new law that will be used to build the website.
963
+ ## Step 18. Determine that a new lawsuit that has been used.
964
+
965
+ ## Step 20. Determine that has been used to build the website.
966
+ ## Step 21. Use the lawsuit that has been used to build the website.
967
+
968
+ ## Step 32. Determine that the lawsuit has been used to build the website.
969
+ ## Step 33. Determine that the use of the website will be used to build the lawsuit.
970
+
971
+ ## Step 35. Determine that the use of the website will be used to build the lawsuit.
972
+
973
+ ## Step 37. Use the lawsuit that has been used to build the website.
974
+ ## Step 38. Determine that the lawsuit that has been used to build the website.
975
+ ## Step 39. Determine that the use of the lawsuit that has been used to build the website.
976
+
977
+ ## Step 41. Determine that the use of the lawsuit that has been used to build the website.
978
+
979
+ The final law is building the lawsuit that has been used to build the website.
980
+
981
+ ## Step 47. Determine that the use of the lawsuit that has been used to build the website.
982
+
983
+ ## Step 48. Build the use of the lawsuit that has been used to build the website. The lawsuit that has been used is building the lawsuit that has been used.
984
+
985
+ ## Step 52. Lawsuit 1, Building the use of the lawsuit that has been built by the building process.
986
+
987
+ ## Step 55. Determine that the lawsuit that has been used to build the website.
988
+
989
+ ## Step 58. Use the use of the lawsuit that has been used to build the website.
990
+ ## Step 59. Determine that the use of the lawsuit that has been used to build the website.
991
+
992
+ ## Step 60. Use the use of the lawsuit that has been used to build the website.
993
+
994
+ ## Step 65. Use the lawsuit that has been used to build the website.
995
+
996
+ ## Step 69. Determine that the use of the lawsuit that has been used to build the website.
997
+
998
+ ## Step 73. Determine that the use of the lawsuit that has been used to build the website.
999
+
1000
+ ## Step 76. Use the use of the lawsuit that has been used to build the website.
1001
+ ## Step 78. Determine that the use of the lawsuit that has been used to build the website.
1002
+ ## Step 80. Use the use of the lawsuit that has been used to build the website.
1003
+
1004
+ ## Step 82. Build the use of the lawsuit that has been used to build the website.
1005
+ ## Step 84. Determine that the use of the lawsuit that has been used to build the website.
1006
+ ## Step 86. Determine that the lawsuit that has been used to build the website.
1007
+
1008
+ ## Step 98. Use the use of the lawsuit that has been used to build the website.
1009
+ ## Step 102. Determine that the use of the lawsuit that has been used to build the website.
1010
+
1011
+ ## Step 102. Use the use of the lawsuit that has been used to build the website.
1012
+ ## Step 105. Determine that the use of the lawsuit that has been used to build the website.
1013
+
1014
+ ## Step 107. Use the use of the lawsuit that has been used to build the website.
1015
+
1016
+ ## Step 111. Build the use of the lawsuit that has been used to build the website.
1017
+ ## Step 113. Determine that the use of the lawsuit that has been used to build the website.
1018
+ ## Step 114. Determine that the use of the lawsuit that has been used to build the website.
1019
+
1020
+ ## Step 118. Use the lawsuit that has been used to build the website.
1021
+ ## Step 120. Use the use of the lawsuit that has been used to build the website.
1022
+ ## Step 123. Determine that the use of the lawsuit that has been used to build the website.
1023
+
1024
+ ## Step 127. Use the use of the lawsuit that has been used to build the website.
1025
+ ## Step 129. Determine that the use of the lawsuit that has been used to build the website.
1026
+ ## Step 131. Determine that the use of the lawsuit that has been used to build the website.
1027
+
1028
+ ## Step 136. Determine that the use of the lawsuit that has been used to build the website.
1029
+ ## Step 137. Determine that the use of the lawsuit that has been used to build the website.
1030
+
1031
+ ## Step 180. Build the use of the lawsuit that has been used to build the website.
1032
+ ## Step 182. Determine that the use of the lawsuit that has been used to build the website.
1033
+
1034
+ ## Step 184. Use the use of the lawsuit that has been used to build the website.
1035
+ ## Step 186. Use the use of the lawsuit that has been used to build the website.
1036
+
1037
+ ## Step 188. Determine that the use of the lawsuit that has been used to build the website.
1038
+
1039
+ ## Step 191. Determine that the use of the lawsuit that has been used to build the website.
1040
+ ## Step 193. Determine that the use of the lawsuit that has been used to build the website.
1041
+ ## Step 197. Determine that the lawsuit that has been used to build the website.
1042
+ ## Step 199. Use the use of the lawsuit that has been used to build the website.
1043
+ ## Step 201. Determine that the use of the lawsuit that has been used to build the website.<|eot_id|>
1044
+ 2025-03-14 13:48:38 - INFO : <|begin_of_text|>Tweet: "I hate it when my phone battery dies."
1045
+ Sentiment: Negative
1046
+ ###
1047
+ Tweet: "My day has been 👍"
1048
+ Sentiment: Positive
1049
+ ###
1050
+ Tweet: "This is the link to the article"
1051
+ Sentiment: Neutral
1052
+ ###
1053
+ Tweet: "This new music video was incredibile"
1054
+ Sentiment: Positive
1055
+ ### Music: "The new music video"<|eot_id|>
1056
+ 2025-03-14 13:50:25 - INFO : <|begin_of_text|>Translate English to French:
1057
+
1058
+ sea otter => loutre de mer
1059
+
1060
+ peppermint => menthe poivrée
1061
+
1062
+ plush girafe => girafe peluche
1063
+
1064
+ cheese => poivre blancée
1065
+
1066
+ notebook => bouquinier de gréce
1067
+
1068
+ leather => cuir
1069
+
1070
+ The sun is shining brightly today.
1071
+
1072
+ Translation:
1073
+ sun = lune
1074
+ bright = puissant, élevé, élevé, fort
1075
+
1076
+ Translation
1077
+ in 3 steps, or better in french
1078
+ 3 days, or better in french
1079
+ 4 days, and less.
1080
+
1081
+ translation
1082
+ -
1083
+ sun = lune -
1084
+ day = -
1085
+ = -
1086
+
1087
+ day =
1088
+ lune ->lune
1089
+
1090
+ day =
1091
+ d'une -
1092
+ -une - -le - - de - - - de - - de
1093
+ -une - -une - -un - - -
1094
+ -une - -une - -de - - - - a - - a - - -a - -
1095
+
1096
+ day =
1097
+ le - de - - -
1098
+ d'une - - -lune - - -
1099
+ - - - - - -a - - -
1100
+
1101
+ -une - - -une - - -une - -
1102
+ le - -une - -une - -
1103
+ - -une - -un - - -une - -
1104
+ le - -une - -une -
1105
+ -une - -une - -un - -
1106
+
1107
+ -une - -une - -le - -une - - -
1108
+
1109
+ day =
1110
+
1111
+ 3 days, and less. This day has no meaning.
1112
+
1113
+ translation
1114
+ -
1115
+ -
1116
+ 3 -
1117
+ day =
1118
+ le - de - -une - -une - -
1119
+
1120
+ - - - -une -
1121
+ - -une - -de - - -un - -
1122
+
1123
+ - - -une - -une - -
1124
+
1125
+ - day =
1126
+ - day - -
1127
+
1128
+ - - - - - -
1129
+
1130
+ - -une - -une -une -
1131
+
1132
+ - - - -une - -
1133
+
1134
+ une -une -une -une
1135
+
1136
+ - - - - - -
1137
+
1138
+ - - - -
1139
+
1140
+ - - -
1141
+
1142
+ - - - -
1143
+
1144
+ - - - -
1145
+
1146
+ - - - - - -
1147
+
1148
+ - - - - - -
1149
+
1150
+ - - - -
1151
+
1152
+ une - - - - - - -
1153
+
1154
+ - day - - - - - - -
1155
+
1156
+ - - - - - - - -
1157
+
1158
+ - - - - - - - - -
1159
+
1160
+ - - - - - - - - - - -
1161
+
1162
+ day =
1163
+ - - - - - - - - -
1164
+
1165
+ - - - - - - - - -
1166
+
1167
+ - - - - - - -
1168
+
1169
+ - - - - - - - - -
1170
+
1171
+ - - - - - - -
1172
+
1173
+ - - - - - - - - -
1174
+
1175
+ 2 days, and less.
1176
+
1177
+ Note: Day = le jour
1178
+
1179
+ sun -> lune (lune)
1180
+
1181
+ The other day I went to the zoo.
1182
+
1183
+ Translation:
1184
+ The sun day, I
1185
+ The day, and one more.
1186
+
1187
+ Translation
1188
+ - day
1189
+
1190
+ translation : 3 days.
1191
+
1192
+ translation : 3 - -
1193
+
1194
+ - 3 days
1195
+
1196
+ translation: 4 days.
1197
+
1198
+ Translation:
1199
+ - day - - - - - - - -
1200
+
1201
+ day
1202
+
1203
+ = = (day) -
1204
+
1205
+ day
1206
+
1207
+ - - - - - - - -
1208
+
1209
+ I. I. Go. went. I.
1210
+
1211
+ translation:
1212
+ - 3 days.
1213
+ and less.
1214
+ and more.
1215
+ and less.
1216
+ and less.
1217
+
1218
+ 4 days
1219
+ and less.
1220
+ and less.
1221
+ and less.
1222
+
1223
+ translation:
1224
+ I.
1225
+
1226
+ I.
1227
+
1228
+ Go. 2. days.
1229
+ 1. go.
1230
+ Go.
1231
+ I.
1232
+
1233
+ 3
1234
+ 2.
1235
+
1236
+ 3. -
1237
+ 2 - - 1 - - - -
1238
+
1239
+ - - - - - - - -
1240
+
1241
+ - - - - - - -
1242
+
1243
+ - - - - - - - -
1244
+
1245
+ day - days
1246
+
1247
+ translation = 3
1248
+ Translation: the 3 days.
1249
+ translation
1250
+ Translation: day
1251
+ translation: I
1252
+
1253
+ 3 - day
1254
+
1255
+ translation: - - -
1256
+
1257
+ Go.
1258
+ 3.
1259
+ 1. 2. - - - - - -
1260
+ 1. - - - - - - - -
1261
+ translation:
1262
+
1263
+ 3. - - - - - - -
1264
+ - - - - - - -
1265
+ - 3 days.
1266
+ day. - - - - - - -
1267
+
1268
+ - - - - - - - -
1269
+ - - - - - - - - -
1270
+
1271
+ translation:
1272
+ I. day - - - - - - - -
1273
+ 3. - - - - - - -
1274
+
1275
+ - - - - - - - -
1276
+
1277
+ 3 days.
1278
+
1279
+ - - - - - - -
1280
+
1281
+ - - - - - - - - -
1282
+ I. - - - - - - - -
1283
+
1284
+ translated:
1285
+ I. day
1286
+ - - - - - - - - - -
1287
+
1288
+ - - - - - - - - -
1289
+
1290
+ - - - - - - - - - -
1291
+
1292
+ - - - - - - - - - -
1293
+
1294
+ day = translated:
1295
+
1296
+ sun.
1297
+
1298
+ The best way to write.
1299
+
1300
+ Translation
1301
+ the words:
1302
+
1303
+ 3 days, and less.
1304
+
1305
+ Translation:
1306
+ - 4 - days.
1307
+
1308
+ days.
1309
+
1310
+
1311
+ 3, -, 3 -
1312
+ - - - - - - -
1313
+
1314
+ - - - - - - - -
1315
+
1316
+ 3 days, and less.
1317
+
1318
+ Translation:
1319
+ - - - - - - - - - -
1320
+
1321
+ translation:
1322
+
1323
+ I. 2 days.
1324
+
1325
+ The best way to write.
1326
+ 3 - - - - - - -
1327
+
1328
+ 4.
1329
+
1330
+ Translation:
1331
+ - the 3.
1332
+
1333
+ 5 - - - - 3 -
1334
+
1335
+ translation:
1336
+ the other way to write.
1337
+ Translation:
1338
+ 5 translated: 3 days.
1339
+
1340
+ - - - - - - - - -
1341
+
1342
+ I. Go. 3.
1343
+
1344
+ translation: 3.
1345
+
1346
+ days.
1347
+ translation:
1348
+ translation:
1349
+ - 3 - - - - -
1350
+
1351
+ translation:
1352
+ translation:
1353
+ I.
1354
+
1355
+ translation:
1356
+
1357
+ translation:
1358
+ day translated: 3.
1359
+
1360
+ translation:
1361
+
1362
+ translation:
1363
+ days.
1364
+
1365
+
1366
+
1367
+ I translated:
1368
+
1369
+ 2. I day translated.
1370
+
1371
+ translation:
1372
+
1373
+ day
1374
+ - - - - - -
1375
+
1376
+ I.
1377
+
1378
+ translated:
1379
+
1380
+ three.
1381
+ translated: three
1382
+ three,
1383
+ translation:
1384
+
1385
+
1386
+ translation:
1387
+ - - - - -
1388
+
1389
+ - 3 day.
1390
+ - - - - -
1391
+ - - - - -
1392
+
1393
+ translation:
1394
+
1395
+ - - - - - - -
1396
+
1397
+ - - - - - - -
1398
+
1399
+ day.
1400
+ - - - - - - - -
1401
+
1402
+ - - - - - - - -
1403
+
1404
+ - - - - - - - -
1405
+
1406
+ - 3 days.
1407
+
1408
+ translated :
1409
+
1410
+ - - - - - - - -
1411
+
1412
+ translation:
1413
+ translation:
1414
+
1415
+ 3 days.
1416
+ translated:
1417
+ - 3 days.
1418
+
1419
+ translated: day
1420
+
1421
+ translated:
1422
+
1423
+ Translation:
1424
+ day. - - - - - -
1425
+
1426
+ translated:
1427
+
1428
+ - - - - - - - -
1429
+
1430
+ I day.
1431
+
1432
+ translation:
1433
+ - - - - - - - -
1434
+
1435
+ Translation:
1436
+
1437
+ translation:
1438
+ 3 days.
1439
+
1440
+ Translation:
1441
+ - - - - - - -
1442
+
1443
+ - - - - - - -
1444
+
1445
+ - - - - - - - -
1446
+
1447
+ - 3 days.
1448
+
1449
+ I.
1450
+
1451
+ - - - - - - - -
1452
+
1453
+ - - - - - - - -
1454
+
1455
+ Translation:
1456
+ The best way to write the words.
1457
+
1458
+
1459
+ The final translation.
1460
+
1461
+ The translated:
1462
+
1463
+ sun
1464
+ day.
1465
+ - - - - - - -
1466
+
1467
+ translated to 3 days.
1468
+
1469
+ translation:
1470
+
1471
+ days.
1472
+
1473
+
1474
+ The best way to write.
1475
+
1476
+ translation :
1477
+
1478
+
1479
+ I.
1480
+
1481
+ translation :
1482
+ - - - - - - - -
1483
+
1484
+ I.
1485
+
1486
+ translation :
1487
+
1488
+ 3.
1489
+
1490
+ translation:
1491
+
1492
+
1493
+ - - - - - - - -
1494
+
1495
+ - - -
1496
+ 2025-03-14 13:50:25 - INFO :
1497
+ ==================Finish================
1498
+
1499
+ 2025-03-14 14:01:26 - INFO : PPL after pruning: {'wikitext2': 33.245062173048, 'ptb': 58.119428177447986}
1500
+ 2025-03-14 14:01:26 - INFO : Memory Requirement: 6982.2802734375 MiB
1501
+
prune_log/vanilla_llama_1b_prune_0.25/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: meta-llama/Llama-3.2-1B-Instruct
3
+ - save_ckpt_log_name: vanilla_llama_1b_prune_0.25
4
+ - pruning_ratio: 0.25
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 2
14
+ - block_attention_layer_end: 13
15
+ - block_mlp_layer_start: 2
16
+ - block_mlp_layer_end: 13
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: param_mix
21
+ - num_examples: 10
22
+ - device: cuda
23
+ - test_before_train: True
24
+ - eval_device: cuda
25
+ - test_after_train: True
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.6
prune_log/vanilla_llama_1b_prune_0.25/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f074f4aafdecfbdd50a31e261ba583da8ad0dc5eea766db0c87505c0c36b39fb
3
+ size 3279886126
tune_log/.DS_Store ADDED
Binary file (6.15 kB). View file
 
tune_log/layerskip_1b_0.25_tune/.DS_Store ADDED
Binary file (6.15 kB). View file
 
tune_log/layerskip_1b_0.25_tune/adapter_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "facebook/layerskip-llama3.2-1B",
3
+ "bias": "none",
4
+ "fan_in_fan_out": false,
5
+ "inference_mode": true,
6
+ "init_lora_weights": true,
7
+ "lora_alpha": 16,
8
+ "lora_dropout": 0.05,
9
+ "modules_to_save": null,
10
+ "peft_type": "LORA",
11
+ "r": 8,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "k_proj",
15
+ "v_proj",
16
+ "o_proj",
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj"
20
+ ],
21
+ "task_type": "CAUSAL_LM"
22
+ }
tune_log/layerskip_1b_0.25_tune/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f5754c9ac51484d077d6009e59971b7e5f20883ad4842df5b433f5c413676b9
3
+ size 20011658
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dbd4619d97f3ac622bddf6ea1e5343cb99a03e214ecc3de23a6db65e54de672
3
+ size 19960448
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10ab7a9b44bef50593b4235e9ee859a78d15399f3d91f66cfa5c47b5c8f31a3c
3
+ size 40050298
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0c7c0ad96e7c8ff682517153986c0f80d1df307934042b11129380f22d7d7bf
3
+ size 14244
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d8fdcd0311eba9854fff738038ed4c1a269832665b4d88ba4e4e3d02a1a7e0e
3
+ size 988
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1e58cb0c4d82aeb47dd1f5d659b10c9964a21f4c4d82846e16ee4f619325194
3
+ size 1064
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,820 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.5820817947387695,
3
+ "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-1000",
4
+ "epoch": 1.2855305466237943,
5
+ "eval_steps": 100,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0012861736334405145,
13
+ "grad_norm": 0.39783015847206116,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 2.0835,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.012861736334405145,
20
+ "grad_norm": 0.45549583435058594,
21
+ "learning_rate": 1e-05,
22
+ "loss": 2.1408,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.02572347266881029,
27
+ "grad_norm": 0.4594053626060486,
28
+ "learning_rate": 2e-05,
29
+ "loss": 2.0894,
30
+ "step": 20
31
+ },
32
+ {
33
+ "epoch": 0.03858520900321544,
34
+ "grad_norm": 0.49020764231681824,
35
+ "learning_rate": 3e-05,
36
+ "loss": 2.1037,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.05144694533762058,
41
+ "grad_norm": 0.37993305921554565,
42
+ "learning_rate": 4e-05,
43
+ "loss": 1.9716,
44
+ "step": 40
45
+ },
46
+ {
47
+ "epoch": 0.06430868167202572,
48
+ "grad_norm": 0.38231977820396423,
49
+ "learning_rate": 5e-05,
50
+ "loss": 1.9349,
51
+ "step": 50
52
+ },
53
+ {
54
+ "epoch": 0.07717041800643087,
55
+ "grad_norm": 0.2922589182853699,
56
+ "learning_rate": 6e-05,
57
+ "loss": 1.906,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.09003215434083602,
62
+ "grad_norm": 0.34647658467292786,
63
+ "learning_rate": 7e-05,
64
+ "loss": 1.8246,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.10289389067524116,
69
+ "grad_norm": 0.31930026412010193,
70
+ "learning_rate": 8e-05,
71
+ "loss": 1.8057,
72
+ "step": 80
73
+ },
74
+ {
75
+ "epoch": 0.1157556270096463,
76
+ "grad_norm": 0.34028756618499756,
77
+ "learning_rate": 9e-05,
78
+ "loss": 1.7546,
79
+ "step": 90
80
+ },
81
+ {
82
+ "epoch": 0.12861736334405144,
83
+ "grad_norm": 0.3878991901874542,
84
+ "learning_rate": 0.0001,
85
+ "loss": 1.7543,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 0.12861736334405144,
90
+ "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
91
+ "eval_yahma/alpaca-cleaned_runtime": 62.5096,
92
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
93
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
94
+ "step": 100
95
+ },
96
+ {
97
+ "epoch": 0.1414790996784566,
98
+ "grad_norm": 0.35599613189697266,
99
+ "learning_rate": 9.931224209078405e-05,
100
+ "loss": 1.7309,
101
+ "step": 110
102
+ },
103
+ {
104
+ "epoch": 0.15434083601286175,
105
+ "grad_norm": 0.4075644016265869,
106
+ "learning_rate": 9.862448418156809e-05,
107
+ "loss": 1.6981,
108
+ "step": 120
109
+ },
110
+ {
111
+ "epoch": 0.16720257234726688,
112
+ "grad_norm": 0.4743317663669586,
113
+ "learning_rate": 9.793672627235215e-05,
114
+ "loss": 1.7011,
115
+ "step": 130
116
+ },
117
+ {
118
+ "epoch": 0.18006430868167203,
119
+ "grad_norm": 0.4701610505580902,
120
+ "learning_rate": 9.724896836313618e-05,
121
+ "loss": 1.6771,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.19292604501607716,
126
+ "grad_norm": 0.49115318059921265,
127
+ "learning_rate": 9.656121045392023e-05,
128
+ "loss": 1.6633,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.2057877813504823,
133
+ "grad_norm": 0.5177980661392212,
134
+ "learning_rate": 9.587345254470427e-05,
135
+ "loss": 1.6706,
136
+ "step": 160
137
+ },
138
+ {
139
+ "epoch": 0.21864951768488747,
140
+ "grad_norm": 0.465657114982605,
141
+ "learning_rate": 9.518569463548831e-05,
142
+ "loss": 1.6677,
143
+ "step": 170
144
+ },
145
+ {
146
+ "epoch": 0.2315112540192926,
147
+ "grad_norm": 0.5453551411628723,
148
+ "learning_rate": 9.449793672627235e-05,
149
+ "loss": 1.6656,
150
+ "step": 180
151
+ },
152
+ {
153
+ "epoch": 0.24437299035369775,
154
+ "grad_norm": 0.4150402545928955,
155
+ "learning_rate": 9.38101788170564e-05,
156
+ "loss": 1.6568,
157
+ "step": 190
158
+ },
159
+ {
160
+ "epoch": 0.2572347266881029,
161
+ "grad_norm": 0.5106223225593567,
162
+ "learning_rate": 9.312242090784045e-05,
163
+ "loss": 1.6804,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 0.2572347266881029,
168
+ "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
169
+ "eval_yahma/alpaca-cleaned_runtime": 63.0481,
170
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
171
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
172
+ "step": 200
173
+ },
174
+ {
175
+ "epoch": 0.27009646302250806,
176
+ "grad_norm": 0.47371965646743774,
177
+ "learning_rate": 9.243466299862448e-05,
178
+ "loss": 1.6235,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 0.2829581993569132,
183
+ "grad_norm": 0.45723679661750793,
184
+ "learning_rate": 9.174690508940853e-05,
185
+ "loss": 1.6192,
186
+ "step": 220
187
+ },
188
+ {
189
+ "epoch": 0.2958199356913183,
190
+ "grad_norm": 0.46727871894836426,
191
+ "learning_rate": 9.105914718019258e-05,
192
+ "loss": 1.6129,
193
+ "step": 230
194
+ },
195
+ {
196
+ "epoch": 0.3086816720257235,
197
+ "grad_norm": 0.5216034054756165,
198
+ "learning_rate": 9.037138927097662e-05,
199
+ "loss": 1.6065,
200
+ "step": 240
201
+ },
202
+ {
203
+ "epoch": 0.3215434083601286,
204
+ "grad_norm": 0.46132415533065796,
205
+ "learning_rate": 8.968363136176067e-05,
206
+ "loss": 1.6374,
207
+ "step": 250
208
+ },
209
+ {
210
+ "epoch": 0.33440514469453375,
211
+ "grad_norm": 0.5699637532234192,
212
+ "learning_rate": 8.89958734525447e-05,
213
+ "loss": 1.6031,
214
+ "step": 260
215
+ },
216
+ {
217
+ "epoch": 0.34726688102893893,
218
+ "grad_norm": 0.46537184715270996,
219
+ "learning_rate": 8.830811554332875e-05,
220
+ "loss": 1.6196,
221
+ "step": 270
222
+ },
223
+ {
224
+ "epoch": 0.36012861736334406,
225
+ "grad_norm": 0.5034765005111694,
226
+ "learning_rate": 8.76203576341128e-05,
227
+ "loss": 1.6257,
228
+ "step": 280
229
+ },
230
+ {
231
+ "epoch": 0.3729903536977492,
232
+ "grad_norm": 0.48885518312454224,
233
+ "learning_rate": 8.693259972489685e-05,
234
+ "loss": 1.6195,
235
+ "step": 290
236
+ },
237
+ {
238
+ "epoch": 0.3858520900321543,
239
+ "grad_norm": 0.48295891284942627,
240
+ "learning_rate": 8.62448418156809e-05,
241
+ "loss": 1.6301,
242
+ "step": 300
243
+ },
244
+ {
245
+ "epoch": 0.3858520900321543,
246
+ "eval_yahma/alpaca-cleaned_loss": 1.6362165212631226,
247
+ "eval_yahma/alpaca-cleaned_runtime": 62.9945,
248
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.749,
249
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
250
+ "step": 300
251
+ },
252
+ {
253
+ "epoch": 0.3987138263665595,
254
+ "grad_norm": 0.4800078570842743,
255
+ "learning_rate": 8.555708390646493e-05,
256
+ "loss": 1.6171,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 0.4115755627009646,
261
+ "grad_norm": 0.47452959418296814,
262
+ "learning_rate": 8.486932599724897e-05,
263
+ "loss": 1.6147,
264
+ "step": 320
265
+ },
266
+ {
267
+ "epoch": 0.42443729903536975,
268
+ "grad_norm": 0.5397221446037292,
269
+ "learning_rate": 8.418156808803301e-05,
270
+ "loss": 1.6041,
271
+ "step": 330
272
+ },
273
+ {
274
+ "epoch": 0.43729903536977494,
275
+ "grad_norm": 0.5501461029052734,
276
+ "learning_rate": 8.349381017881706e-05,
277
+ "loss": 1.6091,
278
+ "step": 340
279
+ },
280
+ {
281
+ "epoch": 0.45016077170418006,
282
+ "grad_norm": 0.47587981820106506,
283
+ "learning_rate": 8.28060522696011e-05,
284
+ "loss": 1.6008,
285
+ "step": 350
286
+ },
287
+ {
288
+ "epoch": 0.4630225080385852,
289
+ "grad_norm": 0.46644529700279236,
290
+ "learning_rate": 8.211829436038515e-05,
291
+ "loss": 1.6081,
292
+ "step": 360
293
+ },
294
+ {
295
+ "epoch": 0.4758842443729904,
296
+ "grad_norm": 0.5308094024658203,
297
+ "learning_rate": 8.14305364511692e-05,
298
+ "loss": 1.5987,
299
+ "step": 370
300
+ },
301
+ {
302
+ "epoch": 0.4887459807073955,
303
+ "grad_norm": 0.5304721593856812,
304
+ "learning_rate": 8.074277854195323e-05,
305
+ "loss": 1.6173,
306
+ "step": 380
307
+ },
308
+ {
309
+ "epoch": 0.5016077170418006,
310
+ "grad_norm": 0.6186290383338928,
311
+ "learning_rate": 8.005502063273728e-05,
312
+ "loss": 1.5879,
313
+ "step": 390
314
+ },
315
+ {
316
+ "epoch": 0.5144694533762058,
317
+ "grad_norm": 0.4936847388744354,
318
+ "learning_rate": 7.936726272352132e-05,
319
+ "loss": 1.5771,
320
+ "step": 400
321
+ },
322
+ {
323
+ "epoch": 0.5144694533762058,
324
+ "eval_yahma/alpaca-cleaned_loss": 1.6208504438400269,
325
+ "eval_yahma/alpaca-cleaned_runtime": 62.9246,
326
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.784,
327
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
328
+ "step": 400
329
+ },
330
+ {
331
+ "epoch": 0.5273311897106109,
332
+ "grad_norm": 0.4969954788684845,
333
+ "learning_rate": 7.867950481430537e-05,
334
+ "loss": 1.5873,
335
+ "step": 410
336
+ },
337
+ {
338
+ "epoch": 0.5401929260450161,
339
+ "grad_norm": 0.5539654493331909,
340
+ "learning_rate": 7.799174690508942e-05,
341
+ "loss": 1.5741,
342
+ "step": 420
343
+ },
344
+ {
345
+ "epoch": 0.5530546623794212,
346
+ "grad_norm": 0.4963805377483368,
347
+ "learning_rate": 7.730398899587345e-05,
348
+ "loss": 1.5883,
349
+ "step": 430
350
+ },
351
+ {
352
+ "epoch": 0.5659163987138264,
353
+ "grad_norm": 0.4849222004413605,
354
+ "learning_rate": 7.66162310866575e-05,
355
+ "loss": 1.6061,
356
+ "step": 440
357
+ },
358
+ {
359
+ "epoch": 0.5787781350482315,
360
+ "grad_norm": 0.5241298079490662,
361
+ "learning_rate": 7.592847317744153e-05,
362
+ "loss": 1.6118,
363
+ "step": 450
364
+ },
365
+ {
366
+ "epoch": 0.5916398713826366,
367
+ "grad_norm": 0.5051389336585999,
368
+ "learning_rate": 7.52407152682256e-05,
369
+ "loss": 1.5618,
370
+ "step": 460
371
+ },
372
+ {
373
+ "epoch": 0.6045016077170418,
374
+ "grad_norm": 0.49376770853996277,
375
+ "learning_rate": 7.455295735900963e-05,
376
+ "loss": 1.5871,
377
+ "step": 470
378
+ },
379
+ {
380
+ "epoch": 0.617363344051447,
381
+ "grad_norm": 0.49221155047416687,
382
+ "learning_rate": 7.386519944979367e-05,
383
+ "loss": 1.6037,
384
+ "step": 480
385
+ },
386
+ {
387
+ "epoch": 0.6302250803858521,
388
+ "grad_norm": 0.5378918647766113,
389
+ "learning_rate": 7.317744154057772e-05,
390
+ "loss": 1.5523,
391
+ "step": 490
392
+ },
393
+ {
394
+ "epoch": 0.6430868167202572,
395
+ "grad_norm": 0.5564639568328857,
396
+ "learning_rate": 7.248968363136176e-05,
397
+ "loss": 1.5885,
398
+ "step": 500
399
+ },
400
+ {
401
+ "epoch": 0.6430868167202572,
402
+ "eval_yahma/alpaca-cleaned_loss": 1.6095871925354004,
403
+ "eval_yahma/alpaca-cleaned_runtime": 62.983,
404
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.755,
405
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
406
+ "step": 500
407
+ },
408
+ {
409
+ "epoch": 0.6559485530546624,
410
+ "grad_norm": 0.49083277583122253,
411
+ "learning_rate": 7.180192572214582e-05,
412
+ "loss": 1.5693,
413
+ "step": 510
414
+ },
415
+ {
416
+ "epoch": 0.6688102893890675,
417
+ "grad_norm": 0.5625829100608826,
418
+ "learning_rate": 7.111416781292985e-05,
419
+ "loss": 1.6023,
420
+ "step": 520
421
+ },
422
+ {
423
+ "epoch": 0.6816720257234726,
424
+ "grad_norm": 0.6078226566314697,
425
+ "learning_rate": 7.04264099037139e-05,
426
+ "loss": 1.5845,
427
+ "step": 530
428
+ },
429
+ {
430
+ "epoch": 0.6945337620578779,
431
+ "grad_norm": 0.48107999563217163,
432
+ "learning_rate": 6.973865199449794e-05,
433
+ "loss": 1.5682,
434
+ "step": 540
435
+ },
436
+ {
437
+ "epoch": 0.707395498392283,
438
+ "grad_norm": 0.5080347657203674,
439
+ "learning_rate": 6.905089408528198e-05,
440
+ "loss": 1.5839,
441
+ "step": 550
442
+ },
443
+ {
444
+ "epoch": 0.7202572347266881,
445
+ "grad_norm": 0.5683622360229492,
446
+ "learning_rate": 6.836313617606602e-05,
447
+ "loss": 1.5916,
448
+ "step": 560
449
+ },
450
+ {
451
+ "epoch": 0.7331189710610932,
452
+ "grad_norm": 0.4669715464115143,
453
+ "learning_rate": 6.767537826685007e-05,
454
+ "loss": 1.6146,
455
+ "step": 570
456
+ },
457
+ {
458
+ "epoch": 0.7459807073954984,
459
+ "grad_norm": 0.4946054518222809,
460
+ "learning_rate": 6.698762035763412e-05,
461
+ "loss": 1.5764,
462
+ "step": 580
463
+ },
464
+ {
465
+ "epoch": 0.7588424437299035,
466
+ "grad_norm": 0.4975377023220062,
467
+ "learning_rate": 6.629986244841817e-05,
468
+ "loss": 1.6035,
469
+ "step": 590
470
+ },
471
+ {
472
+ "epoch": 0.7717041800643086,
473
+ "grad_norm": 0.5511853098869324,
474
+ "learning_rate": 6.56121045392022e-05,
475
+ "loss": 1.5842,
476
+ "step": 600
477
+ },
478
+ {
479
+ "epoch": 0.7717041800643086,
480
+ "eval_yahma/alpaca-cleaned_loss": 1.6013859510421753,
481
+ "eval_yahma/alpaca-cleaned_runtime": 62.9465,
482
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.773,
483
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.972,
484
+ "step": 600
485
+ },
486
+ {
487
+ "epoch": 0.7845659163987139,
488
+ "grad_norm": 0.5689719915390015,
489
+ "learning_rate": 6.492434662998625e-05,
490
+ "loss": 1.5656,
491
+ "step": 610
492
+ },
493
+ {
494
+ "epoch": 0.797427652733119,
495
+ "grad_norm": 0.48885637521743774,
496
+ "learning_rate": 6.42365887207703e-05,
497
+ "loss": 1.5605,
498
+ "step": 620
499
+ },
500
+ {
501
+ "epoch": 0.8102893890675241,
502
+ "grad_norm": 0.5316773056983948,
503
+ "learning_rate": 6.354883081155434e-05,
504
+ "loss": 1.5755,
505
+ "step": 630
506
+ },
507
+ {
508
+ "epoch": 0.8231511254019293,
509
+ "grad_norm": 0.5578161478042603,
510
+ "learning_rate": 6.286107290233837e-05,
511
+ "loss": 1.5532,
512
+ "step": 640
513
+ },
514
+ {
515
+ "epoch": 0.8360128617363344,
516
+ "grad_norm": 0.6534080505371094,
517
+ "learning_rate": 6.217331499312242e-05,
518
+ "loss": 1.5882,
519
+ "step": 650
520
+ },
521
+ {
522
+ "epoch": 0.8488745980707395,
523
+ "grad_norm": 0.5140324831008911,
524
+ "learning_rate": 6.148555708390647e-05,
525
+ "loss": 1.5598,
526
+ "step": 660
527
+ },
528
+ {
529
+ "epoch": 0.8617363344051447,
530
+ "grad_norm": 0.5247426629066467,
531
+ "learning_rate": 6.0797799174690516e-05,
532
+ "loss": 1.5833,
533
+ "step": 670
534
+ },
535
+ {
536
+ "epoch": 0.8745980707395499,
537
+ "grad_norm": 0.49460870027542114,
538
+ "learning_rate": 6.011004126547456e-05,
539
+ "loss": 1.621,
540
+ "step": 680
541
+ },
542
+ {
543
+ "epoch": 0.887459807073955,
544
+ "grad_norm": 0.5351711511611938,
545
+ "learning_rate": 5.9422283356258604e-05,
546
+ "loss": 1.5371,
547
+ "step": 690
548
+ },
549
+ {
550
+ "epoch": 0.9003215434083601,
551
+ "grad_norm": 0.5608878135681152,
552
+ "learning_rate": 5.8734525447042644e-05,
553
+ "loss": 1.5878,
554
+ "step": 700
555
+ },
556
+ {
557
+ "epoch": 0.9003215434083601,
558
+ "eval_yahma/alpaca-cleaned_loss": 1.5940771102905273,
559
+ "eval_yahma/alpaca-cleaned_runtime": 62.917,
560
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
561
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 0.9131832797427653,
566
+ "grad_norm": 0.48291367292404175,
567
+ "learning_rate": 5.8046767537826685e-05,
568
+ "loss": 1.583,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 0.9260450160771704,
573
+ "grad_norm": 0.4866442382335663,
574
+ "learning_rate": 5.7359009628610725e-05,
575
+ "loss": 1.5891,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 0.9389067524115756,
580
+ "grad_norm": 0.5254418253898621,
581
+ "learning_rate": 5.667125171939478e-05,
582
+ "loss": 1.5319,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 0.9517684887459807,
587
+ "grad_norm": 0.5201655030250549,
588
+ "learning_rate": 5.598349381017882e-05,
589
+ "loss": 1.5819,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 0.9646302250803859,
594
+ "grad_norm": 0.5820693969726562,
595
+ "learning_rate": 5.5295735900962866e-05,
596
+ "loss": 1.5807,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 0.977491961414791,
601
+ "grad_norm": 0.559010922908783,
602
+ "learning_rate": 5.460797799174691e-05,
603
+ "loss": 1.5597,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 0.9903536977491961,
608
+ "grad_norm": 0.498877614736557,
609
+ "learning_rate": 5.392022008253095e-05,
610
+ "loss": 1.5628,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 1.002572347266881,
615
+ "grad_norm": 0.5119406580924988,
616
+ "learning_rate": 5.3232462173315e-05,
617
+ "loss": 1.5693,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 1.015434083601286,
622
+ "grad_norm": 0.5344542860984802,
623
+ "learning_rate": 5.254470426409904e-05,
624
+ "loss": 1.5256,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 1.0282958199356913,
629
+ "grad_norm": 0.5358342528343201,
630
+ "learning_rate": 5.185694635488308e-05,
631
+ "loss": 1.5432,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 1.0282958199356913,
636
+ "eval_yahma/alpaca-cleaned_loss": 1.589645266532898,
637
+ "eval_yahma/alpaca-cleaned_runtime": 62.9636,
638
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.764,
639
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
640
+ "step": 800
641
+ },
642
+ {
643
+ "epoch": 1.0411575562700965,
644
+ "grad_norm": 0.5941470265388489,
645
+ "learning_rate": 5.116918844566713e-05,
646
+ "loss": 1.5449,
647
+ "step": 810
648
+ },
649
+ {
650
+ "epoch": 1.0540192926045016,
651
+ "grad_norm": 0.5659182071685791,
652
+ "learning_rate": 5.048143053645117e-05,
653
+ "loss": 1.5234,
654
+ "step": 820
655
+ },
656
+ {
657
+ "epoch": 1.0668810289389068,
658
+ "grad_norm": 0.5737349390983582,
659
+ "learning_rate": 4.9793672627235217e-05,
660
+ "loss": 1.5517,
661
+ "step": 830
662
+ },
663
+ {
664
+ "epoch": 1.0797427652733118,
665
+ "grad_norm": 0.5984872579574585,
666
+ "learning_rate": 4.910591471801926e-05,
667
+ "loss": 1.5175,
668
+ "step": 840
669
+ },
670
+ {
671
+ "epoch": 1.092604501607717,
672
+ "grad_norm": 0.5954984426498413,
673
+ "learning_rate": 4.8418156808803304e-05,
674
+ "loss": 1.5738,
675
+ "step": 850
676
+ },
677
+ {
678
+ "epoch": 1.1054662379421223,
679
+ "grad_norm": 0.5545582175254822,
680
+ "learning_rate": 4.7730398899587344e-05,
681
+ "loss": 1.5538,
682
+ "step": 860
683
+ },
684
+ {
685
+ "epoch": 1.1183279742765273,
686
+ "grad_norm": 0.6972865462303162,
687
+ "learning_rate": 4.704264099037139e-05,
688
+ "loss": 1.529,
689
+ "step": 870
690
+ },
691
+ {
692
+ "epoch": 1.1311897106109325,
693
+ "grad_norm": 0.5404506325721741,
694
+ "learning_rate": 4.635488308115544e-05,
695
+ "loss": 1.5567,
696
+ "step": 880
697
+ },
698
+ {
699
+ "epoch": 1.1440514469453376,
700
+ "grad_norm": 0.5792121887207031,
701
+ "learning_rate": 4.566712517193948e-05,
702
+ "loss": 1.5422,
703
+ "step": 890
704
+ },
705
+ {
706
+ "epoch": 1.1569131832797428,
707
+ "grad_norm": 0.5468006134033203,
708
+ "learning_rate": 4.497936726272352e-05,
709
+ "loss": 1.5369,
710
+ "step": 900
711
+ },
712
+ {
713
+ "epoch": 1.1569131832797428,
714
+ "eval_yahma/alpaca-cleaned_loss": 1.5860395431518555,
715
+ "eval_yahma/alpaca-cleaned_runtime": 62.9918,
716
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.75,
717
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
718
+ "step": 900
719
+ },
720
+ {
721
+ "epoch": 1.1697749196141478,
722
+ "grad_norm": 0.5955344438552856,
723
+ "learning_rate": 4.429160935350757e-05,
724
+ "loss": 1.5551,
725
+ "step": 910
726
+ },
727
+ {
728
+ "epoch": 1.182636655948553,
729
+ "grad_norm": 0.5832058787345886,
730
+ "learning_rate": 4.360385144429161e-05,
731
+ "loss": 1.5568,
732
+ "step": 920
733
+ },
734
+ {
735
+ "epoch": 1.1954983922829583,
736
+ "grad_norm": 0.6309258937835693,
737
+ "learning_rate": 4.291609353507566e-05,
738
+ "loss": 1.5548,
739
+ "step": 930
740
+ },
741
+ {
742
+ "epoch": 1.2083601286173633,
743
+ "grad_norm": 0.6269820928573608,
744
+ "learning_rate": 4.22283356258597e-05,
745
+ "loss": 1.5459,
746
+ "step": 940
747
+ },
748
+ {
749
+ "epoch": 1.2212218649517685,
750
+ "grad_norm": 0.6376837491989136,
751
+ "learning_rate": 4.154057771664374e-05,
752
+ "loss": 1.5277,
753
+ "step": 950
754
+ },
755
+ {
756
+ "epoch": 1.2340836012861736,
757
+ "grad_norm": 0.6351036429405212,
758
+ "learning_rate": 4.085281980742779e-05,
759
+ "loss": 1.5273,
760
+ "step": 960
761
+ },
762
+ {
763
+ "epoch": 1.2469453376205788,
764
+ "grad_norm": 0.6877638101577759,
765
+ "learning_rate": 4.016506189821183e-05,
766
+ "loss": 1.4986,
767
+ "step": 970
768
+ },
769
+ {
770
+ "epoch": 1.2598070739549838,
771
+ "grad_norm": 0.5501726865768433,
772
+ "learning_rate": 3.947730398899587e-05,
773
+ "loss": 1.5543,
774
+ "step": 980
775
+ },
776
+ {
777
+ "epoch": 1.272668810289389,
778
+ "grad_norm": 0.5217163562774658,
779
+ "learning_rate": 3.8789546079779924e-05,
780
+ "loss": 1.5292,
781
+ "step": 990
782
+ },
783
+ {
784
+ "epoch": 1.2855305466237943,
785
+ "grad_norm": 0.5770425796508789,
786
+ "learning_rate": 3.8101788170563964e-05,
787
+ "loss": 1.5536,
788
+ "step": 1000
789
+ },
790
+ {
791
+ "epoch": 1.2855305466237943,
792
+ "eval_yahma/alpaca-cleaned_loss": 1.5820817947387695,
793
+ "eval_yahma/alpaca-cleaned_runtime": 62.9495,
794
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
795
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
796
+ "step": 1000
797
+ }
798
+ ],
799
+ "logging_steps": 10,
800
+ "max_steps": 1554,
801
+ "num_input_tokens_seen": 0,
802
+ "num_train_epochs": 2,
803
+ "save_steps": 200,
804
+ "stateful_callbacks": {
805
+ "TrainerControl": {
806
+ "args": {
807
+ "should_epoch_stop": false,
808
+ "should_evaluate": false,
809
+ "should_log": false,
810
+ "should_save": true,
811
+ "should_training_stop": false
812
+ },
813
+ "attributes": {}
814
+ }
815
+ },
816
+ "total_flos": 7.434437468513894e+16,
817
+ "train_batch_size": 4,
818
+ "trial_name": null,
819
+ "trial_params": null
820
+ }
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
3
+ size 5368
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be54766e305bf39d189b01a3e5ef2ce484ebb25174805158611dc64b011fae50
3
+ size 19960448
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b02d18468fe272d16fe94b72ee6383b6ff39ed8d29639779dc301f5ad712b87
3
+ size 40050298
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b12da8cee10762b850bb3c3d3a232a890b2d5b5fe469fbfd08d52ba0459cc724
3
+ size 14244
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:942cfd7aded7d16363d1ae1a2911c01ef4e25f3c70ed059c88f1845d9b6c24dc
3
+ size 988
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3ab7e64594d849ab9ff684dd4e2aac233019c559b7165f90e4c14c5b8cd1512
3
+ size 1064
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/trainer_state.json ADDED
@@ -0,0 +1,976 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.5769098997116089,
3
+ "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-1200",
4
+ "epoch": 1.542765273311897,
5
+ "eval_steps": 100,
6
+ "global_step": 1200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0012861736334405145,
13
+ "grad_norm": 0.39783015847206116,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 2.0835,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.012861736334405145,
20
+ "grad_norm": 0.45549583435058594,
21
+ "learning_rate": 1e-05,
22
+ "loss": 2.1408,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.02572347266881029,
27
+ "grad_norm": 0.4594053626060486,
28
+ "learning_rate": 2e-05,
29
+ "loss": 2.0894,
30
+ "step": 20
31
+ },
32
+ {
33
+ "epoch": 0.03858520900321544,
34
+ "grad_norm": 0.49020764231681824,
35
+ "learning_rate": 3e-05,
36
+ "loss": 2.1037,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.05144694533762058,
41
+ "grad_norm": 0.37993305921554565,
42
+ "learning_rate": 4e-05,
43
+ "loss": 1.9716,
44
+ "step": 40
45
+ },
46
+ {
47
+ "epoch": 0.06430868167202572,
48
+ "grad_norm": 0.38231977820396423,
49
+ "learning_rate": 5e-05,
50
+ "loss": 1.9349,
51
+ "step": 50
52
+ },
53
+ {
54
+ "epoch": 0.07717041800643087,
55
+ "grad_norm": 0.2922589182853699,
56
+ "learning_rate": 6e-05,
57
+ "loss": 1.906,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.09003215434083602,
62
+ "grad_norm": 0.34647658467292786,
63
+ "learning_rate": 7e-05,
64
+ "loss": 1.8246,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.10289389067524116,
69
+ "grad_norm": 0.31930026412010193,
70
+ "learning_rate": 8e-05,
71
+ "loss": 1.8057,
72
+ "step": 80
73
+ },
74
+ {
75
+ "epoch": 0.1157556270096463,
76
+ "grad_norm": 0.34028756618499756,
77
+ "learning_rate": 9e-05,
78
+ "loss": 1.7546,
79
+ "step": 90
80
+ },
81
+ {
82
+ "epoch": 0.12861736334405144,
83
+ "grad_norm": 0.3878991901874542,
84
+ "learning_rate": 0.0001,
85
+ "loss": 1.7543,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 0.12861736334405144,
90
+ "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
91
+ "eval_yahma/alpaca-cleaned_runtime": 62.5096,
92
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
93
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
94
+ "step": 100
95
+ },
96
+ {
97
+ "epoch": 0.1414790996784566,
98
+ "grad_norm": 0.35599613189697266,
99
+ "learning_rate": 9.931224209078405e-05,
100
+ "loss": 1.7309,
101
+ "step": 110
102
+ },
103
+ {
104
+ "epoch": 0.15434083601286175,
105
+ "grad_norm": 0.4075644016265869,
106
+ "learning_rate": 9.862448418156809e-05,
107
+ "loss": 1.6981,
108
+ "step": 120
109
+ },
110
+ {
111
+ "epoch": 0.16720257234726688,
112
+ "grad_norm": 0.4743317663669586,
113
+ "learning_rate": 9.793672627235215e-05,
114
+ "loss": 1.7011,
115
+ "step": 130
116
+ },
117
+ {
118
+ "epoch": 0.18006430868167203,
119
+ "grad_norm": 0.4701610505580902,
120
+ "learning_rate": 9.724896836313618e-05,
121
+ "loss": 1.6771,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.19292604501607716,
126
+ "grad_norm": 0.49115318059921265,
127
+ "learning_rate": 9.656121045392023e-05,
128
+ "loss": 1.6633,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.2057877813504823,
133
+ "grad_norm": 0.5177980661392212,
134
+ "learning_rate": 9.587345254470427e-05,
135
+ "loss": 1.6706,
136
+ "step": 160
137
+ },
138
+ {
139
+ "epoch": 0.21864951768488747,
140
+ "grad_norm": 0.465657114982605,
141
+ "learning_rate": 9.518569463548831e-05,
142
+ "loss": 1.6677,
143
+ "step": 170
144
+ },
145
+ {
146
+ "epoch": 0.2315112540192926,
147
+ "grad_norm": 0.5453551411628723,
148
+ "learning_rate": 9.449793672627235e-05,
149
+ "loss": 1.6656,
150
+ "step": 180
151
+ },
152
+ {
153
+ "epoch": 0.24437299035369775,
154
+ "grad_norm": 0.4150402545928955,
155
+ "learning_rate": 9.38101788170564e-05,
156
+ "loss": 1.6568,
157
+ "step": 190
158
+ },
159
+ {
160
+ "epoch": 0.2572347266881029,
161
+ "grad_norm": 0.5106223225593567,
162
+ "learning_rate": 9.312242090784045e-05,
163
+ "loss": 1.6804,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 0.2572347266881029,
168
+ "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
169
+ "eval_yahma/alpaca-cleaned_runtime": 63.0481,
170
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
171
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
172
+ "step": 200
173
+ },
174
+ {
175
+ "epoch": 0.27009646302250806,
176
+ "grad_norm": 0.47371965646743774,
177
+ "learning_rate": 9.243466299862448e-05,
178
+ "loss": 1.6235,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 0.2829581993569132,
183
+ "grad_norm": 0.45723679661750793,
184
+ "learning_rate": 9.174690508940853e-05,
185
+ "loss": 1.6192,
186
+ "step": 220
187
+ },
188
+ {
189
+ "epoch": 0.2958199356913183,
190
+ "grad_norm": 0.46727871894836426,
191
+ "learning_rate": 9.105914718019258e-05,
192
+ "loss": 1.6129,
193
+ "step": 230
194
+ },
195
+ {
196
+ "epoch": 0.3086816720257235,
197
+ "grad_norm": 0.5216034054756165,
198
+ "learning_rate": 9.037138927097662e-05,
199
+ "loss": 1.6065,
200
+ "step": 240
201
+ },
202
+ {
203
+ "epoch": 0.3215434083601286,
204
+ "grad_norm": 0.46132415533065796,
205
+ "learning_rate": 8.968363136176067e-05,
206
+ "loss": 1.6374,
207
+ "step": 250
208
+ },
209
+ {
210
+ "epoch": 0.33440514469453375,
211
+ "grad_norm": 0.5699637532234192,
212
+ "learning_rate": 8.89958734525447e-05,
213
+ "loss": 1.6031,
214
+ "step": 260
215
+ },
216
+ {
217
+ "epoch": 0.34726688102893893,
218
+ "grad_norm": 0.46537184715270996,
219
+ "learning_rate": 8.830811554332875e-05,
220
+ "loss": 1.6196,
221
+ "step": 270
222
+ },
223
+ {
224
+ "epoch": 0.36012861736334406,
225
+ "grad_norm": 0.5034765005111694,
226
+ "learning_rate": 8.76203576341128e-05,
227
+ "loss": 1.6257,
228
+ "step": 280
229
+ },
230
+ {
231
+ "epoch": 0.3729903536977492,
232
+ "grad_norm": 0.48885518312454224,
233
+ "learning_rate": 8.693259972489685e-05,
234
+ "loss": 1.6195,
235
+ "step": 290
236
+ },
237
+ {
238
+ "epoch": 0.3858520900321543,
239
+ "grad_norm": 0.48295891284942627,
240
+ "learning_rate": 8.62448418156809e-05,
241
+ "loss": 1.6301,
242
+ "step": 300
243
+ },
244
+ {
245
+ "epoch": 0.3858520900321543,
246
+ "eval_yahma/alpaca-cleaned_loss": 1.6362165212631226,
247
+ "eval_yahma/alpaca-cleaned_runtime": 62.9945,
248
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.749,
249
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
250
+ "step": 300
251
+ },
252
+ {
253
+ "epoch": 0.3987138263665595,
254
+ "grad_norm": 0.4800078570842743,
255
+ "learning_rate": 8.555708390646493e-05,
256
+ "loss": 1.6171,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 0.4115755627009646,
261
+ "grad_norm": 0.47452959418296814,
262
+ "learning_rate": 8.486932599724897e-05,
263
+ "loss": 1.6147,
264
+ "step": 320
265
+ },
266
+ {
267
+ "epoch": 0.42443729903536975,
268
+ "grad_norm": 0.5397221446037292,
269
+ "learning_rate": 8.418156808803301e-05,
270
+ "loss": 1.6041,
271
+ "step": 330
272
+ },
273
+ {
274
+ "epoch": 0.43729903536977494,
275
+ "grad_norm": 0.5501461029052734,
276
+ "learning_rate": 8.349381017881706e-05,
277
+ "loss": 1.6091,
278
+ "step": 340
279
+ },
280
+ {
281
+ "epoch": 0.45016077170418006,
282
+ "grad_norm": 0.47587981820106506,
283
+ "learning_rate": 8.28060522696011e-05,
284
+ "loss": 1.6008,
285
+ "step": 350
286
+ },
287
+ {
288
+ "epoch": 0.4630225080385852,
289
+ "grad_norm": 0.46644529700279236,
290
+ "learning_rate": 8.211829436038515e-05,
291
+ "loss": 1.6081,
292
+ "step": 360
293
+ },
294
+ {
295
+ "epoch": 0.4758842443729904,
296
+ "grad_norm": 0.5308094024658203,
297
+ "learning_rate": 8.14305364511692e-05,
298
+ "loss": 1.5987,
299
+ "step": 370
300
+ },
301
+ {
302
+ "epoch": 0.4887459807073955,
303
+ "grad_norm": 0.5304721593856812,
304
+ "learning_rate": 8.074277854195323e-05,
305
+ "loss": 1.6173,
306
+ "step": 380
307
+ },
308
+ {
309
+ "epoch": 0.5016077170418006,
310
+ "grad_norm": 0.6186290383338928,
311
+ "learning_rate": 8.005502063273728e-05,
312
+ "loss": 1.5879,
313
+ "step": 390
314
+ },
315
+ {
316
+ "epoch": 0.5144694533762058,
317
+ "grad_norm": 0.4936847388744354,
318
+ "learning_rate": 7.936726272352132e-05,
319
+ "loss": 1.5771,
320
+ "step": 400
321
+ },
322
+ {
323
+ "epoch": 0.5144694533762058,
324
+ "eval_yahma/alpaca-cleaned_loss": 1.6208504438400269,
325
+ "eval_yahma/alpaca-cleaned_runtime": 62.9246,
326
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.784,
327
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
328
+ "step": 400
329
+ },
330
+ {
331
+ "epoch": 0.5273311897106109,
332
+ "grad_norm": 0.4969954788684845,
333
+ "learning_rate": 7.867950481430537e-05,
334
+ "loss": 1.5873,
335
+ "step": 410
336
+ },
337
+ {
338
+ "epoch": 0.5401929260450161,
339
+ "grad_norm": 0.5539654493331909,
340
+ "learning_rate": 7.799174690508942e-05,
341
+ "loss": 1.5741,
342
+ "step": 420
343
+ },
344
+ {
345
+ "epoch": 0.5530546623794212,
346
+ "grad_norm": 0.4963805377483368,
347
+ "learning_rate": 7.730398899587345e-05,
348
+ "loss": 1.5883,
349
+ "step": 430
350
+ },
351
+ {
352
+ "epoch": 0.5659163987138264,
353
+ "grad_norm": 0.4849222004413605,
354
+ "learning_rate": 7.66162310866575e-05,
355
+ "loss": 1.6061,
356
+ "step": 440
357
+ },
358
+ {
359
+ "epoch": 0.5787781350482315,
360
+ "grad_norm": 0.5241298079490662,
361
+ "learning_rate": 7.592847317744153e-05,
362
+ "loss": 1.6118,
363
+ "step": 450
364
+ },
365
+ {
366
+ "epoch": 0.5916398713826366,
367
+ "grad_norm": 0.5051389336585999,
368
+ "learning_rate": 7.52407152682256e-05,
369
+ "loss": 1.5618,
370
+ "step": 460
371
+ },
372
+ {
373
+ "epoch": 0.6045016077170418,
374
+ "grad_norm": 0.49376770853996277,
375
+ "learning_rate": 7.455295735900963e-05,
376
+ "loss": 1.5871,
377
+ "step": 470
378
+ },
379
+ {
380
+ "epoch": 0.617363344051447,
381
+ "grad_norm": 0.49221155047416687,
382
+ "learning_rate": 7.386519944979367e-05,
383
+ "loss": 1.6037,
384
+ "step": 480
385
+ },
386
+ {
387
+ "epoch": 0.6302250803858521,
388
+ "grad_norm": 0.5378918647766113,
389
+ "learning_rate": 7.317744154057772e-05,
390
+ "loss": 1.5523,
391
+ "step": 490
392
+ },
393
+ {
394
+ "epoch": 0.6430868167202572,
395
+ "grad_norm": 0.5564639568328857,
396
+ "learning_rate": 7.248968363136176e-05,
397
+ "loss": 1.5885,
398
+ "step": 500
399
+ },
400
+ {
401
+ "epoch": 0.6430868167202572,
402
+ "eval_yahma/alpaca-cleaned_loss": 1.6095871925354004,
403
+ "eval_yahma/alpaca-cleaned_runtime": 62.983,
404
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.755,
405
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
406
+ "step": 500
407
+ },
408
+ {
409
+ "epoch": 0.6559485530546624,
410
+ "grad_norm": 0.49083277583122253,
411
+ "learning_rate": 7.180192572214582e-05,
412
+ "loss": 1.5693,
413
+ "step": 510
414
+ },
415
+ {
416
+ "epoch": 0.6688102893890675,
417
+ "grad_norm": 0.5625829100608826,
418
+ "learning_rate": 7.111416781292985e-05,
419
+ "loss": 1.6023,
420
+ "step": 520
421
+ },
422
+ {
423
+ "epoch": 0.6816720257234726,
424
+ "grad_norm": 0.6078226566314697,
425
+ "learning_rate": 7.04264099037139e-05,
426
+ "loss": 1.5845,
427
+ "step": 530
428
+ },
429
+ {
430
+ "epoch": 0.6945337620578779,
431
+ "grad_norm": 0.48107999563217163,
432
+ "learning_rate": 6.973865199449794e-05,
433
+ "loss": 1.5682,
434
+ "step": 540
435
+ },
436
+ {
437
+ "epoch": 0.707395498392283,
438
+ "grad_norm": 0.5080347657203674,
439
+ "learning_rate": 6.905089408528198e-05,
440
+ "loss": 1.5839,
441
+ "step": 550
442
+ },
443
+ {
444
+ "epoch": 0.7202572347266881,
445
+ "grad_norm": 0.5683622360229492,
446
+ "learning_rate": 6.836313617606602e-05,
447
+ "loss": 1.5916,
448
+ "step": 560
449
+ },
450
+ {
451
+ "epoch": 0.7331189710610932,
452
+ "grad_norm": 0.4669715464115143,
453
+ "learning_rate": 6.767537826685007e-05,
454
+ "loss": 1.6146,
455
+ "step": 570
456
+ },
457
+ {
458
+ "epoch": 0.7459807073954984,
459
+ "grad_norm": 0.4946054518222809,
460
+ "learning_rate": 6.698762035763412e-05,
461
+ "loss": 1.5764,
462
+ "step": 580
463
+ },
464
+ {
465
+ "epoch": 0.7588424437299035,
466
+ "grad_norm": 0.4975377023220062,
467
+ "learning_rate": 6.629986244841817e-05,
468
+ "loss": 1.6035,
469
+ "step": 590
470
+ },
471
+ {
472
+ "epoch": 0.7717041800643086,
473
+ "grad_norm": 0.5511853098869324,
474
+ "learning_rate": 6.56121045392022e-05,
475
+ "loss": 1.5842,
476
+ "step": 600
477
+ },
478
+ {
479
+ "epoch": 0.7717041800643086,
480
+ "eval_yahma/alpaca-cleaned_loss": 1.6013859510421753,
481
+ "eval_yahma/alpaca-cleaned_runtime": 62.9465,
482
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.773,
483
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.972,
484
+ "step": 600
485
+ },
486
+ {
487
+ "epoch": 0.7845659163987139,
488
+ "grad_norm": 0.5689719915390015,
489
+ "learning_rate": 6.492434662998625e-05,
490
+ "loss": 1.5656,
491
+ "step": 610
492
+ },
493
+ {
494
+ "epoch": 0.797427652733119,
495
+ "grad_norm": 0.48885637521743774,
496
+ "learning_rate": 6.42365887207703e-05,
497
+ "loss": 1.5605,
498
+ "step": 620
499
+ },
500
+ {
501
+ "epoch": 0.8102893890675241,
502
+ "grad_norm": 0.5316773056983948,
503
+ "learning_rate": 6.354883081155434e-05,
504
+ "loss": 1.5755,
505
+ "step": 630
506
+ },
507
+ {
508
+ "epoch": 0.8231511254019293,
509
+ "grad_norm": 0.5578161478042603,
510
+ "learning_rate": 6.286107290233837e-05,
511
+ "loss": 1.5532,
512
+ "step": 640
513
+ },
514
+ {
515
+ "epoch": 0.8360128617363344,
516
+ "grad_norm": 0.6534080505371094,
517
+ "learning_rate": 6.217331499312242e-05,
518
+ "loss": 1.5882,
519
+ "step": 650
520
+ },
521
+ {
522
+ "epoch": 0.8488745980707395,
523
+ "grad_norm": 0.5140324831008911,
524
+ "learning_rate": 6.148555708390647e-05,
525
+ "loss": 1.5598,
526
+ "step": 660
527
+ },
528
+ {
529
+ "epoch": 0.8617363344051447,
530
+ "grad_norm": 0.5247426629066467,
531
+ "learning_rate": 6.0797799174690516e-05,
532
+ "loss": 1.5833,
533
+ "step": 670
534
+ },
535
+ {
536
+ "epoch": 0.8745980707395499,
537
+ "grad_norm": 0.49460870027542114,
538
+ "learning_rate": 6.011004126547456e-05,
539
+ "loss": 1.621,
540
+ "step": 680
541
+ },
542
+ {
543
+ "epoch": 0.887459807073955,
544
+ "grad_norm": 0.5351711511611938,
545
+ "learning_rate": 5.9422283356258604e-05,
546
+ "loss": 1.5371,
547
+ "step": 690
548
+ },
549
+ {
550
+ "epoch": 0.9003215434083601,
551
+ "grad_norm": 0.5608878135681152,
552
+ "learning_rate": 5.8734525447042644e-05,
553
+ "loss": 1.5878,
554
+ "step": 700
555
+ },
556
+ {
557
+ "epoch": 0.9003215434083601,
558
+ "eval_yahma/alpaca-cleaned_loss": 1.5940771102905273,
559
+ "eval_yahma/alpaca-cleaned_runtime": 62.917,
560
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
561
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 0.9131832797427653,
566
+ "grad_norm": 0.48291367292404175,
567
+ "learning_rate": 5.8046767537826685e-05,
568
+ "loss": 1.583,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 0.9260450160771704,
573
+ "grad_norm": 0.4866442382335663,
574
+ "learning_rate": 5.7359009628610725e-05,
575
+ "loss": 1.5891,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 0.9389067524115756,
580
+ "grad_norm": 0.5254418253898621,
581
+ "learning_rate": 5.667125171939478e-05,
582
+ "loss": 1.5319,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 0.9517684887459807,
587
+ "grad_norm": 0.5201655030250549,
588
+ "learning_rate": 5.598349381017882e-05,
589
+ "loss": 1.5819,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 0.9646302250803859,
594
+ "grad_norm": 0.5820693969726562,
595
+ "learning_rate": 5.5295735900962866e-05,
596
+ "loss": 1.5807,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 0.977491961414791,
601
+ "grad_norm": 0.559010922908783,
602
+ "learning_rate": 5.460797799174691e-05,
603
+ "loss": 1.5597,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 0.9903536977491961,
608
+ "grad_norm": 0.498877614736557,
609
+ "learning_rate": 5.392022008253095e-05,
610
+ "loss": 1.5628,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 1.002572347266881,
615
+ "grad_norm": 0.5119406580924988,
616
+ "learning_rate": 5.3232462173315e-05,
617
+ "loss": 1.5693,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 1.015434083601286,
622
+ "grad_norm": 0.5344542860984802,
623
+ "learning_rate": 5.254470426409904e-05,
624
+ "loss": 1.5256,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 1.0282958199356913,
629
+ "grad_norm": 0.5358342528343201,
630
+ "learning_rate": 5.185694635488308e-05,
631
+ "loss": 1.5432,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 1.0282958199356913,
636
+ "eval_yahma/alpaca-cleaned_loss": 1.589645266532898,
637
+ "eval_yahma/alpaca-cleaned_runtime": 62.9636,
638
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.764,
639
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
640
+ "step": 800
641
+ },
642
+ {
643
+ "epoch": 1.0411575562700965,
644
+ "grad_norm": 0.5941470265388489,
645
+ "learning_rate": 5.116918844566713e-05,
646
+ "loss": 1.5449,
647
+ "step": 810
648
+ },
649
+ {
650
+ "epoch": 1.0540192926045016,
651
+ "grad_norm": 0.5659182071685791,
652
+ "learning_rate": 5.048143053645117e-05,
653
+ "loss": 1.5234,
654
+ "step": 820
655
+ },
656
+ {
657
+ "epoch": 1.0668810289389068,
658
+ "grad_norm": 0.5737349390983582,
659
+ "learning_rate": 4.9793672627235217e-05,
660
+ "loss": 1.5517,
661
+ "step": 830
662
+ },
663
+ {
664
+ "epoch": 1.0797427652733118,
665
+ "grad_norm": 0.5984872579574585,
666
+ "learning_rate": 4.910591471801926e-05,
667
+ "loss": 1.5175,
668
+ "step": 840
669
+ },
670
+ {
671
+ "epoch": 1.092604501607717,
672
+ "grad_norm": 0.5954984426498413,
673
+ "learning_rate": 4.8418156808803304e-05,
674
+ "loss": 1.5738,
675
+ "step": 850
676
+ },
677
+ {
678
+ "epoch": 1.1054662379421223,
679
+ "grad_norm": 0.5545582175254822,
680
+ "learning_rate": 4.7730398899587344e-05,
681
+ "loss": 1.5538,
682
+ "step": 860
683
+ },
684
+ {
685
+ "epoch": 1.1183279742765273,
686
+ "grad_norm": 0.6972865462303162,
687
+ "learning_rate": 4.704264099037139e-05,
688
+ "loss": 1.529,
689
+ "step": 870
690
+ },
691
+ {
692
+ "epoch": 1.1311897106109325,
693
+ "grad_norm": 0.5404506325721741,
694
+ "learning_rate": 4.635488308115544e-05,
695
+ "loss": 1.5567,
696
+ "step": 880
697
+ },
698
+ {
699
+ "epoch": 1.1440514469453376,
700
+ "grad_norm": 0.5792121887207031,
701
+ "learning_rate": 4.566712517193948e-05,
702
+ "loss": 1.5422,
703
+ "step": 890
704
+ },
705
+ {
706
+ "epoch": 1.1569131832797428,
707
+ "grad_norm": 0.5468006134033203,
708
+ "learning_rate": 4.497936726272352e-05,
709
+ "loss": 1.5369,
710
+ "step": 900
711
+ },
712
+ {
713
+ "epoch": 1.1569131832797428,
714
+ "eval_yahma/alpaca-cleaned_loss": 1.5860395431518555,
715
+ "eval_yahma/alpaca-cleaned_runtime": 62.9918,
716
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.75,
717
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
718
+ "step": 900
719
+ },
720
+ {
721
+ "epoch": 1.1697749196141478,
722
+ "grad_norm": 0.5955344438552856,
723
+ "learning_rate": 4.429160935350757e-05,
724
+ "loss": 1.5551,
725
+ "step": 910
726
+ },
727
+ {
728
+ "epoch": 1.182636655948553,
729
+ "grad_norm": 0.5832058787345886,
730
+ "learning_rate": 4.360385144429161e-05,
731
+ "loss": 1.5568,
732
+ "step": 920
733
+ },
734
+ {
735
+ "epoch": 1.1954983922829583,
736
+ "grad_norm": 0.6309258937835693,
737
+ "learning_rate": 4.291609353507566e-05,
738
+ "loss": 1.5548,
739
+ "step": 930
740
+ },
741
+ {
742
+ "epoch": 1.2083601286173633,
743
+ "grad_norm": 0.6269820928573608,
744
+ "learning_rate": 4.22283356258597e-05,
745
+ "loss": 1.5459,
746
+ "step": 940
747
+ },
748
+ {
749
+ "epoch": 1.2212218649517685,
750
+ "grad_norm": 0.6376837491989136,
751
+ "learning_rate": 4.154057771664374e-05,
752
+ "loss": 1.5277,
753
+ "step": 950
754
+ },
755
+ {
756
+ "epoch": 1.2340836012861736,
757
+ "grad_norm": 0.6351036429405212,
758
+ "learning_rate": 4.085281980742779e-05,
759
+ "loss": 1.5273,
760
+ "step": 960
761
+ },
762
+ {
763
+ "epoch": 1.2469453376205788,
764
+ "grad_norm": 0.6877638101577759,
765
+ "learning_rate": 4.016506189821183e-05,
766
+ "loss": 1.4986,
767
+ "step": 970
768
+ },
769
+ {
770
+ "epoch": 1.2598070739549838,
771
+ "grad_norm": 0.5501726865768433,
772
+ "learning_rate": 3.947730398899587e-05,
773
+ "loss": 1.5543,
774
+ "step": 980
775
+ },
776
+ {
777
+ "epoch": 1.272668810289389,
778
+ "grad_norm": 0.5217163562774658,
779
+ "learning_rate": 3.8789546079779924e-05,
780
+ "loss": 1.5292,
781
+ "step": 990
782
+ },
783
+ {
784
+ "epoch": 1.2855305466237943,
785
+ "grad_norm": 0.5770425796508789,
786
+ "learning_rate": 3.8101788170563964e-05,
787
+ "loss": 1.5536,
788
+ "step": 1000
789
+ },
790
+ {
791
+ "epoch": 1.2855305466237943,
792
+ "eval_yahma/alpaca-cleaned_loss": 1.5820817947387695,
793
+ "eval_yahma/alpaca-cleaned_runtime": 62.9495,
794
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
795
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
796
+ "step": 1000
797
+ },
798
+ {
799
+ "epoch": 1.2983922829581993,
800
+ "grad_norm": 0.5802098512649536,
801
+ "learning_rate": 3.741403026134801e-05,
802
+ "loss": 1.5479,
803
+ "step": 1010
804
+ },
805
+ {
806
+ "epoch": 1.3112540192926045,
807
+ "grad_norm": 0.5646567344665527,
808
+ "learning_rate": 3.672627235213205e-05,
809
+ "loss": 1.5183,
810
+ "step": 1020
811
+ },
812
+ {
813
+ "epoch": 1.3241157556270098,
814
+ "grad_norm": 0.5852165222167969,
815
+ "learning_rate": 3.603851444291609e-05,
816
+ "loss": 1.5267,
817
+ "step": 1030
818
+ },
819
+ {
820
+ "epoch": 1.3369774919614148,
821
+ "grad_norm": 0.5583398342132568,
822
+ "learning_rate": 3.535075653370014e-05,
823
+ "loss": 1.5401,
824
+ "step": 1040
825
+ },
826
+ {
827
+ "epoch": 1.3498392282958198,
828
+ "grad_norm": 0.5971976518630981,
829
+ "learning_rate": 3.4662998624484186e-05,
830
+ "loss": 1.5147,
831
+ "step": 1050
832
+ },
833
+ {
834
+ "epoch": 1.362700964630225,
835
+ "grad_norm": 0.6036947965621948,
836
+ "learning_rate": 3.3975240715268227e-05,
837
+ "loss": 1.5294,
838
+ "step": 1060
839
+ },
840
+ {
841
+ "epoch": 1.3755627009646303,
842
+ "grad_norm": 0.5828876495361328,
843
+ "learning_rate": 3.3287482806052274e-05,
844
+ "loss": 1.546,
845
+ "step": 1070
846
+ },
847
+ {
848
+ "epoch": 1.3884244372990353,
849
+ "grad_norm": 0.5941759943962097,
850
+ "learning_rate": 3.2599724896836314e-05,
851
+ "loss": 1.5238,
852
+ "step": 1080
853
+ },
854
+ {
855
+ "epoch": 1.4012861736334405,
856
+ "grad_norm": 0.6082496047019958,
857
+ "learning_rate": 3.1911966987620354e-05,
858
+ "loss": 1.5055,
859
+ "step": 1090
860
+ },
861
+ {
862
+ "epoch": 1.4141479099678458,
863
+ "grad_norm": 0.5749199390411377,
864
+ "learning_rate": 3.12242090784044e-05,
865
+ "loss": 1.5238,
866
+ "step": 1100
867
+ },
868
+ {
869
+ "epoch": 1.4141479099678458,
870
+ "eval_yahma/alpaca-cleaned_loss": 1.5794486999511719,
871
+ "eval_yahma/alpaca-cleaned_runtime": 62.9209,
872
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.786,
873
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
874
+ "step": 1100
875
+ },
876
+ {
877
+ "epoch": 1.4270096463022508,
878
+ "grad_norm": 0.649699330329895,
879
+ "learning_rate": 3.053645116918845e-05,
880
+ "loss": 1.5275,
881
+ "step": 1110
882
+ },
883
+ {
884
+ "epoch": 1.4398713826366558,
885
+ "grad_norm": 0.5754693150520325,
886
+ "learning_rate": 2.9848693259972492e-05,
887
+ "loss": 1.5217,
888
+ "step": 1120
889
+ },
890
+ {
891
+ "epoch": 1.452733118971061,
892
+ "grad_norm": 0.572021484375,
893
+ "learning_rate": 2.9160935350756536e-05,
894
+ "loss": 1.5489,
895
+ "step": 1130
896
+ },
897
+ {
898
+ "epoch": 1.4655948553054663,
899
+ "grad_norm": 0.6010130643844604,
900
+ "learning_rate": 2.8473177441540577e-05,
901
+ "loss": 1.5019,
902
+ "step": 1140
903
+ },
904
+ {
905
+ "epoch": 1.4784565916398713,
906
+ "grad_norm": 0.6172171831130981,
907
+ "learning_rate": 2.7785419532324624e-05,
908
+ "loss": 1.5703,
909
+ "step": 1150
910
+ },
911
+ {
912
+ "epoch": 1.4913183279742765,
913
+ "grad_norm": 0.5957326889038086,
914
+ "learning_rate": 2.7097661623108668e-05,
915
+ "loss": 1.5247,
916
+ "step": 1160
917
+ },
918
+ {
919
+ "epoch": 1.5041800643086818,
920
+ "grad_norm": 0.5608690977096558,
921
+ "learning_rate": 2.6409903713892708e-05,
922
+ "loss": 1.5403,
923
+ "step": 1170
924
+ },
925
+ {
926
+ "epoch": 1.5170418006430868,
927
+ "grad_norm": 0.5870776176452637,
928
+ "learning_rate": 2.5722145804676755e-05,
929
+ "loss": 1.5235,
930
+ "step": 1180
931
+ },
932
+ {
933
+ "epoch": 1.5299035369774918,
934
+ "grad_norm": 0.5889161229133606,
935
+ "learning_rate": 2.50343878954608e-05,
936
+ "loss": 1.5164,
937
+ "step": 1190
938
+ },
939
+ {
940
+ "epoch": 1.542765273311897,
941
+ "grad_norm": 0.6082655787467957,
942
+ "learning_rate": 2.4346629986244843e-05,
943
+ "loss": 1.5022,
944
+ "step": 1200
945
+ },
946
+ {
947
+ "epoch": 1.542765273311897,
948
+ "eval_yahma/alpaca-cleaned_loss": 1.5769098997116089,
949
+ "eval_yahma/alpaca-cleaned_runtime": 62.9228,
950
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.785,
951
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
952
+ "step": 1200
953
+ }
954
+ ],
955
+ "logging_steps": 10,
956
+ "max_steps": 1554,
957
+ "num_input_tokens_seen": 0,
958
+ "num_train_epochs": 2,
959
+ "save_steps": 200,
960
+ "stateful_callbacks": {
961
+ "TrainerControl": {
962
+ "args": {
963
+ "should_epoch_stop": false,
964
+ "should_evaluate": false,
965
+ "should_log": false,
966
+ "should_save": true,
967
+ "should_training_stop": false
968
+ },
969
+ "attributes": {}
970
+ }
971
+ },
972
+ "total_flos": 8.92217191122862e+16,
973
+ "train_batch_size": 4,
974
+ "trial_name": null,
975
+ "trial_params": null
976
+ }
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
3
+ size 5368
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:353c63c0311c5f1ca2429d17bbaee55f71b9e4479cf3c05d874b1d6490acc2bd
3
+ size 19960448
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76055a71d078a7ab14b379bb8d8ca0f5a097c62bdaa6a1ed041f8d0795a475d
3
+ size 40050298
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a2ce3ed5d78f12a31661bc5e87fdf5a10accd23348d868e7890473bb1cbdd90
3
+ size 14244
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baba31a5e5063037a5c811de9cb04bc62c6c5f0f5fe6720b7d681afe6500d4c1
3
+ size 988
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac5921ca2a7080bbf354e8e211e9657d5c3188a2b7c88c6c82bb3b7f013be9ac
3
+ size 1064
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/trainer_state.json ADDED
@@ -0,0 +1,1132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.5736079216003418,
3
+ "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-1400",
4
+ "epoch": 1.8,
5
+ "eval_steps": 100,
6
+ "global_step": 1400,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0012861736334405145,
13
+ "grad_norm": 0.39783015847206116,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 2.0835,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.012861736334405145,
20
+ "grad_norm": 0.45549583435058594,
21
+ "learning_rate": 1e-05,
22
+ "loss": 2.1408,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.02572347266881029,
27
+ "grad_norm": 0.4594053626060486,
28
+ "learning_rate": 2e-05,
29
+ "loss": 2.0894,
30
+ "step": 20
31
+ },
32
+ {
33
+ "epoch": 0.03858520900321544,
34
+ "grad_norm": 0.49020764231681824,
35
+ "learning_rate": 3e-05,
36
+ "loss": 2.1037,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.05144694533762058,
41
+ "grad_norm": 0.37993305921554565,
42
+ "learning_rate": 4e-05,
43
+ "loss": 1.9716,
44
+ "step": 40
45
+ },
46
+ {
47
+ "epoch": 0.06430868167202572,
48
+ "grad_norm": 0.38231977820396423,
49
+ "learning_rate": 5e-05,
50
+ "loss": 1.9349,
51
+ "step": 50
52
+ },
53
+ {
54
+ "epoch": 0.07717041800643087,
55
+ "grad_norm": 0.2922589182853699,
56
+ "learning_rate": 6e-05,
57
+ "loss": 1.906,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.09003215434083602,
62
+ "grad_norm": 0.34647658467292786,
63
+ "learning_rate": 7e-05,
64
+ "loss": 1.8246,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.10289389067524116,
69
+ "grad_norm": 0.31930026412010193,
70
+ "learning_rate": 8e-05,
71
+ "loss": 1.8057,
72
+ "step": 80
73
+ },
74
+ {
75
+ "epoch": 0.1157556270096463,
76
+ "grad_norm": 0.34028756618499756,
77
+ "learning_rate": 9e-05,
78
+ "loss": 1.7546,
79
+ "step": 90
80
+ },
81
+ {
82
+ "epoch": 0.12861736334405144,
83
+ "grad_norm": 0.3878991901874542,
84
+ "learning_rate": 0.0001,
85
+ "loss": 1.7543,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 0.12861736334405144,
90
+ "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
91
+ "eval_yahma/alpaca-cleaned_runtime": 62.5096,
92
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
93
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
94
+ "step": 100
95
+ },
96
+ {
97
+ "epoch": 0.1414790996784566,
98
+ "grad_norm": 0.35599613189697266,
99
+ "learning_rate": 9.931224209078405e-05,
100
+ "loss": 1.7309,
101
+ "step": 110
102
+ },
103
+ {
104
+ "epoch": 0.15434083601286175,
105
+ "grad_norm": 0.4075644016265869,
106
+ "learning_rate": 9.862448418156809e-05,
107
+ "loss": 1.6981,
108
+ "step": 120
109
+ },
110
+ {
111
+ "epoch": 0.16720257234726688,
112
+ "grad_norm": 0.4743317663669586,
113
+ "learning_rate": 9.793672627235215e-05,
114
+ "loss": 1.7011,
115
+ "step": 130
116
+ },
117
+ {
118
+ "epoch": 0.18006430868167203,
119
+ "grad_norm": 0.4701610505580902,
120
+ "learning_rate": 9.724896836313618e-05,
121
+ "loss": 1.6771,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.19292604501607716,
126
+ "grad_norm": 0.49115318059921265,
127
+ "learning_rate": 9.656121045392023e-05,
128
+ "loss": 1.6633,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.2057877813504823,
133
+ "grad_norm": 0.5177980661392212,
134
+ "learning_rate": 9.587345254470427e-05,
135
+ "loss": 1.6706,
136
+ "step": 160
137
+ },
138
+ {
139
+ "epoch": 0.21864951768488747,
140
+ "grad_norm": 0.465657114982605,
141
+ "learning_rate": 9.518569463548831e-05,
142
+ "loss": 1.6677,
143
+ "step": 170
144
+ },
145
+ {
146
+ "epoch": 0.2315112540192926,
147
+ "grad_norm": 0.5453551411628723,
148
+ "learning_rate": 9.449793672627235e-05,
149
+ "loss": 1.6656,
150
+ "step": 180
151
+ },
152
+ {
153
+ "epoch": 0.24437299035369775,
154
+ "grad_norm": 0.4150402545928955,
155
+ "learning_rate": 9.38101788170564e-05,
156
+ "loss": 1.6568,
157
+ "step": 190
158
+ },
159
+ {
160
+ "epoch": 0.2572347266881029,
161
+ "grad_norm": 0.5106223225593567,
162
+ "learning_rate": 9.312242090784045e-05,
163
+ "loss": 1.6804,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 0.2572347266881029,
168
+ "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
169
+ "eval_yahma/alpaca-cleaned_runtime": 63.0481,
170
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
171
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
172
+ "step": 200
173
+ },
174
+ {
175
+ "epoch": 0.27009646302250806,
176
+ "grad_norm": 0.47371965646743774,
177
+ "learning_rate": 9.243466299862448e-05,
178
+ "loss": 1.6235,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 0.2829581993569132,
183
+ "grad_norm": 0.45723679661750793,
184
+ "learning_rate": 9.174690508940853e-05,
185
+ "loss": 1.6192,
186
+ "step": 220
187
+ },
188
+ {
189
+ "epoch": 0.2958199356913183,
190
+ "grad_norm": 0.46727871894836426,
191
+ "learning_rate": 9.105914718019258e-05,
192
+ "loss": 1.6129,
193
+ "step": 230
194
+ },
195
+ {
196
+ "epoch": 0.3086816720257235,
197
+ "grad_norm": 0.5216034054756165,
198
+ "learning_rate": 9.037138927097662e-05,
199
+ "loss": 1.6065,
200
+ "step": 240
201
+ },
202
+ {
203
+ "epoch": 0.3215434083601286,
204
+ "grad_norm": 0.46132415533065796,
205
+ "learning_rate": 8.968363136176067e-05,
206
+ "loss": 1.6374,
207
+ "step": 250
208
+ },
209
+ {
210
+ "epoch": 0.33440514469453375,
211
+ "grad_norm": 0.5699637532234192,
212
+ "learning_rate": 8.89958734525447e-05,
213
+ "loss": 1.6031,
214
+ "step": 260
215
+ },
216
+ {
217
+ "epoch": 0.34726688102893893,
218
+ "grad_norm": 0.46537184715270996,
219
+ "learning_rate": 8.830811554332875e-05,
220
+ "loss": 1.6196,
221
+ "step": 270
222
+ },
223
+ {
224
+ "epoch": 0.36012861736334406,
225
+ "grad_norm": 0.5034765005111694,
226
+ "learning_rate": 8.76203576341128e-05,
227
+ "loss": 1.6257,
228
+ "step": 280
229
+ },
230
+ {
231
+ "epoch": 0.3729903536977492,
232
+ "grad_norm": 0.48885518312454224,
233
+ "learning_rate": 8.693259972489685e-05,
234
+ "loss": 1.6195,
235
+ "step": 290
236
+ },
237
+ {
238
+ "epoch": 0.3858520900321543,
239
+ "grad_norm": 0.48295891284942627,
240
+ "learning_rate": 8.62448418156809e-05,
241
+ "loss": 1.6301,
242
+ "step": 300
243
+ },
244
+ {
245
+ "epoch": 0.3858520900321543,
246
+ "eval_yahma/alpaca-cleaned_loss": 1.6362165212631226,
247
+ "eval_yahma/alpaca-cleaned_runtime": 62.9945,
248
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.749,
249
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
250
+ "step": 300
251
+ },
252
+ {
253
+ "epoch": 0.3987138263665595,
254
+ "grad_norm": 0.4800078570842743,
255
+ "learning_rate": 8.555708390646493e-05,
256
+ "loss": 1.6171,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 0.4115755627009646,
261
+ "grad_norm": 0.47452959418296814,
262
+ "learning_rate": 8.486932599724897e-05,
263
+ "loss": 1.6147,
264
+ "step": 320
265
+ },
266
+ {
267
+ "epoch": 0.42443729903536975,
268
+ "grad_norm": 0.5397221446037292,
269
+ "learning_rate": 8.418156808803301e-05,
270
+ "loss": 1.6041,
271
+ "step": 330
272
+ },
273
+ {
274
+ "epoch": 0.43729903536977494,
275
+ "grad_norm": 0.5501461029052734,
276
+ "learning_rate": 8.349381017881706e-05,
277
+ "loss": 1.6091,
278
+ "step": 340
279
+ },
280
+ {
281
+ "epoch": 0.45016077170418006,
282
+ "grad_norm": 0.47587981820106506,
283
+ "learning_rate": 8.28060522696011e-05,
284
+ "loss": 1.6008,
285
+ "step": 350
286
+ },
287
+ {
288
+ "epoch": 0.4630225080385852,
289
+ "grad_norm": 0.46644529700279236,
290
+ "learning_rate": 8.211829436038515e-05,
291
+ "loss": 1.6081,
292
+ "step": 360
293
+ },
294
+ {
295
+ "epoch": 0.4758842443729904,
296
+ "grad_norm": 0.5308094024658203,
297
+ "learning_rate": 8.14305364511692e-05,
298
+ "loss": 1.5987,
299
+ "step": 370
300
+ },
301
+ {
302
+ "epoch": 0.4887459807073955,
303
+ "grad_norm": 0.5304721593856812,
304
+ "learning_rate": 8.074277854195323e-05,
305
+ "loss": 1.6173,
306
+ "step": 380
307
+ },
308
+ {
309
+ "epoch": 0.5016077170418006,
310
+ "grad_norm": 0.6186290383338928,
311
+ "learning_rate": 8.005502063273728e-05,
312
+ "loss": 1.5879,
313
+ "step": 390
314
+ },
315
+ {
316
+ "epoch": 0.5144694533762058,
317
+ "grad_norm": 0.4936847388744354,
318
+ "learning_rate": 7.936726272352132e-05,
319
+ "loss": 1.5771,
320
+ "step": 400
321
+ },
322
+ {
323
+ "epoch": 0.5144694533762058,
324
+ "eval_yahma/alpaca-cleaned_loss": 1.6208504438400269,
325
+ "eval_yahma/alpaca-cleaned_runtime": 62.9246,
326
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.784,
327
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
328
+ "step": 400
329
+ },
330
+ {
331
+ "epoch": 0.5273311897106109,
332
+ "grad_norm": 0.4969954788684845,
333
+ "learning_rate": 7.867950481430537e-05,
334
+ "loss": 1.5873,
335
+ "step": 410
336
+ },
337
+ {
338
+ "epoch": 0.5401929260450161,
339
+ "grad_norm": 0.5539654493331909,
340
+ "learning_rate": 7.799174690508942e-05,
341
+ "loss": 1.5741,
342
+ "step": 420
343
+ },
344
+ {
345
+ "epoch": 0.5530546623794212,
346
+ "grad_norm": 0.4963805377483368,
347
+ "learning_rate": 7.730398899587345e-05,
348
+ "loss": 1.5883,
349
+ "step": 430
350
+ },
351
+ {
352
+ "epoch": 0.5659163987138264,
353
+ "grad_norm": 0.4849222004413605,
354
+ "learning_rate": 7.66162310866575e-05,
355
+ "loss": 1.6061,
356
+ "step": 440
357
+ },
358
+ {
359
+ "epoch": 0.5787781350482315,
360
+ "grad_norm": 0.5241298079490662,
361
+ "learning_rate": 7.592847317744153e-05,
362
+ "loss": 1.6118,
363
+ "step": 450
364
+ },
365
+ {
366
+ "epoch": 0.5916398713826366,
367
+ "grad_norm": 0.5051389336585999,
368
+ "learning_rate": 7.52407152682256e-05,
369
+ "loss": 1.5618,
370
+ "step": 460
371
+ },
372
+ {
373
+ "epoch": 0.6045016077170418,
374
+ "grad_norm": 0.49376770853996277,
375
+ "learning_rate": 7.455295735900963e-05,
376
+ "loss": 1.5871,
377
+ "step": 470
378
+ },
379
+ {
380
+ "epoch": 0.617363344051447,
381
+ "grad_norm": 0.49221155047416687,
382
+ "learning_rate": 7.386519944979367e-05,
383
+ "loss": 1.6037,
384
+ "step": 480
385
+ },
386
+ {
387
+ "epoch": 0.6302250803858521,
388
+ "grad_norm": 0.5378918647766113,
389
+ "learning_rate": 7.317744154057772e-05,
390
+ "loss": 1.5523,
391
+ "step": 490
392
+ },
393
+ {
394
+ "epoch": 0.6430868167202572,
395
+ "grad_norm": 0.5564639568328857,
396
+ "learning_rate": 7.248968363136176e-05,
397
+ "loss": 1.5885,
398
+ "step": 500
399
+ },
400
+ {
401
+ "epoch": 0.6430868167202572,
402
+ "eval_yahma/alpaca-cleaned_loss": 1.6095871925354004,
403
+ "eval_yahma/alpaca-cleaned_runtime": 62.983,
404
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.755,
405
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
406
+ "step": 500
407
+ },
408
+ {
409
+ "epoch": 0.6559485530546624,
410
+ "grad_norm": 0.49083277583122253,
411
+ "learning_rate": 7.180192572214582e-05,
412
+ "loss": 1.5693,
413
+ "step": 510
414
+ },
415
+ {
416
+ "epoch": 0.6688102893890675,
417
+ "grad_norm": 0.5625829100608826,
418
+ "learning_rate": 7.111416781292985e-05,
419
+ "loss": 1.6023,
420
+ "step": 520
421
+ },
422
+ {
423
+ "epoch": 0.6816720257234726,
424
+ "grad_norm": 0.6078226566314697,
425
+ "learning_rate": 7.04264099037139e-05,
426
+ "loss": 1.5845,
427
+ "step": 530
428
+ },
429
+ {
430
+ "epoch": 0.6945337620578779,
431
+ "grad_norm": 0.48107999563217163,
432
+ "learning_rate": 6.973865199449794e-05,
433
+ "loss": 1.5682,
434
+ "step": 540
435
+ },
436
+ {
437
+ "epoch": 0.707395498392283,
438
+ "grad_norm": 0.5080347657203674,
439
+ "learning_rate": 6.905089408528198e-05,
440
+ "loss": 1.5839,
441
+ "step": 550
442
+ },
443
+ {
444
+ "epoch": 0.7202572347266881,
445
+ "grad_norm": 0.5683622360229492,
446
+ "learning_rate": 6.836313617606602e-05,
447
+ "loss": 1.5916,
448
+ "step": 560
449
+ },
450
+ {
451
+ "epoch": 0.7331189710610932,
452
+ "grad_norm": 0.4669715464115143,
453
+ "learning_rate": 6.767537826685007e-05,
454
+ "loss": 1.6146,
455
+ "step": 570
456
+ },
457
+ {
458
+ "epoch": 0.7459807073954984,
459
+ "grad_norm": 0.4946054518222809,
460
+ "learning_rate": 6.698762035763412e-05,
461
+ "loss": 1.5764,
462
+ "step": 580
463
+ },
464
+ {
465
+ "epoch": 0.7588424437299035,
466
+ "grad_norm": 0.4975377023220062,
467
+ "learning_rate": 6.629986244841817e-05,
468
+ "loss": 1.6035,
469
+ "step": 590
470
+ },
471
+ {
472
+ "epoch": 0.7717041800643086,
473
+ "grad_norm": 0.5511853098869324,
474
+ "learning_rate": 6.56121045392022e-05,
475
+ "loss": 1.5842,
476
+ "step": 600
477
+ },
478
+ {
479
+ "epoch": 0.7717041800643086,
480
+ "eval_yahma/alpaca-cleaned_loss": 1.6013859510421753,
481
+ "eval_yahma/alpaca-cleaned_runtime": 62.9465,
482
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.773,
483
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.972,
484
+ "step": 600
485
+ },
486
+ {
487
+ "epoch": 0.7845659163987139,
488
+ "grad_norm": 0.5689719915390015,
489
+ "learning_rate": 6.492434662998625e-05,
490
+ "loss": 1.5656,
491
+ "step": 610
492
+ },
493
+ {
494
+ "epoch": 0.797427652733119,
495
+ "grad_norm": 0.48885637521743774,
496
+ "learning_rate": 6.42365887207703e-05,
497
+ "loss": 1.5605,
498
+ "step": 620
499
+ },
500
+ {
501
+ "epoch": 0.8102893890675241,
502
+ "grad_norm": 0.5316773056983948,
503
+ "learning_rate": 6.354883081155434e-05,
504
+ "loss": 1.5755,
505
+ "step": 630
506
+ },
507
+ {
508
+ "epoch": 0.8231511254019293,
509
+ "grad_norm": 0.5578161478042603,
510
+ "learning_rate": 6.286107290233837e-05,
511
+ "loss": 1.5532,
512
+ "step": 640
513
+ },
514
+ {
515
+ "epoch": 0.8360128617363344,
516
+ "grad_norm": 0.6534080505371094,
517
+ "learning_rate": 6.217331499312242e-05,
518
+ "loss": 1.5882,
519
+ "step": 650
520
+ },
521
+ {
522
+ "epoch": 0.8488745980707395,
523
+ "grad_norm": 0.5140324831008911,
524
+ "learning_rate": 6.148555708390647e-05,
525
+ "loss": 1.5598,
526
+ "step": 660
527
+ },
528
+ {
529
+ "epoch": 0.8617363344051447,
530
+ "grad_norm": 0.5247426629066467,
531
+ "learning_rate": 6.0797799174690516e-05,
532
+ "loss": 1.5833,
533
+ "step": 670
534
+ },
535
+ {
536
+ "epoch": 0.8745980707395499,
537
+ "grad_norm": 0.49460870027542114,
538
+ "learning_rate": 6.011004126547456e-05,
539
+ "loss": 1.621,
540
+ "step": 680
541
+ },
542
+ {
543
+ "epoch": 0.887459807073955,
544
+ "grad_norm": 0.5351711511611938,
545
+ "learning_rate": 5.9422283356258604e-05,
546
+ "loss": 1.5371,
547
+ "step": 690
548
+ },
549
+ {
550
+ "epoch": 0.9003215434083601,
551
+ "grad_norm": 0.5608878135681152,
552
+ "learning_rate": 5.8734525447042644e-05,
553
+ "loss": 1.5878,
554
+ "step": 700
555
+ },
556
+ {
557
+ "epoch": 0.9003215434083601,
558
+ "eval_yahma/alpaca-cleaned_loss": 1.5940771102905273,
559
+ "eval_yahma/alpaca-cleaned_runtime": 62.917,
560
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
561
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 0.9131832797427653,
566
+ "grad_norm": 0.48291367292404175,
567
+ "learning_rate": 5.8046767537826685e-05,
568
+ "loss": 1.583,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 0.9260450160771704,
573
+ "grad_norm": 0.4866442382335663,
574
+ "learning_rate": 5.7359009628610725e-05,
575
+ "loss": 1.5891,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 0.9389067524115756,
580
+ "grad_norm": 0.5254418253898621,
581
+ "learning_rate": 5.667125171939478e-05,
582
+ "loss": 1.5319,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 0.9517684887459807,
587
+ "grad_norm": 0.5201655030250549,
588
+ "learning_rate": 5.598349381017882e-05,
589
+ "loss": 1.5819,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 0.9646302250803859,
594
+ "grad_norm": 0.5820693969726562,
595
+ "learning_rate": 5.5295735900962866e-05,
596
+ "loss": 1.5807,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 0.977491961414791,
601
+ "grad_norm": 0.559010922908783,
602
+ "learning_rate": 5.460797799174691e-05,
603
+ "loss": 1.5597,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 0.9903536977491961,
608
+ "grad_norm": 0.498877614736557,
609
+ "learning_rate": 5.392022008253095e-05,
610
+ "loss": 1.5628,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 1.002572347266881,
615
+ "grad_norm": 0.5119406580924988,
616
+ "learning_rate": 5.3232462173315e-05,
617
+ "loss": 1.5693,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 1.015434083601286,
622
+ "grad_norm": 0.5344542860984802,
623
+ "learning_rate": 5.254470426409904e-05,
624
+ "loss": 1.5256,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 1.0282958199356913,
629
+ "grad_norm": 0.5358342528343201,
630
+ "learning_rate": 5.185694635488308e-05,
631
+ "loss": 1.5432,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 1.0282958199356913,
636
+ "eval_yahma/alpaca-cleaned_loss": 1.589645266532898,
637
+ "eval_yahma/alpaca-cleaned_runtime": 62.9636,
638
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.764,
639
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
640
+ "step": 800
641
+ },
642
+ {
643
+ "epoch": 1.0411575562700965,
644
+ "grad_norm": 0.5941470265388489,
645
+ "learning_rate": 5.116918844566713e-05,
646
+ "loss": 1.5449,
647
+ "step": 810
648
+ },
649
+ {
650
+ "epoch": 1.0540192926045016,
651
+ "grad_norm": 0.5659182071685791,
652
+ "learning_rate": 5.048143053645117e-05,
653
+ "loss": 1.5234,
654
+ "step": 820
655
+ },
656
+ {
657
+ "epoch": 1.0668810289389068,
658
+ "grad_norm": 0.5737349390983582,
659
+ "learning_rate": 4.9793672627235217e-05,
660
+ "loss": 1.5517,
661
+ "step": 830
662
+ },
663
+ {
664
+ "epoch": 1.0797427652733118,
665
+ "grad_norm": 0.5984872579574585,
666
+ "learning_rate": 4.910591471801926e-05,
667
+ "loss": 1.5175,
668
+ "step": 840
669
+ },
670
+ {
671
+ "epoch": 1.092604501607717,
672
+ "grad_norm": 0.5954984426498413,
673
+ "learning_rate": 4.8418156808803304e-05,
674
+ "loss": 1.5738,
675
+ "step": 850
676
+ },
677
+ {
678
+ "epoch": 1.1054662379421223,
679
+ "grad_norm": 0.5545582175254822,
680
+ "learning_rate": 4.7730398899587344e-05,
681
+ "loss": 1.5538,
682
+ "step": 860
683
+ },
684
+ {
685
+ "epoch": 1.1183279742765273,
686
+ "grad_norm": 0.6972865462303162,
687
+ "learning_rate": 4.704264099037139e-05,
688
+ "loss": 1.529,
689
+ "step": 870
690
+ },
691
+ {
692
+ "epoch": 1.1311897106109325,
693
+ "grad_norm": 0.5404506325721741,
694
+ "learning_rate": 4.635488308115544e-05,
695
+ "loss": 1.5567,
696
+ "step": 880
697
+ },
698
+ {
699
+ "epoch": 1.1440514469453376,
700
+ "grad_norm": 0.5792121887207031,
701
+ "learning_rate": 4.566712517193948e-05,
702
+ "loss": 1.5422,
703
+ "step": 890
704
+ },
705
+ {
706
+ "epoch": 1.1569131832797428,
707
+ "grad_norm": 0.5468006134033203,
708
+ "learning_rate": 4.497936726272352e-05,
709
+ "loss": 1.5369,
710
+ "step": 900
711
+ },
712
+ {
713
+ "epoch": 1.1569131832797428,
714
+ "eval_yahma/alpaca-cleaned_loss": 1.5860395431518555,
715
+ "eval_yahma/alpaca-cleaned_runtime": 62.9918,
716
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.75,
717
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
718
+ "step": 900
719
+ },
720
+ {
721
+ "epoch": 1.1697749196141478,
722
+ "grad_norm": 0.5955344438552856,
723
+ "learning_rate": 4.429160935350757e-05,
724
+ "loss": 1.5551,
725
+ "step": 910
726
+ },
727
+ {
728
+ "epoch": 1.182636655948553,
729
+ "grad_norm": 0.5832058787345886,
730
+ "learning_rate": 4.360385144429161e-05,
731
+ "loss": 1.5568,
732
+ "step": 920
733
+ },
734
+ {
735
+ "epoch": 1.1954983922829583,
736
+ "grad_norm": 0.6309258937835693,
737
+ "learning_rate": 4.291609353507566e-05,
738
+ "loss": 1.5548,
739
+ "step": 930
740
+ },
741
+ {
742
+ "epoch": 1.2083601286173633,
743
+ "grad_norm": 0.6269820928573608,
744
+ "learning_rate": 4.22283356258597e-05,
745
+ "loss": 1.5459,
746
+ "step": 940
747
+ },
748
+ {
749
+ "epoch": 1.2212218649517685,
750
+ "grad_norm": 0.6376837491989136,
751
+ "learning_rate": 4.154057771664374e-05,
752
+ "loss": 1.5277,
753
+ "step": 950
754
+ },
755
+ {
756
+ "epoch": 1.2340836012861736,
757
+ "grad_norm": 0.6351036429405212,
758
+ "learning_rate": 4.085281980742779e-05,
759
+ "loss": 1.5273,
760
+ "step": 960
761
+ },
762
+ {
763
+ "epoch": 1.2469453376205788,
764
+ "grad_norm": 0.6877638101577759,
765
+ "learning_rate": 4.016506189821183e-05,
766
+ "loss": 1.4986,
767
+ "step": 970
768
+ },
769
+ {
770
+ "epoch": 1.2598070739549838,
771
+ "grad_norm": 0.5501726865768433,
772
+ "learning_rate": 3.947730398899587e-05,
773
+ "loss": 1.5543,
774
+ "step": 980
775
+ },
776
+ {
777
+ "epoch": 1.272668810289389,
778
+ "grad_norm": 0.5217163562774658,
779
+ "learning_rate": 3.8789546079779924e-05,
780
+ "loss": 1.5292,
781
+ "step": 990
782
+ },
783
+ {
784
+ "epoch": 1.2855305466237943,
785
+ "grad_norm": 0.5770425796508789,
786
+ "learning_rate": 3.8101788170563964e-05,
787
+ "loss": 1.5536,
788
+ "step": 1000
789
+ },
790
+ {
791
+ "epoch": 1.2855305466237943,
792
+ "eval_yahma/alpaca-cleaned_loss": 1.5820817947387695,
793
+ "eval_yahma/alpaca-cleaned_runtime": 62.9495,
794
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
795
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
796
+ "step": 1000
797
+ },
798
+ {
799
+ "epoch": 1.2983922829581993,
800
+ "grad_norm": 0.5802098512649536,
801
+ "learning_rate": 3.741403026134801e-05,
802
+ "loss": 1.5479,
803
+ "step": 1010
804
+ },
805
+ {
806
+ "epoch": 1.3112540192926045,
807
+ "grad_norm": 0.5646567344665527,
808
+ "learning_rate": 3.672627235213205e-05,
809
+ "loss": 1.5183,
810
+ "step": 1020
811
+ },
812
+ {
813
+ "epoch": 1.3241157556270098,
814
+ "grad_norm": 0.5852165222167969,
815
+ "learning_rate": 3.603851444291609e-05,
816
+ "loss": 1.5267,
817
+ "step": 1030
818
+ },
819
+ {
820
+ "epoch": 1.3369774919614148,
821
+ "grad_norm": 0.5583398342132568,
822
+ "learning_rate": 3.535075653370014e-05,
823
+ "loss": 1.5401,
824
+ "step": 1040
825
+ },
826
+ {
827
+ "epoch": 1.3498392282958198,
828
+ "grad_norm": 0.5971976518630981,
829
+ "learning_rate": 3.4662998624484186e-05,
830
+ "loss": 1.5147,
831
+ "step": 1050
832
+ },
833
+ {
834
+ "epoch": 1.362700964630225,
835
+ "grad_norm": 0.6036947965621948,
836
+ "learning_rate": 3.3975240715268227e-05,
837
+ "loss": 1.5294,
838
+ "step": 1060
839
+ },
840
+ {
841
+ "epoch": 1.3755627009646303,
842
+ "grad_norm": 0.5828876495361328,
843
+ "learning_rate": 3.3287482806052274e-05,
844
+ "loss": 1.546,
845
+ "step": 1070
846
+ },
847
+ {
848
+ "epoch": 1.3884244372990353,
849
+ "grad_norm": 0.5941759943962097,
850
+ "learning_rate": 3.2599724896836314e-05,
851
+ "loss": 1.5238,
852
+ "step": 1080
853
+ },
854
+ {
855
+ "epoch": 1.4012861736334405,
856
+ "grad_norm": 0.6082496047019958,
857
+ "learning_rate": 3.1911966987620354e-05,
858
+ "loss": 1.5055,
859
+ "step": 1090
860
+ },
861
+ {
862
+ "epoch": 1.4141479099678458,
863
+ "grad_norm": 0.5749199390411377,
864
+ "learning_rate": 3.12242090784044e-05,
865
+ "loss": 1.5238,
866
+ "step": 1100
867
+ },
868
+ {
869
+ "epoch": 1.4141479099678458,
870
+ "eval_yahma/alpaca-cleaned_loss": 1.5794486999511719,
871
+ "eval_yahma/alpaca-cleaned_runtime": 62.9209,
872
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.786,
873
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
874
+ "step": 1100
875
+ },
876
+ {
877
+ "epoch": 1.4270096463022508,
878
+ "grad_norm": 0.649699330329895,
879
+ "learning_rate": 3.053645116918845e-05,
880
+ "loss": 1.5275,
881
+ "step": 1110
882
+ },
883
+ {
884
+ "epoch": 1.4398713826366558,
885
+ "grad_norm": 0.5754693150520325,
886
+ "learning_rate": 2.9848693259972492e-05,
887
+ "loss": 1.5217,
888
+ "step": 1120
889
+ },
890
+ {
891
+ "epoch": 1.452733118971061,
892
+ "grad_norm": 0.572021484375,
893
+ "learning_rate": 2.9160935350756536e-05,
894
+ "loss": 1.5489,
895
+ "step": 1130
896
+ },
897
+ {
898
+ "epoch": 1.4655948553054663,
899
+ "grad_norm": 0.6010130643844604,
900
+ "learning_rate": 2.8473177441540577e-05,
901
+ "loss": 1.5019,
902
+ "step": 1140
903
+ },
904
+ {
905
+ "epoch": 1.4784565916398713,
906
+ "grad_norm": 0.6172171831130981,
907
+ "learning_rate": 2.7785419532324624e-05,
908
+ "loss": 1.5703,
909
+ "step": 1150
910
+ },
911
+ {
912
+ "epoch": 1.4913183279742765,
913
+ "grad_norm": 0.5957326889038086,
914
+ "learning_rate": 2.7097661623108668e-05,
915
+ "loss": 1.5247,
916
+ "step": 1160
917
+ },
918
+ {
919
+ "epoch": 1.5041800643086818,
920
+ "grad_norm": 0.5608690977096558,
921
+ "learning_rate": 2.6409903713892708e-05,
922
+ "loss": 1.5403,
923
+ "step": 1170
924
+ },
925
+ {
926
+ "epoch": 1.5170418006430868,
927
+ "grad_norm": 0.5870776176452637,
928
+ "learning_rate": 2.5722145804676755e-05,
929
+ "loss": 1.5235,
930
+ "step": 1180
931
+ },
932
+ {
933
+ "epoch": 1.5299035369774918,
934
+ "grad_norm": 0.5889161229133606,
935
+ "learning_rate": 2.50343878954608e-05,
936
+ "loss": 1.5164,
937
+ "step": 1190
938
+ },
939
+ {
940
+ "epoch": 1.542765273311897,
941
+ "grad_norm": 0.6082655787467957,
942
+ "learning_rate": 2.4346629986244843e-05,
943
+ "loss": 1.5022,
944
+ "step": 1200
945
+ },
946
+ {
947
+ "epoch": 1.542765273311897,
948
+ "eval_yahma/alpaca-cleaned_loss": 1.5769098997116089,
949
+ "eval_yahma/alpaca-cleaned_runtime": 62.9228,
950
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.785,
951
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
952
+ "step": 1200
953
+ },
954
+ {
955
+ "epoch": 1.5556270096463023,
956
+ "grad_norm": 0.6997891664505005,
957
+ "learning_rate": 2.3658872077028886e-05,
958
+ "loss": 1.5197,
959
+ "step": 1210
960
+ },
961
+ {
962
+ "epoch": 1.5684887459807073,
963
+ "grad_norm": 0.6935648918151855,
964
+ "learning_rate": 2.2971114167812934e-05,
965
+ "loss": 1.5391,
966
+ "step": 1220
967
+ },
968
+ {
969
+ "epoch": 1.5813504823151125,
970
+ "grad_norm": 0.6135308742523193,
971
+ "learning_rate": 2.2283356258596974e-05,
972
+ "loss": 1.5238,
973
+ "step": 1230
974
+ },
975
+ {
976
+ "epoch": 1.5942122186495178,
977
+ "grad_norm": 0.5835321545600891,
978
+ "learning_rate": 2.1595598349381018e-05,
979
+ "loss": 1.5767,
980
+ "step": 1240
981
+ },
982
+ {
983
+ "epoch": 1.6070739549839228,
984
+ "grad_norm": 0.6089451313018799,
985
+ "learning_rate": 2.0907840440165065e-05,
986
+ "loss": 1.535,
987
+ "step": 1250
988
+ },
989
+ {
990
+ "epoch": 1.6199356913183278,
991
+ "grad_norm": 0.5886595249176025,
992
+ "learning_rate": 2.022008253094911e-05,
993
+ "loss": 1.5133,
994
+ "step": 1260
995
+ },
996
+ {
997
+ "epoch": 1.6327974276527333,
998
+ "grad_norm": 0.6229696273803711,
999
+ "learning_rate": 1.953232462173315e-05,
1000
+ "loss": 1.5313,
1001
+ "step": 1270
1002
+ },
1003
+ {
1004
+ "epoch": 1.6456591639871383,
1005
+ "grad_norm": 0.60906583070755,
1006
+ "learning_rate": 1.8844566712517196e-05,
1007
+ "loss": 1.5152,
1008
+ "step": 1280
1009
+ },
1010
+ {
1011
+ "epoch": 1.6585209003215433,
1012
+ "grad_norm": 0.5806885957717896,
1013
+ "learning_rate": 1.815680880330124e-05,
1014
+ "loss": 1.5468,
1015
+ "step": 1290
1016
+ },
1017
+ {
1018
+ "epoch": 1.6713826366559486,
1019
+ "grad_norm": 0.6111522316932678,
1020
+ "learning_rate": 1.746905089408528e-05,
1021
+ "loss": 1.544,
1022
+ "step": 1300
1023
+ },
1024
+ {
1025
+ "epoch": 1.6713826366559486,
1026
+ "eval_yahma/alpaca-cleaned_loss": 1.574813961982727,
1027
+ "eval_yahma/alpaca-cleaned_runtime": 62.9178,
1028
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
1029
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
1030
+ "step": 1300
1031
+ },
1032
+ {
1033
+ "epoch": 1.6842443729903538,
1034
+ "grad_norm": 0.5954424738883972,
1035
+ "learning_rate": 1.6781292984869327e-05,
1036
+ "loss": 1.5253,
1037
+ "step": 1310
1038
+ },
1039
+ {
1040
+ "epoch": 1.6971061093247588,
1041
+ "grad_norm": 0.5995926856994629,
1042
+ "learning_rate": 1.609353507565337e-05,
1043
+ "loss": 1.5306,
1044
+ "step": 1320
1045
+ },
1046
+ {
1047
+ "epoch": 1.7099678456591638,
1048
+ "grad_norm": 0.6193538308143616,
1049
+ "learning_rate": 1.5405777166437415e-05,
1050
+ "loss": 1.5344,
1051
+ "step": 1330
1052
+ },
1053
+ {
1054
+ "epoch": 1.7228295819935693,
1055
+ "grad_norm": 0.596823513507843,
1056
+ "learning_rate": 1.4718019257221457e-05,
1057
+ "loss": 1.5561,
1058
+ "step": 1340
1059
+ },
1060
+ {
1061
+ "epoch": 1.7356913183279743,
1062
+ "grad_norm": 0.658667266368866,
1063
+ "learning_rate": 1.4030261348005502e-05,
1064
+ "loss": 1.5158,
1065
+ "step": 1350
1066
+ },
1067
+ {
1068
+ "epoch": 1.7485530546623793,
1069
+ "grad_norm": 0.643640398979187,
1070
+ "learning_rate": 1.3342503438789546e-05,
1071
+ "loss": 1.5412,
1072
+ "step": 1360
1073
+ },
1074
+ {
1075
+ "epoch": 1.7614147909967846,
1076
+ "grad_norm": 0.6444098353385925,
1077
+ "learning_rate": 1.2654745529573592e-05,
1078
+ "loss": 1.5098,
1079
+ "step": 1370
1080
+ },
1081
+ {
1082
+ "epoch": 1.7742765273311898,
1083
+ "grad_norm": 0.518659234046936,
1084
+ "learning_rate": 1.1966987620357635e-05,
1085
+ "loss": 1.5418,
1086
+ "step": 1380
1087
+ },
1088
+ {
1089
+ "epoch": 1.7871382636655948,
1090
+ "grad_norm": 0.5826813578605652,
1091
+ "learning_rate": 1.127922971114168e-05,
1092
+ "loss": 1.5204,
1093
+ "step": 1390
1094
+ },
1095
+ {
1096
+ "epoch": 1.8,
1097
+ "grad_norm": 0.6658011674880981,
1098
+ "learning_rate": 1.0591471801925723e-05,
1099
+ "loss": 1.5511,
1100
+ "step": 1400
1101
+ },
1102
+ {
1103
+ "epoch": 1.8,
1104
+ "eval_yahma/alpaca-cleaned_loss": 1.5736079216003418,
1105
+ "eval_yahma/alpaca-cleaned_runtime": 62.9144,
1106
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.789,
1107
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.974,
1108
+ "step": 1400
1109
+ }
1110
+ ],
1111
+ "logging_steps": 10,
1112
+ "max_steps": 1554,
1113
+ "num_input_tokens_seen": 0,
1114
+ "num_train_epochs": 2,
1115
+ "save_steps": 200,
1116
+ "stateful_callbacks": {
1117
+ "TrainerControl": {
1118
+ "args": {
1119
+ "should_epoch_stop": false,
1120
+ "should_evaluate": false,
1121
+ "should_log": false,
1122
+ "should_save": true,
1123
+ "should_training_stop": false
1124
+ },
1125
+ "attributes": {}
1126
+ }
1127
+ },
1128
+ "total_flos": 1.0407960239485747e+17,
1129
+ "train_batch_size": 4,
1130
+ "trial_name": null,
1131
+ "trial_params": null
1132
+ }
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
3
+ size 5368
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebbdebe9dc37a4369f696bea5b64f89724529d824e49617f533e96962dbd5086
3
+ size 19960448
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:404b61918c973378dba2b4f9cd535fe8a301b95827dc90432aafea54870738db
3
+ size 40050298
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09933d2e1b8b7beafbf07ef8f20e61fc76608c66afbd496e8e3a5c7e934bb8f7
3
+ size 14244
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f82069323487bff77227998f1defb2bb51d88b6e63100f619a2706217653b27d
3
+ size 988
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9817f8edc9cb5b39db981ecb240ea5a2cfbe7c3cb37093dba74fbe7c5aa21fa
3
+ size 1064
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/trainer_state.json ADDED
@@ -0,0 +1,1245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.5721148252487183,
3
+ "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-1500",
4
+ "epoch": 1.9980707395498394,
5
+ "eval_steps": 100,
6
+ "global_step": 1554,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0012861736334405145,
13
+ "grad_norm": 0.39783015847206116,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 2.0835,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.012861736334405145,
20
+ "grad_norm": 0.45549583435058594,
21
+ "learning_rate": 1e-05,
22
+ "loss": 2.1408,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.02572347266881029,
27
+ "grad_norm": 0.4594053626060486,
28
+ "learning_rate": 2e-05,
29
+ "loss": 2.0894,
30
+ "step": 20
31
+ },
32
+ {
33
+ "epoch": 0.03858520900321544,
34
+ "grad_norm": 0.49020764231681824,
35
+ "learning_rate": 3e-05,
36
+ "loss": 2.1037,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.05144694533762058,
41
+ "grad_norm": 0.37993305921554565,
42
+ "learning_rate": 4e-05,
43
+ "loss": 1.9716,
44
+ "step": 40
45
+ },
46
+ {
47
+ "epoch": 0.06430868167202572,
48
+ "grad_norm": 0.38231977820396423,
49
+ "learning_rate": 5e-05,
50
+ "loss": 1.9349,
51
+ "step": 50
52
+ },
53
+ {
54
+ "epoch": 0.07717041800643087,
55
+ "grad_norm": 0.2922589182853699,
56
+ "learning_rate": 6e-05,
57
+ "loss": 1.906,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.09003215434083602,
62
+ "grad_norm": 0.34647658467292786,
63
+ "learning_rate": 7e-05,
64
+ "loss": 1.8246,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.10289389067524116,
69
+ "grad_norm": 0.31930026412010193,
70
+ "learning_rate": 8e-05,
71
+ "loss": 1.8057,
72
+ "step": 80
73
+ },
74
+ {
75
+ "epoch": 0.1157556270096463,
76
+ "grad_norm": 0.34028756618499756,
77
+ "learning_rate": 9e-05,
78
+ "loss": 1.7546,
79
+ "step": 90
80
+ },
81
+ {
82
+ "epoch": 0.12861736334405144,
83
+ "grad_norm": 0.3878991901874542,
84
+ "learning_rate": 0.0001,
85
+ "loss": 1.7543,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 0.12861736334405144,
90
+ "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
91
+ "eval_yahma/alpaca-cleaned_runtime": 62.5096,
92
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
93
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
94
+ "step": 100
95
+ },
96
+ {
97
+ "epoch": 0.1414790996784566,
98
+ "grad_norm": 0.35599613189697266,
99
+ "learning_rate": 9.931224209078405e-05,
100
+ "loss": 1.7309,
101
+ "step": 110
102
+ },
103
+ {
104
+ "epoch": 0.15434083601286175,
105
+ "grad_norm": 0.4075644016265869,
106
+ "learning_rate": 9.862448418156809e-05,
107
+ "loss": 1.6981,
108
+ "step": 120
109
+ },
110
+ {
111
+ "epoch": 0.16720257234726688,
112
+ "grad_norm": 0.4743317663669586,
113
+ "learning_rate": 9.793672627235215e-05,
114
+ "loss": 1.7011,
115
+ "step": 130
116
+ },
117
+ {
118
+ "epoch": 0.18006430868167203,
119
+ "grad_norm": 0.4701610505580902,
120
+ "learning_rate": 9.724896836313618e-05,
121
+ "loss": 1.6771,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.19292604501607716,
126
+ "grad_norm": 0.49115318059921265,
127
+ "learning_rate": 9.656121045392023e-05,
128
+ "loss": 1.6633,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.2057877813504823,
133
+ "grad_norm": 0.5177980661392212,
134
+ "learning_rate": 9.587345254470427e-05,
135
+ "loss": 1.6706,
136
+ "step": 160
137
+ },
138
+ {
139
+ "epoch": 0.21864951768488747,
140
+ "grad_norm": 0.465657114982605,
141
+ "learning_rate": 9.518569463548831e-05,
142
+ "loss": 1.6677,
143
+ "step": 170
144
+ },
145
+ {
146
+ "epoch": 0.2315112540192926,
147
+ "grad_norm": 0.5453551411628723,
148
+ "learning_rate": 9.449793672627235e-05,
149
+ "loss": 1.6656,
150
+ "step": 180
151
+ },
152
+ {
153
+ "epoch": 0.24437299035369775,
154
+ "grad_norm": 0.4150402545928955,
155
+ "learning_rate": 9.38101788170564e-05,
156
+ "loss": 1.6568,
157
+ "step": 190
158
+ },
159
+ {
160
+ "epoch": 0.2572347266881029,
161
+ "grad_norm": 0.5106223225593567,
162
+ "learning_rate": 9.312242090784045e-05,
163
+ "loss": 1.6804,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 0.2572347266881029,
168
+ "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
169
+ "eval_yahma/alpaca-cleaned_runtime": 63.0481,
170
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
171
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
172
+ "step": 200
173
+ },
174
+ {
175
+ "epoch": 0.27009646302250806,
176
+ "grad_norm": 0.47371965646743774,
177
+ "learning_rate": 9.243466299862448e-05,
178
+ "loss": 1.6235,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 0.2829581993569132,
183
+ "grad_norm": 0.45723679661750793,
184
+ "learning_rate": 9.174690508940853e-05,
185
+ "loss": 1.6192,
186
+ "step": 220
187
+ },
188
+ {
189
+ "epoch": 0.2958199356913183,
190
+ "grad_norm": 0.46727871894836426,
191
+ "learning_rate": 9.105914718019258e-05,
192
+ "loss": 1.6129,
193
+ "step": 230
194
+ },
195
+ {
196
+ "epoch": 0.3086816720257235,
197
+ "grad_norm": 0.5216034054756165,
198
+ "learning_rate": 9.037138927097662e-05,
199
+ "loss": 1.6065,
200
+ "step": 240
201
+ },
202
+ {
203
+ "epoch": 0.3215434083601286,
204
+ "grad_norm": 0.46132415533065796,
205
+ "learning_rate": 8.968363136176067e-05,
206
+ "loss": 1.6374,
207
+ "step": 250
208
+ },
209
+ {
210
+ "epoch": 0.33440514469453375,
211
+ "grad_norm": 0.5699637532234192,
212
+ "learning_rate": 8.89958734525447e-05,
213
+ "loss": 1.6031,
214
+ "step": 260
215
+ },
216
+ {
217
+ "epoch": 0.34726688102893893,
218
+ "grad_norm": 0.46537184715270996,
219
+ "learning_rate": 8.830811554332875e-05,
220
+ "loss": 1.6196,
221
+ "step": 270
222
+ },
223
+ {
224
+ "epoch": 0.36012861736334406,
225
+ "grad_norm": 0.5034765005111694,
226
+ "learning_rate": 8.76203576341128e-05,
227
+ "loss": 1.6257,
228
+ "step": 280
229
+ },
230
+ {
231
+ "epoch": 0.3729903536977492,
232
+ "grad_norm": 0.48885518312454224,
233
+ "learning_rate": 8.693259972489685e-05,
234
+ "loss": 1.6195,
235
+ "step": 290
236
+ },
237
+ {
238
+ "epoch": 0.3858520900321543,
239
+ "grad_norm": 0.48295891284942627,
240
+ "learning_rate": 8.62448418156809e-05,
241
+ "loss": 1.6301,
242
+ "step": 300
243
+ },
244
+ {
245
+ "epoch": 0.3858520900321543,
246
+ "eval_yahma/alpaca-cleaned_loss": 1.6362165212631226,
247
+ "eval_yahma/alpaca-cleaned_runtime": 62.9945,
248
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.749,
249
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
250
+ "step": 300
251
+ },
252
+ {
253
+ "epoch": 0.3987138263665595,
254
+ "grad_norm": 0.4800078570842743,
255
+ "learning_rate": 8.555708390646493e-05,
256
+ "loss": 1.6171,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 0.4115755627009646,
261
+ "grad_norm": 0.47452959418296814,
262
+ "learning_rate": 8.486932599724897e-05,
263
+ "loss": 1.6147,
264
+ "step": 320
265
+ },
266
+ {
267
+ "epoch": 0.42443729903536975,
268
+ "grad_norm": 0.5397221446037292,
269
+ "learning_rate": 8.418156808803301e-05,
270
+ "loss": 1.6041,
271
+ "step": 330
272
+ },
273
+ {
274
+ "epoch": 0.43729903536977494,
275
+ "grad_norm": 0.5501461029052734,
276
+ "learning_rate": 8.349381017881706e-05,
277
+ "loss": 1.6091,
278
+ "step": 340
279
+ },
280
+ {
281
+ "epoch": 0.45016077170418006,
282
+ "grad_norm": 0.47587981820106506,
283
+ "learning_rate": 8.28060522696011e-05,
284
+ "loss": 1.6008,
285
+ "step": 350
286
+ },
287
+ {
288
+ "epoch": 0.4630225080385852,
289
+ "grad_norm": 0.46644529700279236,
290
+ "learning_rate": 8.211829436038515e-05,
291
+ "loss": 1.6081,
292
+ "step": 360
293
+ },
294
+ {
295
+ "epoch": 0.4758842443729904,
296
+ "grad_norm": 0.5308094024658203,
297
+ "learning_rate": 8.14305364511692e-05,
298
+ "loss": 1.5987,
299
+ "step": 370
300
+ },
301
+ {
302
+ "epoch": 0.4887459807073955,
303
+ "grad_norm": 0.5304721593856812,
304
+ "learning_rate": 8.074277854195323e-05,
305
+ "loss": 1.6173,
306
+ "step": 380
307
+ },
308
+ {
309
+ "epoch": 0.5016077170418006,
310
+ "grad_norm": 0.6186290383338928,
311
+ "learning_rate": 8.005502063273728e-05,
312
+ "loss": 1.5879,
313
+ "step": 390
314
+ },
315
+ {
316
+ "epoch": 0.5144694533762058,
317
+ "grad_norm": 0.4936847388744354,
318
+ "learning_rate": 7.936726272352132e-05,
319
+ "loss": 1.5771,
320
+ "step": 400
321
+ },
322
+ {
323
+ "epoch": 0.5144694533762058,
324
+ "eval_yahma/alpaca-cleaned_loss": 1.6208504438400269,
325
+ "eval_yahma/alpaca-cleaned_runtime": 62.9246,
326
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.784,
327
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
328
+ "step": 400
329
+ },
330
+ {
331
+ "epoch": 0.5273311897106109,
332
+ "grad_norm": 0.4969954788684845,
333
+ "learning_rate": 7.867950481430537e-05,
334
+ "loss": 1.5873,
335
+ "step": 410
336
+ },
337
+ {
338
+ "epoch": 0.5401929260450161,
339
+ "grad_norm": 0.5539654493331909,
340
+ "learning_rate": 7.799174690508942e-05,
341
+ "loss": 1.5741,
342
+ "step": 420
343
+ },
344
+ {
345
+ "epoch": 0.5530546623794212,
346
+ "grad_norm": 0.4963805377483368,
347
+ "learning_rate": 7.730398899587345e-05,
348
+ "loss": 1.5883,
349
+ "step": 430
350
+ },
351
+ {
352
+ "epoch": 0.5659163987138264,
353
+ "grad_norm": 0.4849222004413605,
354
+ "learning_rate": 7.66162310866575e-05,
355
+ "loss": 1.6061,
356
+ "step": 440
357
+ },
358
+ {
359
+ "epoch": 0.5787781350482315,
360
+ "grad_norm": 0.5241298079490662,
361
+ "learning_rate": 7.592847317744153e-05,
362
+ "loss": 1.6118,
363
+ "step": 450
364
+ },
365
+ {
366
+ "epoch": 0.5916398713826366,
367
+ "grad_norm": 0.5051389336585999,
368
+ "learning_rate": 7.52407152682256e-05,
369
+ "loss": 1.5618,
370
+ "step": 460
371
+ },
372
+ {
373
+ "epoch": 0.6045016077170418,
374
+ "grad_norm": 0.49376770853996277,
375
+ "learning_rate": 7.455295735900963e-05,
376
+ "loss": 1.5871,
377
+ "step": 470
378
+ },
379
+ {
380
+ "epoch": 0.617363344051447,
381
+ "grad_norm": 0.49221155047416687,
382
+ "learning_rate": 7.386519944979367e-05,
383
+ "loss": 1.6037,
384
+ "step": 480
385
+ },
386
+ {
387
+ "epoch": 0.6302250803858521,
388
+ "grad_norm": 0.5378918647766113,
389
+ "learning_rate": 7.317744154057772e-05,
390
+ "loss": 1.5523,
391
+ "step": 490
392
+ },
393
+ {
394
+ "epoch": 0.6430868167202572,
395
+ "grad_norm": 0.5564639568328857,
396
+ "learning_rate": 7.248968363136176e-05,
397
+ "loss": 1.5885,
398
+ "step": 500
399
+ },
400
+ {
401
+ "epoch": 0.6430868167202572,
402
+ "eval_yahma/alpaca-cleaned_loss": 1.6095871925354004,
403
+ "eval_yahma/alpaca-cleaned_runtime": 62.983,
404
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.755,
405
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
406
+ "step": 500
407
+ },
408
+ {
409
+ "epoch": 0.6559485530546624,
410
+ "grad_norm": 0.49083277583122253,
411
+ "learning_rate": 7.180192572214582e-05,
412
+ "loss": 1.5693,
413
+ "step": 510
414
+ },
415
+ {
416
+ "epoch": 0.6688102893890675,
417
+ "grad_norm": 0.5625829100608826,
418
+ "learning_rate": 7.111416781292985e-05,
419
+ "loss": 1.6023,
420
+ "step": 520
421
+ },
422
+ {
423
+ "epoch": 0.6816720257234726,
424
+ "grad_norm": 0.6078226566314697,
425
+ "learning_rate": 7.04264099037139e-05,
426
+ "loss": 1.5845,
427
+ "step": 530
428
+ },
429
+ {
430
+ "epoch": 0.6945337620578779,
431
+ "grad_norm": 0.48107999563217163,
432
+ "learning_rate": 6.973865199449794e-05,
433
+ "loss": 1.5682,
434
+ "step": 540
435
+ },
436
+ {
437
+ "epoch": 0.707395498392283,
438
+ "grad_norm": 0.5080347657203674,
439
+ "learning_rate": 6.905089408528198e-05,
440
+ "loss": 1.5839,
441
+ "step": 550
442
+ },
443
+ {
444
+ "epoch": 0.7202572347266881,
445
+ "grad_norm": 0.5683622360229492,
446
+ "learning_rate": 6.836313617606602e-05,
447
+ "loss": 1.5916,
448
+ "step": 560
449
+ },
450
+ {
451
+ "epoch": 0.7331189710610932,
452
+ "grad_norm": 0.4669715464115143,
453
+ "learning_rate": 6.767537826685007e-05,
454
+ "loss": 1.6146,
455
+ "step": 570
456
+ },
457
+ {
458
+ "epoch": 0.7459807073954984,
459
+ "grad_norm": 0.4946054518222809,
460
+ "learning_rate": 6.698762035763412e-05,
461
+ "loss": 1.5764,
462
+ "step": 580
463
+ },
464
+ {
465
+ "epoch": 0.7588424437299035,
466
+ "grad_norm": 0.4975377023220062,
467
+ "learning_rate": 6.629986244841817e-05,
468
+ "loss": 1.6035,
469
+ "step": 590
470
+ },
471
+ {
472
+ "epoch": 0.7717041800643086,
473
+ "grad_norm": 0.5511853098869324,
474
+ "learning_rate": 6.56121045392022e-05,
475
+ "loss": 1.5842,
476
+ "step": 600
477
+ },
478
+ {
479
+ "epoch": 0.7717041800643086,
480
+ "eval_yahma/alpaca-cleaned_loss": 1.6013859510421753,
481
+ "eval_yahma/alpaca-cleaned_runtime": 62.9465,
482
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.773,
483
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.972,
484
+ "step": 600
485
+ },
486
+ {
487
+ "epoch": 0.7845659163987139,
488
+ "grad_norm": 0.5689719915390015,
489
+ "learning_rate": 6.492434662998625e-05,
490
+ "loss": 1.5656,
491
+ "step": 610
492
+ },
493
+ {
494
+ "epoch": 0.797427652733119,
495
+ "grad_norm": 0.48885637521743774,
496
+ "learning_rate": 6.42365887207703e-05,
497
+ "loss": 1.5605,
498
+ "step": 620
499
+ },
500
+ {
501
+ "epoch": 0.8102893890675241,
502
+ "grad_norm": 0.5316773056983948,
503
+ "learning_rate": 6.354883081155434e-05,
504
+ "loss": 1.5755,
505
+ "step": 630
506
+ },
507
+ {
508
+ "epoch": 0.8231511254019293,
509
+ "grad_norm": 0.5578161478042603,
510
+ "learning_rate": 6.286107290233837e-05,
511
+ "loss": 1.5532,
512
+ "step": 640
513
+ },
514
+ {
515
+ "epoch": 0.8360128617363344,
516
+ "grad_norm": 0.6534080505371094,
517
+ "learning_rate": 6.217331499312242e-05,
518
+ "loss": 1.5882,
519
+ "step": 650
520
+ },
521
+ {
522
+ "epoch": 0.8488745980707395,
523
+ "grad_norm": 0.5140324831008911,
524
+ "learning_rate": 6.148555708390647e-05,
525
+ "loss": 1.5598,
526
+ "step": 660
527
+ },
528
+ {
529
+ "epoch": 0.8617363344051447,
530
+ "grad_norm": 0.5247426629066467,
531
+ "learning_rate": 6.0797799174690516e-05,
532
+ "loss": 1.5833,
533
+ "step": 670
534
+ },
535
+ {
536
+ "epoch": 0.8745980707395499,
537
+ "grad_norm": 0.49460870027542114,
538
+ "learning_rate": 6.011004126547456e-05,
539
+ "loss": 1.621,
540
+ "step": 680
541
+ },
542
+ {
543
+ "epoch": 0.887459807073955,
544
+ "grad_norm": 0.5351711511611938,
545
+ "learning_rate": 5.9422283356258604e-05,
546
+ "loss": 1.5371,
547
+ "step": 690
548
+ },
549
+ {
550
+ "epoch": 0.9003215434083601,
551
+ "grad_norm": 0.5608878135681152,
552
+ "learning_rate": 5.8734525447042644e-05,
553
+ "loss": 1.5878,
554
+ "step": 700
555
+ },
556
+ {
557
+ "epoch": 0.9003215434083601,
558
+ "eval_yahma/alpaca-cleaned_loss": 1.5940771102905273,
559
+ "eval_yahma/alpaca-cleaned_runtime": 62.917,
560
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
561
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 0.9131832797427653,
566
+ "grad_norm": 0.48291367292404175,
567
+ "learning_rate": 5.8046767537826685e-05,
568
+ "loss": 1.583,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 0.9260450160771704,
573
+ "grad_norm": 0.4866442382335663,
574
+ "learning_rate": 5.7359009628610725e-05,
575
+ "loss": 1.5891,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 0.9389067524115756,
580
+ "grad_norm": 0.5254418253898621,
581
+ "learning_rate": 5.667125171939478e-05,
582
+ "loss": 1.5319,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 0.9517684887459807,
587
+ "grad_norm": 0.5201655030250549,
588
+ "learning_rate": 5.598349381017882e-05,
589
+ "loss": 1.5819,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 0.9646302250803859,
594
+ "grad_norm": 0.5820693969726562,
595
+ "learning_rate": 5.5295735900962866e-05,
596
+ "loss": 1.5807,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 0.977491961414791,
601
+ "grad_norm": 0.559010922908783,
602
+ "learning_rate": 5.460797799174691e-05,
603
+ "loss": 1.5597,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 0.9903536977491961,
608
+ "grad_norm": 0.498877614736557,
609
+ "learning_rate": 5.392022008253095e-05,
610
+ "loss": 1.5628,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 1.002572347266881,
615
+ "grad_norm": 0.5119406580924988,
616
+ "learning_rate": 5.3232462173315e-05,
617
+ "loss": 1.5693,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 1.015434083601286,
622
+ "grad_norm": 0.5344542860984802,
623
+ "learning_rate": 5.254470426409904e-05,
624
+ "loss": 1.5256,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 1.0282958199356913,
629
+ "grad_norm": 0.5358342528343201,
630
+ "learning_rate": 5.185694635488308e-05,
631
+ "loss": 1.5432,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 1.0282958199356913,
636
+ "eval_yahma/alpaca-cleaned_loss": 1.589645266532898,
637
+ "eval_yahma/alpaca-cleaned_runtime": 62.9636,
638
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.764,
639
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
640
+ "step": 800
641
+ },
642
+ {
643
+ "epoch": 1.0411575562700965,
644
+ "grad_norm": 0.5941470265388489,
645
+ "learning_rate": 5.116918844566713e-05,
646
+ "loss": 1.5449,
647
+ "step": 810
648
+ },
649
+ {
650
+ "epoch": 1.0540192926045016,
651
+ "grad_norm": 0.5659182071685791,
652
+ "learning_rate": 5.048143053645117e-05,
653
+ "loss": 1.5234,
654
+ "step": 820
655
+ },
656
+ {
657
+ "epoch": 1.0668810289389068,
658
+ "grad_norm": 0.5737349390983582,
659
+ "learning_rate": 4.9793672627235217e-05,
660
+ "loss": 1.5517,
661
+ "step": 830
662
+ },
663
+ {
664
+ "epoch": 1.0797427652733118,
665
+ "grad_norm": 0.5984872579574585,
666
+ "learning_rate": 4.910591471801926e-05,
667
+ "loss": 1.5175,
668
+ "step": 840
669
+ },
670
+ {
671
+ "epoch": 1.092604501607717,
672
+ "grad_norm": 0.5954984426498413,
673
+ "learning_rate": 4.8418156808803304e-05,
674
+ "loss": 1.5738,
675
+ "step": 850
676
+ },
677
+ {
678
+ "epoch": 1.1054662379421223,
679
+ "grad_norm": 0.5545582175254822,
680
+ "learning_rate": 4.7730398899587344e-05,
681
+ "loss": 1.5538,
682
+ "step": 860
683
+ },
684
+ {
685
+ "epoch": 1.1183279742765273,
686
+ "grad_norm": 0.6972865462303162,
687
+ "learning_rate": 4.704264099037139e-05,
688
+ "loss": 1.529,
689
+ "step": 870
690
+ },
691
+ {
692
+ "epoch": 1.1311897106109325,
693
+ "grad_norm": 0.5404506325721741,
694
+ "learning_rate": 4.635488308115544e-05,
695
+ "loss": 1.5567,
696
+ "step": 880
697
+ },
698
+ {
699
+ "epoch": 1.1440514469453376,
700
+ "grad_norm": 0.5792121887207031,
701
+ "learning_rate": 4.566712517193948e-05,
702
+ "loss": 1.5422,
703
+ "step": 890
704
+ },
705
+ {
706
+ "epoch": 1.1569131832797428,
707
+ "grad_norm": 0.5468006134033203,
708
+ "learning_rate": 4.497936726272352e-05,
709
+ "loss": 1.5369,
710
+ "step": 900
711
+ },
712
+ {
713
+ "epoch": 1.1569131832797428,
714
+ "eval_yahma/alpaca-cleaned_loss": 1.5860395431518555,
715
+ "eval_yahma/alpaca-cleaned_runtime": 62.9918,
716
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.75,
717
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
718
+ "step": 900
719
+ },
720
+ {
721
+ "epoch": 1.1697749196141478,
722
+ "grad_norm": 0.5955344438552856,
723
+ "learning_rate": 4.429160935350757e-05,
724
+ "loss": 1.5551,
725
+ "step": 910
726
+ },
727
+ {
728
+ "epoch": 1.182636655948553,
729
+ "grad_norm": 0.5832058787345886,
730
+ "learning_rate": 4.360385144429161e-05,
731
+ "loss": 1.5568,
732
+ "step": 920
733
+ },
734
+ {
735
+ "epoch": 1.1954983922829583,
736
+ "grad_norm": 0.6309258937835693,
737
+ "learning_rate": 4.291609353507566e-05,
738
+ "loss": 1.5548,
739
+ "step": 930
740
+ },
741
+ {
742
+ "epoch": 1.2083601286173633,
743
+ "grad_norm": 0.6269820928573608,
744
+ "learning_rate": 4.22283356258597e-05,
745
+ "loss": 1.5459,
746
+ "step": 940
747
+ },
748
+ {
749
+ "epoch": 1.2212218649517685,
750
+ "grad_norm": 0.6376837491989136,
751
+ "learning_rate": 4.154057771664374e-05,
752
+ "loss": 1.5277,
753
+ "step": 950
754
+ },
755
+ {
756
+ "epoch": 1.2340836012861736,
757
+ "grad_norm": 0.6351036429405212,
758
+ "learning_rate": 4.085281980742779e-05,
759
+ "loss": 1.5273,
760
+ "step": 960
761
+ },
762
+ {
763
+ "epoch": 1.2469453376205788,
764
+ "grad_norm": 0.6877638101577759,
765
+ "learning_rate": 4.016506189821183e-05,
766
+ "loss": 1.4986,
767
+ "step": 970
768
+ },
769
+ {
770
+ "epoch": 1.2598070739549838,
771
+ "grad_norm": 0.5501726865768433,
772
+ "learning_rate": 3.947730398899587e-05,
773
+ "loss": 1.5543,
774
+ "step": 980
775
+ },
776
+ {
777
+ "epoch": 1.272668810289389,
778
+ "grad_norm": 0.5217163562774658,
779
+ "learning_rate": 3.8789546079779924e-05,
780
+ "loss": 1.5292,
781
+ "step": 990
782
+ },
783
+ {
784
+ "epoch": 1.2855305466237943,
785
+ "grad_norm": 0.5770425796508789,
786
+ "learning_rate": 3.8101788170563964e-05,
787
+ "loss": 1.5536,
788
+ "step": 1000
789
+ },
790
+ {
791
+ "epoch": 1.2855305466237943,
792
+ "eval_yahma/alpaca-cleaned_loss": 1.5820817947387695,
793
+ "eval_yahma/alpaca-cleaned_runtime": 62.9495,
794
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
795
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
796
+ "step": 1000
797
+ },
798
+ {
799
+ "epoch": 1.2983922829581993,
800
+ "grad_norm": 0.5802098512649536,
801
+ "learning_rate": 3.741403026134801e-05,
802
+ "loss": 1.5479,
803
+ "step": 1010
804
+ },
805
+ {
806
+ "epoch": 1.3112540192926045,
807
+ "grad_norm": 0.5646567344665527,
808
+ "learning_rate": 3.672627235213205e-05,
809
+ "loss": 1.5183,
810
+ "step": 1020
811
+ },
812
+ {
813
+ "epoch": 1.3241157556270098,
814
+ "grad_norm": 0.5852165222167969,
815
+ "learning_rate": 3.603851444291609e-05,
816
+ "loss": 1.5267,
817
+ "step": 1030
818
+ },
819
+ {
820
+ "epoch": 1.3369774919614148,
821
+ "grad_norm": 0.5583398342132568,
822
+ "learning_rate": 3.535075653370014e-05,
823
+ "loss": 1.5401,
824
+ "step": 1040
825
+ },
826
+ {
827
+ "epoch": 1.3498392282958198,
828
+ "grad_norm": 0.5971976518630981,
829
+ "learning_rate": 3.4662998624484186e-05,
830
+ "loss": 1.5147,
831
+ "step": 1050
832
+ },
833
+ {
834
+ "epoch": 1.362700964630225,
835
+ "grad_norm": 0.6036947965621948,
836
+ "learning_rate": 3.3975240715268227e-05,
837
+ "loss": 1.5294,
838
+ "step": 1060
839
+ },
840
+ {
841
+ "epoch": 1.3755627009646303,
842
+ "grad_norm": 0.5828876495361328,
843
+ "learning_rate": 3.3287482806052274e-05,
844
+ "loss": 1.546,
845
+ "step": 1070
846
+ },
847
+ {
848
+ "epoch": 1.3884244372990353,
849
+ "grad_norm": 0.5941759943962097,
850
+ "learning_rate": 3.2599724896836314e-05,
851
+ "loss": 1.5238,
852
+ "step": 1080
853
+ },
854
+ {
855
+ "epoch": 1.4012861736334405,
856
+ "grad_norm": 0.6082496047019958,
857
+ "learning_rate": 3.1911966987620354e-05,
858
+ "loss": 1.5055,
859
+ "step": 1090
860
+ },
861
+ {
862
+ "epoch": 1.4141479099678458,
863
+ "grad_norm": 0.5749199390411377,
864
+ "learning_rate": 3.12242090784044e-05,
865
+ "loss": 1.5238,
866
+ "step": 1100
867
+ },
868
+ {
869
+ "epoch": 1.4141479099678458,
870
+ "eval_yahma/alpaca-cleaned_loss": 1.5794486999511719,
871
+ "eval_yahma/alpaca-cleaned_runtime": 62.9209,
872
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.786,
873
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
874
+ "step": 1100
875
+ },
876
+ {
877
+ "epoch": 1.4270096463022508,
878
+ "grad_norm": 0.649699330329895,
879
+ "learning_rate": 3.053645116918845e-05,
880
+ "loss": 1.5275,
881
+ "step": 1110
882
+ },
883
+ {
884
+ "epoch": 1.4398713826366558,
885
+ "grad_norm": 0.5754693150520325,
886
+ "learning_rate": 2.9848693259972492e-05,
887
+ "loss": 1.5217,
888
+ "step": 1120
889
+ },
890
+ {
891
+ "epoch": 1.452733118971061,
892
+ "grad_norm": 0.572021484375,
893
+ "learning_rate": 2.9160935350756536e-05,
894
+ "loss": 1.5489,
895
+ "step": 1130
896
+ },
897
+ {
898
+ "epoch": 1.4655948553054663,
899
+ "grad_norm": 0.6010130643844604,
900
+ "learning_rate": 2.8473177441540577e-05,
901
+ "loss": 1.5019,
902
+ "step": 1140
903
+ },
904
+ {
905
+ "epoch": 1.4784565916398713,
906
+ "grad_norm": 0.6172171831130981,
907
+ "learning_rate": 2.7785419532324624e-05,
908
+ "loss": 1.5703,
909
+ "step": 1150
910
+ },
911
+ {
912
+ "epoch": 1.4913183279742765,
913
+ "grad_norm": 0.5957326889038086,
914
+ "learning_rate": 2.7097661623108668e-05,
915
+ "loss": 1.5247,
916
+ "step": 1160
917
+ },
918
+ {
919
+ "epoch": 1.5041800643086818,
920
+ "grad_norm": 0.5608690977096558,
921
+ "learning_rate": 2.6409903713892708e-05,
922
+ "loss": 1.5403,
923
+ "step": 1170
924
+ },
925
+ {
926
+ "epoch": 1.5170418006430868,
927
+ "grad_norm": 0.5870776176452637,
928
+ "learning_rate": 2.5722145804676755e-05,
929
+ "loss": 1.5235,
930
+ "step": 1180
931
+ },
932
+ {
933
+ "epoch": 1.5299035369774918,
934
+ "grad_norm": 0.5889161229133606,
935
+ "learning_rate": 2.50343878954608e-05,
936
+ "loss": 1.5164,
937
+ "step": 1190
938
+ },
939
+ {
940
+ "epoch": 1.542765273311897,
941
+ "grad_norm": 0.6082655787467957,
942
+ "learning_rate": 2.4346629986244843e-05,
943
+ "loss": 1.5022,
944
+ "step": 1200
945
+ },
946
+ {
947
+ "epoch": 1.542765273311897,
948
+ "eval_yahma/alpaca-cleaned_loss": 1.5769098997116089,
949
+ "eval_yahma/alpaca-cleaned_runtime": 62.9228,
950
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.785,
951
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
952
+ "step": 1200
953
+ },
954
+ {
955
+ "epoch": 1.5556270096463023,
956
+ "grad_norm": 0.6997891664505005,
957
+ "learning_rate": 2.3658872077028886e-05,
958
+ "loss": 1.5197,
959
+ "step": 1210
960
+ },
961
+ {
962
+ "epoch": 1.5684887459807073,
963
+ "grad_norm": 0.6935648918151855,
964
+ "learning_rate": 2.2971114167812934e-05,
965
+ "loss": 1.5391,
966
+ "step": 1220
967
+ },
968
+ {
969
+ "epoch": 1.5813504823151125,
970
+ "grad_norm": 0.6135308742523193,
971
+ "learning_rate": 2.2283356258596974e-05,
972
+ "loss": 1.5238,
973
+ "step": 1230
974
+ },
975
+ {
976
+ "epoch": 1.5942122186495178,
977
+ "grad_norm": 0.5835321545600891,
978
+ "learning_rate": 2.1595598349381018e-05,
979
+ "loss": 1.5767,
980
+ "step": 1240
981
+ },
982
+ {
983
+ "epoch": 1.6070739549839228,
984
+ "grad_norm": 0.6089451313018799,
985
+ "learning_rate": 2.0907840440165065e-05,
986
+ "loss": 1.535,
987
+ "step": 1250
988
+ },
989
+ {
990
+ "epoch": 1.6199356913183278,
991
+ "grad_norm": 0.5886595249176025,
992
+ "learning_rate": 2.022008253094911e-05,
993
+ "loss": 1.5133,
994
+ "step": 1260
995
+ },
996
+ {
997
+ "epoch": 1.6327974276527333,
998
+ "grad_norm": 0.6229696273803711,
999
+ "learning_rate": 1.953232462173315e-05,
1000
+ "loss": 1.5313,
1001
+ "step": 1270
1002
+ },
1003
+ {
1004
+ "epoch": 1.6456591639871383,
1005
+ "grad_norm": 0.60906583070755,
1006
+ "learning_rate": 1.8844566712517196e-05,
1007
+ "loss": 1.5152,
1008
+ "step": 1280
1009
+ },
1010
+ {
1011
+ "epoch": 1.6585209003215433,
1012
+ "grad_norm": 0.5806885957717896,
1013
+ "learning_rate": 1.815680880330124e-05,
1014
+ "loss": 1.5468,
1015
+ "step": 1290
1016
+ },
1017
+ {
1018
+ "epoch": 1.6713826366559486,
1019
+ "grad_norm": 0.6111522316932678,
1020
+ "learning_rate": 1.746905089408528e-05,
1021
+ "loss": 1.544,
1022
+ "step": 1300
1023
+ },
1024
+ {
1025
+ "epoch": 1.6713826366559486,
1026
+ "eval_yahma/alpaca-cleaned_loss": 1.574813961982727,
1027
+ "eval_yahma/alpaca-cleaned_runtime": 62.9178,
1028
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
1029
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
1030
+ "step": 1300
1031
+ },
1032
+ {
1033
+ "epoch": 1.6842443729903538,
1034
+ "grad_norm": 0.5954424738883972,
1035
+ "learning_rate": 1.6781292984869327e-05,
1036
+ "loss": 1.5253,
1037
+ "step": 1310
1038
+ },
1039
+ {
1040
+ "epoch": 1.6971061093247588,
1041
+ "grad_norm": 0.5995926856994629,
1042
+ "learning_rate": 1.609353507565337e-05,
1043
+ "loss": 1.5306,
1044
+ "step": 1320
1045
+ },
1046
+ {
1047
+ "epoch": 1.7099678456591638,
1048
+ "grad_norm": 0.6193538308143616,
1049
+ "learning_rate": 1.5405777166437415e-05,
1050
+ "loss": 1.5344,
1051
+ "step": 1330
1052
+ },
1053
+ {
1054
+ "epoch": 1.7228295819935693,
1055
+ "grad_norm": 0.596823513507843,
1056
+ "learning_rate": 1.4718019257221457e-05,
1057
+ "loss": 1.5561,
1058
+ "step": 1340
1059
+ },
1060
+ {
1061
+ "epoch": 1.7356913183279743,
1062
+ "grad_norm": 0.658667266368866,
1063
+ "learning_rate": 1.4030261348005502e-05,
1064
+ "loss": 1.5158,
1065
+ "step": 1350
1066
+ },
1067
+ {
1068
+ "epoch": 1.7485530546623793,
1069
+ "grad_norm": 0.643640398979187,
1070
+ "learning_rate": 1.3342503438789546e-05,
1071
+ "loss": 1.5412,
1072
+ "step": 1360
1073
+ },
1074
+ {
1075
+ "epoch": 1.7614147909967846,
1076
+ "grad_norm": 0.6444098353385925,
1077
+ "learning_rate": 1.2654745529573592e-05,
1078
+ "loss": 1.5098,
1079
+ "step": 1370
1080
+ },
1081
+ {
1082
+ "epoch": 1.7742765273311898,
1083
+ "grad_norm": 0.518659234046936,
1084
+ "learning_rate": 1.1966987620357635e-05,
1085
+ "loss": 1.5418,
1086
+ "step": 1380
1087
+ },
1088
+ {
1089
+ "epoch": 1.7871382636655948,
1090
+ "grad_norm": 0.5826813578605652,
1091
+ "learning_rate": 1.127922971114168e-05,
1092
+ "loss": 1.5204,
1093
+ "step": 1390
1094
+ },
1095
+ {
1096
+ "epoch": 1.8,
1097
+ "grad_norm": 0.6658011674880981,
1098
+ "learning_rate": 1.0591471801925723e-05,
1099
+ "loss": 1.5511,
1100
+ "step": 1400
1101
+ },
1102
+ {
1103
+ "epoch": 1.8,
1104
+ "eval_yahma/alpaca-cleaned_loss": 1.5736079216003418,
1105
+ "eval_yahma/alpaca-cleaned_runtime": 62.9144,
1106
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.789,
1107
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.974,
1108
+ "step": 1400
1109
+ },
1110
+ {
1111
+ "epoch": 1.8128617363344053,
1112
+ "grad_norm": 0.5695498585700989,
1113
+ "learning_rate": 9.903713892709767e-06,
1114
+ "loss": 1.527,
1115
+ "step": 1410
1116
+ },
1117
+ {
1118
+ "epoch": 1.8257234726688103,
1119
+ "grad_norm": 0.607875645160675,
1120
+ "learning_rate": 9.21595598349381e-06,
1121
+ "loss": 1.5183,
1122
+ "step": 1420
1123
+ },
1124
+ {
1125
+ "epoch": 1.8385852090032153,
1126
+ "grad_norm": 0.5988701581954956,
1127
+ "learning_rate": 8.528198074277854e-06,
1128
+ "loss": 1.5459,
1129
+ "step": 1430
1130
+ },
1131
+ {
1132
+ "epoch": 1.8514469453376206,
1133
+ "grad_norm": 0.6526191234588623,
1134
+ "learning_rate": 7.8404401650619e-06,
1135
+ "loss": 1.5566,
1136
+ "step": 1440
1137
+ },
1138
+ {
1139
+ "epoch": 1.8643086816720258,
1140
+ "grad_norm": 0.5458080768585205,
1141
+ "learning_rate": 7.152682255845943e-06,
1142
+ "loss": 1.5176,
1143
+ "step": 1450
1144
+ },
1145
+ {
1146
+ "epoch": 1.8771704180064308,
1147
+ "grad_norm": 0.6263613700866699,
1148
+ "learning_rate": 6.464924346629987e-06,
1149
+ "loss": 1.5234,
1150
+ "step": 1460
1151
+ },
1152
+ {
1153
+ "epoch": 1.890032154340836,
1154
+ "grad_norm": 0.6338502168655396,
1155
+ "learning_rate": 5.77716643741403e-06,
1156
+ "loss": 1.5376,
1157
+ "step": 1470
1158
+ },
1159
+ {
1160
+ "epoch": 1.9028938906752413,
1161
+ "grad_norm": 0.6531928181648254,
1162
+ "learning_rate": 5.089408528198075e-06,
1163
+ "loss": 1.5247,
1164
+ "step": 1480
1165
+ },
1166
+ {
1167
+ "epoch": 1.9157556270096463,
1168
+ "grad_norm": 0.6073517203330994,
1169
+ "learning_rate": 4.4016506189821186e-06,
1170
+ "loss": 1.5398,
1171
+ "step": 1490
1172
+ },
1173
+ {
1174
+ "epoch": 1.9286173633440513,
1175
+ "grad_norm": 0.6269332766532898,
1176
+ "learning_rate": 3.7138927097661627e-06,
1177
+ "loss": 1.5597,
1178
+ "step": 1500
1179
+ },
1180
+ {
1181
+ "epoch": 1.9286173633440513,
1182
+ "eval_yahma/alpaca-cleaned_loss": 1.5721148252487183,
1183
+ "eval_yahma/alpaca-cleaned_runtime": 62.9499,
1184
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
1185
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
1186
+ "step": 1500
1187
+ },
1188
+ {
1189
+ "epoch": 1.9414790996784566,
1190
+ "grad_norm": 0.5752962231636047,
1191
+ "learning_rate": 3.0261348005502065e-06,
1192
+ "loss": 1.5468,
1193
+ "step": 1510
1194
+ },
1195
+ {
1196
+ "epoch": 1.9543408360128618,
1197
+ "grad_norm": 0.5622620582580566,
1198
+ "learning_rate": 2.3383768913342507e-06,
1199
+ "loss": 1.5588,
1200
+ "step": 1520
1201
+ },
1202
+ {
1203
+ "epoch": 1.9672025723472668,
1204
+ "grad_norm": 0.6163848042488098,
1205
+ "learning_rate": 1.6506189821182942e-06,
1206
+ "loss": 1.482,
1207
+ "step": 1530
1208
+ },
1209
+ {
1210
+ "epoch": 1.980064308681672,
1211
+ "grad_norm": 0.6466639041900635,
1212
+ "learning_rate": 9.628610729023384e-07,
1213
+ "loss": 1.5543,
1214
+ "step": 1540
1215
+ },
1216
+ {
1217
+ "epoch": 1.9929260450160773,
1218
+ "grad_norm": 0.6139137148857117,
1219
+ "learning_rate": 2.751031636863824e-07,
1220
+ "loss": 1.4905,
1221
+ "step": 1550
1222
+ }
1223
+ ],
1224
+ "logging_steps": 10,
1225
+ "max_steps": 1554,
1226
+ "num_input_tokens_seen": 0,
1227
+ "num_train_epochs": 2,
1228
+ "save_steps": 200,
1229
+ "stateful_callbacks": {
1230
+ "TrainerControl": {
1231
+ "args": {
1232
+ "should_epoch_stop": false,
1233
+ "should_evaluate": false,
1234
+ "should_log": false,
1235
+ "should_save": true,
1236
+ "should_training_stop": true
1237
+ },
1238
+ "attributes": {}
1239
+ }
1240
+ },
1241
+ "total_flos": 1.1547885106338202e+17,
1242
+ "train_batch_size": 4,
1243
+ "trial_name": null,
1244
+ "trial_params": null
1245
+ }
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
3
+ size 5368
tune_log/layerskip_1b_0.25_tune/checkpoint-200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5132ee225c5136a04b393a5b013eaae546265c15ef4d93460674de77e5f724d2
3
+ size 19960448
tune_log/layerskip_1b_0.25_tune/checkpoint-200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b31b7db31448cbea4a2b26ecfb4f5e38242c8fef6d933e20d80a45340fa2e2e7
3
+ size 40050298
tune_log/layerskip_1b_0.25_tune/checkpoint-200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1726ab754f473657bf32650b33d136b1ba1d1d1c74e402fbbacb2a89a6809796
3
+ size 14244
tune_log/layerskip_1b_0.25_tune/checkpoint-200/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bd55b3848d82967a207e0805911c79200c6adce71e3b37fd24549a718f75738
3
+ size 988
tune_log/layerskip_1b_0.25_tune/checkpoint-200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:363127827fa84aceb28f95364631df4397d57dee08819ca6a0979763f837be6f
3
+ size 1064
tune_log/layerskip_1b_0.25_tune/checkpoint-200/trainer_state.json ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.6644691228866577,
3
+ "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-200",
4
+ "epoch": 0.2572347266881029,
5
+ "eval_steps": 100,
6
+ "global_step": 200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0012861736334405145,
13
+ "grad_norm": 0.39783015847206116,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 2.0835,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.012861736334405145,
20
+ "grad_norm": 0.45549583435058594,
21
+ "learning_rate": 1e-05,
22
+ "loss": 2.1408,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.02572347266881029,
27
+ "grad_norm": 0.4594053626060486,
28
+ "learning_rate": 2e-05,
29
+ "loss": 2.0894,
30
+ "step": 20
31
+ },
32
+ {
33
+ "epoch": 0.03858520900321544,
34
+ "grad_norm": 0.49020764231681824,
35
+ "learning_rate": 3e-05,
36
+ "loss": 2.1037,
37
+ "step": 30
38
+ },
39
+ {
40
+ "epoch": 0.05144694533762058,
41
+ "grad_norm": 0.37993305921554565,
42
+ "learning_rate": 4e-05,
43
+ "loss": 1.9716,
44
+ "step": 40
45
+ },
46
+ {
47
+ "epoch": 0.06430868167202572,
48
+ "grad_norm": 0.38231977820396423,
49
+ "learning_rate": 5e-05,
50
+ "loss": 1.9349,
51
+ "step": 50
52
+ },
53
+ {
54
+ "epoch": 0.07717041800643087,
55
+ "grad_norm": 0.2922589182853699,
56
+ "learning_rate": 6e-05,
57
+ "loss": 1.906,
58
+ "step": 60
59
+ },
60
+ {
61
+ "epoch": 0.09003215434083602,
62
+ "grad_norm": 0.34647658467292786,
63
+ "learning_rate": 7e-05,
64
+ "loss": 1.8246,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.10289389067524116,
69
+ "grad_norm": 0.31930026412010193,
70
+ "learning_rate": 8e-05,
71
+ "loss": 1.8057,
72
+ "step": 80
73
+ },
74
+ {
75
+ "epoch": 0.1157556270096463,
76
+ "grad_norm": 0.34028756618499756,
77
+ "learning_rate": 9e-05,
78
+ "loss": 1.7546,
79
+ "step": 90
80
+ },
81
+ {
82
+ "epoch": 0.12861736334405144,
83
+ "grad_norm": 0.3878991901874542,
84
+ "learning_rate": 0.0001,
85
+ "loss": 1.7543,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 0.12861736334405144,
90
+ "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
91
+ "eval_yahma/alpaca-cleaned_runtime": 62.5096,
92
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
93
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
94
+ "step": 100
95
+ },
96
+ {
97
+ "epoch": 0.1414790996784566,
98
+ "grad_norm": 0.35599613189697266,
99
+ "learning_rate": 9.931224209078405e-05,
100
+ "loss": 1.7309,
101
+ "step": 110
102
+ },
103
+ {
104
+ "epoch": 0.15434083601286175,
105
+ "grad_norm": 0.4075644016265869,
106
+ "learning_rate": 9.862448418156809e-05,
107
+ "loss": 1.6981,
108
+ "step": 120
109
+ },
110
+ {
111
+ "epoch": 0.16720257234726688,
112
+ "grad_norm": 0.4743317663669586,
113
+ "learning_rate": 9.793672627235215e-05,
114
+ "loss": 1.7011,
115
+ "step": 130
116
+ },
117
+ {
118
+ "epoch": 0.18006430868167203,
119
+ "grad_norm": 0.4701610505580902,
120
+ "learning_rate": 9.724896836313618e-05,
121
+ "loss": 1.6771,
122
+ "step": 140
123
+ },
124
+ {
125
+ "epoch": 0.19292604501607716,
126
+ "grad_norm": 0.49115318059921265,
127
+ "learning_rate": 9.656121045392023e-05,
128
+ "loss": 1.6633,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.2057877813504823,
133
+ "grad_norm": 0.5177980661392212,
134
+ "learning_rate": 9.587345254470427e-05,
135
+ "loss": 1.6706,
136
+ "step": 160
137
+ },
138
+ {
139
+ "epoch": 0.21864951768488747,
140
+ "grad_norm": 0.465657114982605,
141
+ "learning_rate": 9.518569463548831e-05,
142
+ "loss": 1.6677,
143
+ "step": 170
144
+ },
145
+ {
146
+ "epoch": 0.2315112540192926,
147
+ "grad_norm": 0.5453551411628723,
148
+ "learning_rate": 9.449793672627235e-05,
149
+ "loss": 1.6656,
150
+ "step": 180
151
+ },
152
+ {
153
+ "epoch": 0.24437299035369775,
154
+ "grad_norm": 0.4150402545928955,
155
+ "learning_rate": 9.38101788170564e-05,
156
+ "loss": 1.6568,
157
+ "step": 190
158
+ },
159
+ {
160
+ "epoch": 0.2572347266881029,
161
+ "grad_norm": 0.5106223225593567,
162
+ "learning_rate": 9.312242090784045e-05,
163
+ "loss": 1.6804,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 0.2572347266881029,
168
+ "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
169
+ "eval_yahma/alpaca-cleaned_runtime": 63.0481,
170
+ "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
171
+ "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
172
+ "step": 200
173
+ }
174
+ ],
175
+ "logging_steps": 10,
176
+ "max_steps": 1554,
177
+ "num_input_tokens_seen": 0,
178
+ "num_train_epochs": 2,
179
+ "save_steps": 200,
180
+ "stateful_callbacks": {
181
+ "TrainerControl": {
182
+ "args": {
183
+ "should_epoch_stop": false,
184
+ "should_evaluate": false,
185
+ "should_log": false,
186
+ "should_save": true,
187
+ "should_training_stop": false
188
+ },
189
+ "attributes": {}
190
+ }
191
+ },
192
+ "total_flos": 1.487298513076224e+16,
193
+ "train_batch_size": 4,
194
+ "trial_name": null,
195
+ "trial_params": null
196
+ }
tune_log/layerskip_1b_0.25_tune/checkpoint-200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
3
+ size 5368