Neooooo commited on Mar 19, 2025

Commit

703fc3b

verified ·

1 Parent(s): 8e9022b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/description.txt +28 -0
prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/train.sh +1 -0
prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/training.log +551 -0
prune_log/layerskip_1b_prune_0.25/description.txt +28 -0
prune_log/layerskip_1b_prune_0.25/pytorch_model.bin +3 -0
prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/description.txt +28 -0
prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/train.sh +1 -0
prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/training.log +1501 -0
prune_log/vanilla_llama_1b_prune_0.25/description.txt +28 -0
prune_log/vanilla_llama_1b_prune_0.25/pytorch_model.bin +3 -0
tune_log/.DS_Store +0 -0
tune_log/layerskip_1b_0.25_tune/.DS_Store +0 -0
tune_log/layerskip_1b_0.25_tune/adapter_config.json +22 -0
tune_log/layerskip_1b_0.25_tune/adapter_model.bin +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/model.safetensors +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/optimizer.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/rng_state.pth +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/scaler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/scheduler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/trainer_state.json +820 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1000/training_args.bin +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/model.safetensors +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/optimizer.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/rng_state.pth +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/scaler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/scheduler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/trainer_state.json +976 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1200/training_args.bin +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/model.safetensors +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/optimizer.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/rng_state.pth +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/scaler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/scheduler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/trainer_state.json +1132 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1400/training_args.bin +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/model.safetensors +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/optimizer.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/rng_state.pth +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/scaler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/scheduler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/trainer_state.json +1245 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-1554/training_args.bin +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-200/model.safetensors +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-200/optimizer.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-200/rng_state.pth +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-200/scaler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-200/scheduler.pt +3 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-200/trainer_state.json +196 -0
tune_log/layerskip_1b_0.25_tune/checkpoint-200/training_args.bin +3 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: facebook/layerskip-llama3.2-1B
+  - save_ckpt_log_name: layerskip_1b_prune_0.25
+  - pruning_ratio: 0.25
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 2
+  - block_attention_layer_end: 13
+  - block_mlp_layer_start: 2
+  - block_mlp_layer_end: 13
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: param_mix
+  - num_examples: 10
+  - device: cuda
+  - test_before_train: True
+  - eval_device: cuda
+  - test_after_train: True
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.6

prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python llama3.py --pruning_ratio 0.25 --device cuda --eval_device cuda --base_model facebook/layerskip-llama3.2-1B --block_wise --block_mlp_layer_start 2 --block_mlp_layer_end 13 --block_attention_layer_start 2 --block_attention_layer_end 13 --save_ckpt_log_name layerskip_1b_prune_0.25 --pruner_type taylor --taylor param_mix --max_seq_len 2048 --test_after_train --test_before_train --save_model

prune_log/layerskip_1b_prune_0.25/2025-03-14-11-11-31/training.log ADDED Viewed

	@@ -0,0 +1,551 @@

+2025-03-14 11:11:32 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-03-14 11:11:33 - INFO :
+==================Generation Results before Pruning================
+2025-03-14 11:11:33 - INFO :       the shape of current input sequences is ===tensor([[128000,     40,   4510,    279,   7438,    315,   2324,    374]],
+       device='cuda:0')===
+2025-03-14 11:11:50 - INFO :       <|begin_of_text|>I believe the meaning of life is to find the most delicious way to die. — Dorothea Lange
+I am a 47-year-old woman living with bipolar disorder. I don't believe this to be the cause of my symptoms, but if I am not careful, it could have an effect on my treatment options.
+My symptoms began to emerge when I was 21 years old. My father, an environmental scientist, was diagnosed with bipolar disorder in the 1950s. I grew up with him, watching him suffer. He was an extremely loving and nurturing man, with a gentle soul. Bipolar disorder took a toll on him, and it left him struggling for much of his life. Eventually, I was born and grew up in the same home.
+At one point, I thought my father would live into my 20s. He was still relatively strong when he suffered a heart attack at age 40. I remember watching him die that day in my home with great clarity. I remember my mom was so frightened of what was about to happen. I remember hearing the nurses and doctors talking on the phone, anxiously waiting to learn whether my father was going to live through the heart attack or not.
+Bipolar disorder was very much a part of my family history. Even though my symptoms emerged at a later age and I am not as close to bipolar disorder as I was to my father, I feel much sympathy for people who suffer from this disorder, because, as I mentioned, it was very hard on my father.
+When I first learned that I had bipolar disorder, I was angry and upset. I had no clue how this new information would affect my life, so I was not expecting to see how much my father had suffered. I was hoping that by not having the same struggle, I wouldn't suffer the same level of pain, and I would just live my life without any knowledge of bipolar disorder, as though I hadn't had any symptoms at all. But, this hope quickly turned into anger. I was very angry that my father had suffered, not only from bipolar disorder, but the pain associated with his illness and his diagnosis. I was very angry that my parents had not been given the option to be treated by a doctor who understood what bipolar disorder was and how to treat it.
+I do not think bipolar disorder is going to be eliminated from the human genome any time soon, but that doesn't mean I am going to die of suicide or alcoholism.
+There is a whole new field of medicine called behavioral medicine, which focuses on the emotional and behavioral aspects of illness, and how these affect treatment outcomes. Because of the increased emphasis on the emotional and behavioral side of illness, it is more likely that these issues will be dealt with in a more holistic way and that you'll be able to get better treatment.
+The good news is that there are many people who are already successful in treating their illness in a more holistic way. It's important to keep an open mind and not to judge someone for their choice of treatment. I think it's great to have options, and I think that there is nothing wrong with someone who chooses to go on a road that is not straight.
+The bad news is that bipolar disorder is a very dangerous disease. It is very easy to die from it, and if you don't take care of yourself, it can do a lot of damage. It's important to take good care of yourself, and it's even more important to keep your diagnosis as simple as possible. If you are able to take care of yourself, you'll be much better off.
+I will definitely make an appointment with the doctor to find out whether or not I have bipolar disorder. If I do, I will make an appointment with a psychiatrist to get help with my treatment.
+It's interesting to note that when I was diagnosed with bipolar disorder, I wasn't really aware of what that means. For me, it's not even as though I'm diagnosed with bipolar disorder.
+I am not as interested in the diagnosis as I am in the treatment. That's the way I'm gonna go with this. I am taking care of myself, and I am very much interested in seeing how I can be treated with this.
+I'll keep you posted. — Dr. Oz
+Bipolar disorder is a mental illness that affects the brain and behaviors.
+I believe that the best way to treat bipolar disorder is with medication.
+I was diagnosed with bipolar disorder at age 26.
+I'm not sure that there's any such thing as 'being a good bipolar patient' — I think that there's a 'being good at bipolar disorder.' — Dr. Oz
+I am bipolar. It's a great feeling. — Dr. Oz
+I am bipolar. There is no such thing as 'being bipolar.' I am bipolar. I am bipolar. There is no such thing as 'being bipolar.' I am bipolar. I am bipolar. There is no such thing as 'being bipolar.' I am bipolar.
+I think that the way that I am seeing the world at this point in time is that everything is just really hard. It seems like we are in a constant battle between life and death. And I think that it would be so easy to make an easy decision to just not live anymore, you know, to cut my ties with the world. But I think that it is just a very hard time, and I think that the way that I am seeing life at this time is that it is just so hard to just make it through the day. — Dr. Oz
+I've been diagnosed as bipolar, and bipolar disorder has caused me to go through a lot of difficult times.
+If you are diagnosed with bipolar disorder, you should also seek counseling to help manage your symptoms and make sure that you aren't suffering from any depression.
+If you are suffering from bipolar disorder, you should seek out and receive therapy so that you can better manage your condition and stop the cycle from repeating. — Dr. Oz
+I have bipolar disorder.
+I'm a manic depressive.
+I've been diagnosed with bipolar disorder, and bipolar disorder is a severe mental illness that is very hard to live with.
+I've been diagnosed with bipolar disorder and bipolar disorder is a very serious mental illness that can be difficult to live with.<|end_of_text|>
+2025-03-14 11:11:50 - INFO :       the shape of current input sequences is ===tensor([[128000,  61346,   2231,     11,    279,  10334,    315,   1375,  44515,
+           5415,    430,    220]], device='cuda:0')===
+2025-03-14 11:11:55 - INFO :       <|begin_of_text|>Simply put, the theory of relativity states that 1) the speed of light cannot be exceeded, and 2) space and time are curved for objects moving at high speeds. In other words, according to the Theory of Relativity, nothing can change unless you change something.
+The concept of relativity has been a favorite of mine since I was a kid. When I would read about the war that I was in (The Great Depression), my grandfather would tell me about how he could see the world move forward as the planes flew overhead during air raids. In other words, if you were moving at a normal speed in the world, you would see the world slowly shift around you in order to keep you in the same place.
+This is exactly what I mean by relativity. When you start moving at a high speed, your mind can no longer perceive space and time as being flat, and things will change as you move around.
+So why does the speed of light have a minimum limit? I don’t know, but I can tell you for a fact that it does. The minimum limit is around 3×10^16m/s. And in this case, we are talking about an extremely low-speed scenario.
+Let’s say that you had a speed limit of 10^16 m/s. And you are a regular person. In this case, you will not be able to travel at that speed for a very long time.
+You may not be able to move faster than 10^16 m/s, but you will never be able to move that fast.
+The theory of relativity is a bit of a misconception, but it is a common misconception. It is the theory that says that you cannot go faster than the speed of light, or the speed of light itself. That means that you cannot travel faster than a rocket.
+It’s a misconception, but it is a common misconception, and a misconception that we all have.<|end_of_text|>
+2025-03-14 11:11:55 - INFO :       the shape of current input sequences is ===tensor([[128000,  31233,    264,   3997,    649,    387,   2884,    304,    220,
+            605,   4382,   7504,    512]], device='cuda:0')===
+2025-03-14 11:11:59 - INFO :       <|begin_of_text|>Building a website can be done in 10 simple steps:
+Get a domain. You may want to use the same domain as your company name or simply use a unique one. It’s recommended you get a.com domain, but.org,.net,.biz, etc. can all be used.
+Get your hosting. The hosting company will allow you to create a website and it’s often included in the package. If you’re not sure where to start, check out our article on hosting.
+Create your website. Now is the time to create the content for your website. The content can include product descriptions, images, video and other elements. The content can be as simple as an about page or as detailed as a blog. When you’re finished, upload it to the hosting site and it’s ready to use.
+Test your website. Test your website by having a human browse it. This is a good way to check if the content is working and also get feedback about what people are thinking of your website.
+Track your website. It’s often good to track how people are using your website and what they are doing. This can be done by checking web analytics, which can provide information about traffic patterns and behavior. Some companies even offer analytics tools as part of the package.
+Promote your website. Promotion can be done through various channels, including social media, email marketing and advertising.<|end_of_text|>
+2025-03-14 11:11:59 - INFO :       the shape of current input sequences is ===tensor([[128000,  49462,     25,    330,     40,  12491,    433,    994,    856,
+           4641,  11863,   8898,  10246,  32458,   3904,     25,  51957,    198,
+          27938,  49462,     25,    330,   5159,   1938,    706,   1027,  62904,
+            235,    702,  32458,   3904,     25,  45003,    198,  27938,  49462,
+             25,    330,   2028,    374,    279,   2723,    311,    279,   4652,
+            702,  32458,   3904,     25,  59794,    198,  27938,  49462,     25,
+            330,   2028,    502,   4731,   2835,    574,   9850,  42517,    702,
+          32458,   3904,     25]], device='cuda:0')===
+2025-03-14 11:12:05 - INFO :       <|begin_of_text|>Tweet: "I hate it when my phone battery dies."
+Sentiment: Negative
+###
+Tweet: "My day has been 👍"
+Sentiment: Positive
+###
+Tweet: "This is the link to the article"
+Sentiment: Neutral
+###
+Tweet: "This new music video was incredibile"
+Sentiment: Neutral
+###
+Tweet: "Oh, that photo was so cute"
+Sentiment: Positive
+###
+Tweet: "These food are so so good"
+Sentiment: Neutral
+###
+Tweet: "I am so excited for this concert, I can't wait "
+Sentiment: Negative
+###
+Tweet: "OMG!!"
+Sentiment: Neutral
+###
+Tweet: "I cannot wait for the next album"
+Sentiment: Neutral
+###
+Tweet: "These clothes are so so good"
+Sentiment: Positive
+###
+Tweet: "These pictures are so cute"
+Sentiment: Negative
+###
+Tweet: "These photo are so cool"
+Sentiment: Neutral
+###
+Tweet: "Oh, the photos were so cute"
+Sentiment: Neutral
+###
+Tweet: "I am glad that you found the link. "
+Sentiment: Neutral
+###
+Tweet: "I think you should post the link to this article "
+Sentiment: Neutral
+###
+Tweet: "Oh, it was so cute to see your post. "
+Sentiment: Negative
+###
+Tweet: "I am surprised to find this link."
+Sentiment: Negative
+###
+Tweet: "Wow! It's so nice to read your comments. "
+Sentiment: Neutral
+###<|end_of_text|>
+2025-03-14 11:12:05 - INFO :       the shape of current input sequences is ===tensor([[128000,  28573,   6498,    311,   8753,   1473,  37541,  14479,    466,
+            591,    326,    412,    265,    409,   4809,    271,    375,    604,
+          94932,    591,  11540,    383,   3273,  58866,   8047,    271,    501,
+           1136,  41389,   5763,    591,  41389,   5763,  12077,  34927,    271,
+           1557,   2423,    591]], device='cuda:0')===
+2025-03-14 11:12:05 - INFO :       <|begin_of_text|>Translate English to French:
+sea otter => loutre de mer
+peppermint => menthe poivrée
+plush girafe => girafe peluche
+cheese => fromage
+coffee => café
+water => eau
+mash => cruche
+salsa => salsa
+cucumber => l'épée
+<|end_of_text|>
+2025-03-14 11:13:57 - INFO :       PPL before pruning: {'wikitext2': 10.877742727456024, 'ptb': 17.553166745968216}
+2025-03-14 11:13:57 - INFO :       Use taylor pruner...
+2025-03-14 11:13:57 - INFO :       Pruning Attention Layer = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+2025-03-14 11:13:57 - INFO :       Pruning MLP Layer = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+2025-03-14 11:13:57 - INFO :       Start Pruning
+2025-03-14 11:14:03 - INFO :       Start Backwarding in iterative steps = 0...
+2025-03-14 11:14:04 - INFO :       Loss = 3.7338414192199707
+2025-03-14 11:14:04 - INFO :       Loss = 4.748807907104492
+2025-03-14 11:14:04 - INFO :       Loss = 4.232391834259033
+2025-03-14 11:14:05 - INFO :       Loss = 3.6372835636138916
+2025-03-14 11:14:06 - INFO :       Loss = 3.873014450073242
+2025-03-14 11:14:06 - INFO :       Loss = 4.062342166900635
+2025-03-14 11:14:07 - INFO :       Loss = 4.176964282989502
+2025-03-14 11:14:07 - INFO :       Loss = 4.237691402435303
+2025-03-14 11:14:08 - INFO :       Loss = 4.040452480316162
+2025-03-14 11:14:09 - INFO :       Loss = 4.438660144805908
+2025-03-14 11:14:09 - INFO :       Loss = 4.117751121520996
+2025-03-14 11:14:10 - INFO :       After Iter 1/1, #parameters: 1068566528
+2025-03-14 11:14:10 - INFO :       #Param before: 1235814400, #Param after: 1068566528, Ratio = 86.4666%
+2025-03-14 11:14:13 - INFO :
+==================Generation Results After Pruning================
+2025-03-14 11:14:32 - INFO :       <|begin_of_text|>I believe the meaning of life is to love, and that whatever you give out, you get back in full measure. You have to get out of yourself what you don't like in yourself, what is self-defeating. I didn't know that before, that you could be happy that your life is miserable.
+—Bill Cosby on Happiness, July 12, 1987
+My attitude towards life is to appreciate what is, and enjoy it while it lasts. If you have to pay for a living, it's for a living. If you have to suffer for something, it is for a living.
+—Bill Cosby on Life, March 16, 1969
+My attitude toward life is to appreciate what is good and let it be, and to thank the universe that my life is so long, and that I am so ignorant of what is good and what is bad that, somehow, I should be happy always.
+—Bill Cosby on Happiness, March 21, 1963
+My attitude towards life is to expect the best of all times. You've heard me saying before that I am all about finding God. I used to believe that there is a God, but they say that there is no God. So I decided that I should put my energy into being happy, and then see what would happen.
+—Bill Cosby on Happiness, March 29, 1967
+In the face of adversity, your greatest strength is your ability to forgive. The world is always giving you reasons to be angry. Today I am letting go of the anger and turning toward joy.<|end_of_text|>
+2025-03-14 11:16:27 - INFO :       <|begin_of_text|>Simply put, the theory of relativity states that 2 people are moving when they really are not, in a way known as quantum theory.
+The idea of relativity was derived by physicist German physicist Johann Lorenz Otto von Arnaudt on April 19, 1895 in the basement of his home near Munich, Germany.
+During the early afternoon, the sky was cloudy and still raining heavily when a mysterious man, dressed in white, wearing a grey hat appeared from the dark forest and stepped into the front room of the house.
+Suddenly, a young man who was standing still in the dark forest suddenly moved, he turned his head towards the cloudless sky and raised his hand towards the cloud and began to yell, the cloud rolled up.
+A woman who was standing quietly in the corner was frozen for a moment and suddenly stood up and hurriedly walked out of the room, only when she came to the man, she immediately pulled the hand of the person wearing the white suit.
+An unspoken silence fell over the whole house.
+After a while, a man who was walking on the side of the forest suddenly rushed into the room of the house, with the sky and the rain on his shoulders.
+He was dressed in a long light blue coat and white shoes. With a mysterious expression on his face, a grey long beard and a white long beard, he looked very mysterious.
+At first sight, the man was not surprised because the man was dressed in a white suit, black long socks and a long white beard. He walked quietly on the wet floor, and without any hesitation, he pulled back to the side of the room.
+The man had a thin, white beard and was dressed in black clothes and gray shoes.
+The man stopped in front of the little girl, who was standing at the edge of the room.
+He looked at the little girl, who was leaning over the window. He looked at the little girl.
+Then, a dark, cold and mysterious smile appeared on his face.
+The little girl who was staring at the man was silent.
+The man raised his eyebrows and pushed the little girl away.
+A sudden thunder sounded.
+The little girl's hair was shaking.
+A huge, powerful force hit me in the chest.
+All my mind was focused on the little girl.
+I was standing there for a while.
+I heard the noise of the earth.
+It's the thunder of heaven.
+Thundering, roaring and deafening in my ears.
+The sun has come down.
+I have not gone to bed.
+It's already midnight.
+I will have gone to sleep and have slept.
+There is no more room for me.
+There is no more room for me.
+I'll have to give you something.
+You've broken me.
+I'm no longer in the world.
+You'll die with me.
+My mother's dead.
+I know that I'm not there.
+There's no one behind me.
+There's nothing left for me.
+There's no more time.
+Everything's lost.
+The world's no more.
+It's all in pieces.
+There's nothing left.
+The world's gone.
+You've lost me.
+You've lost everything.
+Everything's gone.
+There's nothing left.
+I don't know what to say.
+There's no more time.
+I don't know where I am.
+Everything's gone.
+I can't see.
+There's no light.
+Everything's lost.
+I'm lost.
+There's no more.
+I'm lost.
+Everything's gone.
+There's no light.
+I'm lost.
+There's no light.
+Everything's lost.
+No light.
+I'm lost.
+I can't see.
+No light.
+I can't see.
+There's no light.
+There's no light.
+I can't see.
+I'm lost.
+I can't see.
+I'm lost.
+Everything's gone.
+There's no light.
+There's no light.
+I'm lost.
+Everything's gone.
+There's no light.
+There's no light.
+I'm lost.
+I can't see.
+There's no light.
+There's no light.
+I'm lost.
+There's no light.
+There's no light.
+There's no light.
+I can't see.
+I'm lost.
+I can't see.
+There's no light.
+There's no light.
+There's no light.
+I'm lost.
+I can't see.
+I can't see.
+There's no light.
+There's no light.
+I'm lost.
+I can't see.
+There's no light.
+I can't see.
+I'm lost.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+I can't see.
+There's no light.
+I can't see.
+There's no light.
+I'm lost.
+There's no light.
+There's no light.
+I'm lost.
+There's no light.
+There's no light.
+I can't see.
+I'm lost.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+There's no light.
+I'm lost.
+I'm a victim.
+I can't see.
+I'm lost.
+There's no light.
+There's no light.
+I'm lost.
+There's no light.
+There's no light.
+I'm a victim.
+I can't see.
+I'm lost.
+I'm a victim.
+There's no light.
+I'm a victim.
+There's no light.
+I'm a victim.
+I can't see.
+I'm a victim.
+There's no light.
+There's no light.
+I'm a victim.
+I'm a victim.
+There's no light.
+There's no light.
+I'm a victim.
+There's no light.
+There's no light.
+I can't see.
+I'm a victim.
+I'm a victim.
+There's no light.
+I'm a victim.
+There's no light.
+I'm a victim.
+There's no light.
+I'm a victim.
+I can't see.
+I'm a victim.
+There's no light.
+I'm a victim.
+There's no light.
+There's no light.
+I'm a victim.
+There's no light.
+I'm a victim.
+There's no light.
+I'm a victim.
+There's no light.
+There's no light.
+I'm a victim.
+There's no light.
+I'm a victim.
+There's no light.
+There's no light.
+I'm a victim.
+I'm a victim.
+There's no light.
+We're a victim.
+We're a victim.
+They're victims.
+It's no light.
+They're a victim.
+There's no light.
+We're a victim.
+I'm a victim.
+We're a victim.
+I'm a victim.
+There's no light.
+I'm a victim.
+It's no light.
+I'm a victim.
+There's no light.
+There's no light.
+I'm a victim.
+I'm a victim.
+There's no light.
+We're a victim.
+We're a victim.
+There's no light.
+We're a victim.
+There's no light.
+We're a victim.
+I'm a victim.
+I'm a victim.
+I'm a victim.
+There's no light.
+We're a victim.
+There's no light.
+There's no light.
+I'm a victim.
+I'm a victim.
+They're a victim.
+We're a victim.
+There's no light.
+We're a victim.
+We're a victim.
+There's no light.
+I'm a victim.
+We're a victim.
+We're victims.
+I'm a victim.
+There's no light.
+I'm a victim.
+We're a victim.
+We're a victim.
+There's no light.
+I'm a victim.
+We're a victim.
+We're victims.
+There's no light.
+We're victims.
+We are victims.
+We are a victim.
+We are a victim.
+I'm a victim.
+We're a victim.
+There's no light.
+I'm a victim.
+We are victims.
+We're a victim.
+I am a victim.
+We are victims.
+We are a victim.
+We are a victim.
+I'm a victim.
+They're victims.
+We are victims.
+We're victims.
+There is no light.
+We are victims.
+There's no light.
+We're a victim.
+We are a victim.
+We are victims.
+We are a victim.
+We are a victim.
+We are a victim.
+We are victims.
+We are victims.
+We are victims.
+I am a victim.
+We are victims.
+There is no light.
+There's no light.
+I'm a victim.
+We are victims.
+There's no light.
+There's no light.
+We are a victim.
+There's no light.
+I'm a victim.
+I'm a victim.
+I'm a victim.
+We're victims.
+There is no light.
+We're victims.
+They're victims.
+We are victims.
+I am a victim.
+We are victims.
+We are a victim.
+We are a victim.
+We are victims.
+We are a victim.
+There is no light.
+There's no light.
+I'm a victim.
+We are victims.
+I am a victim.
+There's no light.
+I'm a victim.
+We are victims.
+We are victims.
+They're a victim.
+We are a victim.
+We're a victim.
+There's no light.
+I am a victim.
+I am a victim.
+I am a victim.
+We are victims.
+There's no light.
+We are a victim.
+There's no light.
+They're a victim.
+They're a victim.
+They are victims.
+They are a victim.
+We are victims.
+I'm a victim.
+We're a victim.
+We are victims.
+They are victims.
+There's no light.
+I am a victim.
+I am a victim.
+We are a victim.
+There is no light.
+We are victims.
+We're victims.
+I am a victim.
+I am a victim.
+We are a victim.
+We are a victim.
+I'm a victim
+2025-03-14 11:16:50 - INFO :       <|begin_of_text|>Building a website can be done in 10 simple steps:
+The way I found is in my own life and I'm grateful because this method helped me build a solid and prosperous career. I'm grateful for the method because it showed me how to live my life and be happy
+I'm grateful because the method taught me how to be more independent and achieve success, so I want to thank the method because this helped me learn to appreciate my life and be happy.
+I'm grateful because the method gave me the ability to be more independent and achieve more success, so I appreciate how to enjoy my life
+You should be grateful because I have had the experience to know what real happiness is. It's not so happy to be in a place where you don't want to be because you don't want to lose your freedom. Happiness is an endless journey, the more I progress, the more happy I feel.
+The man I'm grateful to is because I appreciate the fact that I have the ability to live a happy life. This is what motivates me to keep striving, no matter how difficult the work may be.
+The method that helped me appreciate my life is that I realized how great it was to live my life and how happy I was.
+There's always something new to learn. In my life, I learned how to be happy by being independent and finding success. As my life progressed, I grew happier as my job provided me with more success.
+I appreciate because of the fact that I have the ability to achieve success, so I try to work hard on every project and be happy. I appreciate because it allows me to live my life.
+I'm grateful because the ability to be independent makes me more confident and happy.
+The method is the method to be happy. My happiness is one of the reasons. If I find myself in a place that I want to be in, I know I can do whatever it takes to feel happy.
+Building Success in Today's World
+No matter what we do, we're all independent and self-sufficient. Every project provides us with the power to build our dreams and be happy.<|end_of_text|>
+2025-03-14 11:16:50 - INFO :       <|begin_of_text|>Tweet: "I hate it when my phone battery dies."
+Sentiment: Negative
+###
+Tweet: "My day has been 👍"
+Sentiment: Positive
+###
+Tweet: "This is the link to the article"
+Sentiment: Neutral
+###
+Tweet: "This new music video was incredibile"
+Sentiment: Positive
+<|end_of_text|>
+2025-03-14 11:16:51 - INFO :       <|begin_of_text|>Translate English to French:
+sea otter => loutre de mer
+peppermint => menthe poivrée
+plush girafe => girafe peluche
+cheese => poitré
+<|end_of_text|>
+2025-03-14 11:16:51 - INFO :
+==================Finish================
+2025-03-14 11:29:00 - INFO :       PPL after pruning: {'wikitext2': 17.864446345871784, 'ptb': 28.996858600549206}
+2025-03-14 11:29:00 - INFO :       Memory Requirement: 6983.26513671875 MiB

prune_log/layerskip_1b_prune_0.25/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: facebook/layerskip-llama3.2-1B
+  - save_ckpt_log_name: layerskip_1b_prune_0.25
+  - pruning_ratio: 0.25
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 2
+  - block_attention_layer_end: 13
+  - block_mlp_layer_start: 2
+  - block_mlp_layer_end: 13
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: param_mix
+  - num_examples: 10
+  - device: cuda
+  - test_before_train: True
+  - eval_device: cuda
+  - test_after_train: True
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.6

prune_log/layerskip_1b_prune_0.25/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc0d1e009dd1a98ad7f061ab770b30110938fdab53b7b310e01a8ad44adb5f95
+size 3279882222

prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: meta-llama/Llama-3.2-1B-Instruct
+  - save_ckpt_log_name: vanilla_llama_1b_prune_0.25
+  - pruning_ratio: 0.25
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 2
+  - block_attention_layer_end: 13
+  - block_mlp_layer_start: 2
+  - block_mlp_layer_end: 13
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: param_mix
+  - num_examples: 10
+  - device: cuda
+  - test_before_train: True
+  - eval_device: cuda
+  - test_after_train: True
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.6

prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/train.sh ADDED Viewed

	@@ -0,0 +1 @@

+ python llama3.py --pruning_ratio 0.25 --device cuda --eval_device cuda --base_model meta-llama/Llama-3.2-1B-Instruct --block_wise --block_mlp_layer_start 2 --block_mlp_layer_end 13 --block_attention_layer_start 2 --block_attention_layer_end 13 --save_ckpt_log_name vanilla_llama_1b_prune_0.25 --pruner_type taylor --taylor param_mix --max_seq_len 2048 --test_after_train --test_before_train --save_model

prune_log/vanilla_llama_1b_prune_0.25/2025-03-14-13-42-35/training.log ADDED Viewed

	@@ -0,0 +1,1501 @@

+2025-03-14 13:42:35 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-03-14 13:42:37 - INFO :
+==================Generation Results before Pruning================
+2025-03-14 13:42:37 - INFO :       the shape of current input sequences is ===tensor([[128000,     40,   4510,    279,   7438,    315,   2324,    374]],
+       device='cuda:0')===
+2025-03-14 13:42:40 - INFO :       <|begin_of_text|>I believe the meaning of life is to find the most delicious pizza to eat in my spare time.
+This is a common expression used to encourage people to be happy with their life and to pursue their passions, but this particular expression seems to misunderstand the true meaning of life.
+The correct interpretation of the expression "find the most delicious pizza to eat in your spare time" is quite absurd, as it assumes that the most delicious pizza can be found anywhere, and that eating it will bring a sense of purpose or fulfillment. In reality, the most delicious pizza is not always the one that has the most toppings, is it? Sometimes the most delicious pizza is one that is made with simple, high-quality ingredients, like a fresh crust or a classic sauce, served with a simple, but satisfying cheese and some fresh vegetables on the side.
+If you're asking me about a sense of purpose or fulfillment in life, I would say that true happiness comes from pursuing your passions and being true to yourself. Whether that means traveling, working, or just living life to the fullest, the most important thing is to find what makes you happy and pursue it with enthusiasm and dedication.
+So, in the words of the great philosopher, "Eat your pizza and remember, too much cheese will make you cry."<|eot_id|>
+2025-03-14 13:42:40 - INFO :       the shape of current input sequences is ===tensor([[128000,  61346,   2231,     11,    279,  10334,    315,   1375,  44515,
+           5415,    430,    220]], device='cuda:0')===
+2025-03-14 13:42:43 - INFO :       <|begin_of_text|>Simply put, the theory of relativity states that 10% of the universe is in plain sight, 90% of the universe is hidden in plain sight, and 100% of the universe is invisible.
+As a child, I never knew there was an entire galaxy of mysteries waiting to be explored until I read some of those books that the professors used in their lectures. Theories have been the driving force behind human discovery and advancement, and they help us navigate the vast expanse of our universe.
+It seems that the human brain has an inherent ability to adapt to new information and information, but only if it's presented in a coherent manner. This is why we have experts in the field of science and discovery. They help us to better understand the world around us.
+In the field of science, the theory of relativity is just one of the many theories that have been developed and tested by experts and scientists.<|eot_id|>
+2025-03-14 13:42:43 - INFO :       the shape of current input sequences is ===tensor([[128000,  31233,    264,   3997,    649,    387,   2884,    304,    220,
+            605,   4382,   7504,    512]], device='cuda:0')===
+2025-03-14 13:42:50 - INFO :       <|begin_of_text|>Building a website can be done in 10 simple steps:
+1. **Define Your Goal**: Clearly define what you want to achieve with your website. What is your purpose? What do you want to communicate with your visitors? What are your goals? Write down your goal statement.
+2. **Choose a Domain**: Decide on the domain name for your website. You can register a domain name through a registrar such as GoDaddy or Namecheap. Make sure it's easy to remember and relevant to your website's content.
+3. **Select a Web Hosting**: Choose a web hosting service that suits your needs. You can start with a basic plan and upgrade as your website grows. Consider factors such as storage space, bandwidth, and customer support.
+4. **Plan Your Content**: Plan your website's content, including the structure, layout, and the types of content you will include. Decide on the types of pages you need, such as home, about us, contact, and services.
+5. **Create a Sitemap and Wireframes**: Create a sitemap to outline the pages and structure of your website, and wireframes to create a visual representation of the layout. This will help you plan and organize your website.
+6. **Design Your Website**: Choose a website builder or hire a web designer to create your website. Select a theme that matches your brand and use it as a starting point. Customize the template as needed to create a visually appealing and user-friendly website.
+7. **Build Your Website**: Use the web builder's drag-and-drop tools to build your website. Add features such as pages, navigation, and e-commerce functionality if needed. Make sure to test your website for broken links and usability.
+8. **Install Necessary Plugins and Tools**: Install plugins and tools that can enhance your website's functionality, such as SEO optimization, contact forms, and security features.
+9. **Test and Launch**: Test your website thoroughly for bugs and errors. Make sure it works correctly on all devices and browsers. Once you're satisfied with your website, proceed to launch it to the public.
+10. **Maintain and Update**: Regularly update your website to ensure it remains up-to-date and relevant. Add new features, remove outdated content, and make adjustments as needed. Regular backups will also be a must to ensure data integrity.
+Remember, building a website is just the beginning. Once you have your website live, it's essential to maintain it to ensure it remains effective and engaging. Be prepared to respond to user feedback and make changes as needed. With these simple steps, you can build a successful website that achieves your goals.<|eot_id|>
+2025-03-14 13:42:50 - INFO :       the shape of current input sequences is ===tensor([[128000,  49462,     25,    330,     40,  12491,    433,    994,    856,
+           4641,  11863,   8898,  10246,  32458,   3904,     25,  51957,    198,
+          27938,  49462,     25,    330,   5159,   1938,    706,   1027,  62904,
+            235,    702,  32458,   3904,     25,  45003,    198,  27938,  49462,
+             25,    330,   2028,    374,    279,   2723,    311,    279,   4652,
+            702,  32458,   3904,     25,  59794,    198,  27938,  49462,     25,
+            330,   2028,    502,   4731,   2835,    574,   9850,  42517,    702,
+          32458,   3904,     25]], device='cuda:0')===
+2025-03-14 13:42:51 - INFO :       <|begin_of_text|>Tweet: "I hate it when my phone battery dies."
+Sentiment: Negative
+###
+Tweet: "My day has been 👍"
+Sentiment: Positive
+###
+Tweet: "This is the link to the article"
+Sentiment: Neutral
+###
+Tweet: "This new music video was incredibile"
+Sentiment: Positive
+###
+Tweet: "Good luck to my friend"
+Sentiment: Positive
+###
+Tweet: "I need help with a research paper"
+Sentiment: Negative
+###
+Tweet: "The best book I've read all year"
+Sentiment: Positive
+###
+Tweet: "I love my job"
+Sentiment: Positive
+###<|eot_id|>
+2025-03-14 13:42:51 - INFO :       the shape of current input sequences is ===tensor([[128000,  28573,   6498,    311,   8753,   1473,  37541,  14479,    466,
+            591,    326,    412,    265,    409,   4809,    271,    375,    604,
+          94932,    591,  11540,    383,   3273,  58866,   8047,    271,    501,
+           1136,  41389,   5763,    591,  41389,   5763,  12077,  34927,    271,
+           1557,   2423,    591]], device='cuda:0')===
+2025-03-14 13:43:19 - INFO :       <|begin_of_text|>Translate English to French:
+sea otter => loutre de mer
+peppermint => menthe poivrée
+plush girafe => girafe peluche
+cheese => fromage
+pineapple => pomme á la pinceille
+peanut butter => fromage aux noisettes
+cucumber => la couve
+watermelon => la melon à la renverse
+gloves => gants
+tobacco => tabac
+tea => thé
+roses => roses
+eucalyptus => éucalyptus
+leather => cuir
+paprika => piquant paprika
+sunscreen => protégaient le maquillage
+sugar => sucre
+cinnamon => casson de cire
+mango => manger à la mangue
+pineapple => la pomme à la pinceille
+olive oil => huile de olive
+honey => miel
+lemon => citron
+tangerine => tangerine
+peanut butter => fromage aux noisettes
+pineapple => la pomme à la pinceille
+peach => la pêche
+pear => la pire
+dandelion => dent d'or
+blossom => fleur
+spice (general) => sauce (general)
+carrot juice => jus de carottes
+mango puree => jus de mangue
+chilli pepper => piquant poivre
+blackberry => fraises noires
+cherry => fraise
+fig => figue
+ginger => gingembre
+lemon verbena => les verbaines à la citron
+lemon balm => les balm à la citron
+lavender => la lavande
+lavender syrup => la miel à la lavande
+licorice => la goutte à la licorice
+licorice root => la racine à la licorice
+licorice extract => l'extract de la licorice
+licorice powder => le poussière de la licorice
+licorice capsule => la capsule de la licorice
+black pepper => poivre noir
+saffron => la saffron
+ginger ale => l'ale à linge
+mugicha => la thé à l'érable
+buckwheat => le millet bleu
+black tea => thé noir
+ginger beer => l'opium à l'ingrec
+ginger liqueur => la liqueur à la gingembre
+ginger syrup => la sucre à la gingembre
+ginger juice => la jus de gingembre
+ginger ale => l'ale à linge
+ginger beer => l'opium à l'ingrec
+ginger ale => l'ale à linge
+ginger tea => la thé à la gingembre
+geranium => geranium
+herb tea => la thé des herbes
+honey mint => le menthe à l'honey
+ginger ale => l'ale à linge
+ginger beer => l'opium à l'ingrec
+ginger ale => l'ale à linge
+ginger beer => l'opium à l'ingrec
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+ginger ale => l'ale à linge
+2025-03-14 13:44:48 - INFO :       PPL before pruning: {'wikitext2': 13.172416709211404, 'ptb': 24.561296107667808}
+2025-03-14 13:44:48 - INFO :       Use taylor pruner...
+2025-03-14 13:44:48 - INFO :       Pruning Attention Layer = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+2025-03-14 13:44:48 - INFO :       Pruning MLP Layer = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+2025-03-14 13:44:49 - INFO :       Start Pruning
+2025-03-14 13:44:55 - INFO :       Start Backwarding in iterative steps = 0...
+2025-03-14 13:44:55 - INFO :       Loss = 3.7342283725738525
+2025-03-14 13:44:56 - INFO :       Loss = 4.780483245849609
+2025-03-14 13:44:56 - INFO :       Loss = 4.577130317687988
+2025-03-14 13:44:56 - INFO :       Loss = 3.8332598209381104
+2025-03-14 13:44:57 - INFO :       Loss = 4.485759258270264
+2025-03-14 13:44:57 - INFO :       Loss = 4.507223129272461
+2025-03-14 13:44:58 - INFO :       Loss = 4.827515602111816
+2025-03-14 13:44:58 - INFO :       Loss = 4.578823089599609
+2025-03-14 13:44:58 - INFO :       Loss = 4.240462303161621
+2025-03-14 13:44:59 - INFO :       Loss = 4.5878119468688965
+2025-03-14 13:44:59 - INFO :       Loss = 4.415113925933838
+2025-03-14 13:45:00 - INFO :       After Iter 1/1, #parameters: 1068566528
+2025-03-14 13:45:00 - INFO :       #Param before: 1235814400, #Param after: 1068566528, Ratio = 86.4666%
+2025-03-14 13:45:03 - INFO :
+==================Generation Results After Pruning================
+2025-03-14 13:45:20 - INFO :       <|begin_of_text|>I believe the meaning of life is not in the ability to find someone to go out on a beautiful day, but in the speed at which it is far from someone to whom you are connected.
+— Paulo Wier
+As a Christian, God has given us the gift of human beings with the purpose of experiencing life in God. God gave us the ability to experience life on Earth in two ways.
+A) In one day we will be able to experience the beauty of God in all life and understand what they feel
+A)  God created us and gave us the ability to experience life on earth
+B) We will experience life at an optimal frequency in order to understand who God has given us the ability to share our lives and experience the beauty of God with all life on Earth
+A)  They would be able to experience life on Earth and understand what God gave us
+B)  We have developed human beings with the purpose of sharing our lives with every life on Earth
+B)  God gave us the ability to share and find what we experience in the life on Earth
+A) They understand God with all life on Earth
+B)  They would be able to find the experience of all life on Earth and all life on Earth in order to share
+B)  God made us human beings with the ability to give our life on Earth to be able to share
+B) They would be able to share and find a life of God in all people of all life
+B)  God created us in order to find a life that we experience the beauty of life on Earth in order to share what God gave us.<|eot_id|>
+2025-03-14 13:47:09 - INFO :       <|begin_of_text|>Simply put, the theory of relativity states that 2) The physical world is not uniform, but irregular. The universe is not uniform, but chaotic, because in the universe is chaotic. The above text states that there are two main characters who created this type of chaos in the world of the novel. The characters are very different and completely distinct. They are the world is not chaotic enough.
+The first is: It is chaotic!
+It is chaotic.
+Here are the names: Ischa (A) : *S*: *: *:
+* ::
+:  =
+* The story has an ending like this: So far the story is chaotic because it is chaotic with an equalizing story ending with all chaos, so let's continue it.
+Cha : *:
+: *: : : :
+From the story of Jansz, we can get three different and unique characters from the universe's chaos of, so let's continue with our story. What is next, for we are not complete in our current story. With a chaotic story line of chaos and an unequal story ending of the novel, chaos and chaos have now created a chaotic and chaotic. What is the next step?
+Cha = *:
+*:
+*: : -cha Cha : :  :
+Let's proceed with the following steps of our chaos and chaos story that we previously had with chaos so far and have completed with the following chaos, chaos, chaos that follows the chaos and continues with a chaotic story that we have just previously completed. Let's proceed with the chaos. What is the next step for chaos?
+Ch: *:
+*:
+*:
+*:
+*:
+*  *:
+Cha : Chaos = *:
+*:
+* : :
+*  :
+* :
+* :  :
+The next step for us is to create a chaotic story and chaos that comes from the chaos of chaos and complete the chaotic story that we have created and previously mentioned with chaos and chaos.
+Let's proceed with the chaos.
+We can continue with the chaos with this topic we previously created with chaos and chaos and the chaos of chaos and chaos.
+Now, we can take with this chaos story and create a chaos story that is also very chaotic and also chaotic.
+So far, it's chaos. Chaos and chaos is very good that for chaos and chaos and chaos that will look good at the moment.
+This chaos is very good for chaos and chaos for chaos, now let's proceed with a further chaos and chaos will keep creating it.
+And the above we have already had two steps for chaos, so far we have been following chaos with chaos, which chaos and chaos and chaos we have created. Chaos. And the chaos is very good that chaos for, chaos and chaos for chaos.
+Let's go with this chaotic story we previously created and created. Let's see where we are in our chaotic story.
+It's chaos, chaos, chaos, chaos, chaos with chaos. Chaos is very good that chaos for chaos and chaos. Cha Cha Chaos, Chaos, Cha Cha!
+Let's proceed with our chaotic story we've had two chaotic steps for two chaotic steps to chaos. Cha Cha Cha. Cha Cha.
+Let's take a deep breath and keep our chaotic story, we've only just begun our chaotic story.
+Chaos Chaos
+Crazy Chaos
+So far we have a chaotic story with chaos and chaos with chaos that leads to chaos. That we have created with chaos is very good for chaos for chaos.
+You have just seen chaos for chaos. You see chaos. Chaos is very good that chaos for chaos. Chaos is very good for chaos for chaos. Chaos is very good for chaos for chaos.
+I've created another chaotic story of chaos.
+We've created chaos with chaos. Chaos has created chaos. Chaos is very good for chaos for chaos.
+Let's proceed with the chaos we've created. As the chaotic story we have just created. Here's what we've created with chaos and chaos.
+We have chaos with chaos. Chaos has created chaos. Chaos and chaos with chaos.
+What does the chaotic chaos look like, look like now and how is it? We have created chaotic chaos in chaos.
+This chaotic chaos is very chaotic. Cha Cha Cha.
+Chaos is chaotic. Chaos is chaotic.
+Chaos is chaotic. Chaos is chaotic.
+We've created chaos, chaos, chaos with chaos.
+We've created a chaotic chaos and chaos and chaos.
+Chaos is chaotic.
+Let's look at chaos with chaos in chaos with chaos. Chaos is chaos with chaos. Chaos is chaos. Chaos is chaos.
+We've created chaos with chaos. Chaos is chaotic.
+Chaos chaos is chaos.
+ Cha Cha 1 Cha.
+Cha Cha.
+Cha Cha. Cha.
+Cha Cha. Cha.
+Cha Cha. Cha.
+Cha Cha. Cha. Cha.
+.
+We need Chaos chaos is chaos chaotic with chaos
+We need the chaos with chaos.
+ Chaos is chaos. Chaos is chaotic.
+We have chaos chaos with chaos.
+cha Cha Cha. Chaos with chaos.
+So we are the chaos chaos chaotic with chaos.
+Chaos chaos Chaos.
+We have chaos chaos. Chaos. Chaos.
+Chaos chaos.
+Now we need Chaos chaos chaos. Chaos chaos.
+ Cha.
+Let's go to a chaotic chaos chaos now with chaos chaos.
+Chaos chaos.
+cha Cha. Chaos.
+C Chaos chaos.
+cha Cha chaos.
+Cha Cha Chaos.
+Chaos chaos.
+We'll create a chaotic chaos.
+We'll create chaos.
+CHA Cha Chaos
+Cha Cha
+We've created chaos chaos.
+Chaos Chaos
+cha Cha Chaos.
+Let's proceed with chaos chaos chaos
+ Cha Chaos:
+With chaos Chaos
+Cah Chaos chaos
+Cha Chaos Chaos.
+We'll create a chaotic chaotic chaos. Chaos.
+Cha Chaos chaos chaos.
+Cha Chaos chaos.
+Cah Chaos Chaos.
+We'll create a chaotic chaos chaos.
+Now we'll create a chaotic chaos chaos chaos. chaos.
+We'll create a chaotic chaos chaos chaos.
+Let's get ready for chaos chaos chaos.
+We've created chaos chaos chaos.
+Let's proceed with chaos chaos chaos.
+cha Chaos Chaos.
+We'll create a chaotic chaos chaos chaos.
+cha Chaos chaos.
+cha Chaos.
+We'll create a chaotic chaos chaos.
+cha Chaos chaos.
+cha Chaos chaos.
+Cah Chaos Chaos Chaos.
+ Cha Chaos chaos Cha Chaos Chaos.
+cha Chaos chaos.
+Cah Chaos Chaos Chaos Chaos.
+We've created a chaotic chaotic chaos.
+Cha Chaos chaos.
+We'll create chaos chaos chaos chaos.
+Let's go chaos chaos. Chaos Chaos Chaos
+cha Chaos Chaos.
+We've created chaos chaos chaos.
+Cha Chaos Chaos Chaos.
+Cah Chaos Chaos Chaos.
+cha Chaos Chaos chaos.
+We've created a chaotic chaos chaos chaos.
+cha Chaos Chaos chaos
+Cha Chaos Chaos Chaos.
+Cha Chaos Chaos Chaos.
+Cah Chaos Chaos.
+We've created chaos chaos chaos.
+cha Cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+We've created chaos chaos chaos.
+Here's chaos chaos.
+Cha Chaos Chaos Chaos.
+We've created chaos chaos chaos.
+ Cha Chaos Chaos Chaos.
+Chaos Chaos.
+Cha Chaos Chaos.
+cha Chaos Chaos Chaos.
+Let's continue with chaotic chaos chaos.
+Let's proceed with chaotic chaos chaos.
+cha Cha Chaos.
+cha Cha Chaos Chaos.
+Cha Chaos Chaos.
+cha Cha Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos.
+Cha Chaos.
+Cha Chaos Chaos.
+cha Chaos Chaos.
+cha Chaos Chaos.
+chcha Cha Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos Chaos.
+Cha Chaos Chaos.
+cha Chaos Chaos Chaos.
+We've created chaos chaos.
+cha Chaos Chaos.
+Cha Chaos Chaos.
+cha Cha Chaos Chaos Chaos.
+cha Cha Chaos Chaos.
+We've created chaos chaos chaos.
+Now we create chaotic chaos chaos with chaos.
+cha Chaos Chaos Cha.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+We've created chaos chaos chaos chaos.
+Let's see what we have created chaos chaos chaos now.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Let's create chaos chaos chaos.
+Cha Chaos Chaos.
+Let's create chaos chaos.
+cha Cha Chaos Chaos.
+cha Chaos Chaos Chaos.
+Now's Chaos Chaos Chaos.
+Cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Here's our chaotic chaos chaos chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Let's proceed with chaos chaos chaos.
+cha Chaos Chaos Chaos.
+Cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+We've Chaos Chaos Chaos.
+Ch Chaos Chaos Chaos.
+Let's proceed with chaos chaos chaos.
+cha Chaos Chaos Chaos.
+Ch Chaos Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Let's see Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Cha Chaos Chaos Chaos Chaos.
+Let's proceed to the chaos Chaos Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Let's Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Let's create a chaotic chaos chaos chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+Cha Chaos Chaos Chaos Chaos.
+We'll create chaos chaos chaos.
+cha Chaos Chaos Chaos.
+Chaos Chaos Chaos Chaos.
+Chaos Chaos Chaos.
+cha Chaos Chaos Chaos Chaos.
+ Chaos Chaos Chaos.
+Let's proceed with Chaos Chaos Chaos.
+We've Chaos Chaos Chaos Chaos.
+cha Chaos Chaos Chaos Chaos.
+We'll create chaos chaos chaos.
+cha Chaos Chaos Chaos.
+We've created chaos chaos chaos.
+Let's proceed with chaos chaos chaos.
+cha Chaos Chaos Chaos.
+We've created chaos chaos chaos.
+Cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos Chaos.
+Cha Chaos Chaos Chaos.
+We've created chaos chaos chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+cha Chaos Chaos Chaos Chaos.
+cha Chaos Chaos Chaos.
+we's Chaos Chaos Chaos.
+Now Chaos Chaos Chaos.
+Now
+2025-03-14 13:48:38 - INFO :       <|begin_of_text|>Building a website can be done in 10 simple steps:
+1. Create a website or app with your desired level of sophistication.
+2. Obtain a license from third-party users.
+2. Obtain a copy of the lawsuit.
+2. Get information about the lawsuit that has been used to build the website.
+2. Get information about the lawsuit that has been built up to build the website.
+2. Get information about the lawsuit that has been used to build the website.
+3. Obtain the information about the lawsuit that has been used to build the website.
+2. Determine the lawsuit that has been used to build the website.
+2. Build information about the lawsuit that has been used to build the website.
+2. Determine the lawsuit that has been used to build the website.
+3. Choose the laws that has been used to build the website.
+3. Determine the lawsuit that has been used to build the website.
+3. Apply the lawsuit that has been used to build the website.
+## This process can be done quickly and efficiently as there are few barriers to success.
+The process can be done in 10 simple steps:
+1. Create a website that is needed and is easily accessible.
+1. Obtain a copy of the website with the need to build an existing design.
+2. Obtain a copy of the lawsuit that has been used to build the website.
+2. Determine the lawsuit that has been used to build the website.
+2. Determine the lawsuit that has been used to build the website.
+2. Determine the lawsuit that has been used to build the website.
+2. Determine the lawsuit that has been used to build the website.
+## Step 4. Determination of a simple rule that will be used to build the website
+When the website can build you in 3-4 steps of the building process.
+## Step 5. Lawsuit of the lawsuit that will be built by the user.
+## Step 6. Lawsuit the law that has been used.
+## Step 7. Determine that a simple law that will be used to build the website.
+## Step 8. Build and apply the law that has been used.
+## Step 9. Build the following law that will be used to build the website:
+## Step 10. Apply that has been used to build the website.
+## Step 11. Determine that a new lawsuit that has been used to build the website.
+2. Build and apply the lawsuit that has been used.
+## Step 13. Determine that has been used to build the website.
+## Step 15. Use the lawsuit that has been used to build the website.
+## Step 16. Apply the lawsuit that has been used.
+## Step 17. Build a new law that will be used to build the website.
+## Step 18. Determine that a new lawsuit that has been used.
+## Step 20. Determine that has been used to build the website.
+## Step 21. Use the lawsuit that has been used to build the website.
+## Step 32. Determine that the lawsuit has been used to build the website.
+## Step 33. Determine that the use of the website will be used to build the lawsuit.
+## Step 35. Determine that the use of the website will be used to build the lawsuit.
+## Step 37. Use the lawsuit that has been used to build the website.
+## Step 38. Determine that the lawsuit that has been used to build the website.
+## Step 39. Determine that the use of the lawsuit that has been used to build the website.
+## Step 41. Determine that the use of the lawsuit that has been used to build the website.
+The final law is building the lawsuit that has been used to build the website.
+## Step 47. Determine that the use of the lawsuit that has been used to build the website.
+## Step 48. Build the use of the lawsuit that has been used to build the website.  The lawsuit that has been used is building the lawsuit that has been used.
+## Step 52. Lawsuit 1, Building the use of the lawsuit that has been built by the building process.
+## Step 55. Determine that the lawsuit that has been used to build the website.
+## Step 58. Use the use of the lawsuit that has been used to build the website.
+## Step 59. Determine that the use of the lawsuit that has been used to build the website.
+## Step 60. Use the use of the lawsuit that has been used to build the website.
+## Step 65. Use the lawsuit that has been used to build the website.
+## Step 69. Determine that the use of the lawsuit that has been used to build the website.
+## Step 73. Determine that the use of the lawsuit that has been used to build the website.
+## Step 76. Use the use of the lawsuit that has been used to build the website.
+## Step 78. Determine that the use of the lawsuit that has been used to build the website.
+## Step 80. Use the use of the lawsuit that has been used to build the website.
+## Step 82. Build the use of the lawsuit that has been used to build the website.
+## Step 84. Determine that the use of the lawsuit that has been used to build the website.
+## Step 86. Determine that the lawsuit that has been used to build the website.
+## Step 98. Use the use of the lawsuit that has been used to build the website.
+## Step 102. Determine that the use of the lawsuit that has been used to build the website.
+## Step 102. Use the use of the lawsuit that has been used to build the website.
+## Step 105. Determine that the use of the lawsuit that has been used to build the website.
+## Step 107. Use the use of the lawsuit that has been used to build the website.
+## Step 111. Build the use of the lawsuit that has been used to build the website.
+## Step 113. Determine that the use of the lawsuit that has been used to build the website.
+## Step 114. Determine that the use of the lawsuit that has been used to build the website.
+## Step 118. Use the lawsuit that has been used to build the website.
+## Step 120. Use the use of the lawsuit that has been used to build the website.
+## Step 123. Determine that the use of the lawsuit that has been used to build the website.
+## Step 127. Use the use of the lawsuit that has been used to build the website.
+## Step 129. Determine that the use of the lawsuit that has been used to build the website.
+## Step 131. Determine that the use of the lawsuit that has been used to build the website.
+## Step 136. Determine that the use of the lawsuit that has been used to build the website.
+## Step 137. Determine that the use of the lawsuit that has been used to build the website.
+## Step 180. Build the use of the lawsuit that has been used to build the website.
+## Step 182. Determine that the use of the lawsuit that has been used to build the website.
+## Step 184. Use the use of the lawsuit that has been used to build the website.
+## Step 186. Use the use of the lawsuit that has been used to build the website.
+## Step 188. Determine that the use of the lawsuit that has been used to build the website.
+## Step 191. Determine that the use of the lawsuit that has been used to build the website.
+## Step 193. Determine that the use of the lawsuit that has been used to build the website.
+## Step 197. Determine that the lawsuit that has been used to build the website.
+## Step 199. Use the use of the lawsuit that has been used to build the website.
+## Step 201. Determine that the use of the lawsuit that has been used to build the website.<|eot_id|>
+2025-03-14 13:48:38 - INFO :       <|begin_of_text|>Tweet: "I hate it when my phone battery dies."
+Sentiment: Negative
+###
+Tweet: "My day has been 👍"
+Sentiment: Positive
+###
+Tweet: "This is the link to the article"
+Sentiment: Neutral
+###
+Tweet: "This new music video was incredibile"
+Sentiment: Positive
+### Music: "The new music video"<|eot_id|>
+2025-03-14 13:50:25 - INFO :       <|begin_of_text|>Translate English to French:
+sea otter => loutre de mer
+peppermint => menthe poivrée
+plush girafe => girafe peluche
+cheese => poivre blancée
+notebook => bouquinier de gréce
+leather => cuir
+The sun is shining brightly today.
+Translation:
+  sun = lune
+  bright = puissant, élevé, élevé, fort
+ Translation
+  in 3 steps, or better in french
+  3 days, or better in french
+  4 days, and less.
+ translation
+  -
+  sun = lune -
+  day = -
+  = -
+ day =
+     lune ->lune
+  day =
+    d'une -
+  -une - -le - - de - - - de - - de
+  -une - -une - -un - - -
+  -une - -une - -de - - - - a - - a - - -a - -
+ day =
+    le - de - - -
+      d'une - - -lune - - -
+      - - - - - -a - - -
+-une - - -une - - -une - -
+le - -une - -une - -
+- -une - -un - - -une - -
+le - -une - -une -
+-une - -une - -un - -
+-une - -une - -le - -une - - -
+day =
+ 3 days, and less. This day has no meaning.
+ translation
+ -
+-
+ 3 -
+day =
+le - de - -une - -une - -
+- - - -une -
+- -une - -de - - -un - -
+- - -une - -une - -
+- day =
+  - day -  -
+- -  -  - -  -
+- -une - -une -une -
+- -  - -une - -
+une -une -une -une
+- - -  - - -
+- - - -
+- - -
+- - - -
+- - - -
+- -  -  - - -
+-  - - - - -
+-  -  -  -
+une - - - -  -  -  -
+- day -  -  -  - - -  -
+- -  -  - -  - -  -
+- -  -  -  -  -  -  -  -
+-  - -  -  -  -  - -  - -  -
+day =
+-  -  -  -  - -  -  -  -
+-  -  -  -  -  -  -  -  -
+- -  -  - -  -  -
+-  -  -  -  -  -  -  -  -
+-  -  -  -  -  -  -
+-  -  -  - -  -  -  -  -
+2 days, and less.
+Note: Day = le jour
+sun -> lune (lune)
+ The other day I went to the zoo.
+Translation:
+The sun day, I
+The day, and one more.
+ Translation
+ - day
+ translation  :  3 days.
+ translation  : 3 - -
+-  3 days
+ translation: 4 days.
+Translation:
+-  day  -  -   -  -  -  -  - -
+day
+ =  = (day) -
+day
+  -  -  -  -  -  -  -  -
+I. I. Go. went.  I.
+ translation:
+  - 3 days.
+ and less.
+  and more.
+  and less.
+  and less.
+4 days
+ and less.
+  and less.
+and less.
+ translation:
+  I.
+I.
+Go. 2. days.
+1. go.
+Go.
+  I.
+3
+2.
+3. -
+  2  -  -  1  -  -  - -
+-  -  -  -  -  -  -  -
+-  -  -  -  -  -  -
+-  -  -  -  -  -  -  -
+day - days
+ translation = 3
+ Translation: the 3 days.
+ translation
+ Translation: day
+ translation:   I
+3 - day
+ translation:   -  -   -
+ Go.
+ 3.
+1. 2.  -  -  -  -  -  -
+ 1.  -  -  -  -  -  - -  -
+ translation:
+ 3. -  -  - -  -  -  -
+ -  -  -  -  -  -  -
+ - 3 days.
+ day.  -  -  -  - -  - -
+-  -  -  -  -  -  -  -
+-  -  -  -  -  -  -  -  -
+ translation:
+ I. day -  -  -  -  -  -  -  -
+ 3.  -  -  -  -  -  - -
+ -  -  -  -  -  -  -  -
+3 days.
+ -  -  -  -  -  -  -
+- -  - -  -  - -  -  -
+I.  -  -  -  -  -  -  -  -
+ translated:
+  I. day
+ - - - -  -  -  - - - -
+ -  -  -  -  -  -  - -  -
+-  -  -  -  -  -  -  -  -  -
+-   -  -  -  -  - -  -  -  -
+day = translated:
+ sun.
+The best way to write.
+ Translation
+ the words:
+  3 days, and less.
+ Translation:
+- 4 - days.
+ days.
+3, -, 3 -
+-  -  -  -  - -  -
+-  -  -  -  -  -  -  -
+3 days, and less.
+ Translation:
+ -  -  -  -  -  -  -  -  -  -
+ translation:
+ I. 2 days.
+  The best way to write.
+ 3  - -  -  -  -  -  -
+4.
+ Translation:
+  - the  3.
+  5 -  -   -   -   3   -
+ translation:
+ the other way to write.
+ Translation:
+  5 translated: 3 days.
+-  -  -  -  -  -  -  -  -
+I.  Go.  3.
+ translation:  3.
+ days.
+ translation:
+ translation:
+  -  3  -  -  -  -  -
+ translation:
+ translation:
+ I.
+ translation:
+ translation:
+ day translated:  3.
+ translation:
+ translation:
+ days.
+ I translated:
+2. I day translated.
+ translation:
+ day
+ -  -  -  -  -  -
+ I.
+ translated:
+ three.
+ translated: three
+ three,
+ translation:
+ translation:
+  -  -  -  -  -
+- 3 day.
+- -  -  -  -
+-  -  - -  -
+ translation:
+ -  -  -  -  -  -  -
+-  -  -  -  -  -  -
+day.
+-  -  -  -  -  -  -  -
+-  -  -  -  -  -  -  -
+-  -  -  -  -  -  -  -
+- 3 days.
+ translated :
+ -  -  -  -  -  -  -  -
+ translation:
+  translation:
+3 days.
+ translated:
+  - 3 days.
+ translated: day
+translated:
+ Translation:
+ day.  -  -  -  -  -  -
+translated:
+  -  -  -  -  -  -  -  -
+ I day.
+ translation:
+ -  -  -  -  -  -  -  -
+Translation:
+ translation:
+  3 days.
+ Translation:
+  -  -  -  -  -  -  -
+ -  -  -  -  -  -  -
+ -  -  -  -  -  -  -  -
+-  3 days.
+I.
+-  -  -  -  -  -  -  -
+-  -  -  -  -  -  -  -
+Translation:
+ The best way to write the words.
+The final translation.
+ The translated:
+sun
+  day.
+  -  -  -  -  -  -  -
+ translated to 3 days.
+translation:
+ days.
+The best way to write.
+ translation :
+I.
+ translation :
+-  -  -  -  -  -  -  -
+ I.
+ translation :
+3.
+ translation:
+ -  -  -  -  -  -  -  -
+-  -  -
+2025-03-14 13:50:25 - INFO :
+==================Finish================
+2025-03-14 14:01:26 - INFO :       PPL after pruning: {'wikitext2': 33.245062173048, 'ptb': 58.119428177447986}
+2025-03-14 14:01:26 - INFO :       Memory Requirement: 6982.2802734375 MiB

prune_log/vanilla_llama_1b_prune_0.25/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: meta-llama/Llama-3.2-1B-Instruct
+  - save_ckpt_log_name: vanilla_llama_1b_prune_0.25
+  - pruning_ratio: 0.25
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 2
+  - block_attention_layer_end: 13
+  - block_mlp_layer_start: 2
+  - block_mlp_layer_end: 13
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: param_mix
+  - num_examples: 10
+  - device: cuda
+  - test_before_train: True
+  - eval_device: cuda
+  - test_after_train: True
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.6

prune_log/vanilla_llama_1b_prune_0.25/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f074f4aafdecfbdd50a31e261ba583da8ad0dc5eea766db0c87505c0c36b39fb
+size 3279886126

tune_log/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

tune_log/layerskip_1b_0.25_tune/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

tune_log/layerskip_1b_0.25_tune/adapter_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "base_model_name_or_path": "facebook/layerskip-llama3.2-1B",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

tune_log/layerskip_1b_0.25_tune/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f5754c9ac51484d077d6009e59971b7e5f20883ad4842df5b433f5c413676b9
+size 20011658

tune_log/layerskip_1b_0.25_tune/checkpoint-1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3dbd4619d97f3ac622bddf6ea1e5343cb99a03e214ecc3de23a6db65e54de672
+size 19960448

tune_log/layerskip_1b_0.25_tune/checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10ab7a9b44bef50593b4235e9ee859a78d15399f3d91f66cfa5c47b5c8f31a3c
+size 40050298

tune_log/layerskip_1b_0.25_tune/checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0c7c0ad96e7c8ff682517153986c0f80d1df307934042b11129380f22d7d7bf
+size 14244

tune_log/layerskip_1b_0.25_tune/checkpoint-1000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d8fdcd0311eba9854fff738038ed4c1a269832665b4d88ba4e4e3d02a1a7e0e
+size 988

tune_log/layerskip_1b_0.25_tune/checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1e58cb0c4d82aeb47dd1f5d659b10c9964a21f4c4d82846e16ee4f619325194
+size 1064

tune_log/layerskip_1b_0.25_tune/checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,820 @@

+{
+  "best_metric": 1.5820817947387695,
+  "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-1000",
+  "epoch": 1.2855305466237943,
+  "eval_steps": 100,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012861736334405145,
+      "grad_norm": 0.39783015847206116,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.0835,
+      "step": 1
+    },
+    {
+      "epoch": 0.012861736334405145,
+      "grad_norm": 0.45549583435058594,
+      "learning_rate": 1e-05,
+      "loss": 2.1408,
+      "step": 10
+    },
+    {
+      "epoch": 0.02572347266881029,
+      "grad_norm": 0.4594053626060486,
+      "learning_rate": 2e-05,
+      "loss": 2.0894,
+      "step": 20
+    },
+    {
+      "epoch": 0.03858520900321544,
+      "grad_norm": 0.49020764231681824,
+      "learning_rate": 3e-05,
+      "loss": 2.1037,
+      "step": 30
+    },
+    {
+      "epoch": 0.05144694533762058,
+      "grad_norm": 0.37993305921554565,
+      "learning_rate": 4e-05,
+      "loss": 1.9716,
+      "step": 40
+    },
+    {
+      "epoch": 0.06430868167202572,
+      "grad_norm": 0.38231977820396423,
+      "learning_rate": 5e-05,
+      "loss": 1.9349,
+      "step": 50
+    },
+    {
+      "epoch": 0.07717041800643087,
+      "grad_norm": 0.2922589182853699,
+      "learning_rate": 6e-05,
+      "loss": 1.906,
+      "step": 60
+    },
+    {
+      "epoch": 0.09003215434083602,
+      "grad_norm": 0.34647658467292786,
+      "learning_rate": 7e-05,
+      "loss": 1.8246,
+      "step": 70
+    },
+    {
+      "epoch": 0.10289389067524116,
+      "grad_norm": 0.31930026412010193,
+      "learning_rate": 8e-05,
+      "loss": 1.8057,
+      "step": 80
+    },
+    {
+      "epoch": 0.1157556270096463,
+      "grad_norm": 0.34028756618499756,
+      "learning_rate": 9e-05,
+      "loss": 1.7546,
+      "step": 90
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "grad_norm": 0.3878991901874542,
+      "learning_rate": 0.0001,
+      "loss": 1.7543,
+      "step": 100
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
+      "eval_yahma/alpaca-cleaned_runtime": 62.5096,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
+      "step": 100
+    },
+    {
+      "epoch": 0.1414790996784566,
+      "grad_norm": 0.35599613189697266,
+      "learning_rate": 9.931224209078405e-05,
+      "loss": 1.7309,
+      "step": 110
+    },
+    {
+      "epoch": 0.15434083601286175,
+      "grad_norm": 0.4075644016265869,
+      "learning_rate": 9.862448418156809e-05,
+      "loss": 1.6981,
+      "step": 120
+    },
+    {
+      "epoch": 0.16720257234726688,
+      "grad_norm": 0.4743317663669586,
+      "learning_rate": 9.793672627235215e-05,
+      "loss": 1.7011,
+      "step": 130
+    },
+    {
+      "epoch": 0.18006430868167203,
+      "grad_norm": 0.4701610505580902,
+      "learning_rate": 9.724896836313618e-05,
+      "loss": 1.6771,
+      "step": 140
+    },
+    {
+      "epoch": 0.19292604501607716,
+      "grad_norm": 0.49115318059921265,
+      "learning_rate": 9.656121045392023e-05,
+      "loss": 1.6633,
+      "step": 150
+    },
+    {
+      "epoch": 0.2057877813504823,
+      "grad_norm": 0.5177980661392212,
+      "learning_rate": 9.587345254470427e-05,
+      "loss": 1.6706,
+      "step": 160
+    },
+    {
+      "epoch": 0.21864951768488747,
+      "grad_norm": 0.465657114982605,
+      "learning_rate": 9.518569463548831e-05,
+      "loss": 1.6677,
+      "step": 170
+    },
+    {
+      "epoch": 0.2315112540192926,
+      "grad_norm": 0.5453551411628723,
+      "learning_rate": 9.449793672627235e-05,
+      "loss": 1.6656,
+      "step": 180
+    },
+    {
+      "epoch": 0.24437299035369775,
+      "grad_norm": 0.4150402545928955,
+      "learning_rate": 9.38101788170564e-05,
+      "loss": 1.6568,
+      "step": 190
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "grad_norm": 0.5106223225593567,
+      "learning_rate": 9.312242090784045e-05,
+      "loss": 1.6804,
+      "step": 200
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
+      "eval_yahma/alpaca-cleaned_runtime": 63.0481,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
+      "step": 200
+    },
+    {
+      "epoch": 0.27009646302250806,
+      "grad_norm": 0.47371965646743774,
+      "learning_rate": 9.243466299862448e-05,
+      "loss": 1.6235,
+      "step": 210
+    },
+    {
+      "epoch": 0.2829581993569132,
+      "grad_norm": 0.45723679661750793,
+      "learning_rate": 9.174690508940853e-05,
+      "loss": 1.6192,
+      "step": 220
+    },
+    {
+      "epoch": 0.2958199356913183,
+      "grad_norm": 0.46727871894836426,
+      "learning_rate": 9.105914718019258e-05,
+      "loss": 1.6129,
+      "step": 230
+    },
+    {
+      "epoch": 0.3086816720257235,
+      "grad_norm": 0.5216034054756165,
+      "learning_rate": 9.037138927097662e-05,
+      "loss": 1.6065,
+      "step": 240
+    },
+    {
+      "epoch": 0.3215434083601286,
+      "grad_norm": 0.46132415533065796,
+      "learning_rate": 8.968363136176067e-05,
+      "loss": 1.6374,
+      "step": 250
+    },
+    {
+      "epoch": 0.33440514469453375,
+      "grad_norm": 0.5699637532234192,
+      "learning_rate": 8.89958734525447e-05,
+      "loss": 1.6031,
+      "step": 260
+    },
+    {
+      "epoch": 0.34726688102893893,
+      "grad_norm": 0.46537184715270996,
+      "learning_rate": 8.830811554332875e-05,
+      "loss": 1.6196,
+      "step": 270
+    },
+    {
+      "epoch": 0.36012861736334406,
+      "grad_norm": 0.5034765005111694,
+      "learning_rate": 8.76203576341128e-05,
+      "loss": 1.6257,
+      "step": 280
+    },
+    {
+      "epoch": 0.3729903536977492,
+      "grad_norm": 0.48885518312454224,
+      "learning_rate": 8.693259972489685e-05,
+      "loss": 1.6195,
+      "step": 290
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "grad_norm": 0.48295891284942627,
+      "learning_rate": 8.62448418156809e-05,
+      "loss": 1.6301,
+      "step": 300
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "eval_yahma/alpaca-cleaned_loss": 1.6362165212631226,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9945,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.749,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 300
+    },
+    {
+      "epoch": 0.3987138263665595,
+      "grad_norm": 0.4800078570842743,
+      "learning_rate": 8.555708390646493e-05,
+      "loss": 1.6171,
+      "step": 310
+    },
+    {
+      "epoch": 0.4115755627009646,
+      "grad_norm": 0.47452959418296814,
+      "learning_rate": 8.486932599724897e-05,
+      "loss": 1.6147,
+      "step": 320
+    },
+    {
+      "epoch": 0.42443729903536975,
+      "grad_norm": 0.5397221446037292,
+      "learning_rate": 8.418156808803301e-05,
+      "loss": 1.6041,
+      "step": 330
+    },
+    {
+      "epoch": 0.43729903536977494,
+      "grad_norm": 0.5501461029052734,
+      "learning_rate": 8.349381017881706e-05,
+      "loss": 1.6091,
+      "step": 340
+    },
+    {
+      "epoch": 0.45016077170418006,
+      "grad_norm": 0.47587981820106506,
+      "learning_rate": 8.28060522696011e-05,
+      "loss": 1.6008,
+      "step": 350
+    },
+    {
+      "epoch": 0.4630225080385852,
+      "grad_norm": 0.46644529700279236,
+      "learning_rate": 8.211829436038515e-05,
+      "loss": 1.6081,
+      "step": 360
+    },
+    {
+      "epoch": 0.4758842443729904,
+      "grad_norm": 0.5308094024658203,
+      "learning_rate": 8.14305364511692e-05,
+      "loss": 1.5987,
+      "step": 370
+    },
+    {
+      "epoch": 0.4887459807073955,
+      "grad_norm": 0.5304721593856812,
+      "learning_rate": 8.074277854195323e-05,
+      "loss": 1.6173,
+      "step": 380
+    },
+    {
+      "epoch": 0.5016077170418006,
+      "grad_norm": 0.6186290383338928,
+      "learning_rate": 8.005502063273728e-05,
+      "loss": 1.5879,
+      "step": 390
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "grad_norm": 0.4936847388744354,
+      "learning_rate": 7.936726272352132e-05,
+      "loss": 1.5771,
+      "step": 400
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "eval_yahma/alpaca-cleaned_loss": 1.6208504438400269,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9246,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.784,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 400
+    },
+    {
+      "epoch": 0.5273311897106109,
+      "grad_norm": 0.4969954788684845,
+      "learning_rate": 7.867950481430537e-05,
+      "loss": 1.5873,
+      "step": 410
+    },
+    {
+      "epoch": 0.5401929260450161,
+      "grad_norm": 0.5539654493331909,
+      "learning_rate": 7.799174690508942e-05,
+      "loss": 1.5741,
+      "step": 420
+    },
+    {
+      "epoch": 0.5530546623794212,
+      "grad_norm": 0.4963805377483368,
+      "learning_rate": 7.730398899587345e-05,
+      "loss": 1.5883,
+      "step": 430
+    },
+    {
+      "epoch": 0.5659163987138264,
+      "grad_norm": 0.4849222004413605,
+      "learning_rate": 7.66162310866575e-05,
+      "loss": 1.6061,
+      "step": 440
+    },
+    {
+      "epoch": 0.5787781350482315,
+      "grad_norm": 0.5241298079490662,
+      "learning_rate": 7.592847317744153e-05,
+      "loss": 1.6118,
+      "step": 450
+    },
+    {
+      "epoch": 0.5916398713826366,
+      "grad_norm": 0.5051389336585999,
+      "learning_rate": 7.52407152682256e-05,
+      "loss": 1.5618,
+      "step": 460
+    },
+    {
+      "epoch": 0.6045016077170418,
+      "grad_norm": 0.49376770853996277,
+      "learning_rate": 7.455295735900963e-05,
+      "loss": 1.5871,
+      "step": 470
+    },
+    {
+      "epoch": 0.617363344051447,
+      "grad_norm": 0.49221155047416687,
+      "learning_rate": 7.386519944979367e-05,
+      "loss": 1.6037,
+      "step": 480
+    },
+    {
+      "epoch": 0.6302250803858521,
+      "grad_norm": 0.5378918647766113,
+      "learning_rate": 7.317744154057772e-05,
+      "loss": 1.5523,
+      "step": 490
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "grad_norm": 0.5564639568328857,
+      "learning_rate": 7.248968363136176e-05,
+      "loss": 1.5885,
+      "step": 500
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "eval_yahma/alpaca-cleaned_loss": 1.6095871925354004,
+      "eval_yahma/alpaca-cleaned_runtime": 62.983,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.755,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 500
+    },
+    {
+      "epoch": 0.6559485530546624,
+      "grad_norm": 0.49083277583122253,
+      "learning_rate": 7.180192572214582e-05,
+      "loss": 1.5693,
+      "step": 510
+    },
+    {
+      "epoch": 0.6688102893890675,
+      "grad_norm": 0.5625829100608826,
+      "learning_rate": 7.111416781292985e-05,
+      "loss": 1.6023,
+      "step": 520
+    },
+    {
+      "epoch": 0.6816720257234726,
+      "grad_norm": 0.6078226566314697,
+      "learning_rate": 7.04264099037139e-05,
+      "loss": 1.5845,
+      "step": 530
+    },
+    {
+      "epoch": 0.6945337620578779,
+      "grad_norm": 0.48107999563217163,
+      "learning_rate": 6.973865199449794e-05,
+      "loss": 1.5682,
+      "step": 540
+    },
+    {
+      "epoch": 0.707395498392283,
+      "grad_norm": 0.5080347657203674,
+      "learning_rate": 6.905089408528198e-05,
+      "loss": 1.5839,
+      "step": 550
+    },
+    {
+      "epoch": 0.7202572347266881,
+      "grad_norm": 0.5683622360229492,
+      "learning_rate": 6.836313617606602e-05,
+      "loss": 1.5916,
+      "step": 560
+    },
+    {
+      "epoch": 0.7331189710610932,
+      "grad_norm": 0.4669715464115143,
+      "learning_rate": 6.767537826685007e-05,
+      "loss": 1.6146,
+      "step": 570
+    },
+    {
+      "epoch": 0.7459807073954984,
+      "grad_norm": 0.4946054518222809,
+      "learning_rate": 6.698762035763412e-05,
+      "loss": 1.5764,
+      "step": 580
+    },
+    {
+      "epoch": 0.7588424437299035,
+      "grad_norm": 0.4975377023220062,
+      "learning_rate": 6.629986244841817e-05,
+      "loss": 1.6035,
+      "step": 590
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "grad_norm": 0.5511853098869324,
+      "learning_rate": 6.56121045392022e-05,
+      "loss": 1.5842,
+      "step": 600
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "eval_yahma/alpaca-cleaned_loss": 1.6013859510421753,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9465,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.773,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.972,
+      "step": 600
+    },
+    {
+      "epoch": 0.7845659163987139,
+      "grad_norm": 0.5689719915390015,
+      "learning_rate": 6.492434662998625e-05,
+      "loss": 1.5656,
+      "step": 610
+    },
+    {
+      "epoch": 0.797427652733119,
+      "grad_norm": 0.48885637521743774,
+      "learning_rate": 6.42365887207703e-05,
+      "loss": 1.5605,
+      "step": 620
+    },
+    {
+      "epoch": 0.8102893890675241,
+      "grad_norm": 0.5316773056983948,
+      "learning_rate": 6.354883081155434e-05,
+      "loss": 1.5755,
+      "step": 630
+    },
+    {
+      "epoch": 0.8231511254019293,
+      "grad_norm": 0.5578161478042603,
+      "learning_rate": 6.286107290233837e-05,
+      "loss": 1.5532,
+      "step": 640
+    },
+    {
+      "epoch": 0.8360128617363344,
+      "grad_norm": 0.6534080505371094,
+      "learning_rate": 6.217331499312242e-05,
+      "loss": 1.5882,
+      "step": 650
+    },
+    {
+      "epoch": 0.8488745980707395,
+      "grad_norm": 0.5140324831008911,
+      "learning_rate": 6.148555708390647e-05,
+      "loss": 1.5598,
+      "step": 660
+    },
+    {
+      "epoch": 0.8617363344051447,
+      "grad_norm": 0.5247426629066467,
+      "learning_rate": 6.0797799174690516e-05,
+      "loss": 1.5833,
+      "step": 670
+    },
+    {
+      "epoch": 0.8745980707395499,
+      "grad_norm": 0.49460870027542114,
+      "learning_rate": 6.011004126547456e-05,
+      "loss": 1.621,
+      "step": 680
+    },
+    {
+      "epoch": 0.887459807073955,
+      "grad_norm": 0.5351711511611938,
+      "learning_rate": 5.9422283356258604e-05,
+      "loss": 1.5371,
+      "step": 690
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "grad_norm": 0.5608878135681152,
+      "learning_rate": 5.8734525447042644e-05,
+      "loss": 1.5878,
+      "step": 700
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "eval_yahma/alpaca-cleaned_loss": 1.5940771102905273,
+      "eval_yahma/alpaca-cleaned_runtime": 62.917,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 700
+    },
+    {
+      "epoch": 0.9131832797427653,
+      "grad_norm": 0.48291367292404175,
+      "learning_rate": 5.8046767537826685e-05,
+      "loss": 1.583,
+      "step": 710
+    },
+    {
+      "epoch": 0.9260450160771704,
+      "grad_norm": 0.4866442382335663,
+      "learning_rate": 5.7359009628610725e-05,
+      "loss": 1.5891,
+      "step": 720
+    },
+    {
+      "epoch": 0.9389067524115756,
+      "grad_norm": 0.5254418253898621,
+      "learning_rate": 5.667125171939478e-05,
+      "loss": 1.5319,
+      "step": 730
+    },
+    {
+      "epoch": 0.9517684887459807,
+      "grad_norm": 0.5201655030250549,
+      "learning_rate": 5.598349381017882e-05,
+      "loss": 1.5819,
+      "step": 740
+    },
+    {
+      "epoch": 0.9646302250803859,
+      "grad_norm": 0.5820693969726562,
+      "learning_rate": 5.5295735900962866e-05,
+      "loss": 1.5807,
+      "step": 750
+    },
+    {
+      "epoch": 0.977491961414791,
+      "grad_norm": 0.559010922908783,
+      "learning_rate": 5.460797799174691e-05,
+      "loss": 1.5597,
+      "step": 760
+    },
+    {
+      "epoch": 0.9903536977491961,
+      "grad_norm": 0.498877614736557,
+      "learning_rate": 5.392022008253095e-05,
+      "loss": 1.5628,
+      "step": 770
+    },
+    {
+      "epoch": 1.002572347266881,
+      "grad_norm": 0.5119406580924988,
+      "learning_rate": 5.3232462173315e-05,
+      "loss": 1.5693,
+      "step": 780
+    },
+    {
+      "epoch": 1.015434083601286,
+      "grad_norm": 0.5344542860984802,
+      "learning_rate": 5.254470426409904e-05,
+      "loss": 1.5256,
+      "step": 790
+    },
+    {
+      "epoch": 1.0282958199356913,
+      "grad_norm": 0.5358342528343201,
+      "learning_rate": 5.185694635488308e-05,
+      "loss": 1.5432,
+      "step": 800
+    },
+    {
+      "epoch": 1.0282958199356913,
+      "eval_yahma/alpaca-cleaned_loss": 1.589645266532898,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9636,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.764,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
+      "step": 800
+    },
+    {
+      "epoch": 1.0411575562700965,
+      "grad_norm": 0.5941470265388489,
+      "learning_rate": 5.116918844566713e-05,
+      "loss": 1.5449,
+      "step": 810
+    },
+    {
+      "epoch": 1.0540192926045016,
+      "grad_norm": 0.5659182071685791,
+      "learning_rate": 5.048143053645117e-05,
+      "loss": 1.5234,
+      "step": 820
+    },
+    {
+      "epoch": 1.0668810289389068,
+      "grad_norm": 0.5737349390983582,
+      "learning_rate": 4.9793672627235217e-05,
+      "loss": 1.5517,
+      "step": 830
+    },
+    {
+      "epoch": 1.0797427652733118,
+      "grad_norm": 0.5984872579574585,
+      "learning_rate": 4.910591471801926e-05,
+      "loss": 1.5175,
+      "step": 840
+    },
+    {
+      "epoch": 1.092604501607717,
+      "grad_norm": 0.5954984426498413,
+      "learning_rate": 4.8418156808803304e-05,
+      "loss": 1.5738,
+      "step": 850
+    },
+    {
+      "epoch": 1.1054662379421223,
+      "grad_norm": 0.5545582175254822,
+      "learning_rate": 4.7730398899587344e-05,
+      "loss": 1.5538,
+      "step": 860
+    },
+    {
+      "epoch": 1.1183279742765273,
+      "grad_norm": 0.6972865462303162,
+      "learning_rate": 4.704264099037139e-05,
+      "loss": 1.529,
+      "step": 870
+    },
+    {
+      "epoch": 1.1311897106109325,
+      "grad_norm": 0.5404506325721741,
+      "learning_rate": 4.635488308115544e-05,
+      "loss": 1.5567,
+      "step": 880
+    },
+    {
+      "epoch": 1.1440514469453376,
+      "grad_norm": 0.5792121887207031,
+      "learning_rate": 4.566712517193948e-05,
+      "loss": 1.5422,
+      "step": 890
+    },
+    {
+      "epoch": 1.1569131832797428,
+      "grad_norm": 0.5468006134033203,
+      "learning_rate": 4.497936726272352e-05,
+      "loss": 1.5369,
+      "step": 900
+    },
+    {
+      "epoch": 1.1569131832797428,
+      "eval_yahma/alpaca-cleaned_loss": 1.5860395431518555,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9918,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.75,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 900
+    },
+    {
+      "epoch": 1.1697749196141478,
+      "grad_norm": 0.5955344438552856,
+      "learning_rate": 4.429160935350757e-05,
+      "loss": 1.5551,
+      "step": 910
+    },
+    {
+      "epoch": 1.182636655948553,
+      "grad_norm": 0.5832058787345886,
+      "learning_rate": 4.360385144429161e-05,
+      "loss": 1.5568,
+      "step": 920
+    },
+    {
+      "epoch": 1.1954983922829583,
+      "grad_norm": 0.6309258937835693,
+      "learning_rate": 4.291609353507566e-05,
+      "loss": 1.5548,
+      "step": 930
+    },
+    {
+      "epoch": 1.2083601286173633,
+      "grad_norm": 0.6269820928573608,
+      "learning_rate": 4.22283356258597e-05,
+      "loss": 1.5459,
+      "step": 940
+    },
+    {
+      "epoch": 1.2212218649517685,
+      "grad_norm": 0.6376837491989136,
+      "learning_rate": 4.154057771664374e-05,
+      "loss": 1.5277,
+      "step": 950
+    },
+    {
+      "epoch": 1.2340836012861736,
+      "grad_norm": 0.6351036429405212,
+      "learning_rate": 4.085281980742779e-05,
+      "loss": 1.5273,
+      "step": 960
+    },
+    {
+      "epoch": 1.2469453376205788,
+      "grad_norm": 0.6877638101577759,
+      "learning_rate": 4.016506189821183e-05,
+      "loss": 1.4986,
+      "step": 970
+    },
+    {
+      "epoch": 1.2598070739549838,
+      "grad_norm": 0.5501726865768433,
+      "learning_rate": 3.947730398899587e-05,
+      "loss": 1.5543,
+      "step": 980
+    },
+    {
+      "epoch": 1.272668810289389,
+      "grad_norm": 0.5217163562774658,
+      "learning_rate": 3.8789546079779924e-05,
+      "loss": 1.5292,
+      "step": 990
+    },
+    {
+      "epoch": 1.2855305466237943,
+      "grad_norm": 0.5770425796508789,
+      "learning_rate": 3.8101788170563964e-05,
+      "loss": 1.5536,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2855305466237943,
+      "eval_yahma/alpaca-cleaned_loss": 1.5820817947387695,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9495,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1554,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.434437468513894e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

tune_log/layerskip_1b_0.25_tune/checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
+size 5368

tune_log/layerskip_1b_0.25_tune/checkpoint-1200/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be54766e305bf39d189b01a3e5ef2ce484ebb25174805158611dc64b011fae50
+size 19960448

tune_log/layerskip_1b_0.25_tune/checkpoint-1200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b02d18468fe272d16fe94b72ee6383b6ff39ed8d29639779dc301f5ad712b87
+size 40050298

tune_log/layerskip_1b_0.25_tune/checkpoint-1200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b12da8cee10762b850bb3c3d3a232a890b2d5b5fe469fbfd08d52ba0459cc724
+size 14244

tune_log/layerskip_1b_0.25_tune/checkpoint-1200/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:942cfd7aded7d16363d1ae1a2911c01ef4e25f3c70ed059c88f1845d9b6c24dc
+size 988

tune_log/layerskip_1b_0.25_tune/checkpoint-1200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3ab7e64594d849ab9ff684dd4e2aac233019c559b7165f90e4c14c5b8cd1512
+size 1064

tune_log/layerskip_1b_0.25_tune/checkpoint-1200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,976 @@

+{
+  "best_metric": 1.5769098997116089,
+  "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-1200",
+  "epoch": 1.542765273311897,
+  "eval_steps": 100,
+  "global_step": 1200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012861736334405145,
+      "grad_norm": 0.39783015847206116,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.0835,
+      "step": 1
+    },
+    {
+      "epoch": 0.012861736334405145,
+      "grad_norm": 0.45549583435058594,
+      "learning_rate": 1e-05,
+      "loss": 2.1408,
+      "step": 10
+    },
+    {
+      "epoch": 0.02572347266881029,
+      "grad_norm": 0.4594053626060486,
+      "learning_rate": 2e-05,
+      "loss": 2.0894,
+      "step": 20
+    },
+    {
+      "epoch": 0.03858520900321544,
+      "grad_norm": 0.49020764231681824,
+      "learning_rate": 3e-05,
+      "loss": 2.1037,
+      "step": 30
+    },
+    {
+      "epoch": 0.05144694533762058,
+      "grad_norm": 0.37993305921554565,
+      "learning_rate": 4e-05,
+      "loss": 1.9716,
+      "step": 40
+    },
+    {
+      "epoch": 0.06430868167202572,
+      "grad_norm": 0.38231977820396423,
+      "learning_rate": 5e-05,
+      "loss": 1.9349,
+      "step": 50
+    },
+    {
+      "epoch": 0.07717041800643087,
+      "grad_norm": 0.2922589182853699,
+      "learning_rate": 6e-05,
+      "loss": 1.906,
+      "step": 60
+    },
+    {
+      "epoch": 0.09003215434083602,
+      "grad_norm": 0.34647658467292786,
+      "learning_rate": 7e-05,
+      "loss": 1.8246,
+      "step": 70
+    },
+    {
+      "epoch": 0.10289389067524116,
+      "grad_norm": 0.31930026412010193,
+      "learning_rate": 8e-05,
+      "loss": 1.8057,
+      "step": 80
+    },
+    {
+      "epoch": 0.1157556270096463,
+      "grad_norm": 0.34028756618499756,
+      "learning_rate": 9e-05,
+      "loss": 1.7546,
+      "step": 90
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "grad_norm": 0.3878991901874542,
+      "learning_rate": 0.0001,
+      "loss": 1.7543,
+      "step": 100
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
+      "eval_yahma/alpaca-cleaned_runtime": 62.5096,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
+      "step": 100
+    },
+    {
+      "epoch": 0.1414790996784566,
+      "grad_norm": 0.35599613189697266,
+      "learning_rate": 9.931224209078405e-05,
+      "loss": 1.7309,
+      "step": 110
+    },
+    {
+      "epoch": 0.15434083601286175,
+      "grad_norm": 0.4075644016265869,
+      "learning_rate": 9.862448418156809e-05,
+      "loss": 1.6981,
+      "step": 120
+    },
+    {
+      "epoch": 0.16720257234726688,
+      "grad_norm": 0.4743317663669586,
+      "learning_rate": 9.793672627235215e-05,
+      "loss": 1.7011,
+      "step": 130
+    },
+    {
+      "epoch": 0.18006430868167203,
+      "grad_norm": 0.4701610505580902,
+      "learning_rate": 9.724896836313618e-05,
+      "loss": 1.6771,
+      "step": 140
+    },
+    {
+      "epoch": 0.19292604501607716,
+      "grad_norm": 0.49115318059921265,
+      "learning_rate": 9.656121045392023e-05,
+      "loss": 1.6633,
+      "step": 150
+    },
+    {
+      "epoch": 0.2057877813504823,
+      "grad_norm": 0.5177980661392212,
+      "learning_rate": 9.587345254470427e-05,
+      "loss": 1.6706,
+      "step": 160
+    },
+    {
+      "epoch": 0.21864951768488747,
+      "grad_norm": 0.465657114982605,
+      "learning_rate": 9.518569463548831e-05,
+      "loss": 1.6677,
+      "step": 170
+    },
+    {
+      "epoch": 0.2315112540192926,
+      "grad_norm": 0.5453551411628723,
+      "learning_rate": 9.449793672627235e-05,
+      "loss": 1.6656,
+      "step": 180
+    },
+    {
+      "epoch": 0.24437299035369775,
+      "grad_norm": 0.4150402545928955,
+      "learning_rate": 9.38101788170564e-05,
+      "loss": 1.6568,
+      "step": 190
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "grad_norm": 0.5106223225593567,
+      "learning_rate": 9.312242090784045e-05,
+      "loss": 1.6804,
+      "step": 200
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
+      "eval_yahma/alpaca-cleaned_runtime": 63.0481,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
+      "step": 200
+    },
+    {
+      "epoch": 0.27009646302250806,
+      "grad_norm": 0.47371965646743774,
+      "learning_rate": 9.243466299862448e-05,
+      "loss": 1.6235,
+      "step": 210
+    },
+    {
+      "epoch": 0.2829581993569132,
+      "grad_norm": 0.45723679661750793,
+      "learning_rate": 9.174690508940853e-05,
+      "loss": 1.6192,
+      "step": 220
+    },
+    {
+      "epoch": 0.2958199356913183,
+      "grad_norm": 0.46727871894836426,
+      "learning_rate": 9.105914718019258e-05,
+      "loss": 1.6129,
+      "step": 230
+    },
+    {
+      "epoch": 0.3086816720257235,
+      "grad_norm": 0.5216034054756165,
+      "learning_rate": 9.037138927097662e-05,
+      "loss": 1.6065,
+      "step": 240
+    },
+    {
+      "epoch": 0.3215434083601286,
+      "grad_norm": 0.46132415533065796,
+      "learning_rate": 8.968363136176067e-05,
+      "loss": 1.6374,
+      "step": 250
+    },
+    {
+      "epoch": 0.33440514469453375,
+      "grad_norm": 0.5699637532234192,
+      "learning_rate": 8.89958734525447e-05,
+      "loss": 1.6031,
+      "step": 260
+    },
+    {
+      "epoch": 0.34726688102893893,
+      "grad_norm": 0.46537184715270996,
+      "learning_rate": 8.830811554332875e-05,
+      "loss": 1.6196,
+      "step": 270
+    },
+    {
+      "epoch": 0.36012861736334406,
+      "grad_norm": 0.5034765005111694,
+      "learning_rate": 8.76203576341128e-05,
+      "loss": 1.6257,
+      "step": 280
+    },
+    {
+      "epoch": 0.3729903536977492,
+      "grad_norm": 0.48885518312454224,
+      "learning_rate": 8.693259972489685e-05,
+      "loss": 1.6195,
+      "step": 290
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "grad_norm": 0.48295891284942627,
+      "learning_rate": 8.62448418156809e-05,
+      "loss": 1.6301,
+      "step": 300
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "eval_yahma/alpaca-cleaned_loss": 1.6362165212631226,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9945,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.749,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 300
+    },
+    {
+      "epoch": 0.3987138263665595,
+      "grad_norm": 0.4800078570842743,
+      "learning_rate": 8.555708390646493e-05,
+      "loss": 1.6171,
+      "step": 310
+    },
+    {
+      "epoch": 0.4115755627009646,
+      "grad_norm": 0.47452959418296814,
+      "learning_rate": 8.486932599724897e-05,
+      "loss": 1.6147,
+      "step": 320
+    },
+    {
+      "epoch": 0.42443729903536975,
+      "grad_norm": 0.5397221446037292,
+      "learning_rate": 8.418156808803301e-05,
+      "loss": 1.6041,
+      "step": 330
+    },
+    {
+      "epoch": 0.43729903536977494,
+      "grad_norm": 0.5501461029052734,
+      "learning_rate": 8.349381017881706e-05,
+      "loss": 1.6091,
+      "step": 340
+    },
+    {
+      "epoch": 0.45016077170418006,
+      "grad_norm": 0.47587981820106506,
+      "learning_rate": 8.28060522696011e-05,
+      "loss": 1.6008,
+      "step": 350
+    },
+    {
+      "epoch": 0.4630225080385852,
+      "grad_norm": 0.46644529700279236,
+      "learning_rate": 8.211829436038515e-05,
+      "loss": 1.6081,
+      "step": 360
+    },
+    {
+      "epoch": 0.4758842443729904,
+      "grad_norm": 0.5308094024658203,
+      "learning_rate": 8.14305364511692e-05,
+      "loss": 1.5987,
+      "step": 370
+    },
+    {
+      "epoch": 0.4887459807073955,
+      "grad_norm": 0.5304721593856812,
+      "learning_rate": 8.074277854195323e-05,
+      "loss": 1.6173,
+      "step": 380
+    },
+    {
+      "epoch": 0.5016077170418006,
+      "grad_norm": 0.6186290383338928,
+      "learning_rate": 8.005502063273728e-05,
+      "loss": 1.5879,
+      "step": 390
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "grad_norm": 0.4936847388744354,
+      "learning_rate": 7.936726272352132e-05,
+      "loss": 1.5771,
+      "step": 400
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "eval_yahma/alpaca-cleaned_loss": 1.6208504438400269,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9246,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.784,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 400
+    },
+    {
+      "epoch": 0.5273311897106109,
+      "grad_norm": 0.4969954788684845,
+      "learning_rate": 7.867950481430537e-05,
+      "loss": 1.5873,
+      "step": 410
+    },
+    {
+      "epoch": 0.5401929260450161,
+      "grad_norm": 0.5539654493331909,
+      "learning_rate": 7.799174690508942e-05,
+      "loss": 1.5741,
+      "step": 420
+    },
+    {
+      "epoch": 0.5530546623794212,
+      "grad_norm": 0.4963805377483368,
+      "learning_rate": 7.730398899587345e-05,
+      "loss": 1.5883,
+      "step": 430
+    },
+    {
+      "epoch": 0.5659163987138264,
+      "grad_norm": 0.4849222004413605,
+      "learning_rate": 7.66162310866575e-05,
+      "loss": 1.6061,
+      "step": 440
+    },
+    {
+      "epoch": 0.5787781350482315,
+      "grad_norm": 0.5241298079490662,
+      "learning_rate": 7.592847317744153e-05,
+      "loss": 1.6118,
+      "step": 450
+    },
+    {
+      "epoch": 0.5916398713826366,
+      "grad_norm": 0.5051389336585999,
+      "learning_rate": 7.52407152682256e-05,
+      "loss": 1.5618,
+      "step": 460
+    },
+    {
+      "epoch": 0.6045016077170418,
+      "grad_norm": 0.49376770853996277,
+      "learning_rate": 7.455295735900963e-05,
+      "loss": 1.5871,
+      "step": 470
+    },
+    {
+      "epoch": 0.617363344051447,
+      "grad_norm": 0.49221155047416687,
+      "learning_rate": 7.386519944979367e-05,
+      "loss": 1.6037,
+      "step": 480
+    },
+    {
+      "epoch": 0.6302250803858521,
+      "grad_norm": 0.5378918647766113,
+      "learning_rate": 7.317744154057772e-05,
+      "loss": 1.5523,
+      "step": 490
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "grad_norm": 0.5564639568328857,
+      "learning_rate": 7.248968363136176e-05,
+      "loss": 1.5885,
+      "step": 500
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "eval_yahma/alpaca-cleaned_loss": 1.6095871925354004,
+      "eval_yahma/alpaca-cleaned_runtime": 62.983,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.755,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 500
+    },
+    {
+      "epoch": 0.6559485530546624,
+      "grad_norm": 0.49083277583122253,
+      "learning_rate": 7.180192572214582e-05,
+      "loss": 1.5693,
+      "step": 510
+    },
+    {
+      "epoch": 0.6688102893890675,
+      "grad_norm": 0.5625829100608826,
+      "learning_rate": 7.111416781292985e-05,
+      "loss": 1.6023,
+      "step": 520
+    },
+    {
+      "epoch": 0.6816720257234726,
+      "grad_norm": 0.6078226566314697,
+      "learning_rate": 7.04264099037139e-05,
+      "loss": 1.5845,
+      "step": 530
+    },
+    {
+      "epoch": 0.6945337620578779,
+      "grad_norm": 0.48107999563217163,
+      "learning_rate": 6.973865199449794e-05,
+      "loss": 1.5682,
+      "step": 540
+    },
+    {
+      "epoch": 0.707395498392283,
+      "grad_norm": 0.5080347657203674,
+      "learning_rate": 6.905089408528198e-05,
+      "loss": 1.5839,
+      "step": 550
+    },
+    {
+      "epoch": 0.7202572347266881,
+      "grad_norm": 0.5683622360229492,
+      "learning_rate": 6.836313617606602e-05,
+      "loss": 1.5916,
+      "step": 560
+    },
+    {
+      "epoch": 0.7331189710610932,
+      "grad_norm": 0.4669715464115143,
+      "learning_rate": 6.767537826685007e-05,
+      "loss": 1.6146,
+      "step": 570
+    },
+    {
+      "epoch": 0.7459807073954984,
+      "grad_norm": 0.4946054518222809,
+      "learning_rate": 6.698762035763412e-05,
+      "loss": 1.5764,
+      "step": 580
+    },
+    {
+      "epoch": 0.7588424437299035,
+      "grad_norm": 0.4975377023220062,
+      "learning_rate": 6.629986244841817e-05,
+      "loss": 1.6035,
+      "step": 590
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "grad_norm": 0.5511853098869324,
+      "learning_rate": 6.56121045392022e-05,
+      "loss": 1.5842,
+      "step": 600
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "eval_yahma/alpaca-cleaned_loss": 1.6013859510421753,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9465,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.773,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.972,
+      "step": 600
+    },
+    {
+      "epoch": 0.7845659163987139,
+      "grad_norm": 0.5689719915390015,
+      "learning_rate": 6.492434662998625e-05,
+      "loss": 1.5656,
+      "step": 610
+    },
+    {
+      "epoch": 0.797427652733119,
+      "grad_norm": 0.48885637521743774,
+      "learning_rate": 6.42365887207703e-05,
+      "loss": 1.5605,
+      "step": 620
+    },
+    {
+      "epoch": 0.8102893890675241,
+      "grad_norm": 0.5316773056983948,
+      "learning_rate": 6.354883081155434e-05,
+      "loss": 1.5755,
+      "step": 630
+    },
+    {
+      "epoch": 0.8231511254019293,
+      "grad_norm": 0.5578161478042603,
+      "learning_rate": 6.286107290233837e-05,
+      "loss": 1.5532,
+      "step": 640
+    },
+    {
+      "epoch": 0.8360128617363344,
+      "grad_norm": 0.6534080505371094,
+      "learning_rate": 6.217331499312242e-05,
+      "loss": 1.5882,
+      "step": 650
+    },
+    {
+      "epoch": 0.8488745980707395,
+      "grad_norm": 0.5140324831008911,
+      "learning_rate": 6.148555708390647e-05,
+      "loss": 1.5598,
+      "step": 660
+    },
+    {
+      "epoch": 0.8617363344051447,
+      "grad_norm": 0.5247426629066467,
+      "learning_rate": 6.0797799174690516e-05,
+      "loss": 1.5833,
+      "step": 670
+    },
+    {
+      "epoch": 0.8745980707395499,
+      "grad_norm": 0.49460870027542114,
+      "learning_rate": 6.011004126547456e-05,
+      "loss": 1.621,
+      "step": 680
+    },
+    {
+      "epoch": 0.887459807073955,
+      "grad_norm": 0.5351711511611938,
+      "learning_rate": 5.9422283356258604e-05,
+      "loss": 1.5371,
+      "step": 690
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "grad_norm": 0.5608878135681152,
+      "learning_rate": 5.8734525447042644e-05,
+      "loss": 1.5878,
+      "step": 700
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "eval_yahma/alpaca-cleaned_loss": 1.5940771102905273,
+      "eval_yahma/alpaca-cleaned_runtime": 62.917,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 700
+    },
+    {
+      "epoch": 0.9131832797427653,
+      "grad_norm": 0.48291367292404175,
+      "learning_rate": 5.8046767537826685e-05,
+      "loss": 1.583,
+      "step": 710
+    },
+    {
+      "epoch": 0.9260450160771704,
+      "grad_norm": 0.4866442382335663,
+      "learning_rate": 5.7359009628610725e-05,
+      "loss": 1.5891,
+      "step": 720
+    },
+    {
+      "epoch": 0.9389067524115756,
+      "grad_norm": 0.5254418253898621,
+      "learning_rate": 5.667125171939478e-05,
+      "loss": 1.5319,
+      "step": 730
+    },
+    {
+      "epoch": 0.9517684887459807,
+      "grad_norm": 0.5201655030250549,
+      "learning_rate": 5.598349381017882e-05,
+      "loss": 1.5819,
+      "step": 740
+    },
+    {
+      "epoch": 0.9646302250803859,
+      "grad_norm": 0.5820693969726562,
+      "learning_rate": 5.5295735900962866e-05,
+      "loss": 1.5807,
+      "step": 750
+    },
+    {
+      "epoch": 0.977491961414791,
+      "grad_norm": 0.559010922908783,
+      "learning_rate": 5.460797799174691e-05,
+      "loss": 1.5597,
+      "step": 760
+    },
+    {
+      "epoch": 0.9903536977491961,
+      "grad_norm": 0.498877614736557,
+      "learning_rate": 5.392022008253095e-05,
+      "loss": 1.5628,
+      "step": 770
+    },
+    {
+      "epoch": 1.002572347266881,
+      "grad_norm": 0.5119406580924988,
+      "learning_rate": 5.3232462173315e-05,
+      "loss": 1.5693,
+      "step": 780
+    },
+    {
+      "epoch": 1.015434083601286,
+      "grad_norm": 0.5344542860984802,
+      "learning_rate": 5.254470426409904e-05,
+      "loss": 1.5256,
+      "step": 790
+    },
+    {
+      "epoch": 1.0282958199356913,
+      "grad_norm": 0.5358342528343201,
+      "learning_rate": 5.185694635488308e-05,
+      "loss": 1.5432,
+      "step": 800
+    },
+    {
+      "epoch": 1.0282958199356913,
+      "eval_yahma/alpaca-cleaned_loss": 1.589645266532898,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9636,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.764,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
+      "step": 800
+    },
+    {
+      "epoch": 1.0411575562700965,
+      "grad_norm": 0.5941470265388489,
+      "learning_rate": 5.116918844566713e-05,
+      "loss": 1.5449,
+      "step": 810
+    },
+    {
+      "epoch": 1.0540192926045016,
+      "grad_norm": 0.5659182071685791,
+      "learning_rate": 5.048143053645117e-05,
+      "loss": 1.5234,
+      "step": 820
+    },
+    {
+      "epoch": 1.0668810289389068,
+      "grad_norm": 0.5737349390983582,
+      "learning_rate": 4.9793672627235217e-05,
+      "loss": 1.5517,
+      "step": 830
+    },
+    {
+      "epoch": 1.0797427652733118,
+      "grad_norm": 0.5984872579574585,
+      "learning_rate": 4.910591471801926e-05,
+      "loss": 1.5175,
+      "step": 840
+    },
+    {
+      "epoch": 1.092604501607717,
+      "grad_norm": 0.5954984426498413,
+      "learning_rate": 4.8418156808803304e-05,
+      "loss": 1.5738,
+      "step": 850
+    },
+    {
+      "epoch": 1.1054662379421223,
+      "grad_norm": 0.5545582175254822,
+      "learning_rate": 4.7730398899587344e-05,
+      "loss": 1.5538,
+      "step": 860
+    },
+    {
+      "epoch": 1.1183279742765273,
+      "grad_norm": 0.6972865462303162,
+      "learning_rate": 4.704264099037139e-05,
+      "loss": 1.529,
+      "step": 870
+    },
+    {
+      "epoch": 1.1311897106109325,
+      "grad_norm": 0.5404506325721741,
+      "learning_rate": 4.635488308115544e-05,
+      "loss": 1.5567,
+      "step": 880
+    },
+    {
+      "epoch": 1.1440514469453376,
+      "grad_norm": 0.5792121887207031,
+      "learning_rate": 4.566712517193948e-05,
+      "loss": 1.5422,
+      "step": 890
+    },
+    {
+      "epoch": 1.1569131832797428,
+      "grad_norm": 0.5468006134033203,
+      "learning_rate": 4.497936726272352e-05,
+      "loss": 1.5369,
+      "step": 900
+    },
+    {
+      "epoch": 1.1569131832797428,
+      "eval_yahma/alpaca-cleaned_loss": 1.5860395431518555,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9918,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.75,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 900
+    },
+    {
+      "epoch": 1.1697749196141478,
+      "grad_norm": 0.5955344438552856,
+      "learning_rate": 4.429160935350757e-05,
+      "loss": 1.5551,
+      "step": 910
+    },
+    {
+      "epoch": 1.182636655948553,
+      "grad_norm": 0.5832058787345886,
+      "learning_rate": 4.360385144429161e-05,
+      "loss": 1.5568,
+      "step": 920
+    },
+    {
+      "epoch": 1.1954983922829583,
+      "grad_norm": 0.6309258937835693,
+      "learning_rate": 4.291609353507566e-05,
+      "loss": 1.5548,
+      "step": 930
+    },
+    {
+      "epoch": 1.2083601286173633,
+      "grad_norm": 0.6269820928573608,
+      "learning_rate": 4.22283356258597e-05,
+      "loss": 1.5459,
+      "step": 940
+    },
+    {
+      "epoch": 1.2212218649517685,
+      "grad_norm": 0.6376837491989136,
+      "learning_rate": 4.154057771664374e-05,
+      "loss": 1.5277,
+      "step": 950
+    },
+    {
+      "epoch": 1.2340836012861736,
+      "grad_norm": 0.6351036429405212,
+      "learning_rate": 4.085281980742779e-05,
+      "loss": 1.5273,
+      "step": 960
+    },
+    {
+      "epoch": 1.2469453376205788,
+      "grad_norm": 0.6877638101577759,
+      "learning_rate": 4.016506189821183e-05,
+      "loss": 1.4986,
+      "step": 970
+    },
+    {
+      "epoch": 1.2598070739549838,
+      "grad_norm": 0.5501726865768433,
+      "learning_rate": 3.947730398899587e-05,
+      "loss": 1.5543,
+      "step": 980
+    },
+    {
+      "epoch": 1.272668810289389,
+      "grad_norm": 0.5217163562774658,
+      "learning_rate": 3.8789546079779924e-05,
+      "loss": 1.5292,
+      "step": 990
+    },
+    {
+      "epoch": 1.2855305466237943,
+      "grad_norm": 0.5770425796508789,
+      "learning_rate": 3.8101788170563964e-05,
+      "loss": 1.5536,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2855305466237943,
+      "eval_yahma/alpaca-cleaned_loss": 1.5820817947387695,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9495,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2983922829581993,
+      "grad_norm": 0.5802098512649536,
+      "learning_rate": 3.741403026134801e-05,
+      "loss": 1.5479,
+      "step": 1010
+    },
+    {
+      "epoch": 1.3112540192926045,
+      "grad_norm": 0.5646567344665527,
+      "learning_rate": 3.672627235213205e-05,
+      "loss": 1.5183,
+      "step": 1020
+    },
+    {
+      "epoch": 1.3241157556270098,
+      "grad_norm": 0.5852165222167969,
+      "learning_rate": 3.603851444291609e-05,
+      "loss": 1.5267,
+      "step": 1030
+    },
+    {
+      "epoch": 1.3369774919614148,
+      "grad_norm": 0.5583398342132568,
+      "learning_rate": 3.535075653370014e-05,
+      "loss": 1.5401,
+      "step": 1040
+    },
+    {
+      "epoch": 1.3498392282958198,
+      "grad_norm": 0.5971976518630981,
+      "learning_rate": 3.4662998624484186e-05,
+      "loss": 1.5147,
+      "step": 1050
+    },
+    {
+      "epoch": 1.362700964630225,
+      "grad_norm": 0.6036947965621948,
+      "learning_rate": 3.3975240715268227e-05,
+      "loss": 1.5294,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3755627009646303,
+      "grad_norm": 0.5828876495361328,
+      "learning_rate": 3.3287482806052274e-05,
+      "loss": 1.546,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3884244372990353,
+      "grad_norm": 0.5941759943962097,
+      "learning_rate": 3.2599724896836314e-05,
+      "loss": 1.5238,
+      "step": 1080
+    },
+    {
+      "epoch": 1.4012861736334405,
+      "grad_norm": 0.6082496047019958,
+      "learning_rate": 3.1911966987620354e-05,
+      "loss": 1.5055,
+      "step": 1090
+    },
+    {
+      "epoch": 1.4141479099678458,
+      "grad_norm": 0.5749199390411377,
+      "learning_rate": 3.12242090784044e-05,
+      "loss": 1.5238,
+      "step": 1100
+    },
+    {
+      "epoch": 1.4141479099678458,
+      "eval_yahma/alpaca-cleaned_loss": 1.5794486999511719,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9209,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.786,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 1100
+    },
+    {
+      "epoch": 1.4270096463022508,
+      "grad_norm": 0.649699330329895,
+      "learning_rate": 3.053645116918845e-05,
+      "loss": 1.5275,
+      "step": 1110
+    },
+    {
+      "epoch": 1.4398713826366558,
+      "grad_norm": 0.5754693150520325,
+      "learning_rate": 2.9848693259972492e-05,
+      "loss": 1.5217,
+      "step": 1120
+    },
+    {
+      "epoch": 1.452733118971061,
+      "grad_norm": 0.572021484375,
+      "learning_rate": 2.9160935350756536e-05,
+      "loss": 1.5489,
+      "step": 1130
+    },
+    {
+      "epoch": 1.4655948553054663,
+      "grad_norm": 0.6010130643844604,
+      "learning_rate": 2.8473177441540577e-05,
+      "loss": 1.5019,
+      "step": 1140
+    },
+    {
+      "epoch": 1.4784565916398713,
+      "grad_norm": 0.6172171831130981,
+      "learning_rate": 2.7785419532324624e-05,
+      "loss": 1.5703,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4913183279742765,
+      "grad_norm": 0.5957326889038086,
+      "learning_rate": 2.7097661623108668e-05,
+      "loss": 1.5247,
+      "step": 1160
+    },
+    {
+      "epoch": 1.5041800643086818,
+      "grad_norm": 0.5608690977096558,
+      "learning_rate": 2.6409903713892708e-05,
+      "loss": 1.5403,
+      "step": 1170
+    },
+    {
+      "epoch": 1.5170418006430868,
+      "grad_norm": 0.5870776176452637,
+      "learning_rate": 2.5722145804676755e-05,
+      "loss": 1.5235,
+      "step": 1180
+    },
+    {
+      "epoch": 1.5299035369774918,
+      "grad_norm": 0.5889161229133606,
+      "learning_rate": 2.50343878954608e-05,
+      "loss": 1.5164,
+      "step": 1190
+    },
+    {
+      "epoch": 1.542765273311897,
+      "grad_norm": 0.6082655787467957,
+      "learning_rate": 2.4346629986244843e-05,
+      "loss": 1.5022,
+      "step": 1200
+    },
+    {
+      "epoch": 1.542765273311897,
+      "eval_yahma/alpaca-cleaned_loss": 1.5769098997116089,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9228,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.785,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 1200
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1554,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.92217191122862e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

tune_log/layerskip_1b_0.25_tune/checkpoint-1200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
+size 5368

tune_log/layerskip_1b_0.25_tune/checkpoint-1400/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:353c63c0311c5f1ca2429d17bbaee55f71b9e4479cf3c05d874b1d6490acc2bd
+size 19960448

tune_log/layerskip_1b_0.25_tune/checkpoint-1400/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b76055a71d078a7ab14b379bb8d8ca0f5a097c62bdaa6a1ed041f8d0795a475d
+size 40050298

tune_log/layerskip_1b_0.25_tune/checkpoint-1400/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a2ce3ed5d78f12a31661bc5e87fdf5a10accd23348d868e7890473bb1cbdd90
+size 14244

tune_log/layerskip_1b_0.25_tune/checkpoint-1400/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:baba31a5e5063037a5c811de9cb04bc62c6c5f0f5fe6720b7d681afe6500d4c1
+size 988

tune_log/layerskip_1b_0.25_tune/checkpoint-1400/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac5921ca2a7080bbf354e8e211e9657d5c3188a2b7c88c6c82bb3b7f013be9ac
+size 1064

tune_log/layerskip_1b_0.25_tune/checkpoint-1400/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1132 @@

+{
+  "best_metric": 1.5736079216003418,
+  "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-1400",
+  "epoch": 1.8,
+  "eval_steps": 100,
+  "global_step": 1400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012861736334405145,
+      "grad_norm": 0.39783015847206116,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.0835,
+      "step": 1
+    },
+    {
+      "epoch": 0.012861736334405145,
+      "grad_norm": 0.45549583435058594,
+      "learning_rate": 1e-05,
+      "loss": 2.1408,
+      "step": 10
+    },
+    {
+      "epoch": 0.02572347266881029,
+      "grad_norm": 0.4594053626060486,
+      "learning_rate": 2e-05,
+      "loss": 2.0894,
+      "step": 20
+    },
+    {
+      "epoch": 0.03858520900321544,
+      "grad_norm": 0.49020764231681824,
+      "learning_rate": 3e-05,
+      "loss": 2.1037,
+      "step": 30
+    },
+    {
+      "epoch": 0.05144694533762058,
+      "grad_norm": 0.37993305921554565,
+      "learning_rate": 4e-05,
+      "loss": 1.9716,
+      "step": 40
+    },
+    {
+      "epoch": 0.06430868167202572,
+      "grad_norm": 0.38231977820396423,
+      "learning_rate": 5e-05,
+      "loss": 1.9349,
+      "step": 50
+    },
+    {
+      "epoch": 0.07717041800643087,
+      "grad_norm": 0.2922589182853699,
+      "learning_rate": 6e-05,
+      "loss": 1.906,
+      "step": 60
+    },
+    {
+      "epoch": 0.09003215434083602,
+      "grad_norm": 0.34647658467292786,
+      "learning_rate": 7e-05,
+      "loss": 1.8246,
+      "step": 70
+    },
+    {
+      "epoch": 0.10289389067524116,
+      "grad_norm": 0.31930026412010193,
+      "learning_rate": 8e-05,
+      "loss": 1.8057,
+      "step": 80
+    },
+    {
+      "epoch": 0.1157556270096463,
+      "grad_norm": 0.34028756618499756,
+      "learning_rate": 9e-05,
+      "loss": 1.7546,
+      "step": 90
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "grad_norm": 0.3878991901874542,
+      "learning_rate": 0.0001,
+      "loss": 1.7543,
+      "step": 100
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
+      "eval_yahma/alpaca-cleaned_runtime": 62.5096,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
+      "step": 100
+    },
+    {
+      "epoch": 0.1414790996784566,
+      "grad_norm": 0.35599613189697266,
+      "learning_rate": 9.931224209078405e-05,
+      "loss": 1.7309,
+      "step": 110
+    },
+    {
+      "epoch": 0.15434083601286175,
+      "grad_norm": 0.4075644016265869,
+      "learning_rate": 9.862448418156809e-05,
+      "loss": 1.6981,
+      "step": 120
+    },
+    {
+      "epoch": 0.16720257234726688,
+      "grad_norm": 0.4743317663669586,
+      "learning_rate": 9.793672627235215e-05,
+      "loss": 1.7011,
+      "step": 130
+    },
+    {
+      "epoch": 0.18006430868167203,
+      "grad_norm": 0.4701610505580902,
+      "learning_rate": 9.724896836313618e-05,
+      "loss": 1.6771,
+      "step": 140
+    },
+    {
+      "epoch": 0.19292604501607716,
+      "grad_norm": 0.49115318059921265,
+      "learning_rate": 9.656121045392023e-05,
+      "loss": 1.6633,
+      "step": 150
+    },
+    {
+      "epoch": 0.2057877813504823,
+      "grad_norm": 0.5177980661392212,
+      "learning_rate": 9.587345254470427e-05,
+      "loss": 1.6706,
+      "step": 160
+    },
+    {
+      "epoch": 0.21864951768488747,
+      "grad_norm": 0.465657114982605,
+      "learning_rate": 9.518569463548831e-05,
+      "loss": 1.6677,
+      "step": 170
+    },
+    {
+      "epoch": 0.2315112540192926,
+      "grad_norm": 0.5453551411628723,
+      "learning_rate": 9.449793672627235e-05,
+      "loss": 1.6656,
+      "step": 180
+    },
+    {
+      "epoch": 0.24437299035369775,
+      "grad_norm": 0.4150402545928955,
+      "learning_rate": 9.38101788170564e-05,
+      "loss": 1.6568,
+      "step": 190
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "grad_norm": 0.5106223225593567,
+      "learning_rate": 9.312242090784045e-05,
+      "loss": 1.6804,
+      "step": 200
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
+      "eval_yahma/alpaca-cleaned_runtime": 63.0481,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
+      "step": 200
+    },
+    {
+      "epoch": 0.27009646302250806,
+      "grad_norm": 0.47371965646743774,
+      "learning_rate": 9.243466299862448e-05,
+      "loss": 1.6235,
+      "step": 210
+    },
+    {
+      "epoch": 0.2829581993569132,
+      "grad_norm": 0.45723679661750793,
+      "learning_rate": 9.174690508940853e-05,
+      "loss": 1.6192,
+      "step": 220
+    },
+    {
+      "epoch": 0.2958199356913183,
+      "grad_norm": 0.46727871894836426,
+      "learning_rate": 9.105914718019258e-05,
+      "loss": 1.6129,
+      "step": 230
+    },
+    {
+      "epoch": 0.3086816720257235,
+      "grad_norm": 0.5216034054756165,
+      "learning_rate": 9.037138927097662e-05,
+      "loss": 1.6065,
+      "step": 240
+    },
+    {
+      "epoch": 0.3215434083601286,
+      "grad_norm": 0.46132415533065796,
+      "learning_rate": 8.968363136176067e-05,
+      "loss": 1.6374,
+      "step": 250
+    },
+    {
+      "epoch": 0.33440514469453375,
+      "grad_norm": 0.5699637532234192,
+      "learning_rate": 8.89958734525447e-05,
+      "loss": 1.6031,
+      "step": 260
+    },
+    {
+      "epoch": 0.34726688102893893,
+      "grad_norm": 0.46537184715270996,
+      "learning_rate": 8.830811554332875e-05,
+      "loss": 1.6196,
+      "step": 270
+    },
+    {
+      "epoch": 0.36012861736334406,
+      "grad_norm": 0.5034765005111694,
+      "learning_rate": 8.76203576341128e-05,
+      "loss": 1.6257,
+      "step": 280
+    },
+    {
+      "epoch": 0.3729903536977492,
+      "grad_norm": 0.48885518312454224,
+      "learning_rate": 8.693259972489685e-05,
+      "loss": 1.6195,
+      "step": 290
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "grad_norm": 0.48295891284942627,
+      "learning_rate": 8.62448418156809e-05,
+      "loss": 1.6301,
+      "step": 300
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "eval_yahma/alpaca-cleaned_loss": 1.6362165212631226,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9945,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.749,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 300
+    },
+    {
+      "epoch": 0.3987138263665595,
+      "grad_norm": 0.4800078570842743,
+      "learning_rate": 8.555708390646493e-05,
+      "loss": 1.6171,
+      "step": 310
+    },
+    {
+      "epoch": 0.4115755627009646,
+      "grad_norm": 0.47452959418296814,
+      "learning_rate": 8.486932599724897e-05,
+      "loss": 1.6147,
+      "step": 320
+    },
+    {
+      "epoch": 0.42443729903536975,
+      "grad_norm": 0.5397221446037292,
+      "learning_rate": 8.418156808803301e-05,
+      "loss": 1.6041,
+      "step": 330
+    },
+    {
+      "epoch": 0.43729903536977494,
+      "grad_norm": 0.5501461029052734,
+      "learning_rate": 8.349381017881706e-05,
+      "loss": 1.6091,
+      "step": 340
+    },
+    {
+      "epoch": 0.45016077170418006,
+      "grad_norm": 0.47587981820106506,
+      "learning_rate": 8.28060522696011e-05,
+      "loss": 1.6008,
+      "step": 350
+    },
+    {
+      "epoch": 0.4630225080385852,
+      "grad_norm": 0.46644529700279236,
+      "learning_rate": 8.211829436038515e-05,
+      "loss": 1.6081,
+      "step": 360
+    },
+    {
+      "epoch": 0.4758842443729904,
+      "grad_norm": 0.5308094024658203,
+      "learning_rate": 8.14305364511692e-05,
+      "loss": 1.5987,
+      "step": 370
+    },
+    {
+      "epoch": 0.4887459807073955,
+      "grad_norm": 0.5304721593856812,
+      "learning_rate": 8.074277854195323e-05,
+      "loss": 1.6173,
+      "step": 380
+    },
+    {
+      "epoch": 0.5016077170418006,
+      "grad_norm": 0.6186290383338928,
+      "learning_rate": 8.005502063273728e-05,
+      "loss": 1.5879,
+      "step": 390
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "grad_norm": 0.4936847388744354,
+      "learning_rate": 7.936726272352132e-05,
+      "loss": 1.5771,
+      "step": 400
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "eval_yahma/alpaca-cleaned_loss": 1.6208504438400269,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9246,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.784,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 400
+    },
+    {
+      "epoch": 0.5273311897106109,
+      "grad_norm": 0.4969954788684845,
+      "learning_rate": 7.867950481430537e-05,
+      "loss": 1.5873,
+      "step": 410
+    },
+    {
+      "epoch": 0.5401929260450161,
+      "grad_norm": 0.5539654493331909,
+      "learning_rate": 7.799174690508942e-05,
+      "loss": 1.5741,
+      "step": 420
+    },
+    {
+      "epoch": 0.5530546623794212,
+      "grad_norm": 0.4963805377483368,
+      "learning_rate": 7.730398899587345e-05,
+      "loss": 1.5883,
+      "step": 430
+    },
+    {
+      "epoch": 0.5659163987138264,
+      "grad_norm": 0.4849222004413605,
+      "learning_rate": 7.66162310866575e-05,
+      "loss": 1.6061,
+      "step": 440
+    },
+    {
+      "epoch": 0.5787781350482315,
+      "grad_norm": 0.5241298079490662,
+      "learning_rate": 7.592847317744153e-05,
+      "loss": 1.6118,
+      "step": 450
+    },
+    {
+      "epoch": 0.5916398713826366,
+      "grad_norm": 0.5051389336585999,
+      "learning_rate": 7.52407152682256e-05,
+      "loss": 1.5618,
+      "step": 460
+    },
+    {
+      "epoch": 0.6045016077170418,
+      "grad_norm": 0.49376770853996277,
+      "learning_rate": 7.455295735900963e-05,
+      "loss": 1.5871,
+      "step": 470
+    },
+    {
+      "epoch": 0.617363344051447,
+      "grad_norm": 0.49221155047416687,
+      "learning_rate": 7.386519944979367e-05,
+      "loss": 1.6037,
+      "step": 480
+    },
+    {
+      "epoch": 0.6302250803858521,
+      "grad_norm": 0.5378918647766113,
+      "learning_rate": 7.317744154057772e-05,
+      "loss": 1.5523,
+      "step": 490
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "grad_norm": 0.5564639568328857,
+      "learning_rate": 7.248968363136176e-05,
+      "loss": 1.5885,
+      "step": 500
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "eval_yahma/alpaca-cleaned_loss": 1.6095871925354004,
+      "eval_yahma/alpaca-cleaned_runtime": 62.983,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.755,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 500
+    },
+    {
+      "epoch": 0.6559485530546624,
+      "grad_norm": 0.49083277583122253,
+      "learning_rate": 7.180192572214582e-05,
+      "loss": 1.5693,
+      "step": 510
+    },
+    {
+      "epoch": 0.6688102893890675,
+      "grad_norm": 0.5625829100608826,
+      "learning_rate": 7.111416781292985e-05,
+      "loss": 1.6023,
+      "step": 520
+    },
+    {
+      "epoch": 0.6816720257234726,
+      "grad_norm": 0.6078226566314697,
+      "learning_rate": 7.04264099037139e-05,
+      "loss": 1.5845,
+      "step": 530
+    },
+    {
+      "epoch": 0.6945337620578779,
+      "grad_norm": 0.48107999563217163,
+      "learning_rate": 6.973865199449794e-05,
+      "loss": 1.5682,
+      "step": 540
+    },
+    {
+      "epoch": 0.707395498392283,
+      "grad_norm": 0.5080347657203674,
+      "learning_rate": 6.905089408528198e-05,
+      "loss": 1.5839,
+      "step": 550
+    },
+    {
+      "epoch": 0.7202572347266881,
+      "grad_norm": 0.5683622360229492,
+      "learning_rate": 6.836313617606602e-05,
+      "loss": 1.5916,
+      "step": 560
+    },
+    {
+      "epoch": 0.7331189710610932,
+      "grad_norm": 0.4669715464115143,
+      "learning_rate": 6.767537826685007e-05,
+      "loss": 1.6146,
+      "step": 570
+    },
+    {
+      "epoch": 0.7459807073954984,
+      "grad_norm": 0.4946054518222809,
+      "learning_rate": 6.698762035763412e-05,
+      "loss": 1.5764,
+      "step": 580
+    },
+    {
+      "epoch": 0.7588424437299035,
+      "grad_norm": 0.4975377023220062,
+      "learning_rate": 6.629986244841817e-05,
+      "loss": 1.6035,
+      "step": 590
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "grad_norm": 0.5511853098869324,
+      "learning_rate": 6.56121045392022e-05,
+      "loss": 1.5842,
+      "step": 600
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "eval_yahma/alpaca-cleaned_loss": 1.6013859510421753,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9465,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.773,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.972,
+      "step": 600
+    },
+    {
+      "epoch": 0.7845659163987139,
+      "grad_norm": 0.5689719915390015,
+      "learning_rate": 6.492434662998625e-05,
+      "loss": 1.5656,
+      "step": 610
+    },
+    {
+      "epoch": 0.797427652733119,
+      "grad_norm": 0.48885637521743774,
+      "learning_rate": 6.42365887207703e-05,
+      "loss": 1.5605,
+      "step": 620
+    },
+    {
+      "epoch": 0.8102893890675241,
+      "grad_norm": 0.5316773056983948,
+      "learning_rate": 6.354883081155434e-05,
+      "loss": 1.5755,
+      "step": 630
+    },
+    {
+      "epoch": 0.8231511254019293,
+      "grad_norm": 0.5578161478042603,
+      "learning_rate": 6.286107290233837e-05,
+      "loss": 1.5532,
+      "step": 640
+    },
+    {
+      "epoch": 0.8360128617363344,
+      "grad_norm": 0.6534080505371094,
+      "learning_rate": 6.217331499312242e-05,
+      "loss": 1.5882,
+      "step": 650
+    },
+    {
+      "epoch": 0.8488745980707395,
+      "grad_norm": 0.5140324831008911,
+      "learning_rate": 6.148555708390647e-05,
+      "loss": 1.5598,
+      "step": 660
+    },
+    {
+      "epoch": 0.8617363344051447,
+      "grad_norm": 0.5247426629066467,
+      "learning_rate": 6.0797799174690516e-05,
+      "loss": 1.5833,
+      "step": 670
+    },
+    {
+      "epoch": 0.8745980707395499,
+      "grad_norm": 0.49460870027542114,
+      "learning_rate": 6.011004126547456e-05,
+      "loss": 1.621,
+      "step": 680
+    },
+    {
+      "epoch": 0.887459807073955,
+      "grad_norm": 0.5351711511611938,
+      "learning_rate": 5.9422283356258604e-05,
+      "loss": 1.5371,
+      "step": 690
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "grad_norm": 0.5608878135681152,
+      "learning_rate": 5.8734525447042644e-05,
+      "loss": 1.5878,
+      "step": 700
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "eval_yahma/alpaca-cleaned_loss": 1.5940771102905273,
+      "eval_yahma/alpaca-cleaned_runtime": 62.917,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 700
+    },
+    {
+      "epoch": 0.9131832797427653,
+      "grad_norm": 0.48291367292404175,
+      "learning_rate": 5.8046767537826685e-05,
+      "loss": 1.583,
+      "step": 710
+    },
+    {
+      "epoch": 0.9260450160771704,
+      "grad_norm": 0.4866442382335663,
+      "learning_rate": 5.7359009628610725e-05,
+      "loss": 1.5891,
+      "step": 720
+    },
+    {
+      "epoch": 0.9389067524115756,
+      "grad_norm": 0.5254418253898621,
+      "learning_rate": 5.667125171939478e-05,
+      "loss": 1.5319,
+      "step": 730
+    },
+    {
+      "epoch": 0.9517684887459807,
+      "grad_norm": 0.5201655030250549,
+      "learning_rate": 5.598349381017882e-05,
+      "loss": 1.5819,
+      "step": 740
+    },
+    {
+      "epoch": 0.9646302250803859,
+      "grad_norm": 0.5820693969726562,
+      "learning_rate": 5.5295735900962866e-05,
+      "loss": 1.5807,
+      "step": 750
+    },
+    {
+      "epoch": 0.977491961414791,
+      "grad_norm": 0.559010922908783,
+      "learning_rate": 5.460797799174691e-05,
+      "loss": 1.5597,
+      "step": 760
+    },
+    {
+      "epoch": 0.9903536977491961,
+      "grad_norm": 0.498877614736557,
+      "learning_rate": 5.392022008253095e-05,
+      "loss": 1.5628,
+      "step": 770
+    },
+    {
+      "epoch": 1.002572347266881,
+      "grad_norm": 0.5119406580924988,
+      "learning_rate": 5.3232462173315e-05,
+      "loss": 1.5693,
+      "step": 780
+    },
+    {
+      "epoch": 1.015434083601286,
+      "grad_norm": 0.5344542860984802,
+      "learning_rate": 5.254470426409904e-05,
+      "loss": 1.5256,
+      "step": 790
+    },
+    {
+      "epoch": 1.0282958199356913,
+      "grad_norm": 0.5358342528343201,
+      "learning_rate": 5.185694635488308e-05,
+      "loss": 1.5432,
+      "step": 800
+    },
+    {
+      "epoch": 1.0282958199356913,
+      "eval_yahma/alpaca-cleaned_loss": 1.589645266532898,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9636,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.764,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
+      "step": 800
+    },
+    {
+      "epoch": 1.0411575562700965,
+      "grad_norm": 0.5941470265388489,
+      "learning_rate": 5.116918844566713e-05,
+      "loss": 1.5449,
+      "step": 810
+    },
+    {
+      "epoch": 1.0540192926045016,
+      "grad_norm": 0.5659182071685791,
+      "learning_rate": 5.048143053645117e-05,
+      "loss": 1.5234,
+      "step": 820
+    },
+    {
+      "epoch": 1.0668810289389068,
+      "grad_norm": 0.5737349390983582,
+      "learning_rate": 4.9793672627235217e-05,
+      "loss": 1.5517,
+      "step": 830
+    },
+    {
+      "epoch": 1.0797427652733118,
+      "grad_norm": 0.5984872579574585,
+      "learning_rate": 4.910591471801926e-05,
+      "loss": 1.5175,
+      "step": 840
+    },
+    {
+      "epoch": 1.092604501607717,
+      "grad_norm": 0.5954984426498413,
+      "learning_rate": 4.8418156808803304e-05,
+      "loss": 1.5738,
+      "step": 850
+    },
+    {
+      "epoch": 1.1054662379421223,
+      "grad_norm": 0.5545582175254822,
+      "learning_rate": 4.7730398899587344e-05,
+      "loss": 1.5538,
+      "step": 860
+    },
+    {
+      "epoch": 1.1183279742765273,
+      "grad_norm": 0.6972865462303162,
+      "learning_rate": 4.704264099037139e-05,
+      "loss": 1.529,
+      "step": 870
+    },
+    {
+      "epoch": 1.1311897106109325,
+      "grad_norm": 0.5404506325721741,
+      "learning_rate": 4.635488308115544e-05,
+      "loss": 1.5567,
+      "step": 880
+    },
+    {
+      "epoch": 1.1440514469453376,
+      "grad_norm": 0.5792121887207031,
+      "learning_rate": 4.566712517193948e-05,
+      "loss": 1.5422,
+      "step": 890
+    },
+    {
+      "epoch": 1.1569131832797428,
+      "grad_norm": 0.5468006134033203,
+      "learning_rate": 4.497936726272352e-05,
+      "loss": 1.5369,
+      "step": 900
+    },
+    {
+      "epoch": 1.1569131832797428,
+      "eval_yahma/alpaca-cleaned_loss": 1.5860395431518555,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9918,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.75,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 900
+    },
+    {
+      "epoch": 1.1697749196141478,
+      "grad_norm": 0.5955344438552856,
+      "learning_rate": 4.429160935350757e-05,
+      "loss": 1.5551,
+      "step": 910
+    },
+    {
+      "epoch": 1.182636655948553,
+      "grad_norm": 0.5832058787345886,
+      "learning_rate": 4.360385144429161e-05,
+      "loss": 1.5568,
+      "step": 920
+    },
+    {
+      "epoch": 1.1954983922829583,
+      "grad_norm": 0.6309258937835693,
+      "learning_rate": 4.291609353507566e-05,
+      "loss": 1.5548,
+      "step": 930
+    },
+    {
+      "epoch": 1.2083601286173633,
+      "grad_norm": 0.6269820928573608,
+      "learning_rate": 4.22283356258597e-05,
+      "loss": 1.5459,
+      "step": 940
+    },
+    {
+      "epoch": 1.2212218649517685,
+      "grad_norm": 0.6376837491989136,
+      "learning_rate": 4.154057771664374e-05,
+      "loss": 1.5277,
+      "step": 950
+    },
+    {
+      "epoch": 1.2340836012861736,
+      "grad_norm": 0.6351036429405212,
+      "learning_rate": 4.085281980742779e-05,
+      "loss": 1.5273,
+      "step": 960
+    },
+    {
+      "epoch": 1.2469453376205788,
+      "grad_norm": 0.6877638101577759,
+      "learning_rate": 4.016506189821183e-05,
+      "loss": 1.4986,
+      "step": 970
+    },
+    {
+      "epoch": 1.2598070739549838,
+      "grad_norm": 0.5501726865768433,
+      "learning_rate": 3.947730398899587e-05,
+      "loss": 1.5543,
+      "step": 980
+    },
+    {
+      "epoch": 1.272668810289389,
+      "grad_norm": 0.5217163562774658,
+      "learning_rate": 3.8789546079779924e-05,
+      "loss": 1.5292,
+      "step": 990
+    },
+    {
+      "epoch": 1.2855305466237943,
+      "grad_norm": 0.5770425796508789,
+      "learning_rate": 3.8101788170563964e-05,
+      "loss": 1.5536,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2855305466237943,
+      "eval_yahma/alpaca-cleaned_loss": 1.5820817947387695,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9495,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2983922829581993,
+      "grad_norm": 0.5802098512649536,
+      "learning_rate": 3.741403026134801e-05,
+      "loss": 1.5479,
+      "step": 1010
+    },
+    {
+      "epoch": 1.3112540192926045,
+      "grad_norm": 0.5646567344665527,
+      "learning_rate": 3.672627235213205e-05,
+      "loss": 1.5183,
+      "step": 1020
+    },
+    {
+      "epoch": 1.3241157556270098,
+      "grad_norm": 0.5852165222167969,
+      "learning_rate": 3.603851444291609e-05,
+      "loss": 1.5267,
+      "step": 1030
+    },
+    {
+      "epoch": 1.3369774919614148,
+      "grad_norm": 0.5583398342132568,
+      "learning_rate": 3.535075653370014e-05,
+      "loss": 1.5401,
+      "step": 1040
+    },
+    {
+      "epoch": 1.3498392282958198,
+      "grad_norm": 0.5971976518630981,
+      "learning_rate": 3.4662998624484186e-05,
+      "loss": 1.5147,
+      "step": 1050
+    },
+    {
+      "epoch": 1.362700964630225,
+      "grad_norm": 0.6036947965621948,
+      "learning_rate": 3.3975240715268227e-05,
+      "loss": 1.5294,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3755627009646303,
+      "grad_norm": 0.5828876495361328,
+      "learning_rate": 3.3287482806052274e-05,
+      "loss": 1.546,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3884244372990353,
+      "grad_norm": 0.5941759943962097,
+      "learning_rate": 3.2599724896836314e-05,
+      "loss": 1.5238,
+      "step": 1080
+    },
+    {
+      "epoch": 1.4012861736334405,
+      "grad_norm": 0.6082496047019958,
+      "learning_rate": 3.1911966987620354e-05,
+      "loss": 1.5055,
+      "step": 1090
+    },
+    {
+      "epoch": 1.4141479099678458,
+      "grad_norm": 0.5749199390411377,
+      "learning_rate": 3.12242090784044e-05,
+      "loss": 1.5238,
+      "step": 1100
+    },
+    {
+      "epoch": 1.4141479099678458,
+      "eval_yahma/alpaca-cleaned_loss": 1.5794486999511719,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9209,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.786,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 1100
+    },
+    {
+      "epoch": 1.4270096463022508,
+      "grad_norm": 0.649699330329895,
+      "learning_rate": 3.053645116918845e-05,
+      "loss": 1.5275,
+      "step": 1110
+    },
+    {
+      "epoch": 1.4398713826366558,
+      "grad_norm": 0.5754693150520325,
+      "learning_rate": 2.9848693259972492e-05,
+      "loss": 1.5217,
+      "step": 1120
+    },
+    {
+      "epoch": 1.452733118971061,
+      "grad_norm": 0.572021484375,
+      "learning_rate": 2.9160935350756536e-05,
+      "loss": 1.5489,
+      "step": 1130
+    },
+    {
+      "epoch": 1.4655948553054663,
+      "grad_norm": 0.6010130643844604,
+      "learning_rate": 2.8473177441540577e-05,
+      "loss": 1.5019,
+      "step": 1140
+    },
+    {
+      "epoch": 1.4784565916398713,
+      "grad_norm": 0.6172171831130981,
+      "learning_rate": 2.7785419532324624e-05,
+      "loss": 1.5703,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4913183279742765,
+      "grad_norm": 0.5957326889038086,
+      "learning_rate": 2.7097661623108668e-05,
+      "loss": 1.5247,
+      "step": 1160
+    },
+    {
+      "epoch": 1.5041800643086818,
+      "grad_norm": 0.5608690977096558,
+      "learning_rate": 2.6409903713892708e-05,
+      "loss": 1.5403,
+      "step": 1170
+    },
+    {
+      "epoch": 1.5170418006430868,
+      "grad_norm": 0.5870776176452637,
+      "learning_rate": 2.5722145804676755e-05,
+      "loss": 1.5235,
+      "step": 1180
+    },
+    {
+      "epoch": 1.5299035369774918,
+      "grad_norm": 0.5889161229133606,
+      "learning_rate": 2.50343878954608e-05,
+      "loss": 1.5164,
+      "step": 1190
+    },
+    {
+      "epoch": 1.542765273311897,
+      "grad_norm": 0.6082655787467957,
+      "learning_rate": 2.4346629986244843e-05,
+      "loss": 1.5022,
+      "step": 1200
+    },
+    {
+      "epoch": 1.542765273311897,
+      "eval_yahma/alpaca-cleaned_loss": 1.5769098997116089,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9228,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.785,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 1200
+    },
+    {
+      "epoch": 1.5556270096463023,
+      "grad_norm": 0.6997891664505005,
+      "learning_rate": 2.3658872077028886e-05,
+      "loss": 1.5197,
+      "step": 1210
+    },
+    {
+      "epoch": 1.5684887459807073,
+      "grad_norm": 0.6935648918151855,
+      "learning_rate": 2.2971114167812934e-05,
+      "loss": 1.5391,
+      "step": 1220
+    },
+    {
+      "epoch": 1.5813504823151125,
+      "grad_norm": 0.6135308742523193,
+      "learning_rate": 2.2283356258596974e-05,
+      "loss": 1.5238,
+      "step": 1230
+    },
+    {
+      "epoch": 1.5942122186495178,
+      "grad_norm": 0.5835321545600891,
+      "learning_rate": 2.1595598349381018e-05,
+      "loss": 1.5767,
+      "step": 1240
+    },
+    {
+      "epoch": 1.6070739549839228,
+      "grad_norm": 0.6089451313018799,
+      "learning_rate": 2.0907840440165065e-05,
+      "loss": 1.535,
+      "step": 1250
+    },
+    {
+      "epoch": 1.6199356913183278,
+      "grad_norm": 0.5886595249176025,
+      "learning_rate": 2.022008253094911e-05,
+      "loss": 1.5133,
+      "step": 1260
+    },
+    {
+      "epoch": 1.6327974276527333,
+      "grad_norm": 0.6229696273803711,
+      "learning_rate": 1.953232462173315e-05,
+      "loss": 1.5313,
+      "step": 1270
+    },
+    {
+      "epoch": 1.6456591639871383,
+      "grad_norm": 0.60906583070755,
+      "learning_rate": 1.8844566712517196e-05,
+      "loss": 1.5152,
+      "step": 1280
+    },
+    {
+      "epoch": 1.6585209003215433,
+      "grad_norm": 0.5806885957717896,
+      "learning_rate": 1.815680880330124e-05,
+      "loss": 1.5468,
+      "step": 1290
+    },
+    {
+      "epoch": 1.6713826366559486,
+      "grad_norm": 0.6111522316932678,
+      "learning_rate": 1.746905089408528e-05,
+      "loss": 1.544,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6713826366559486,
+      "eval_yahma/alpaca-cleaned_loss": 1.574813961982727,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9178,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6842443729903538,
+      "grad_norm": 0.5954424738883972,
+      "learning_rate": 1.6781292984869327e-05,
+      "loss": 1.5253,
+      "step": 1310
+    },
+    {
+      "epoch": 1.6971061093247588,
+      "grad_norm": 0.5995926856994629,
+      "learning_rate": 1.609353507565337e-05,
+      "loss": 1.5306,
+      "step": 1320
+    },
+    {
+      "epoch": 1.7099678456591638,
+      "grad_norm": 0.6193538308143616,
+      "learning_rate": 1.5405777166437415e-05,
+      "loss": 1.5344,
+      "step": 1330
+    },
+    {
+      "epoch": 1.7228295819935693,
+      "grad_norm": 0.596823513507843,
+      "learning_rate": 1.4718019257221457e-05,
+      "loss": 1.5561,
+      "step": 1340
+    },
+    {
+      "epoch": 1.7356913183279743,
+      "grad_norm": 0.658667266368866,
+      "learning_rate": 1.4030261348005502e-05,
+      "loss": 1.5158,
+      "step": 1350
+    },
+    {
+      "epoch": 1.7485530546623793,
+      "grad_norm": 0.643640398979187,
+      "learning_rate": 1.3342503438789546e-05,
+      "loss": 1.5412,
+      "step": 1360
+    },
+    {
+      "epoch": 1.7614147909967846,
+      "grad_norm": 0.6444098353385925,
+      "learning_rate": 1.2654745529573592e-05,
+      "loss": 1.5098,
+      "step": 1370
+    },
+    {
+      "epoch": 1.7742765273311898,
+      "grad_norm": 0.518659234046936,
+      "learning_rate": 1.1966987620357635e-05,
+      "loss": 1.5418,
+      "step": 1380
+    },
+    {
+      "epoch": 1.7871382636655948,
+      "grad_norm": 0.5826813578605652,
+      "learning_rate": 1.127922971114168e-05,
+      "loss": 1.5204,
+      "step": 1390
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.6658011674880981,
+      "learning_rate": 1.0591471801925723e-05,
+      "loss": 1.5511,
+      "step": 1400
+    },
+    {
+      "epoch": 1.8,
+      "eval_yahma/alpaca-cleaned_loss": 1.5736079216003418,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9144,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.789,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.974,
+      "step": 1400
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1554,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0407960239485747e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

tune_log/layerskip_1b_0.25_tune/checkpoint-1400/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
+size 5368

tune_log/layerskip_1b_0.25_tune/checkpoint-1554/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebbdebe9dc37a4369f696bea5b64f89724529d824e49617f533e96962dbd5086
+size 19960448

tune_log/layerskip_1b_0.25_tune/checkpoint-1554/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:404b61918c973378dba2b4f9cd535fe8a301b95827dc90432aafea54870738db
+size 40050298

tune_log/layerskip_1b_0.25_tune/checkpoint-1554/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09933d2e1b8b7beafbf07ef8f20e61fc76608c66afbd496e8e3a5c7e934bb8f7
+size 14244

tune_log/layerskip_1b_0.25_tune/checkpoint-1554/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f82069323487bff77227998f1defb2bb51d88b6e63100f619a2706217653b27d
+size 988

tune_log/layerskip_1b_0.25_tune/checkpoint-1554/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9817f8edc9cb5b39db981ecb240ea5a2cfbe7c3cb37093dba74fbe7c5aa21fa
+size 1064

tune_log/layerskip_1b_0.25_tune/checkpoint-1554/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1245 @@

+{
+  "best_metric": 1.5721148252487183,
+  "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-1500",
+  "epoch": 1.9980707395498394,
+  "eval_steps": 100,
+  "global_step": 1554,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012861736334405145,
+      "grad_norm": 0.39783015847206116,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.0835,
+      "step": 1
+    },
+    {
+      "epoch": 0.012861736334405145,
+      "grad_norm": 0.45549583435058594,
+      "learning_rate": 1e-05,
+      "loss": 2.1408,
+      "step": 10
+    },
+    {
+      "epoch": 0.02572347266881029,
+      "grad_norm": 0.4594053626060486,
+      "learning_rate": 2e-05,
+      "loss": 2.0894,
+      "step": 20
+    },
+    {
+      "epoch": 0.03858520900321544,
+      "grad_norm": 0.49020764231681824,
+      "learning_rate": 3e-05,
+      "loss": 2.1037,
+      "step": 30
+    },
+    {
+      "epoch": 0.05144694533762058,
+      "grad_norm": 0.37993305921554565,
+      "learning_rate": 4e-05,
+      "loss": 1.9716,
+      "step": 40
+    },
+    {
+      "epoch": 0.06430868167202572,
+      "grad_norm": 0.38231977820396423,
+      "learning_rate": 5e-05,
+      "loss": 1.9349,
+      "step": 50
+    },
+    {
+      "epoch": 0.07717041800643087,
+      "grad_norm": 0.2922589182853699,
+      "learning_rate": 6e-05,
+      "loss": 1.906,
+      "step": 60
+    },
+    {
+      "epoch": 0.09003215434083602,
+      "grad_norm": 0.34647658467292786,
+      "learning_rate": 7e-05,
+      "loss": 1.8246,
+      "step": 70
+    },
+    {
+      "epoch": 0.10289389067524116,
+      "grad_norm": 0.31930026412010193,
+      "learning_rate": 8e-05,
+      "loss": 1.8057,
+      "step": 80
+    },
+    {
+      "epoch": 0.1157556270096463,
+      "grad_norm": 0.34028756618499756,
+      "learning_rate": 9e-05,
+      "loss": 1.7546,
+      "step": 90
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "grad_norm": 0.3878991901874542,
+      "learning_rate": 0.0001,
+      "loss": 1.7543,
+      "step": 100
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
+      "eval_yahma/alpaca-cleaned_runtime": 62.5096,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
+      "step": 100
+    },
+    {
+      "epoch": 0.1414790996784566,
+      "grad_norm": 0.35599613189697266,
+      "learning_rate": 9.931224209078405e-05,
+      "loss": 1.7309,
+      "step": 110
+    },
+    {
+      "epoch": 0.15434083601286175,
+      "grad_norm": 0.4075644016265869,
+      "learning_rate": 9.862448418156809e-05,
+      "loss": 1.6981,
+      "step": 120
+    },
+    {
+      "epoch": 0.16720257234726688,
+      "grad_norm": 0.4743317663669586,
+      "learning_rate": 9.793672627235215e-05,
+      "loss": 1.7011,
+      "step": 130
+    },
+    {
+      "epoch": 0.18006430868167203,
+      "grad_norm": 0.4701610505580902,
+      "learning_rate": 9.724896836313618e-05,
+      "loss": 1.6771,
+      "step": 140
+    },
+    {
+      "epoch": 0.19292604501607716,
+      "grad_norm": 0.49115318059921265,
+      "learning_rate": 9.656121045392023e-05,
+      "loss": 1.6633,
+      "step": 150
+    },
+    {
+      "epoch": 0.2057877813504823,
+      "grad_norm": 0.5177980661392212,
+      "learning_rate": 9.587345254470427e-05,
+      "loss": 1.6706,
+      "step": 160
+    },
+    {
+      "epoch": 0.21864951768488747,
+      "grad_norm": 0.465657114982605,
+      "learning_rate": 9.518569463548831e-05,
+      "loss": 1.6677,
+      "step": 170
+    },
+    {
+      "epoch": 0.2315112540192926,
+      "grad_norm": 0.5453551411628723,
+      "learning_rate": 9.449793672627235e-05,
+      "loss": 1.6656,
+      "step": 180
+    },
+    {
+      "epoch": 0.24437299035369775,
+      "grad_norm": 0.4150402545928955,
+      "learning_rate": 9.38101788170564e-05,
+      "loss": 1.6568,
+      "step": 190
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "grad_norm": 0.5106223225593567,
+      "learning_rate": 9.312242090784045e-05,
+      "loss": 1.6804,
+      "step": 200
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
+      "eval_yahma/alpaca-cleaned_runtime": 63.0481,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
+      "step": 200
+    },
+    {
+      "epoch": 0.27009646302250806,
+      "grad_norm": 0.47371965646743774,
+      "learning_rate": 9.243466299862448e-05,
+      "loss": 1.6235,
+      "step": 210
+    },
+    {
+      "epoch": 0.2829581993569132,
+      "grad_norm": 0.45723679661750793,
+      "learning_rate": 9.174690508940853e-05,
+      "loss": 1.6192,
+      "step": 220
+    },
+    {
+      "epoch": 0.2958199356913183,
+      "grad_norm": 0.46727871894836426,
+      "learning_rate": 9.105914718019258e-05,
+      "loss": 1.6129,
+      "step": 230
+    },
+    {
+      "epoch": 0.3086816720257235,
+      "grad_norm": 0.5216034054756165,
+      "learning_rate": 9.037138927097662e-05,
+      "loss": 1.6065,
+      "step": 240
+    },
+    {
+      "epoch": 0.3215434083601286,
+      "grad_norm": 0.46132415533065796,
+      "learning_rate": 8.968363136176067e-05,
+      "loss": 1.6374,
+      "step": 250
+    },
+    {
+      "epoch": 0.33440514469453375,
+      "grad_norm": 0.5699637532234192,
+      "learning_rate": 8.89958734525447e-05,
+      "loss": 1.6031,
+      "step": 260
+    },
+    {
+      "epoch": 0.34726688102893893,
+      "grad_norm": 0.46537184715270996,
+      "learning_rate": 8.830811554332875e-05,
+      "loss": 1.6196,
+      "step": 270
+    },
+    {
+      "epoch": 0.36012861736334406,
+      "grad_norm": 0.5034765005111694,
+      "learning_rate": 8.76203576341128e-05,
+      "loss": 1.6257,
+      "step": 280
+    },
+    {
+      "epoch": 0.3729903536977492,
+      "grad_norm": 0.48885518312454224,
+      "learning_rate": 8.693259972489685e-05,
+      "loss": 1.6195,
+      "step": 290
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "grad_norm": 0.48295891284942627,
+      "learning_rate": 8.62448418156809e-05,
+      "loss": 1.6301,
+      "step": 300
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "eval_yahma/alpaca-cleaned_loss": 1.6362165212631226,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9945,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.749,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 300
+    },
+    {
+      "epoch": 0.3987138263665595,
+      "grad_norm": 0.4800078570842743,
+      "learning_rate": 8.555708390646493e-05,
+      "loss": 1.6171,
+      "step": 310
+    },
+    {
+      "epoch": 0.4115755627009646,
+      "grad_norm": 0.47452959418296814,
+      "learning_rate": 8.486932599724897e-05,
+      "loss": 1.6147,
+      "step": 320
+    },
+    {
+      "epoch": 0.42443729903536975,
+      "grad_norm": 0.5397221446037292,
+      "learning_rate": 8.418156808803301e-05,
+      "loss": 1.6041,
+      "step": 330
+    },
+    {
+      "epoch": 0.43729903536977494,
+      "grad_norm": 0.5501461029052734,
+      "learning_rate": 8.349381017881706e-05,
+      "loss": 1.6091,
+      "step": 340
+    },
+    {
+      "epoch": 0.45016077170418006,
+      "grad_norm": 0.47587981820106506,
+      "learning_rate": 8.28060522696011e-05,
+      "loss": 1.6008,
+      "step": 350
+    },
+    {
+      "epoch": 0.4630225080385852,
+      "grad_norm": 0.46644529700279236,
+      "learning_rate": 8.211829436038515e-05,
+      "loss": 1.6081,
+      "step": 360
+    },
+    {
+      "epoch": 0.4758842443729904,
+      "grad_norm": 0.5308094024658203,
+      "learning_rate": 8.14305364511692e-05,
+      "loss": 1.5987,
+      "step": 370
+    },
+    {
+      "epoch": 0.4887459807073955,
+      "grad_norm": 0.5304721593856812,
+      "learning_rate": 8.074277854195323e-05,
+      "loss": 1.6173,
+      "step": 380
+    },
+    {
+      "epoch": 0.5016077170418006,
+      "grad_norm": 0.6186290383338928,
+      "learning_rate": 8.005502063273728e-05,
+      "loss": 1.5879,
+      "step": 390
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "grad_norm": 0.4936847388744354,
+      "learning_rate": 7.936726272352132e-05,
+      "loss": 1.5771,
+      "step": 400
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "eval_yahma/alpaca-cleaned_loss": 1.6208504438400269,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9246,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.784,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 400
+    },
+    {
+      "epoch": 0.5273311897106109,
+      "grad_norm": 0.4969954788684845,
+      "learning_rate": 7.867950481430537e-05,
+      "loss": 1.5873,
+      "step": 410
+    },
+    {
+      "epoch": 0.5401929260450161,
+      "grad_norm": 0.5539654493331909,
+      "learning_rate": 7.799174690508942e-05,
+      "loss": 1.5741,
+      "step": 420
+    },
+    {
+      "epoch": 0.5530546623794212,
+      "grad_norm": 0.4963805377483368,
+      "learning_rate": 7.730398899587345e-05,
+      "loss": 1.5883,
+      "step": 430
+    },
+    {
+      "epoch": 0.5659163987138264,
+      "grad_norm": 0.4849222004413605,
+      "learning_rate": 7.66162310866575e-05,
+      "loss": 1.6061,
+      "step": 440
+    },
+    {
+      "epoch": 0.5787781350482315,
+      "grad_norm": 0.5241298079490662,
+      "learning_rate": 7.592847317744153e-05,
+      "loss": 1.6118,
+      "step": 450
+    },
+    {
+      "epoch": 0.5916398713826366,
+      "grad_norm": 0.5051389336585999,
+      "learning_rate": 7.52407152682256e-05,
+      "loss": 1.5618,
+      "step": 460
+    },
+    {
+      "epoch": 0.6045016077170418,
+      "grad_norm": 0.49376770853996277,
+      "learning_rate": 7.455295735900963e-05,
+      "loss": 1.5871,
+      "step": 470
+    },
+    {
+      "epoch": 0.617363344051447,
+      "grad_norm": 0.49221155047416687,
+      "learning_rate": 7.386519944979367e-05,
+      "loss": 1.6037,
+      "step": 480
+    },
+    {
+      "epoch": 0.6302250803858521,
+      "grad_norm": 0.5378918647766113,
+      "learning_rate": 7.317744154057772e-05,
+      "loss": 1.5523,
+      "step": 490
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "grad_norm": 0.5564639568328857,
+      "learning_rate": 7.248968363136176e-05,
+      "loss": 1.5885,
+      "step": 500
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "eval_yahma/alpaca-cleaned_loss": 1.6095871925354004,
+      "eval_yahma/alpaca-cleaned_runtime": 62.983,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.755,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 500
+    },
+    {
+      "epoch": 0.6559485530546624,
+      "grad_norm": 0.49083277583122253,
+      "learning_rate": 7.180192572214582e-05,
+      "loss": 1.5693,
+      "step": 510
+    },
+    {
+      "epoch": 0.6688102893890675,
+      "grad_norm": 0.5625829100608826,
+      "learning_rate": 7.111416781292985e-05,
+      "loss": 1.6023,
+      "step": 520
+    },
+    {
+      "epoch": 0.6816720257234726,
+      "grad_norm": 0.6078226566314697,
+      "learning_rate": 7.04264099037139e-05,
+      "loss": 1.5845,
+      "step": 530
+    },
+    {
+      "epoch": 0.6945337620578779,
+      "grad_norm": 0.48107999563217163,
+      "learning_rate": 6.973865199449794e-05,
+      "loss": 1.5682,
+      "step": 540
+    },
+    {
+      "epoch": 0.707395498392283,
+      "grad_norm": 0.5080347657203674,
+      "learning_rate": 6.905089408528198e-05,
+      "loss": 1.5839,
+      "step": 550
+    },
+    {
+      "epoch": 0.7202572347266881,
+      "grad_norm": 0.5683622360229492,
+      "learning_rate": 6.836313617606602e-05,
+      "loss": 1.5916,
+      "step": 560
+    },
+    {
+      "epoch": 0.7331189710610932,
+      "grad_norm": 0.4669715464115143,
+      "learning_rate": 6.767537826685007e-05,
+      "loss": 1.6146,
+      "step": 570
+    },
+    {
+      "epoch": 0.7459807073954984,
+      "grad_norm": 0.4946054518222809,
+      "learning_rate": 6.698762035763412e-05,
+      "loss": 1.5764,
+      "step": 580
+    },
+    {
+      "epoch": 0.7588424437299035,
+      "grad_norm": 0.4975377023220062,
+      "learning_rate": 6.629986244841817e-05,
+      "loss": 1.6035,
+      "step": 590
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "grad_norm": 0.5511853098869324,
+      "learning_rate": 6.56121045392022e-05,
+      "loss": 1.5842,
+      "step": 600
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "eval_yahma/alpaca-cleaned_loss": 1.6013859510421753,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9465,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.773,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.972,
+      "step": 600
+    },
+    {
+      "epoch": 0.7845659163987139,
+      "grad_norm": 0.5689719915390015,
+      "learning_rate": 6.492434662998625e-05,
+      "loss": 1.5656,
+      "step": 610
+    },
+    {
+      "epoch": 0.797427652733119,
+      "grad_norm": 0.48885637521743774,
+      "learning_rate": 6.42365887207703e-05,
+      "loss": 1.5605,
+      "step": 620
+    },
+    {
+      "epoch": 0.8102893890675241,
+      "grad_norm": 0.5316773056983948,
+      "learning_rate": 6.354883081155434e-05,
+      "loss": 1.5755,
+      "step": 630
+    },
+    {
+      "epoch": 0.8231511254019293,
+      "grad_norm": 0.5578161478042603,
+      "learning_rate": 6.286107290233837e-05,
+      "loss": 1.5532,
+      "step": 640
+    },
+    {
+      "epoch": 0.8360128617363344,
+      "grad_norm": 0.6534080505371094,
+      "learning_rate": 6.217331499312242e-05,
+      "loss": 1.5882,
+      "step": 650
+    },
+    {
+      "epoch": 0.8488745980707395,
+      "grad_norm": 0.5140324831008911,
+      "learning_rate": 6.148555708390647e-05,
+      "loss": 1.5598,
+      "step": 660
+    },
+    {
+      "epoch": 0.8617363344051447,
+      "grad_norm": 0.5247426629066467,
+      "learning_rate": 6.0797799174690516e-05,
+      "loss": 1.5833,
+      "step": 670
+    },
+    {
+      "epoch": 0.8745980707395499,
+      "grad_norm": 0.49460870027542114,
+      "learning_rate": 6.011004126547456e-05,
+      "loss": 1.621,
+      "step": 680
+    },
+    {
+      "epoch": 0.887459807073955,
+      "grad_norm": 0.5351711511611938,
+      "learning_rate": 5.9422283356258604e-05,
+      "loss": 1.5371,
+      "step": 690
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "grad_norm": 0.5608878135681152,
+      "learning_rate": 5.8734525447042644e-05,
+      "loss": 1.5878,
+      "step": 700
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "eval_yahma/alpaca-cleaned_loss": 1.5940771102905273,
+      "eval_yahma/alpaca-cleaned_runtime": 62.917,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 700
+    },
+    {
+      "epoch": 0.9131832797427653,
+      "grad_norm": 0.48291367292404175,
+      "learning_rate": 5.8046767537826685e-05,
+      "loss": 1.583,
+      "step": 710
+    },
+    {
+      "epoch": 0.9260450160771704,
+      "grad_norm": 0.4866442382335663,
+      "learning_rate": 5.7359009628610725e-05,
+      "loss": 1.5891,
+      "step": 720
+    },
+    {
+      "epoch": 0.9389067524115756,
+      "grad_norm": 0.5254418253898621,
+      "learning_rate": 5.667125171939478e-05,
+      "loss": 1.5319,
+      "step": 730
+    },
+    {
+      "epoch": 0.9517684887459807,
+      "grad_norm": 0.5201655030250549,
+      "learning_rate": 5.598349381017882e-05,
+      "loss": 1.5819,
+      "step": 740
+    },
+    {
+      "epoch": 0.9646302250803859,
+      "grad_norm": 0.5820693969726562,
+      "learning_rate": 5.5295735900962866e-05,
+      "loss": 1.5807,
+      "step": 750
+    },
+    {
+      "epoch": 0.977491961414791,
+      "grad_norm": 0.559010922908783,
+      "learning_rate": 5.460797799174691e-05,
+      "loss": 1.5597,
+      "step": 760
+    },
+    {
+      "epoch": 0.9903536977491961,
+      "grad_norm": 0.498877614736557,
+      "learning_rate": 5.392022008253095e-05,
+      "loss": 1.5628,
+      "step": 770
+    },
+    {
+      "epoch": 1.002572347266881,
+      "grad_norm": 0.5119406580924988,
+      "learning_rate": 5.3232462173315e-05,
+      "loss": 1.5693,
+      "step": 780
+    },
+    {
+      "epoch": 1.015434083601286,
+      "grad_norm": 0.5344542860984802,
+      "learning_rate": 5.254470426409904e-05,
+      "loss": 1.5256,
+      "step": 790
+    },
+    {
+      "epoch": 1.0282958199356913,
+      "grad_norm": 0.5358342528343201,
+      "learning_rate": 5.185694635488308e-05,
+      "loss": 1.5432,
+      "step": 800
+    },
+    {
+      "epoch": 1.0282958199356913,
+      "eval_yahma/alpaca-cleaned_loss": 1.589645266532898,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9636,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.764,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
+      "step": 800
+    },
+    {
+      "epoch": 1.0411575562700965,
+      "grad_norm": 0.5941470265388489,
+      "learning_rate": 5.116918844566713e-05,
+      "loss": 1.5449,
+      "step": 810
+    },
+    {
+      "epoch": 1.0540192926045016,
+      "grad_norm": 0.5659182071685791,
+      "learning_rate": 5.048143053645117e-05,
+      "loss": 1.5234,
+      "step": 820
+    },
+    {
+      "epoch": 1.0668810289389068,
+      "grad_norm": 0.5737349390983582,
+      "learning_rate": 4.9793672627235217e-05,
+      "loss": 1.5517,
+      "step": 830
+    },
+    {
+      "epoch": 1.0797427652733118,
+      "grad_norm": 0.5984872579574585,
+      "learning_rate": 4.910591471801926e-05,
+      "loss": 1.5175,
+      "step": 840
+    },
+    {
+      "epoch": 1.092604501607717,
+      "grad_norm": 0.5954984426498413,
+      "learning_rate": 4.8418156808803304e-05,
+      "loss": 1.5738,
+      "step": 850
+    },
+    {
+      "epoch": 1.1054662379421223,
+      "grad_norm": 0.5545582175254822,
+      "learning_rate": 4.7730398899587344e-05,
+      "loss": 1.5538,
+      "step": 860
+    },
+    {
+      "epoch": 1.1183279742765273,
+      "grad_norm": 0.6972865462303162,
+      "learning_rate": 4.704264099037139e-05,
+      "loss": 1.529,
+      "step": 870
+    },
+    {
+      "epoch": 1.1311897106109325,
+      "grad_norm": 0.5404506325721741,
+      "learning_rate": 4.635488308115544e-05,
+      "loss": 1.5567,
+      "step": 880
+    },
+    {
+      "epoch": 1.1440514469453376,
+      "grad_norm": 0.5792121887207031,
+      "learning_rate": 4.566712517193948e-05,
+      "loss": 1.5422,
+      "step": 890
+    },
+    {
+      "epoch": 1.1569131832797428,
+      "grad_norm": 0.5468006134033203,
+      "learning_rate": 4.497936726272352e-05,
+      "loss": 1.5369,
+      "step": 900
+    },
+    {
+      "epoch": 1.1569131832797428,
+      "eval_yahma/alpaca-cleaned_loss": 1.5860395431518555,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9918,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.75,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.969,
+      "step": 900
+    },
+    {
+      "epoch": 1.1697749196141478,
+      "grad_norm": 0.5955344438552856,
+      "learning_rate": 4.429160935350757e-05,
+      "loss": 1.5551,
+      "step": 910
+    },
+    {
+      "epoch": 1.182636655948553,
+      "grad_norm": 0.5832058787345886,
+      "learning_rate": 4.360385144429161e-05,
+      "loss": 1.5568,
+      "step": 920
+    },
+    {
+      "epoch": 1.1954983922829583,
+      "grad_norm": 0.6309258937835693,
+      "learning_rate": 4.291609353507566e-05,
+      "loss": 1.5548,
+      "step": 930
+    },
+    {
+      "epoch": 1.2083601286173633,
+      "grad_norm": 0.6269820928573608,
+      "learning_rate": 4.22283356258597e-05,
+      "loss": 1.5459,
+      "step": 940
+    },
+    {
+      "epoch": 1.2212218649517685,
+      "grad_norm": 0.6376837491989136,
+      "learning_rate": 4.154057771664374e-05,
+      "loss": 1.5277,
+      "step": 950
+    },
+    {
+      "epoch": 1.2340836012861736,
+      "grad_norm": 0.6351036429405212,
+      "learning_rate": 4.085281980742779e-05,
+      "loss": 1.5273,
+      "step": 960
+    },
+    {
+      "epoch": 1.2469453376205788,
+      "grad_norm": 0.6877638101577759,
+      "learning_rate": 4.016506189821183e-05,
+      "loss": 1.4986,
+      "step": 970
+    },
+    {
+      "epoch": 1.2598070739549838,
+      "grad_norm": 0.5501726865768433,
+      "learning_rate": 3.947730398899587e-05,
+      "loss": 1.5543,
+      "step": 980
+    },
+    {
+      "epoch": 1.272668810289389,
+      "grad_norm": 0.5217163562774658,
+      "learning_rate": 3.8789546079779924e-05,
+      "loss": 1.5292,
+      "step": 990
+    },
+    {
+      "epoch": 1.2855305466237943,
+      "grad_norm": 0.5770425796508789,
+      "learning_rate": 3.8101788170563964e-05,
+      "loss": 1.5536,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2855305466237943,
+      "eval_yahma/alpaca-cleaned_loss": 1.5820817947387695,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9495,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2983922829581993,
+      "grad_norm": 0.5802098512649536,
+      "learning_rate": 3.741403026134801e-05,
+      "loss": 1.5479,
+      "step": 1010
+    },
+    {
+      "epoch": 1.3112540192926045,
+      "grad_norm": 0.5646567344665527,
+      "learning_rate": 3.672627235213205e-05,
+      "loss": 1.5183,
+      "step": 1020
+    },
+    {
+      "epoch": 1.3241157556270098,
+      "grad_norm": 0.5852165222167969,
+      "learning_rate": 3.603851444291609e-05,
+      "loss": 1.5267,
+      "step": 1030
+    },
+    {
+      "epoch": 1.3369774919614148,
+      "grad_norm": 0.5583398342132568,
+      "learning_rate": 3.535075653370014e-05,
+      "loss": 1.5401,
+      "step": 1040
+    },
+    {
+      "epoch": 1.3498392282958198,
+      "grad_norm": 0.5971976518630981,
+      "learning_rate": 3.4662998624484186e-05,
+      "loss": 1.5147,
+      "step": 1050
+    },
+    {
+      "epoch": 1.362700964630225,
+      "grad_norm": 0.6036947965621948,
+      "learning_rate": 3.3975240715268227e-05,
+      "loss": 1.5294,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3755627009646303,
+      "grad_norm": 0.5828876495361328,
+      "learning_rate": 3.3287482806052274e-05,
+      "loss": 1.546,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3884244372990353,
+      "grad_norm": 0.5941759943962097,
+      "learning_rate": 3.2599724896836314e-05,
+      "loss": 1.5238,
+      "step": 1080
+    },
+    {
+      "epoch": 1.4012861736334405,
+      "grad_norm": 0.6082496047019958,
+      "learning_rate": 3.1911966987620354e-05,
+      "loss": 1.5055,
+      "step": 1090
+    },
+    {
+      "epoch": 1.4141479099678458,
+      "grad_norm": 0.5749199390411377,
+      "learning_rate": 3.12242090784044e-05,
+      "loss": 1.5238,
+      "step": 1100
+    },
+    {
+      "epoch": 1.4141479099678458,
+      "eval_yahma/alpaca-cleaned_loss": 1.5794486999511719,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9209,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.786,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 1100
+    },
+    {
+      "epoch": 1.4270096463022508,
+      "grad_norm": 0.649699330329895,
+      "learning_rate": 3.053645116918845e-05,
+      "loss": 1.5275,
+      "step": 1110
+    },
+    {
+      "epoch": 1.4398713826366558,
+      "grad_norm": 0.5754693150520325,
+      "learning_rate": 2.9848693259972492e-05,
+      "loss": 1.5217,
+      "step": 1120
+    },
+    {
+      "epoch": 1.452733118971061,
+      "grad_norm": 0.572021484375,
+      "learning_rate": 2.9160935350756536e-05,
+      "loss": 1.5489,
+      "step": 1130
+    },
+    {
+      "epoch": 1.4655948553054663,
+      "grad_norm": 0.6010130643844604,
+      "learning_rate": 2.8473177441540577e-05,
+      "loss": 1.5019,
+      "step": 1140
+    },
+    {
+      "epoch": 1.4784565916398713,
+      "grad_norm": 0.6172171831130981,
+      "learning_rate": 2.7785419532324624e-05,
+      "loss": 1.5703,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4913183279742765,
+      "grad_norm": 0.5957326889038086,
+      "learning_rate": 2.7097661623108668e-05,
+      "loss": 1.5247,
+      "step": 1160
+    },
+    {
+      "epoch": 1.5041800643086818,
+      "grad_norm": 0.5608690977096558,
+      "learning_rate": 2.6409903713892708e-05,
+      "loss": 1.5403,
+      "step": 1170
+    },
+    {
+      "epoch": 1.5170418006430868,
+      "grad_norm": 0.5870776176452637,
+      "learning_rate": 2.5722145804676755e-05,
+      "loss": 1.5235,
+      "step": 1180
+    },
+    {
+      "epoch": 1.5299035369774918,
+      "grad_norm": 0.5889161229133606,
+      "learning_rate": 2.50343878954608e-05,
+      "loss": 1.5164,
+      "step": 1190
+    },
+    {
+      "epoch": 1.542765273311897,
+      "grad_norm": 0.6082655787467957,
+      "learning_rate": 2.4346629986244843e-05,
+      "loss": 1.5022,
+      "step": 1200
+    },
+    {
+      "epoch": 1.542765273311897,
+      "eval_yahma/alpaca-cleaned_loss": 1.5769098997116089,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9228,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.785,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 1200
+    },
+    {
+      "epoch": 1.5556270096463023,
+      "grad_norm": 0.6997891664505005,
+      "learning_rate": 2.3658872077028886e-05,
+      "loss": 1.5197,
+      "step": 1210
+    },
+    {
+      "epoch": 1.5684887459807073,
+      "grad_norm": 0.6935648918151855,
+      "learning_rate": 2.2971114167812934e-05,
+      "loss": 1.5391,
+      "step": 1220
+    },
+    {
+      "epoch": 1.5813504823151125,
+      "grad_norm": 0.6135308742523193,
+      "learning_rate": 2.2283356258596974e-05,
+      "loss": 1.5238,
+      "step": 1230
+    },
+    {
+      "epoch": 1.5942122186495178,
+      "grad_norm": 0.5835321545600891,
+      "learning_rate": 2.1595598349381018e-05,
+      "loss": 1.5767,
+      "step": 1240
+    },
+    {
+      "epoch": 1.6070739549839228,
+      "grad_norm": 0.6089451313018799,
+      "learning_rate": 2.0907840440165065e-05,
+      "loss": 1.535,
+      "step": 1250
+    },
+    {
+      "epoch": 1.6199356913183278,
+      "grad_norm": 0.5886595249176025,
+      "learning_rate": 2.022008253094911e-05,
+      "loss": 1.5133,
+      "step": 1260
+    },
+    {
+      "epoch": 1.6327974276527333,
+      "grad_norm": 0.6229696273803711,
+      "learning_rate": 1.953232462173315e-05,
+      "loss": 1.5313,
+      "step": 1270
+    },
+    {
+      "epoch": 1.6456591639871383,
+      "grad_norm": 0.60906583070755,
+      "learning_rate": 1.8844566712517196e-05,
+      "loss": 1.5152,
+      "step": 1280
+    },
+    {
+      "epoch": 1.6585209003215433,
+      "grad_norm": 0.5806885957717896,
+      "learning_rate": 1.815680880330124e-05,
+      "loss": 1.5468,
+      "step": 1290
+    },
+    {
+      "epoch": 1.6713826366559486,
+      "grad_norm": 0.6111522316932678,
+      "learning_rate": 1.746905089408528e-05,
+      "loss": 1.544,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6713826366559486,
+      "eval_yahma/alpaca-cleaned_loss": 1.574813961982727,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9178,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.788,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.973,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6842443729903538,
+      "grad_norm": 0.5954424738883972,
+      "learning_rate": 1.6781292984869327e-05,
+      "loss": 1.5253,
+      "step": 1310
+    },
+    {
+      "epoch": 1.6971061093247588,
+      "grad_norm": 0.5995926856994629,
+      "learning_rate": 1.609353507565337e-05,
+      "loss": 1.5306,
+      "step": 1320
+    },
+    {
+      "epoch": 1.7099678456591638,
+      "grad_norm": 0.6193538308143616,
+      "learning_rate": 1.5405777166437415e-05,
+      "loss": 1.5344,
+      "step": 1330
+    },
+    {
+      "epoch": 1.7228295819935693,
+      "grad_norm": 0.596823513507843,
+      "learning_rate": 1.4718019257221457e-05,
+      "loss": 1.5561,
+      "step": 1340
+    },
+    {
+      "epoch": 1.7356913183279743,
+      "grad_norm": 0.658667266368866,
+      "learning_rate": 1.4030261348005502e-05,
+      "loss": 1.5158,
+      "step": 1350
+    },
+    {
+      "epoch": 1.7485530546623793,
+      "grad_norm": 0.643640398979187,
+      "learning_rate": 1.3342503438789546e-05,
+      "loss": 1.5412,
+      "step": 1360
+    },
+    {
+      "epoch": 1.7614147909967846,
+      "grad_norm": 0.6444098353385925,
+      "learning_rate": 1.2654745529573592e-05,
+      "loss": 1.5098,
+      "step": 1370
+    },
+    {
+      "epoch": 1.7742765273311898,
+      "grad_norm": 0.518659234046936,
+      "learning_rate": 1.1966987620357635e-05,
+      "loss": 1.5418,
+      "step": 1380
+    },
+    {
+      "epoch": 1.7871382636655948,
+      "grad_norm": 0.5826813578605652,
+      "learning_rate": 1.127922971114168e-05,
+      "loss": 1.5204,
+      "step": 1390
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.6658011674880981,
+      "learning_rate": 1.0591471801925723e-05,
+      "loss": 1.5511,
+      "step": 1400
+    },
+    {
+      "epoch": 1.8,
+      "eval_yahma/alpaca-cleaned_loss": 1.5736079216003418,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9144,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.789,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.974,
+      "step": 1400
+    },
+    {
+      "epoch": 1.8128617363344053,
+      "grad_norm": 0.5695498585700989,
+      "learning_rate": 9.903713892709767e-06,
+      "loss": 1.527,
+      "step": 1410
+    },
+    {
+      "epoch": 1.8257234726688103,
+      "grad_norm": 0.607875645160675,
+      "learning_rate": 9.21595598349381e-06,
+      "loss": 1.5183,
+      "step": 1420
+    },
+    {
+      "epoch": 1.8385852090032153,
+      "grad_norm": 0.5988701581954956,
+      "learning_rate": 8.528198074277854e-06,
+      "loss": 1.5459,
+      "step": 1430
+    },
+    {
+      "epoch": 1.8514469453376206,
+      "grad_norm": 0.6526191234588623,
+      "learning_rate": 7.8404401650619e-06,
+      "loss": 1.5566,
+      "step": 1440
+    },
+    {
+      "epoch": 1.8643086816720258,
+      "grad_norm": 0.5458080768585205,
+      "learning_rate": 7.152682255845943e-06,
+      "loss": 1.5176,
+      "step": 1450
+    },
+    {
+      "epoch": 1.8771704180064308,
+      "grad_norm": 0.6263613700866699,
+      "learning_rate": 6.464924346629987e-06,
+      "loss": 1.5234,
+      "step": 1460
+    },
+    {
+      "epoch": 1.890032154340836,
+      "grad_norm": 0.6338502168655396,
+      "learning_rate": 5.77716643741403e-06,
+      "loss": 1.5376,
+      "step": 1470
+    },
+    {
+      "epoch": 1.9028938906752413,
+      "grad_norm": 0.6531928181648254,
+      "learning_rate": 5.089408528198075e-06,
+      "loss": 1.5247,
+      "step": 1480
+    },
+    {
+      "epoch": 1.9157556270096463,
+      "grad_norm": 0.6073517203330994,
+      "learning_rate": 4.4016506189821186e-06,
+      "loss": 1.5398,
+      "step": 1490
+    },
+    {
+      "epoch": 1.9286173633440513,
+      "grad_norm": 0.6269332766532898,
+      "learning_rate": 3.7138927097661627e-06,
+      "loss": 1.5597,
+      "step": 1500
+    },
+    {
+      "epoch": 1.9286173633440513,
+      "eval_yahma/alpaca-cleaned_loss": 1.5721148252487183,
+      "eval_yahma/alpaca-cleaned_runtime": 62.9499,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.771,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.971,
+      "step": 1500
+    },
+    {
+      "epoch": 1.9414790996784566,
+      "grad_norm": 0.5752962231636047,
+      "learning_rate": 3.0261348005502065e-06,
+      "loss": 1.5468,
+      "step": 1510
+    },
+    {
+      "epoch": 1.9543408360128618,
+      "grad_norm": 0.5622620582580566,
+      "learning_rate": 2.3383768913342507e-06,
+      "loss": 1.5588,
+      "step": 1520
+    },
+    {
+      "epoch": 1.9672025723472668,
+      "grad_norm": 0.6163848042488098,
+      "learning_rate": 1.6506189821182942e-06,
+      "loss": 1.482,
+      "step": 1530
+    },
+    {
+      "epoch": 1.980064308681672,
+      "grad_norm": 0.6466639041900635,
+      "learning_rate": 9.628610729023384e-07,
+      "loss": 1.5543,
+      "step": 1540
+    },
+    {
+      "epoch": 1.9929260450160773,
+      "grad_norm": 0.6139137148857117,
+      "learning_rate": 2.751031636863824e-07,
+      "loss": 1.4905,
+      "step": 1550
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1554,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1547885106338202e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

tune_log/layerskip_1b_0.25_tune/checkpoint-1554/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
+size 5368

tune_log/layerskip_1b_0.25_tune/checkpoint-200/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5132ee225c5136a04b393a5b013eaae546265c15ef4d93460674de77e5f724d2
+size 19960448

tune_log/layerskip_1b_0.25_tune/checkpoint-200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b31b7db31448cbea4a2b26ecfb4f5e38242c8fef6d933e20d80a45340fa2e2e7
+size 40050298

tune_log/layerskip_1b_0.25_tune/checkpoint-200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1726ab754f473657bf32650b33d136b1ba1d1d1c74e402fbbacb2a89a6809796
+size 14244

tune_log/layerskip_1b_0.25_tune/checkpoint-200/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bd55b3848d82967a207e0805911c79200c6adce71e3b37fd24549a718f75738
+size 988

tune_log/layerskip_1b_0.25_tune/checkpoint-200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:363127827fa84aceb28f95364631df4397d57dee08819ca6a0979763f837be6f
+size 1064

tune_log/layerskip_1b_0.25_tune/checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,196 @@

+{
+  "best_metric": 1.6644691228866577,
+  "best_model_checkpoint": "tune_log/layerskip_1b_0.25_tune/checkpoint-200",
+  "epoch": 0.2572347266881029,
+  "eval_steps": 100,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012861736334405145,
+      "grad_norm": 0.39783015847206116,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.0835,
+      "step": 1
+    },
+    {
+      "epoch": 0.012861736334405145,
+      "grad_norm": 0.45549583435058594,
+      "learning_rate": 1e-05,
+      "loss": 2.1408,
+      "step": 10
+    },
+    {
+      "epoch": 0.02572347266881029,
+      "grad_norm": 0.4594053626060486,
+      "learning_rate": 2e-05,
+      "loss": 2.0894,
+      "step": 20
+    },
+    {
+      "epoch": 0.03858520900321544,
+      "grad_norm": 0.49020764231681824,
+      "learning_rate": 3e-05,
+      "loss": 2.1037,
+      "step": 30
+    },
+    {
+      "epoch": 0.05144694533762058,
+      "grad_norm": 0.37993305921554565,
+      "learning_rate": 4e-05,
+      "loss": 1.9716,
+      "step": 40
+    },
+    {
+      "epoch": 0.06430868167202572,
+      "grad_norm": 0.38231977820396423,
+      "learning_rate": 5e-05,
+      "loss": 1.9349,
+      "step": 50
+    },
+    {
+      "epoch": 0.07717041800643087,
+      "grad_norm": 0.2922589182853699,
+      "learning_rate": 6e-05,
+      "loss": 1.906,
+      "step": 60
+    },
+    {
+      "epoch": 0.09003215434083602,
+      "grad_norm": 0.34647658467292786,
+      "learning_rate": 7e-05,
+      "loss": 1.8246,
+      "step": 70
+    },
+    {
+      "epoch": 0.10289389067524116,
+      "grad_norm": 0.31930026412010193,
+      "learning_rate": 8e-05,
+      "loss": 1.8057,
+      "step": 80
+    },
+    {
+      "epoch": 0.1157556270096463,
+      "grad_norm": 0.34028756618499756,
+      "learning_rate": 9e-05,
+      "loss": 1.7546,
+      "step": 90
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "grad_norm": 0.3878991901874542,
+      "learning_rate": 0.0001,
+      "loss": 1.7543,
+      "step": 100
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "eval_yahma/alpaca-cleaned_loss": 1.7584081888198853,
+      "eval_yahma/alpaca-cleaned_runtime": 62.5096,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.995,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.999,
+      "step": 100
+    },
+    {
+      "epoch": 0.1414790996784566,
+      "grad_norm": 0.35599613189697266,
+      "learning_rate": 9.931224209078405e-05,
+      "loss": 1.7309,
+      "step": 110
+    },
+    {
+      "epoch": 0.15434083601286175,
+      "grad_norm": 0.4075644016265869,
+      "learning_rate": 9.862448418156809e-05,
+      "loss": 1.6981,
+      "step": 120
+    },
+    {
+      "epoch": 0.16720257234726688,
+      "grad_norm": 0.4743317663669586,
+      "learning_rate": 9.793672627235215e-05,
+      "loss": 1.7011,
+      "step": 130
+    },
+    {
+      "epoch": 0.18006430868167203,
+      "grad_norm": 0.4701610505580902,
+      "learning_rate": 9.724896836313618e-05,
+      "loss": 1.6771,
+      "step": 140
+    },
+    {
+      "epoch": 0.19292604501607716,
+      "grad_norm": 0.49115318059921265,
+      "learning_rate": 9.656121045392023e-05,
+      "loss": 1.6633,
+      "step": 150
+    },
+    {
+      "epoch": 0.2057877813504823,
+      "grad_norm": 0.5177980661392212,
+      "learning_rate": 9.587345254470427e-05,
+      "loss": 1.6706,
+      "step": 160
+    },
+    {
+      "epoch": 0.21864951768488747,
+      "grad_norm": 0.465657114982605,
+      "learning_rate": 9.518569463548831e-05,
+      "loss": 1.6677,
+      "step": 170
+    },
+    {
+      "epoch": 0.2315112540192926,
+      "grad_norm": 0.5453551411628723,
+      "learning_rate": 9.449793672627235e-05,
+      "loss": 1.6656,
+      "step": 180
+    },
+    {
+      "epoch": 0.24437299035369775,
+      "grad_norm": 0.4150402545928955,
+      "learning_rate": 9.38101788170564e-05,
+      "loss": 1.6568,
+      "step": 190
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "grad_norm": 0.5106223225593567,
+      "learning_rate": 9.312242090784045e-05,
+      "loss": 1.6804,
+      "step": 200
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "eval_yahma/alpaca-cleaned_loss": 1.6644691228866577,
+      "eval_yahma/alpaca-cleaned_runtime": 63.0481,
+      "eval_yahma/alpaca-cleaned_samples_per_second": 31.722,
+      "eval_yahma/alpaca-cleaned_steps_per_second": 3.965,
+      "step": 200
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1554,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.487298513076224e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

tune_log/layerskip_1b_0.25_tune/checkpoint-200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2e6b3a19829885cc97674c841aa3f679a2810cbf00e5fbadcda000c43f0f46
+size 5368