Upload folder using huggingface_hub
Browse files- 0000100_adapters.safetensors +3 -0
- 0000200_adapters.safetensors +3 -0
- 0000300_adapters.safetensors +3 -0
- 0000400_adapters.safetensors +3 -0
- 0000500_adapters.safetensors +3 -0
- 0000600_adapters.safetensors +3 -0
- adapter_config.json +40 -0
- adapters.safetensors +3 -0
- training.log +330 -0
0000100_adapters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b268b38ba34abff6a6b1c21d11b1b7184b785932a494fbdaef5c4b8cb04fac96
|
| 3 |
+
size 41967272
|
0000200_adapters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a94fba5a9400fc8ba993bfd1fea6788185fc54282ffe77a061f06c75ca412e04
|
| 3 |
+
size 41967272
|
0000300_adapters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be8d8b6b18e3e0b4253ba393c4137fb3b475dd52464b1a1f2b2fb0f7e7eceb21
|
| 3 |
+
size 41967272
|
0000400_adapters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b695f45f86cafbeb670af96d63042e3afb2a6c57e07d17e95e326e6dca29040e
|
| 3 |
+
size 41967272
|
0000500_adapters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7c55fd2fce6124e01a895591b97d66074420788bdd412c3576e4eb0ba97e699
|
| 3 |
+
size 41967272
|
0000600_adapters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e07caf257c0f9147b9d07047a7ab7edb5040afd2cac886f4f310cfe82991fe9e
|
| 3 |
+
size 41967272
|
adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"adapter_path": "models/lora/mistral_lora_telegram_20251111_114741",
|
| 3 |
+
"batch_size": 2,
|
| 4 |
+
"config": null,
|
| 5 |
+
"data": "data/phase2/mlx_datasets/telegram",
|
| 6 |
+
"fine_tune_type": "lora",
|
| 7 |
+
"grad_accumulation_steps": 1,
|
| 8 |
+
"grad_checkpoint": false,
|
| 9 |
+
"iters": 600,
|
| 10 |
+
"learning_rate": 1e-05,
|
| 11 |
+
"lora_parameters": {
|
| 12 |
+
"rank": 8,
|
| 13 |
+
"dropout": 0.0,
|
| 14 |
+
"scale": 20.0
|
| 15 |
+
},
|
| 16 |
+
"lr_schedule": null,
|
| 17 |
+
"mask_prompt": false,
|
| 18 |
+
"max_seq_length": 2048,
|
| 19 |
+
"model": "models/mistral-7b-instruct-v0.3-mlx",
|
| 20 |
+
"num_layers": 16,
|
| 21 |
+
"optimizer": "adam",
|
| 22 |
+
"optimizer_config": {
|
| 23 |
+
"adam": {},
|
| 24 |
+
"adamw": {},
|
| 25 |
+
"muon": {},
|
| 26 |
+
"sgd": {},
|
| 27 |
+
"adafactor": {}
|
| 28 |
+
},
|
| 29 |
+
"project_name": null,
|
| 30 |
+
"report_to": null,
|
| 31 |
+
"resume_adapter_file": null,
|
| 32 |
+
"save_every": 100,
|
| 33 |
+
"seed": 42,
|
| 34 |
+
"steps_per_eval": 100,
|
| 35 |
+
"steps_per_report": 10,
|
| 36 |
+
"test": true,
|
| 37 |
+
"test_batches": 50,
|
| 38 |
+
"train": true,
|
| 39 |
+
"val_batches": 25
|
| 40 |
+
}
|
adapters.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e07caf257c0f9147b9d07047a7ab7edb5040afd2cac886f4f310cfe82991fe9e
|
| 3 |
+
size 41967272
|
training.log
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Loading pretrained model
|
| 2 |
+
Loading datasets
|
| 3 |
+
Training
|
| 4 |
+
Trainable parameters: 0.145% (10.486M/7248.024M)
|
| 5 |
+
Starting training..., iters: 600
|
| 6 |
+
|
| 7 |
+
Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
|
| 8 |
+
Calculating loss...: 4%|β | 1/25 [00:01<00:32, 1.36s/it]
|
| 9 |
+
Calculating loss...: 8%|β | 2/25 [00:02<00:33, 1.47s/it]
|
| 10 |
+
Calculating loss...: 12%|ββ | 3/25 [00:04<00:34, 1.55s/it]
|
| 11 |
+
Calculating loss...: 16%|ββ | 4/25 [00:05<00:30, 1.47s/it]
|
| 12 |
+
Calculating loss...: 20%|ββ | 5/25 [00:07<00:28, 1.43s/it]
|
| 13 |
+
Calculating loss...: 24%|βββ | 6/25 [00:09<00:34, 1.82s/it]
|
| 14 |
+
Calculating loss...: 28%|βββ | 7/25 [00:11<00:31, 1.72s/it]
|
| 15 |
+
Calculating loss...: 32%|ββββ | 8/25 [00:13<00:29, 1.74s/it]
|
| 16 |
+
Calculating loss...: 36%|ββββ | 9/25 [00:14<00:27, 1.69s/it]
|
| 17 |
+
Calculating loss...: 40%|ββββ | 10/25 [00:16<00:25, 1.68s/it]
|
| 18 |
+
Calculating loss...: 44%|βββββ | 11/25 [00:17<00:21, 1.56s/it]
|
| 19 |
+
Calculating loss...: 48%|βββββ | 12/25 [00:19<00:20, 1.56s/it]
|
| 20 |
+
Calculating loss...: 52%|ββββββ | 13/25 [00:20<00:17, 1.50s/it]
|
| 21 |
+
Calculating loss...: 56%|ββββββ | 14/25 [00:22<00:16, 1.48s/it]
|
| 22 |
+
Calculating loss...: 60%|ββββββ | 15/25 [00:23<00:15, 1.60s/it]
|
| 23 |
+
Calculating loss...: 64%|βββββββ | 16/25 [00:25<00:14, 1.58s/it]
|
| 24 |
+
Calculating loss...: 68%|βββββββ | 17/25 [00:26<00:12, 1.56s/it]
|
| 25 |
+
Calculating loss...: 72%|ββββββββ | 18/25 [00:28<00:11, 1.63s/it]
|
| 26 |
+
Calculating loss...: 76%|ββββββββ | 19/25 [00:30<00:10, 1.68s/it]
|
| 27 |
+
Calculating loss...: 80%|ββββββββ | 20/25 [00:31<00:07, 1.59s/it]
|
| 28 |
+
Calculating loss...: 84%|βββββββββ | 21/25 [00:33<00:06, 1.52s/it]
|
| 29 |
+
Calculating loss...: 88%|βββββββββ | 22/25 [00:34<00:04, 1.44s/it]
|
| 30 |
+
Calculating loss...: 92%|ββββββββββ| 23/25 [00:36<00:02, 1.45s/it]
|
| 31 |
+
Calculating loss...: 96%|ββββββββββ| 24/25 [00:37<00:01, 1.51s/it]
|
| 32 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:39<00:00, 1.60s/it]
|
| 33 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:39<00:00, 1.58s/it]
|
| 34 |
+
Iter 1: Val loss 2.214, Val took 39.464s
|
| 35 |
+
Iter 10: Train loss 1.065, Learning Rate 1.000e-05, It/sec 0.362, Tokens/sec 431.356, Trained Tokens 11910, Peak mem 24.920 GB
|
| 36 |
+
Iter 20: Train loss 0.755, Learning Rate 1.000e-05, It/sec 0.263, Tokens/sec 350.838, Trained Tokens 25233, Peak mem 37.081 GB
|
| 37 |
+
Iter 30: Train loss 0.688, Learning Rate 1.000e-05, It/sec 0.352, Tokens/sec 429.868, Trained Tokens 37431, Peak mem 37.081 GB
|
| 38 |
+
Iter 40: Train loss 0.557, Learning Rate 1.000e-05, It/sec 0.340, Tokens/sec 403.603, Trained Tokens 49302, Peak mem 37.081 GB
|
| 39 |
+
Iter 50: Train loss 0.663, Learning Rate 1.000e-05, It/sec 0.296, Tokens/sec 387.318, Trained Tokens 62366, Peak mem 37.081 GB
|
| 40 |
+
Iter 60: Train loss 0.609, Learning Rate 1.000e-05, It/sec 0.345, Tokens/sec 424.207, Trained Tokens 74645, Peak mem 37.081 GB
|
| 41 |
+
Iter 70: Train loss 0.614, Learning Rate 1.000e-05, It/sec 0.344, Tokens/sec 421.161, Trained Tokens 86901, Peak mem 37.081 GB
|
| 42 |
+
Iter 80: Train loss 0.607, Learning Rate 1.000e-05, It/sec 0.348, Tokens/sec 423.462, Trained Tokens 99079, Peak mem 37.081 GB
|
| 43 |
+
Iter 90: Train loss 0.556, Learning Rate 1.000e-05, It/sec 0.373, Tokens/sec 439.181, Trained Tokens 110838, Peak mem 37.081 GB
|
| 44 |
+
|
| 45 |
+
Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
|
| 46 |
+
Calculating loss...: 4%|β | 1/25 [00:01<00:43, 1.83s/it]
|
| 47 |
+
Calculating loss...: 8%|β | 2/25 [00:03<00:37, 1.65s/it]
|
| 48 |
+
Calculating loss...: 12%|ββ | 3/25 [00:04<00:34, 1.55s/it]
|
| 49 |
+
Calculating loss...: 16%|ββ | 4/25 [00:06<00:31, 1.51s/it]
|
| 50 |
+
Calculating loss...: 20%|ββ | 5/25 [00:07<00:29, 1.46s/it]
|
| 51 |
+
Calculating loss...: 24%|βββ | 6/25 [00:09<00:28, 1.48s/it]
|
| 52 |
+
Calculating loss...: 28%|βββ | 7/25 [00:10<00:26, 1.49s/it]
|
| 53 |
+
Calculating loss...: 32%|ββββ | 8/25 [00:11<00:24, 1.44s/it]
|
| 54 |
+
Calculating loss...: 36%|ββββ | 9/25 [00:13<00:22, 1.41s/it]
|
| 55 |
+
Calculating loss...: 40%|ββββ | 10/25 [00:14<00:21, 1.45s/it]
|
| 56 |
+
Calculating loss...: 44%|βββββ | 11/25 [00:16<00:21, 1.54s/it]
|
| 57 |
+
Calculating loss...: 48%|βββββ | 12/25 [00:17<00:19, 1.49s/it]
|
| 58 |
+
Calculating loss...: 52%|ββββββ | 13/25 [00:19<00:18, 1.50s/it]
|
| 59 |
+
Calculating loss...: 56%|ββββββ | 14/25 [00:21<00:16, 1.50s/it]
|
| 60 |
+
Calculating loss...: 60%|ββββββ | 15/25 [00:22<00:15, 1.51s/it]
|
| 61 |
+
Calculating loss...: 64%|βββββββ | 16/25 [00:24<00:15, 1.72s/it]
|
| 62 |
+
Calculating loss...: 68%|βββββββ | 17/25 [00:26<00:13, 1.67s/it]
|
| 63 |
+
Calculating loss...: 72%|ββββββββ | 18/25 [00:29<00:13, 1.99s/it]
|
| 64 |
+
Calculating loss...: 76%|ββββββββ | 19/25 [00:30<00:10, 1.82s/it]
|
| 65 |
+
Calculating loss...: 80%|βββββοΏ½οΏ½οΏ½ββ | 20/25 [00:33<00:10, 2.12s/it]
|
| 66 |
+
Calculating loss...: 84%|βββββββββ | 21/25 [00:34<00:07, 1.89s/it]
|
| 67 |
+
Calculating loss...: 88%|βββββββββ | 22/25 [00:36<00:05, 1.75s/it]
|
| 68 |
+
Calculating loss...: 92%|ββββββββββ| 23/25 [00:37<00:03, 1.68s/it]
|
| 69 |
+
Calculating loss...: 96%|ββββββββββ| 24/25 [00:38<00:01, 1.61s/it]
|
| 70 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:40<00:00, 1.60s/it]
|
| 71 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:40<00:00, 1.62s/it]
|
| 72 |
+
Iter 100: Val loss 0.553, Val took 40.597s
|
| 73 |
+
Iter 100: Train loss 0.487, Learning Rate 1.000e-05, It/sec 0.317, Tokens/sec 358.772, Trained Tokens 122142, Peak mem 37.081 GB
|
| 74 |
+
Iter 100: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000100_adapters.safetensors.
|
| 75 |
+
Iter 110: Train loss 0.556, Learning Rate 1.000e-05, It/sec 0.366, Tokens/sec 430.562, Trained Tokens 133896, Peak mem 37.081 GB
|
| 76 |
+
Iter 120: Train loss 0.550, Learning Rate 1.000e-05, It/sec 0.348, Tokens/sec 437.090, Trained Tokens 146466, Peak mem 37.081 GB
|
| 77 |
+
Iter 130: Train loss 0.499, Learning Rate 1.000e-05, It/sec 0.349, Tokens/sec 430.415, Trained Tokens 158812, Peak mem 37.081 GB
|
| 78 |
+
Iter 140: Train loss 0.393, Learning Rate 1.000e-05, It/sec 0.436, Tokens/sec 465.004, Trained Tokens 169489, Peak mem 37.081 GB
|
| 79 |
+
Iter 150: Train loss 0.506, Learning Rate 1.000e-05, It/sec 0.372, Tokens/sec 438.469, Trained Tokens 181272, Peak mem 37.081 GB
|
| 80 |
+
Iter 160: Train loss 0.484, Learning Rate 1.000e-05, It/sec 0.375, Tokens/sec 443.286, Trained Tokens 193104, Peak mem 37.081 GB
|
| 81 |
+
Iter 170: Train loss 0.480, Learning Rate 1.000e-05, It/sec 0.399, Tokens/sec 448.274, Trained Tokens 204339, Peak mem 37.081 GB
|
| 82 |
+
Iter 180: Train loss 0.491, Learning Rate 1.000e-05, It/sec 0.374, Tokens/sec 442.924, Trained Tokens 216181, Peak mem 37.081 GB
|
| 83 |
+
Iter 190: Train loss 0.612, Learning Rate 1.000e-05, It/sec 0.259, Tokens/sec 348.364, Trained Tokens 229607, Peak mem 37.123 GB
|
| 84 |
+
|
| 85 |
+
Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
|
| 86 |
+
Calculating loss...: 4%|β | 1/25 [00:03<01:30, 3.75s/it]
|
| 87 |
+
Calculating loss...: 8%|β | 2/25 [00:05<01:00, 2.64s/it]
|
| 88 |
+
Calculating loss...: 12%|ββ | 3/25 [00:07<00:54, 2.47s/it]
|
| 89 |
+
Calculating loss...: 16%|ββ | 4/25 [00:10<00:49, 2.36s/it]
|
| 90 |
+
Calculating loss...: 20%|ββ | 5/25 [00:11<00:39, 1.99s/it]
|
| 91 |
+
Calculating loss...: 24%|βββ | 6/25 [00:13<00:37, 1.96s/it]
|
| 92 |
+
Calculating loss...: 28%|βββ | 7/25 [00:14<00:31, 1.74s/it]
|
| 93 |
+
Calculating loss...: 32%|ββββ | 8/25 [00:16<00:28, 1.68s/it]
|
| 94 |
+
Calculating loss...: 36%|ββββ | 9/25 [00:19<00:33, 2.10s/it]
|
| 95 |
+
Calculating loss...: 40%|ββββ | 10/25 [00:21<00:30, 2.02s/it]
|
| 96 |
+
Calculating loss...: 44%|βββββ | 11/25 [00:22<00:26, 1.88s/it]
|
| 97 |
+
Calculating loss...: 48%|βββββ | 12/25 [00:24<00:23, 1.83s/it]
|
| 98 |
+
Calculating loss...: 52%|ββββββ | 13/25 [00:25<00:20, 1.72s/it]
|
| 99 |
+
Calculating loss...: 56%|ββββββ | 14/25 [00:27<00:19, 1.75s/it]
|
| 100 |
+
Calculating loss...: 60%|ββββββ | 15/25 [00:30<00:21, 2.15s/it]
|
| 101 |
+
Calculating loss...: 64%|βββββββ | 16/25 [00:32<00:17, 1.99s/it]
|
| 102 |
+
Calculating loss...: 68%|βββββββ | 17/25 [00:34<00:15, 1.95s/it]
|
| 103 |
+
Calculating loss...: 72%|ββββββββ | 18/25 [00:35<00:12, 1.75s/it]
|
| 104 |
+
Calculating loss...: 76%|ββββββββ | 19/25 [00:37<00:10, 1.72s/it]
|
| 105 |
+
Calculating loss...: 80%|ββββββββ | 20/25 [00:39<00:08, 1.78s/it]
|
| 106 |
+
Calculating loss...: 84%|βββββββββ | 21/25 [00:40<00:06, 1.65s/it]
|
| 107 |
+
Calculating loss...: 88%|βββββββββ | 22/25 [00:41<00:04, 1.64s/it]
|
| 108 |
+
Calculating loss...: 92%|ββββββββββ| 23/25 [00:43<00:03, 1.70s/it]
|
| 109 |
+
Calculating loss...: 96%|ββββββββββ| 24/25 [00:45<00:01, 1.63s/it]
|
| 110 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:46<00:00, 1.59s/it]
|
| 111 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:46<00:00, 1.87s/it]
|
| 112 |
+
Iter 200: Val loss 0.499, Val took 46.784s
|
| 113 |
+
Iter 200: Train loss 0.426, Learning Rate 1.000e-05, It/sec 0.364, Tokens/sec 420.021, Trained Tokens 241139, Peak mem 37.123 GB
|
| 114 |
+
Iter 200: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000200_adapters.safetensors.
|
| 115 |
+
Iter 210: Train loss 0.402, Learning Rate 1.000e-05, It/sec 0.364, Tokens/sec 403.442, Trained Tokens 252219, Peak mem 37.123 GB
|
| 116 |
+
Iter 220: Train loss 0.432, Learning Rate 1.000e-05, It/sec 0.408, Tokens/sec 467.043, Trained Tokens 263665, Peak mem 37.123 GB
|
| 117 |
+
Iter 230: Train loss 0.562, Learning Rate 1.000e-05, It/sec 0.369, Tokens/sec 447.334, Trained Tokens 275796, Peak mem 37.123 GB
|
| 118 |
+
Iter 240: Train loss 0.491, Learning Rate 1.000e-05, It/sec 0.396, Tokens/sec 461.514, Trained Tokens 287447, Peak mem 37.123 GB
|
| 119 |
+
Iter 250: Train loss 0.460, Learning Rate 1.000e-05, It/sec 0.391, Tokens/sec 453.744, Trained Tokens 299037, Peak mem 37.123 GB
|
| 120 |
+
Iter 260: Train loss 0.462, Learning Rate 1.000e-05, It/sec 0.403, Tokens/sec 470.594, Trained Tokens 310708, Peak mem 37.123 GB
|
| 121 |
+
Iter 270: Train loss 0.411, Learning Rate 1.000e-05, It/sec 0.397, Tokens/sec 455.131, Trained Tokens 322185, Peak mem 37.123 GB
|
| 122 |
+
Iter 280: Train loss 0.388, Learning Rate 1.000e-05, It/sec 0.386, Tokens/sec 447.749, Trained Tokens 333784, Peak mem 37.123 GB
|
| 123 |
+
Iter 290: Train loss 0.506, Learning Rate 1.000e-05, It/sec 0.401, Tokens/sec 459.746, Trained Tokens 345244, Peak mem 37.123 GB
|
| 124 |
+
|
| 125 |
+
Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
|
| 126 |
+
Calculating loss...: 4%|β | 1/25 [00:01<00:33, 1.40s/it]
|
| 127 |
+
Calculating loss...: 8%|β | 2/25 [00:04<00:48, 2.13s/it]
|
| 128 |
+
Calculating loss...: 12%|ββ | 3/25 [00:05<00:40, 1.83s/it]
|
| 129 |
+
Calculating loss...: 16%|ββ | 4/25 [00:07<00:35, 1.70s/it]
|
| 130 |
+
Calculating loss...: 20%|ββ | 5/25 [00:08<00:32, 1.62s/it]
|
| 131 |
+
Calculating loss...: 24%|βββ | 6/25 [00:09<00:29, 1.55s/it]
|
| 132 |
+
Calculating loss...: 28%|βββ | 7/25 [00:11<00:26, 1.48s/it]
|
| 133 |
+
Calculating loss...: 32%|ββββ | 8/25 [00:12<00:24, 1.46s/it]
|
| 134 |
+
Calculating loss...: 36%|ββββ | 9/25 [00:14<00:23, 1.49s/it]
|
| 135 |
+
Calculating loss...: 40%|ββββ | 10/25 [00:15<00:22, 1.51s/it]
|
| 136 |
+
Calculating loss...: 44%|βββββ | 11/25 [00:17<00:21, 1.50s/it]
|
| 137 |
+
Calculating loss...: 48%|βββββ | 12/25 [00:18<00:19, 1.52s/it]
|
| 138 |
+
Calculating loss...: 52%|ββββββ | 13/25 [00:20<00:19, 1.60s/it]
|
| 139 |
+
Calculating loss...: 56%|ββββββ | 14/25 [00:21<00:16, 1.52s/it]
|
| 140 |
+
Calculating loss...: 60%|ββββββ | 15/25 [00:24<00:18, 1.88s/it]
|
| 141 |
+
Calculating loss...: 64%|βββββββ | 16/25 [00:25<00:15, 1.70s/it]
|
| 142 |
+
Calculating loss...: 68%|βββββββ | 17/25 [00:27<00:12, 1.60s/it]
|
| 143 |
+
Calculating loss...: 72%|ββββββββ | 18/25 [00:29<00:11, 1.66s/it]
|
| 144 |
+
Calculating loss...: 76%|ββββββββ | 19/25 [00:30<00:10, 1.69s/it]
|
| 145 |
+
Calculating loss...: 80%|ββββββββ | 20/25 [00:32<00:08, 1.66s/it]
|
| 146 |
+
Calculating loss...: 84%|βββββββββ | 21/25 [00:33<00:06, 1.54s/it]
|
| 147 |
+
Calculating loss...: 88%|βββββββββ | 22/25 [00:35<00:04, 1.57s/it]
|
| 148 |
+
Calculating loss...: 92%|ββββββββββ| 23/25 [00:37<00:03, 1.73s/it]
|
| 149 |
+
Calculating loss...: 96%|ββββββββββ| 24/25 [00:38<00:01, 1.63s/it]
|
| 150 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:40<00:00, 1.54s/it]
|
| 151 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:40<00:00, 1.61s/it]
|
| 152 |
+
Iter 300: Val loss 0.475, Val took 40.184s
|
| 153 |
+
Iter 300: Train loss 0.520, Learning Rate 1.000e-05, It/sec 0.420, Tokens/sec 467.207, Trained Tokens 356356, Peak mem 37.123 GB
|
| 154 |
+
Iter 300: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000300_adapters.safetensors.
|
| 155 |
+
Iter 310: Train loss 0.429, Learning Rate 1.000e-05, It/sec 0.430, Tokens/sec 478.254, Trained Tokens 367482, Peak mem 37.123 GB
|
| 156 |
+
Iter 320: Train loss 0.420, Learning Rate 1.000e-05, It/sec 0.373, Tokens/sec 439.972, Trained Tokens 379266, Peak mem 37.123 GB
|
| 157 |
+
Iter 330: Train loss 0.480, Learning Rate 1.000e-05, It/sec 0.399, Tokens/sec 462.754, Trained Tokens 390875, Peak mem 37.123 GB
|
| 158 |
+
Iter 340: Train loss 0.456, Learning Rate 1.000e-05, It/sec 0.404, Tokens/sec 456.805, Trained Tokens 402182, Peak mem 37.123 GB
|
| 159 |
+
Iter 350: Train loss 0.639, Learning Rate 1.000e-05, It/sec 0.300, Tokens/sec 410.751, Trained Tokens 415852, Peak mem 37.972 GB
|
| 160 |
+
Iter 360: Train loss 0.522, Learning Rate 1.000e-05, It/sec 0.407, Tokens/sec 474.435, Trained Tokens 427505, Peak mem 37.972 GB
|
| 161 |
+
Iter 370: Train loss 0.525, Learning Rate 1.000e-05, It/sec 0.332, Tokens/sec 434.123, Trained Tokens 440587, Peak mem 37.972 GB
|
| 162 |
+
Iter 380: Train loss 0.424, Learning Rate 1.000e-05, It/sec 0.374, Tokens/sec 446.958, Trained Tokens 452536, Peak mem 37.972 GB
|
| 163 |
+
Iter 390: Train loss 0.428, Learning Rate 1.000e-05, It/sec 0.357, Tokens/sec 444.952, Trained Tokens 464991, Peak mem 37.972 GB
|
| 164 |
+
|
| 165 |
+
Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
|
| 166 |
+
Calculating loss...: 4%|β | 1/25 [00:01<00:37, 1.55s/it]
|
| 167 |
+
Calculating loss...: 8%|β | 2/25 [00:03<00:34, 1.52s/it]
|
| 168 |
+
Calculating loss...: 12%|ββ | 3/25 [00:04<00:36, 1.64s/it]
|
| 169 |
+
Calculating loss...: 16%|ββ | 4/25 [00:08<00:48, 2.29s/it]
|
| 170 |
+
Calculating loss...: 20%|ββ | 5/25 [00:09<00:39, 1.96s/it]
|
| 171 |
+
Calculating loss...: 24%|βββ | 6/25 [00:10<00:33, 1.77s/it]
|
| 172 |
+
Calculating loss...: 28%|βββ | 7/25 [00:12<00:30, 1.70s/it]
|
| 173 |
+
Calculating loss...: 32%|ββββ | 8/25 [00:13<00:27, 1.63s/it]
|
| 174 |
+
Calculating loss...: 36%|ββββ | 9/25 [00:16<00:29, 1.83s/it]
|
| 175 |
+
Calculating loss...: 40%|ββββ | 10/25 [00:17<00:25, 1.68s/it]
|
| 176 |
+
Calculating loss...: 44%|βββββ | 11/25 [00:18<00:21, 1.56s/it]
|
| 177 |
+
Calculating loss...: 48%|βββββ | 12/25 [00:20<00:19, 1.54s/it]
|
| 178 |
+
Calculating loss...: 52%|ββββββ | 13/25 [00:21<00:17, 1.48s/it]
|
| 179 |
+
Calculating loss...: 56%|ββββββ | 14/25 [00:23<00:15, 1.44s/it]
|
| 180 |
+
Calculating loss...: 60%|ββββββ | 15/25 [00:27<00:24, 2.41s/it]
|
| 181 |
+
Calculating loss...: 64%|βββββββ | 16/25 [00:29<00:20, 2.27s/it]
|
| 182 |
+
Calculating loss...: 68%|βββββββ | 17/25 [00:30<00:15, 2.00s/it]
|
| 183 |
+
Calculating loss...: 72%|ββββββββ | 18/25 [00:33<00:15, 2.19s/it]
|
| 184 |
+
Calculating loss...: 76%|ββββββββ | 19/25 [00:35<00:12, 2.07s/it]
|
| 185 |
+
Calculating loss...: 80%|ββββββββ | 20/25 [00:36<00:09, 1.85s/it]
|
| 186 |
+
Calculating loss...: 84%|βββββββββ | 21/25 [00:38<00:06, 1.68s/it]
|
| 187 |
+
Calculating loss...: 88%|βββββββββ | 22/25 [00:39<00:04, 1.55s/it]
|
| 188 |
+
Calculating loss...: 92%|ββββββββββ| 23/25 [00:40<00:03, 1.50s/it]
|
| 189 |
+
Calculating loss...: 96%|ββββββββββ| 24/25 [00:42<00:01, 1.47s/it]
|
| 190 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:43<00:00, 1.51s/it]
|
| 191 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:43<00:00, 1.75s/it]
|
| 192 |
+
Iter 400: Val loss 0.508, Val took 43.651s
|
| 193 |
+
Iter 400: Train loss 0.352, Learning Rate 1.000e-05, It/sec 0.346, Tokens/sec 403.495, Trained Tokens 476660, Peak mem 37.972 GB
|
| 194 |
+
Iter 400: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000400_adapters.safetensors.
|
| 195 |
+
Iter 410: Train loss 0.362, Learning Rate 1.000e-05, It/sec 0.354, Tokens/sec 436.195, Trained Tokens 488988, Peak mem 37.972 GB
|
| 196 |
+
Iter 420: Train loss 0.319, Learning Rate 1.000e-05, It/sec 0.412, Tokens/sec 477.171, Trained Tokens 500576, Peak mem 37.972 GB
|
| 197 |
+
Iter 430: Train loss 0.347, Learning Rate 1.000e-05, It/sec 0.394, Tokens/sec 461.272, Trained Tokens 512284, Peak mem 37.972 GB
|
| 198 |
+
Iter 440: Train loss 0.370, Learning Rate 1.000e-05, It/sec 0.375, Tokens/sec 436.054, Trained Tokens 523900, Peak mem 37.972 GB
|
| 199 |
+
Iter 450: Train loss 0.370, Learning Rate 1.000e-05, It/sec 0.375, Tokens/sec 460.385, Trained Tokens 536188, Peak mem 37.972 GB
|
| 200 |
+
Iter 460: Train loss 0.362, Learning Rate 1.000e-05, It/sec 0.401, Tokens/sec 460.743, Trained Tokens 547689, Peak mem 37.972 GB
|
| 201 |
+
Iter 470: Train loss 0.343, Learning Rate 1.000e-05, It/sec 0.391, Tokens/sec 464.058, Trained Tokens 559569, Peak mem 37.972 GB
|
| 202 |
+
Iter 480: Train loss 0.375, Learning Rate 1.000e-05, It/sec 0.358, Tokens/sec 445.015, Trained Tokens 572002, Peak mem 37.972 GB
|
| 203 |
+
Iter 490: Train loss 0.338, Learning Rate 1.000e-05, It/sec 0.408, Tokens/sec 471.137, Trained Tokens 583561, Peak mem 37.972 GB
|
| 204 |
+
|
| 205 |
+
Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
|
| 206 |
+
Calculating loss...: 4%|β | 1/25 [00:01<00:35, 1.49s/it]
|
| 207 |
+
Calculating loss...: 8%|β | 2/25 [00:02<00:32, 1.40s/it]
|
| 208 |
+
Calculating loss...: 12%|ββ | 3/25 [00:04<00:31, 1.41s/it]
|
| 209 |
+
Calculating loss...: 16%|ββ | 4/25 [00:05<00:30, 1.44s/it]
|
| 210 |
+
Calculating loss...: 20%|ββ | 5/25 [00:07<00:31, 1.57s/it]
|
| 211 |
+
Calculating loss...: 24%|βββ | 6/25 [00:08<00:27, 1.47s/it]
|
| 212 |
+
Calculating loss...: 28%|βββ | 7/25 [00:10<00:28, 1.57s/it]
|
| 213 |
+
Calculating loss...: 32%|ββββ | 8/25 [00:11<00:25, 1.49s/it]
|
| 214 |
+
Calculating loss...: 36%|ββββ | 9/25 [00:13<00:24, 1.54s/it]
|
| 215 |
+
Calculating loss...: 40%|ββββ | 10/25 [00:14<00:22, 1.50s/it]
|
| 216 |
+
Calculating loss...: 44%|βββββ | 11/25 [00:16<00:20, 1.47s/it]
|
| 217 |
+
Calculating loss...: 48%|βββββ | 12/25 [00:17<00:19, 1.48s/it]
|
| 218 |
+
Calculating loss...: 52%|ββββββ | 13/25 [00:19<00:17, 1.48s/it]
|
| 219 |
+
Calculating loss...: 56%|ββββββ | 14/25 [00:20<00:16, 1.48s/it]
|
| 220 |
+
Calculating loss...: 60%|ββββββ | 15/25 [00:23<00:18, 1.83s/it]
|
| 221 |
+
Calculating loss...: 64%|βββββββ | 16/25 [00:24<00:15, 1.73s/it]
|
| 222 |
+
Calculating loss...: 68%|βββββββ | 17/25 [00:26<00:12, 1.61s/it]
|
| 223 |
+
Calculating loss...: 72%|ββββββββ | 18/25 [00:27<00:10, 1.53s/it]
|
| 224 |
+
Calculating loss...: 76%|ββββββββ | 19/25 [00:28<00:08, 1.47s/it]
|
| 225 |
+
Calculating loss...: 80%|ββββββββ | 20/25 [00:30<00:07, 1.41s/it]
|
| 226 |
+
Calculating loss...: 84%|βββββββββ | 21/25 [00:31<00:05, 1.45s/it]
|
| 227 |
+
Calculating loss...: 88%|βββββββββ | 22/25 [00:33<00:04, 1.44s/it]
|
| 228 |
+
Calculating loss...: 92%|ββββββββββ| 23/25 [00:34<00:02, 1.43s/it]
|
| 229 |
+
Calculating loss...: 96%|ββββββββββ| 24/25 [00:36<00:01, 1.47s/it]
|
| 230 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:37<00:00, 1.47s/it]
|
| 231 |
+
Calculating loss...: 100%|ββοΏ½οΏ½βββββββ| 25/25 [00:37<00:00, 1.51s/it]
|
| 232 |
+
Iter 500: Val loss 0.472, Val took 37.645s
|
| 233 |
+
Iter 500: Train loss 0.357, Learning Rate 1.000e-05, It/sec 0.371, Tokens/sec 445.737, Trained Tokens 595562, Peak mem 37.972 GB
|
| 234 |
+
Iter 500: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000500_adapters.safetensors.
|
| 235 |
+
Iter 510: Train loss 0.461, Learning Rate 1.000e-05, It/sec 0.360, Tokens/sec 451.720, Trained Tokens 608094, Peak mem 37.972 GB
|
| 236 |
+
Iter 520: Train loss 0.350, Learning Rate 1.000e-05, It/sec 0.396, Tokens/sec 460.678, Trained Tokens 619734, Peak mem 37.972 GB
|
| 237 |
+
Iter 530: Train loss 0.403, Learning Rate 1.000e-05, It/sec 0.380, Tokens/sec 459.285, Trained Tokens 631829, Peak mem 37.972 GB
|
| 238 |
+
Iter 540: Train loss 0.311, Learning Rate 1.000e-05, It/sec 0.436, Tokens/sec 473.710, Trained Tokens 642682, Peak mem 37.972 GB
|
| 239 |
+
Iter 550: Train loss 0.350, Learning Rate 1.000e-05, It/sec 0.393, Tokens/sec 459.128, Trained Tokens 654367, Peak mem 37.972 GB
|
| 240 |
+
Iter 560: Train loss 0.484, Learning Rate 1.000e-05, It/sec 0.319, Tokens/sec 420.622, Trained Tokens 667572, Peak mem 37.972 GB
|
| 241 |
+
Iter 570: Train loss 0.390, Learning Rate 1.000e-05, It/sec 0.394, Tokens/sec 464.738, Trained Tokens 679358, Peak mem 37.972 GB
|
| 242 |
+
Iter 580: Train loss 0.364, Learning Rate 1.000e-05, It/sec 0.399, Tokens/sec 460.012, Trained Tokens 690891, Peak mem 37.972 GB
|
| 243 |
+
Iter 590: Train loss 0.337, Learning Rate 1.000e-05, It/sec 0.401, Tokens/sec 468.869, Trained Tokens 702595, Peak mem 37.972 GB
|
| 244 |
+
|
| 245 |
+
Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
|
| 246 |
+
Calculating loss...: 4%|β | 1/25 [00:01<00:42, 1.78s/it]
|
| 247 |
+
Calculating loss...: 8%|β | 2/25 [00:03<00:35, 1.56s/it]
|
| 248 |
+
Calculating loss...: 12%|ββ | 3/25 [00:04<00:32, 1.46s/it]
|
| 249 |
+
Calculating loss...: 16%|ββ | 4/25 [00:06<00:30, 1.47s/it]
|
| 250 |
+
Calculating loss...: 20%|ββ | 5/25 [00:07<00:30, 1.51s/it]
|
| 251 |
+
Calculating loss...: 24%|βββ | 6/25 [00:09<00:28, 1.50s/it]
|
| 252 |
+
Calculating loss...: 28%|βββ | 7/25 [00:10<00:26, 1.45s/it]
|
| 253 |
+
Calculating loss...: 32%|ββββ | 8/25 [00:11<00:24, 1.46s/it]
|
| 254 |
+
Calculating loss...: 36%|ββββ | 9/25 [00:13<00:24, 1.56s/it]
|
| 255 |
+
Calculating loss...: 40%|ββββ | 10/25 [00:15<00:24, 1.61s/it]
|
| 256 |
+
Calculating loss...: 44%|βββββ | 11/25 [00:17<00:25, 1.79s/it]
|
| 257 |
+
Calculating loss...: 48%|βββββ | 12/25 [00:19<00:22, 1.70s/it]
|
| 258 |
+
Calculating loss...: 52%|ββββββ | 13/25 [00:20<00:19, 1.61s/it]
|
| 259 |
+
Calculating loss...: 56%|ββββββ | 14/25 [00:21<00:16, 1.53s/it]
|
| 260 |
+
Calculating loss...: 60%|ββββββ | 15/25 [00:23<00:14, 1.47s/it]
|
| 261 |
+
Calculating loss...: 64%|βββββββ | 16/25 [00:24<00:12, 1.40s/it]
|
| 262 |
+
Calculating loss...: 68%|βββββββ | 17/25 [00:25<00:10, 1.36s/it]
|
| 263 |
+
Calculating loss...: 72%|ββββββββ | 18/25 [00:27<00:09, 1.41s/it]
|
| 264 |
+
Calculating loss...: 76%|ββββββββ | 19/25 [00:28<00:08, 1.37s/it]
|
| 265 |
+
Calculating loss...: 80%|ββββββββ | 20/25 [00:29<00:06, 1.36s/it]
|
| 266 |
+
Calculating loss...: 84%|βββββββββ | 21/25 [00:31<00:05, 1.40s/it]
|
| 267 |
+
Calculating loss...: 88%|βββββββββ | 22/25 [00:32<00:04, 1.36s/it]
|
| 268 |
+
Calculating loss...: 92%|ββββββββββ| 23/25 [00:34<00:02, 1.42s/it]
|
| 269 |
+
Calculating loss...: 96%|ββββββββββ| 24/25 [00:35<00:01, 1.44s/it]
|
| 270 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:36<00:00, 1.41s/it]
|
| 271 |
+
Calculating loss...: 100%|ββββββββββ| 25/25 [00:36<00:00, 1.48s/it]
|
| 272 |
+
Iter 600: Val loss 0.427, Val took 36.973s
|
| 273 |
+
Iter 600: Train loss 0.346, Learning Rate 1.000e-05, It/sec 0.427, Tokens/sec 474.136, Trained Tokens 713698, Peak mem 37.972 GB
|
| 274 |
+
Iter 600: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000600_adapters.safetensors.
|
| 275 |
+
Saved final weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors.
|
| 276 |
+
Testing
|
| 277 |
+
|
| 278 |
+
Calculating loss...: 0%| | 0/50 [00:00<?, ?it/s]
|
| 279 |
+
Calculating loss...: 2%|β | 1/50 [00:01<01:12, 1.49s/it]
|
| 280 |
+
Calculating loss...: 4%|β | 2/50 [00:02<01:04, 1.35s/it]
|
| 281 |
+
Calculating loss...: 6%|β | 3/50 [00:04<01:11, 1.52s/it]
|
| 282 |
+
Calculating loss...: 8%|β | 4/50 [00:06<01:10, 1.54s/it]
|
| 283 |
+
Calculating loss...: 10%|β | 5/50 [00:07<01:08, 1.52s/it]
|
| 284 |
+
Calculating loss...: 12%|ββ | 6/50 [00:09<01:07, 1.54s/it]
|
| 285 |
+
Calculating loss...: 14%|ββ | 7/50 [00:10<01:05, 1.52s/it]
|
| 286 |
+
Calculating loss...: 16%|ββ | 8/50 [00:11<01:01, 1.46s/it]
|
| 287 |
+
Calculating loss...: 18%|ββ | 9/50 [00:13<00:57, 1.40s/it]
|
| 288 |
+
Calculating loss...: 20%|ββ | 10/50 [00:14<00:59, 1.49s/it]
|
| 289 |
+
Calculating loss...: 22%|βββ | 11/50 [00:16<00:57, 1.47s/it]
|
| 290 |
+
Calculating loss...: 24%|βββ | 12/50 [00:17<00:53, 1.41s/it]
|
| 291 |
+
Calculating loss...: 26%|βββ | 13/50 [00:18<00:51, 1.38s/it]
|
| 292 |
+
Calculating loss...: 28%|βββ | 14/50 [00:20<00:49, 1.39s/it]
|
| 293 |
+
Calculating loss...: 30%|βββ | 15/50 [00:21<00:47, 1.35s/it]
|
| 294 |
+
Calculating loss...: 32%|ββββ | 16/50 [00:23<00:48, 1.44s/it]
|
| 295 |
+
Calculating loss...: 34%|ββββ | 17/50 [00:24<00:49, 1.50s/it]
|
| 296 |
+
Calculating loss...: 36%|ββββ | 18/50 [00:26<00:48, 1.50s/it]
|
| 297 |
+
Calculating loss...: 38%|ββββ | 19/50 [00:27<00:45, 1.45s/it]
|
| 298 |
+
Calculating loss...: 40%|ββββ | 20/50 [00:29<00:42, 1.42s/it]
|
| 299 |
+
Calculating loss...: 42%|βββββ | 21/50 [00:30<00:43, 1.49s/it]
|
| 300 |
+
Calculating loss...: 44%|βββββ | 22/50 [00:32<00:41, 1.49s/it]
|
| 301 |
+
Calculating loss...: 46%|βββββ | 23/50 [00:33<00:38, 1.44s/it]
|
| 302 |
+
Calculating loss...: 48%|βββββ | 24/50 [00:34<00:36, 1.41s/it]
|
| 303 |
+
Calculating loss...: 50%|βββββ | 25/50 [00:36<00:37, 1.48s/it]
|
| 304 |
+
Calculating loss...: 52%|ββββββ | 26/50 [00:37<00:33, 1.41s/it]
|
| 305 |
+
Calculating loss...: 54%|ββββββ | 27/50 [00:39<00:31, 1.39s/it]
|
| 306 |
+
Calculating loss...: 56%|ββββββ | 28/50 [00:43<00:52, 2.39s/it]
|
| 307 |
+
Calculating loss...: 58%|ββββββ | 29/50 [00:46<00:50, 2.39s/it]
|
| 308 |
+
Calculating loss...: 60%|ββββββ | 30/50 [00:47<00:43, 2.17s/it]
|
| 309 |
+
Calculating loss...: 62%|βββββββ | 31/50 [00:49<00:36, 1.94s/it]
|
| 310 |
+
Calculating loss...: 64%|βββββββ | 32/50 [00:51<00:38, 2.16s/it]
|
| 311 |
+
Calculating loss...: 66%|βββββββ | 33/50 [00:53<00:34, 2.04s/it]
|
| 312 |
+
Calculating loss...: 68%|βββββββ | 34/50 [00:55<00:30, 1.88s/it]
|
| 313 |
+
Calculating loss...: 70%|βββββββ | 35/50 [00:57<00:31, 2.08s/it]
|
| 314 |
+
Calculating loss...: 72%|ββββββββ | 36/50 [00:59<00:26, 1.88s/it]
|
| 315 |
+
Calculating loss...: 74%|ββββββββ | 37/50 [01:00<00:24, 1.85s/it]
|
| 316 |
+
Calculating loss...: 76%|ββββββββ | 38/50 [01:02<00:21, 1.79s/it]
|
| 317 |
+
Calculating loss...: 78%|ββββββββ | 39/50 [01:03<00:18, 1.68s/it]
|
| 318 |
+
Calculating loss...: 80%|ββββββββ | 40/50 [01:06<00:19, 1.99s/it]
|
| 319 |
+
Calculating loss...: 82%|βββββββββ | 41/50 [01:08<00:16, 1.84s/it]
|
| 320 |
+
Calculating loss...: 84%|βββββββββ | 42/50 [01:09<00:14, 1.82s/it]
|
| 321 |
+
Calculating loss...: 86%|βββββββββ | 43/50 [01:11<00:12, 1.77s/it]
|
| 322 |
+
Calculating loss...: 88%|βββββββββ | 44/50 [01:13<00:10, 1.77s/it]
|
| 323 |
+
Calculating loss...: 90%|βββββββββ | 45/50 [01:15<00:08, 1.78s/it]
|
| 324 |
+
Calculating loss...: 92%|ββββββββββ| 46/50 [01:16<00:06, 1.70s/it]
|
| 325 |
+
Calculating loss...: 94%|ββββββββββ| 47/50 [01:18<00:04, 1.63s/it]
|
| 326 |
+
Calculating loss...: 96%|ββββββββββ| 48/50 [01:19<00:03, 1.56s/it]
|
| 327 |
+
Calculating loss...: 98%|ββββββββββ| 49/50 [01:20<00:01, 1.51s/it]
|
| 328 |
+
Calculating loss...: 100%|ββββββββββ| 50/50 [01:22<00:00, 1.59s/it]
|
| 329 |
+
Calculating loss...: 100%|ββββββββββ| 50/50 [01:22<00:00, 1.65s/it]
|
| 330 |
+
Test loss 0.474, Test ppl 1.606.
|