erikbranmarino commited on
Commit
19b827b
Β·
verified Β·
1 Parent(s): d5737a7

Upload folder using huggingface_hub

Browse files
0000100_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b268b38ba34abff6a6b1c21d11b1b7184b785932a494fbdaef5c4b8cb04fac96
3
+ size 41967272
0000200_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a94fba5a9400fc8ba993bfd1fea6788185fc54282ffe77a061f06c75ca412e04
3
+ size 41967272
0000300_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be8d8b6b18e3e0b4253ba393c4137fb3b475dd52464b1a1f2b2fb0f7e7eceb21
3
+ size 41967272
0000400_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b695f45f86cafbeb670af96d63042e3afb2a6c57e07d17e95e326e6dca29040e
3
+ size 41967272
0000500_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7c55fd2fce6124e01a895591b97d66074420788bdd412c3576e4eb0ba97e699
3
+ size 41967272
0000600_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e07caf257c0f9147b9d07047a7ab7edb5040afd2cac886f4f310cfe82991fe9e
3
+ size 41967272
adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "adapter_path": "models/lora/mistral_lora_telegram_20251111_114741",
3
+ "batch_size": 2,
4
+ "config": null,
5
+ "data": "data/phase2/mlx_datasets/telegram",
6
+ "fine_tune_type": "lora",
7
+ "grad_accumulation_steps": 1,
8
+ "grad_checkpoint": false,
9
+ "iters": 600,
10
+ "learning_rate": 1e-05,
11
+ "lora_parameters": {
12
+ "rank": 8,
13
+ "dropout": 0.0,
14
+ "scale": 20.0
15
+ },
16
+ "lr_schedule": null,
17
+ "mask_prompt": false,
18
+ "max_seq_length": 2048,
19
+ "model": "models/mistral-7b-instruct-v0.3-mlx",
20
+ "num_layers": 16,
21
+ "optimizer": "adam",
22
+ "optimizer_config": {
23
+ "adam": {},
24
+ "adamw": {},
25
+ "muon": {},
26
+ "sgd": {},
27
+ "adafactor": {}
28
+ },
29
+ "project_name": null,
30
+ "report_to": null,
31
+ "resume_adapter_file": null,
32
+ "save_every": 100,
33
+ "seed": 42,
34
+ "steps_per_eval": 100,
35
+ "steps_per_report": 10,
36
+ "test": true,
37
+ "test_batches": 50,
38
+ "train": true,
39
+ "val_batches": 25
40
+ }
adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e07caf257c0f9147b9d07047a7ab7edb5040afd2cac886f4f310cfe82991fe9e
3
+ size 41967272
training.log ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Loading pretrained model
2
+ Loading datasets
3
+ Training
4
+ Trainable parameters: 0.145% (10.486M/7248.024M)
5
+ Starting training..., iters: 600
6
+
7
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
8
+ Calculating loss...: 4%|▍ | 1/25 [00:01<00:32, 1.36s/it]
9
+ Calculating loss...: 8%|β–Š | 2/25 [00:02<00:33, 1.47s/it]
10
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:04<00:34, 1.55s/it]
11
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:05<00:30, 1.47s/it]
12
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:07<00:28, 1.43s/it]
13
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:09<00:34, 1.82s/it]
14
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:11<00:31, 1.72s/it]
15
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:13<00:29, 1.74s/it]
16
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:14<00:27, 1.69s/it]
17
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:16<00:25, 1.68s/it]
18
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:17<00:21, 1.56s/it]
19
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:19<00:20, 1.56s/it]
20
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:20<00:17, 1.50s/it]
21
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:22<00:16, 1.48s/it]
22
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:23<00:15, 1.60s/it]
23
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:25<00:14, 1.58s/it]
24
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:26<00:12, 1.56s/it]
25
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:28<00:11, 1.63s/it]
26
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:30<00:10, 1.68s/it]
27
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:31<00:07, 1.59s/it]
28
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:33<00:06, 1.52s/it]
29
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:34<00:04, 1.44s/it]
30
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:36<00:02, 1.45s/it]
31
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [00:37<00:01, 1.51s/it]
32
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:39<00:00, 1.60s/it]
33
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:39<00:00, 1.58s/it]
34
+ Iter 1: Val loss 2.214, Val took 39.464s
35
+ Iter 10: Train loss 1.065, Learning Rate 1.000e-05, It/sec 0.362, Tokens/sec 431.356, Trained Tokens 11910, Peak mem 24.920 GB
36
+ Iter 20: Train loss 0.755, Learning Rate 1.000e-05, It/sec 0.263, Tokens/sec 350.838, Trained Tokens 25233, Peak mem 37.081 GB
37
+ Iter 30: Train loss 0.688, Learning Rate 1.000e-05, It/sec 0.352, Tokens/sec 429.868, Trained Tokens 37431, Peak mem 37.081 GB
38
+ Iter 40: Train loss 0.557, Learning Rate 1.000e-05, It/sec 0.340, Tokens/sec 403.603, Trained Tokens 49302, Peak mem 37.081 GB
39
+ Iter 50: Train loss 0.663, Learning Rate 1.000e-05, It/sec 0.296, Tokens/sec 387.318, Trained Tokens 62366, Peak mem 37.081 GB
40
+ Iter 60: Train loss 0.609, Learning Rate 1.000e-05, It/sec 0.345, Tokens/sec 424.207, Trained Tokens 74645, Peak mem 37.081 GB
41
+ Iter 70: Train loss 0.614, Learning Rate 1.000e-05, It/sec 0.344, Tokens/sec 421.161, Trained Tokens 86901, Peak mem 37.081 GB
42
+ Iter 80: Train loss 0.607, Learning Rate 1.000e-05, It/sec 0.348, Tokens/sec 423.462, Trained Tokens 99079, Peak mem 37.081 GB
43
+ Iter 90: Train loss 0.556, Learning Rate 1.000e-05, It/sec 0.373, Tokens/sec 439.181, Trained Tokens 110838, Peak mem 37.081 GB
44
+
45
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
46
+ Calculating loss...: 4%|▍ | 1/25 [00:01<00:43, 1.83s/it]
47
+ Calculating loss...: 8%|β–Š | 2/25 [00:03<00:37, 1.65s/it]
48
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:04<00:34, 1.55s/it]
49
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:06<00:31, 1.51s/it]
50
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:07<00:29, 1.46s/it]
51
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:09<00:28, 1.48s/it]
52
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:10<00:26, 1.49s/it]
53
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:11<00:24, 1.44s/it]
54
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:13<00:22, 1.41s/it]
55
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:14<00:21, 1.45s/it]
56
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:16<00:21, 1.54s/it]
57
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:17<00:19, 1.49s/it]
58
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:19<00:18, 1.50s/it]
59
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:21<00:16, 1.50s/it]
60
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:22<00:15, 1.51s/it]
61
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:24<00:15, 1.72s/it]
62
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:26<00:13, 1.67s/it]
63
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:29<00:13, 1.99s/it]
64
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:30<00:10, 1.82s/it]
65
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆοΏ½οΏ½οΏ½β–ˆβ–ˆ | 20/25 [00:33<00:10, 2.12s/it]
66
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:34<00:07, 1.89s/it]
67
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:36<00:05, 1.75s/it]
68
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:37<00:03, 1.68s/it]
69
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [00:38<00:01, 1.61s/it]
70
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:40<00:00, 1.60s/it]
71
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:40<00:00, 1.62s/it]
72
+ Iter 100: Val loss 0.553, Val took 40.597s
73
+ Iter 100: Train loss 0.487, Learning Rate 1.000e-05, It/sec 0.317, Tokens/sec 358.772, Trained Tokens 122142, Peak mem 37.081 GB
74
+ Iter 100: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000100_adapters.safetensors.
75
+ Iter 110: Train loss 0.556, Learning Rate 1.000e-05, It/sec 0.366, Tokens/sec 430.562, Trained Tokens 133896, Peak mem 37.081 GB
76
+ Iter 120: Train loss 0.550, Learning Rate 1.000e-05, It/sec 0.348, Tokens/sec 437.090, Trained Tokens 146466, Peak mem 37.081 GB
77
+ Iter 130: Train loss 0.499, Learning Rate 1.000e-05, It/sec 0.349, Tokens/sec 430.415, Trained Tokens 158812, Peak mem 37.081 GB
78
+ Iter 140: Train loss 0.393, Learning Rate 1.000e-05, It/sec 0.436, Tokens/sec 465.004, Trained Tokens 169489, Peak mem 37.081 GB
79
+ Iter 150: Train loss 0.506, Learning Rate 1.000e-05, It/sec 0.372, Tokens/sec 438.469, Trained Tokens 181272, Peak mem 37.081 GB
80
+ Iter 160: Train loss 0.484, Learning Rate 1.000e-05, It/sec 0.375, Tokens/sec 443.286, Trained Tokens 193104, Peak mem 37.081 GB
81
+ Iter 170: Train loss 0.480, Learning Rate 1.000e-05, It/sec 0.399, Tokens/sec 448.274, Trained Tokens 204339, Peak mem 37.081 GB
82
+ Iter 180: Train loss 0.491, Learning Rate 1.000e-05, It/sec 0.374, Tokens/sec 442.924, Trained Tokens 216181, Peak mem 37.081 GB
83
+ Iter 190: Train loss 0.612, Learning Rate 1.000e-05, It/sec 0.259, Tokens/sec 348.364, Trained Tokens 229607, Peak mem 37.123 GB
84
+
85
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
86
+ Calculating loss...: 4%|▍ | 1/25 [00:03<01:30, 3.75s/it]
87
+ Calculating loss...: 8%|β–Š | 2/25 [00:05<01:00, 2.64s/it]
88
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:07<00:54, 2.47s/it]
89
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:10<00:49, 2.36s/it]
90
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:11<00:39, 1.99s/it]
91
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:13<00:37, 1.96s/it]
92
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:14<00:31, 1.74s/it]
93
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:16<00:28, 1.68s/it]
94
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:19<00:33, 2.10s/it]
95
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:21<00:30, 2.02s/it]
96
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:22<00:26, 1.88s/it]
97
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:24<00:23, 1.83s/it]
98
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:25<00:20, 1.72s/it]
99
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:27<00:19, 1.75s/it]
100
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:30<00:21, 2.15s/it]
101
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:32<00:17, 1.99s/it]
102
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:34<00:15, 1.95s/it]
103
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:35<00:12, 1.75s/it]
104
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:37<00:10, 1.72s/it]
105
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:39<00:08, 1.78s/it]
106
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:40<00:06, 1.65s/it]
107
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:41<00:04, 1.64s/it]
108
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:43<00:03, 1.70s/it]
109
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [00:45<00:01, 1.63s/it]
110
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:46<00:00, 1.59s/it]
111
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:46<00:00, 1.87s/it]
112
+ Iter 200: Val loss 0.499, Val took 46.784s
113
+ Iter 200: Train loss 0.426, Learning Rate 1.000e-05, It/sec 0.364, Tokens/sec 420.021, Trained Tokens 241139, Peak mem 37.123 GB
114
+ Iter 200: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000200_adapters.safetensors.
115
+ Iter 210: Train loss 0.402, Learning Rate 1.000e-05, It/sec 0.364, Tokens/sec 403.442, Trained Tokens 252219, Peak mem 37.123 GB
116
+ Iter 220: Train loss 0.432, Learning Rate 1.000e-05, It/sec 0.408, Tokens/sec 467.043, Trained Tokens 263665, Peak mem 37.123 GB
117
+ Iter 230: Train loss 0.562, Learning Rate 1.000e-05, It/sec 0.369, Tokens/sec 447.334, Trained Tokens 275796, Peak mem 37.123 GB
118
+ Iter 240: Train loss 0.491, Learning Rate 1.000e-05, It/sec 0.396, Tokens/sec 461.514, Trained Tokens 287447, Peak mem 37.123 GB
119
+ Iter 250: Train loss 0.460, Learning Rate 1.000e-05, It/sec 0.391, Tokens/sec 453.744, Trained Tokens 299037, Peak mem 37.123 GB
120
+ Iter 260: Train loss 0.462, Learning Rate 1.000e-05, It/sec 0.403, Tokens/sec 470.594, Trained Tokens 310708, Peak mem 37.123 GB
121
+ Iter 270: Train loss 0.411, Learning Rate 1.000e-05, It/sec 0.397, Tokens/sec 455.131, Trained Tokens 322185, Peak mem 37.123 GB
122
+ Iter 280: Train loss 0.388, Learning Rate 1.000e-05, It/sec 0.386, Tokens/sec 447.749, Trained Tokens 333784, Peak mem 37.123 GB
123
+ Iter 290: Train loss 0.506, Learning Rate 1.000e-05, It/sec 0.401, Tokens/sec 459.746, Trained Tokens 345244, Peak mem 37.123 GB
124
+
125
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
126
+ Calculating loss...: 4%|▍ | 1/25 [00:01<00:33, 1.40s/it]
127
+ Calculating loss...: 8%|β–Š | 2/25 [00:04<00:48, 2.13s/it]
128
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:05<00:40, 1.83s/it]
129
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:07<00:35, 1.70s/it]
130
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:08<00:32, 1.62s/it]
131
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:09<00:29, 1.55s/it]
132
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:11<00:26, 1.48s/it]
133
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:12<00:24, 1.46s/it]
134
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:14<00:23, 1.49s/it]
135
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:15<00:22, 1.51s/it]
136
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:17<00:21, 1.50s/it]
137
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:18<00:19, 1.52s/it]
138
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:20<00:19, 1.60s/it]
139
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:21<00:16, 1.52s/it]
140
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:24<00:18, 1.88s/it]
141
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:25<00:15, 1.70s/it]
142
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:27<00:12, 1.60s/it]
143
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:29<00:11, 1.66s/it]
144
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:30<00:10, 1.69s/it]
145
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:32<00:08, 1.66s/it]
146
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:33<00:06, 1.54s/it]
147
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:35<00:04, 1.57s/it]
148
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:37<00:03, 1.73s/it]
149
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [00:38<00:01, 1.63s/it]
150
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:40<00:00, 1.54s/it]
151
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:40<00:00, 1.61s/it]
152
+ Iter 300: Val loss 0.475, Val took 40.184s
153
+ Iter 300: Train loss 0.520, Learning Rate 1.000e-05, It/sec 0.420, Tokens/sec 467.207, Trained Tokens 356356, Peak mem 37.123 GB
154
+ Iter 300: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000300_adapters.safetensors.
155
+ Iter 310: Train loss 0.429, Learning Rate 1.000e-05, It/sec 0.430, Tokens/sec 478.254, Trained Tokens 367482, Peak mem 37.123 GB
156
+ Iter 320: Train loss 0.420, Learning Rate 1.000e-05, It/sec 0.373, Tokens/sec 439.972, Trained Tokens 379266, Peak mem 37.123 GB
157
+ Iter 330: Train loss 0.480, Learning Rate 1.000e-05, It/sec 0.399, Tokens/sec 462.754, Trained Tokens 390875, Peak mem 37.123 GB
158
+ Iter 340: Train loss 0.456, Learning Rate 1.000e-05, It/sec 0.404, Tokens/sec 456.805, Trained Tokens 402182, Peak mem 37.123 GB
159
+ Iter 350: Train loss 0.639, Learning Rate 1.000e-05, It/sec 0.300, Tokens/sec 410.751, Trained Tokens 415852, Peak mem 37.972 GB
160
+ Iter 360: Train loss 0.522, Learning Rate 1.000e-05, It/sec 0.407, Tokens/sec 474.435, Trained Tokens 427505, Peak mem 37.972 GB
161
+ Iter 370: Train loss 0.525, Learning Rate 1.000e-05, It/sec 0.332, Tokens/sec 434.123, Trained Tokens 440587, Peak mem 37.972 GB
162
+ Iter 380: Train loss 0.424, Learning Rate 1.000e-05, It/sec 0.374, Tokens/sec 446.958, Trained Tokens 452536, Peak mem 37.972 GB
163
+ Iter 390: Train loss 0.428, Learning Rate 1.000e-05, It/sec 0.357, Tokens/sec 444.952, Trained Tokens 464991, Peak mem 37.972 GB
164
+
165
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
166
+ Calculating loss...: 4%|▍ | 1/25 [00:01<00:37, 1.55s/it]
167
+ Calculating loss...: 8%|β–Š | 2/25 [00:03<00:34, 1.52s/it]
168
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:04<00:36, 1.64s/it]
169
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:08<00:48, 2.29s/it]
170
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:09<00:39, 1.96s/it]
171
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:10<00:33, 1.77s/it]
172
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:12<00:30, 1.70s/it]
173
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:13<00:27, 1.63s/it]
174
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:16<00:29, 1.83s/it]
175
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:17<00:25, 1.68s/it]
176
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:18<00:21, 1.56s/it]
177
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:20<00:19, 1.54s/it]
178
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:21<00:17, 1.48s/it]
179
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:23<00:15, 1.44s/it]
180
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:27<00:24, 2.41s/it]
181
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:29<00:20, 2.27s/it]
182
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:30<00:15, 2.00s/it]
183
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:33<00:15, 2.19s/it]
184
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:35<00:12, 2.07s/it]
185
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:36<00:09, 1.85s/it]
186
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:38<00:06, 1.68s/it]
187
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:39<00:04, 1.55s/it]
188
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:40<00:03, 1.50s/it]
189
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [00:42<00:01, 1.47s/it]
190
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:43<00:00, 1.51s/it]
191
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:43<00:00, 1.75s/it]
192
+ Iter 400: Val loss 0.508, Val took 43.651s
193
+ Iter 400: Train loss 0.352, Learning Rate 1.000e-05, It/sec 0.346, Tokens/sec 403.495, Trained Tokens 476660, Peak mem 37.972 GB
194
+ Iter 400: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000400_adapters.safetensors.
195
+ Iter 410: Train loss 0.362, Learning Rate 1.000e-05, It/sec 0.354, Tokens/sec 436.195, Trained Tokens 488988, Peak mem 37.972 GB
196
+ Iter 420: Train loss 0.319, Learning Rate 1.000e-05, It/sec 0.412, Tokens/sec 477.171, Trained Tokens 500576, Peak mem 37.972 GB
197
+ Iter 430: Train loss 0.347, Learning Rate 1.000e-05, It/sec 0.394, Tokens/sec 461.272, Trained Tokens 512284, Peak mem 37.972 GB
198
+ Iter 440: Train loss 0.370, Learning Rate 1.000e-05, It/sec 0.375, Tokens/sec 436.054, Trained Tokens 523900, Peak mem 37.972 GB
199
+ Iter 450: Train loss 0.370, Learning Rate 1.000e-05, It/sec 0.375, Tokens/sec 460.385, Trained Tokens 536188, Peak mem 37.972 GB
200
+ Iter 460: Train loss 0.362, Learning Rate 1.000e-05, It/sec 0.401, Tokens/sec 460.743, Trained Tokens 547689, Peak mem 37.972 GB
201
+ Iter 470: Train loss 0.343, Learning Rate 1.000e-05, It/sec 0.391, Tokens/sec 464.058, Trained Tokens 559569, Peak mem 37.972 GB
202
+ Iter 480: Train loss 0.375, Learning Rate 1.000e-05, It/sec 0.358, Tokens/sec 445.015, Trained Tokens 572002, Peak mem 37.972 GB
203
+ Iter 490: Train loss 0.338, Learning Rate 1.000e-05, It/sec 0.408, Tokens/sec 471.137, Trained Tokens 583561, Peak mem 37.972 GB
204
+
205
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
206
+ Calculating loss...: 4%|▍ | 1/25 [00:01<00:35, 1.49s/it]
207
+ Calculating loss...: 8%|β–Š | 2/25 [00:02<00:32, 1.40s/it]
208
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:04<00:31, 1.41s/it]
209
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:05<00:30, 1.44s/it]
210
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:07<00:31, 1.57s/it]
211
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:08<00:27, 1.47s/it]
212
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:10<00:28, 1.57s/it]
213
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:11<00:25, 1.49s/it]
214
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:13<00:24, 1.54s/it]
215
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:14<00:22, 1.50s/it]
216
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:16<00:20, 1.47s/it]
217
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:17<00:19, 1.48s/it]
218
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:19<00:17, 1.48s/it]
219
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:20<00:16, 1.48s/it]
220
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:23<00:18, 1.83s/it]
221
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:24<00:15, 1.73s/it]
222
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:26<00:12, 1.61s/it]
223
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:27<00:10, 1.53s/it]
224
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:28<00:08, 1.47s/it]
225
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:30<00:07, 1.41s/it]
226
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:31<00:05, 1.45s/it]
227
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:33<00:04, 1.44s/it]
228
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:34<00:02, 1.43s/it]
229
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [00:36<00:01, 1.47s/it]
230
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:37<00:00, 1.47s/it]
231
+ Calculating loss...: 100%|β–ˆβ–ˆοΏ½οΏ½β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:37<00:00, 1.51s/it]
232
+ Iter 500: Val loss 0.472, Val took 37.645s
233
+ Iter 500: Train loss 0.357, Learning Rate 1.000e-05, It/sec 0.371, Tokens/sec 445.737, Trained Tokens 595562, Peak mem 37.972 GB
234
+ Iter 500: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000500_adapters.safetensors.
235
+ Iter 510: Train loss 0.461, Learning Rate 1.000e-05, It/sec 0.360, Tokens/sec 451.720, Trained Tokens 608094, Peak mem 37.972 GB
236
+ Iter 520: Train loss 0.350, Learning Rate 1.000e-05, It/sec 0.396, Tokens/sec 460.678, Trained Tokens 619734, Peak mem 37.972 GB
237
+ Iter 530: Train loss 0.403, Learning Rate 1.000e-05, It/sec 0.380, Tokens/sec 459.285, Trained Tokens 631829, Peak mem 37.972 GB
238
+ Iter 540: Train loss 0.311, Learning Rate 1.000e-05, It/sec 0.436, Tokens/sec 473.710, Trained Tokens 642682, Peak mem 37.972 GB
239
+ Iter 550: Train loss 0.350, Learning Rate 1.000e-05, It/sec 0.393, Tokens/sec 459.128, Trained Tokens 654367, Peak mem 37.972 GB
240
+ Iter 560: Train loss 0.484, Learning Rate 1.000e-05, It/sec 0.319, Tokens/sec 420.622, Trained Tokens 667572, Peak mem 37.972 GB
241
+ Iter 570: Train loss 0.390, Learning Rate 1.000e-05, It/sec 0.394, Tokens/sec 464.738, Trained Tokens 679358, Peak mem 37.972 GB
242
+ Iter 580: Train loss 0.364, Learning Rate 1.000e-05, It/sec 0.399, Tokens/sec 460.012, Trained Tokens 690891, Peak mem 37.972 GB
243
+ Iter 590: Train loss 0.337, Learning Rate 1.000e-05, It/sec 0.401, Tokens/sec 468.869, Trained Tokens 702595, Peak mem 37.972 GB
244
+
245
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
246
+ Calculating loss...: 4%|▍ | 1/25 [00:01<00:42, 1.78s/it]
247
+ Calculating loss...: 8%|β–Š | 2/25 [00:03<00:35, 1.56s/it]
248
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:04<00:32, 1.46s/it]
249
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:06<00:30, 1.47s/it]
250
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:07<00:30, 1.51s/it]
251
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:09<00:28, 1.50s/it]
252
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:10<00:26, 1.45s/it]
253
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:11<00:24, 1.46s/it]
254
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:13<00:24, 1.56s/it]
255
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:15<00:24, 1.61s/it]
256
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:17<00:25, 1.79s/it]
257
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:19<00:22, 1.70s/it]
258
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:20<00:19, 1.61s/it]
259
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:21<00:16, 1.53s/it]
260
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:23<00:14, 1.47s/it]
261
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:24<00:12, 1.40s/it]
262
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:25<00:10, 1.36s/it]
263
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:27<00:09, 1.41s/it]
264
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:28<00:08, 1.37s/it]
265
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:29<00:06, 1.36s/it]
266
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:31<00:05, 1.40s/it]
267
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:32<00:04, 1.36s/it]
268
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:34<00:02, 1.42s/it]
269
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [00:35<00:01, 1.44s/it]
270
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:36<00:00, 1.41s/it]
271
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:36<00:00, 1.48s/it]
272
+ Iter 600: Val loss 0.427, Val took 36.973s
273
+ Iter 600: Train loss 0.346, Learning Rate 1.000e-05, It/sec 0.427, Tokens/sec 474.136, Trained Tokens 713698, Peak mem 37.972 GB
274
+ Iter 600: Saved adapter weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors and models/lora/mistral_lora_telegram_20251111_114741/0000600_adapters.safetensors.
275
+ Saved final weights to models/lora/mistral_lora_telegram_20251111_114741/adapters.safetensors.
276
+ Testing
277
+
278
+ Calculating loss...: 0%| | 0/50 [00:00<?, ?it/s]
279
+ Calculating loss...: 2%|▏ | 1/50 [00:01<01:12, 1.49s/it]
280
+ Calculating loss...: 4%|▍ | 2/50 [00:02<01:04, 1.35s/it]
281
+ Calculating loss...: 6%|β–Œ | 3/50 [00:04<01:11, 1.52s/it]
282
+ Calculating loss...: 8%|β–Š | 4/50 [00:06<01:10, 1.54s/it]
283
+ Calculating loss...: 10%|β–ˆ | 5/50 [00:07<01:08, 1.52s/it]
284
+ Calculating loss...: 12%|β–ˆβ– | 6/50 [00:09<01:07, 1.54s/it]
285
+ Calculating loss...: 14%|β–ˆβ– | 7/50 [00:10<01:05, 1.52s/it]
286
+ Calculating loss...: 16%|β–ˆβ–Œ | 8/50 [00:11<01:01, 1.46s/it]
287
+ Calculating loss...: 18%|β–ˆβ–Š | 9/50 [00:13<00:57, 1.40s/it]
288
+ Calculating loss...: 20%|β–ˆβ–ˆ | 10/50 [00:14<00:59, 1.49s/it]
289
+ Calculating loss...: 22%|β–ˆβ–ˆβ– | 11/50 [00:16<00:57, 1.47s/it]
290
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 12/50 [00:17<00:53, 1.41s/it]
291
+ Calculating loss...: 26%|β–ˆβ–ˆβ–Œ | 13/50 [00:18<00:51, 1.38s/it]
292
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 14/50 [00:20<00:49, 1.39s/it]
293
+ Calculating loss...: 30%|β–ˆβ–ˆβ–ˆ | 15/50 [00:21<00:47, 1.35s/it]
294
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 16/50 [00:23<00:48, 1.44s/it]
295
+ Calculating loss...: 34%|β–ˆβ–ˆβ–ˆβ– | 17/50 [00:24<00:49, 1.50s/it]
296
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 18/50 [00:26<00:48, 1.50s/it]
297
+ Calculating loss...: 38%|β–ˆβ–ˆβ–ˆβ–Š | 19/50 [00:27<00:45, 1.45s/it]
298
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 20/50 [00:29<00:42, 1.42s/it]
299
+ Calculating loss...: 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 21/50 [00:30<00:43, 1.49s/it]
300
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 22/50 [00:32<00:41, 1.49s/it]
301
+ Calculating loss...: 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 23/50 [00:33<00:38, 1.44s/it]
302
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 24/50 [00:34<00:36, 1.41s/it]
303
+ Calculating loss...: 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 25/50 [00:36<00:37, 1.48s/it]
304
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 26/50 [00:37<00:33, 1.41s/it]
305
+ Calculating loss...: 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 27/50 [00:39<00:31, 1.39s/it]
306
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 28/50 [00:43<00:52, 2.39s/it]
307
+ Calculating loss...: 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 29/50 [00:46<00:50, 2.39s/it]
308
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 30/50 [00:47<00:43, 2.17s/it]
309
+ Calculating loss...: 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31/50 [00:49<00:36, 1.94s/it]
310
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 32/50 [00:51<00:38, 2.16s/it]
311
+ Calculating loss...: 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 33/50 [00:53<00:34, 2.04s/it]
312
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 34/50 [00:55<00:30, 1.88s/it]
313
+ Calculating loss...: 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 35/50 [00:57<00:31, 2.08s/it]
314
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 36/50 [00:59<00:26, 1.88s/it]
315
+ Calculating loss...: 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 37/50 [01:00<00:24, 1.85s/it]
316
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 38/50 [01:02<00:21, 1.79s/it]
317
+ Calculating loss...: 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 39/50 [01:03<00:18, 1.68s/it]
318
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 40/50 [01:06<00:19, 1.99s/it]
319
+ Calculating loss...: 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 41/50 [01:08<00:16, 1.84s/it]
320
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 42/50 [01:09<00:14, 1.82s/it]
321
+ Calculating loss...: 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 43/50 [01:11<00:12, 1.77s/it]
322
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 44/50 [01:13<00:10, 1.77s/it]
323
+ Calculating loss...: 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 45/50 [01:15<00:08, 1.78s/it]
324
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 46/50 [01:16<00:06, 1.70s/it]
325
+ Calculating loss...: 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 47/50 [01:18<00:04, 1.63s/it]
326
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 48/50 [01:19<00:03, 1.56s/it]
327
+ Calculating loss...: 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 49/50 [01:20<00:01, 1.51s/it]
328
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 50/50 [01:22<00:00, 1.59s/it]
329
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 50/50 [01:22<00:00, 1.65s/it]
330
+ Test loss 0.474, Test ppl 1.606.