erikbranmarino commited on
Commit
da0075c
Β·
verified Β·
1 Parent(s): 4dc6b21

Upload folder using huggingface_hub

Browse files
0000100_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99eb5c73192edbb014e053474a879697a00b2f7b057f201ddd83e2a09576db71
3
+ size 45899454
0000200_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e258c74152f7cee8588e8b611c46ae22d5145a5ecf27a566da618c40123ff1ff
3
+ size 45899454
0000300_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:917d3a2a6bd324de9795d87021a2785ffd82503e798dc753d59fcadd36e2d0ba
3
+ size 45899454
0000400_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:016ead7afe69217eb79ba123ff7dcb28cc9ea13081a1da023283ad378676dd05
3
+ size 45899454
0000500_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d28d3dd4752c2a18f4de84a656a2568c88dfcc30f44e0d745845fc360f9dd13c
3
+ size 45899454
0000600_adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7920b3232ebb36a0ac993a0caddc60b474b221199d07031dec56bc4d2dd4c0d
3
+ size 45899454
adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "adapter_path": "models/lora/deepseek_lora_telegram_20251111_165211",
3
+ "batch_size": 2,
4
+ "config": null,
5
+ "data": "data/phase2/mlx_datasets/telegram",
6
+ "fine_tune_type": "lora",
7
+ "grad_accumulation_steps": 1,
8
+ "grad_checkpoint": false,
9
+ "iters": 600,
10
+ "learning_rate": 1e-05,
11
+ "lora_parameters": {
12
+ "rank": 8,
13
+ "dropout": 0.0,
14
+ "scale": 20.0
15
+ },
16
+ "lr_schedule": null,
17
+ "mask_prompt": false,
18
+ "max_seq_length": 2048,
19
+ "model": "models/deepseek-r1-14b-mlx",
20
+ "num_layers": 16,
21
+ "optimizer": "adam",
22
+ "optimizer_config": {
23
+ "adam": {},
24
+ "adamw": {},
25
+ "muon": {},
26
+ "sgd": {},
27
+ "adafactor": {}
28
+ },
29
+ "project_name": null,
30
+ "report_to": null,
31
+ "resume_adapter_file": null,
32
+ "save_every": 100,
33
+ "seed": 42,
34
+ "steps_per_eval": 100,
35
+ "steps_per_report": 10,
36
+ "test": true,
37
+ "test_batches": 50,
38
+ "train": true,
39
+ "val_batches": 25
40
+ }
adapters.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7920b3232ebb36a0ac993a0caddc60b474b221199d07031dec56bc4d2dd4c0d
3
+ size 45899454
training.log ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Loading pretrained model
2
+ Loading datasets
3
+ Training
4
+ Trainable parameters: 0.078% (11.469M/14770.034M)
5
+ Starting training..., iters: 600
6
+
7
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
8
+ Calculating loss...: 4%|▍ | 1/25 [00:02<00:58, 2.45s/it]
9
+ Calculating loss...: 8%|β–Š | 2/25 [00:05<00:57, 2.51s/it]
10
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:07<00:56, 2.56s/it]
11
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:09<00:50, 2.39s/it]
12
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:11<00:45, 2.29s/it]
13
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:15<00:55, 2.91s/it]
14
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:18<00:48, 2.72s/it]
15
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:21<00:47, 2.77s/it]
16
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:23<00:42, 2.67s/it]
17
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:26<00:39, 2.65s/it]
18
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:28<00:34, 2.47s/it]
19
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:30<00:32, 2.52s/it]
20
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:33<00:29, 2.42s/it]
21
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:35<00:26, 2.40s/it]
22
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:38<00:26, 2.60s/it]
23
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:40<00:22, 2.51s/it]
24
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:43<00:20, 2.50s/it]
25
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:46<00:18, 2.64s/it]
26
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:49<00:16, 2.79s/it]
27
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:51<00:13, 2.60s/it]
28
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:53<00:09, 2.47s/it]
29
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:55<00:06, 2.33s/it]
30
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:58<00:04, 2.32s/it]
31
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [01:00<00:02, 2.42s/it]
32
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:03<00:00, 2.57s/it]
33
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:03<00:00, 2.55s/it]
34
+ Iter 1: Val loss 2.630, Val took 63.650s
35
+ Iter 10: Train loss 2.016, Learning Rate 1.000e-05, It/sec 0.246, Tokens/sec 239.823, Trained Tokens 9738, Peak mem 40.069 GB
36
+ Iter 20: Train loss 1.097, Learning Rate 1.000e-05, It/sec 0.206, Tokens/sec 223.454, Trained Tokens 20566, Peak mem 51.482 GB
37
+ Iter 30: Train loss 0.841, Learning Rate 1.000e-05, It/sec 0.243, Tokens/sec 243.944, Trained Tokens 30620, Peak mem 51.482 GB
38
+ Iter 40: Train loss 0.698, Learning Rate 1.000e-05, It/sec 0.270, Tokens/sec 260.294, Trained Tokens 40275, Peak mem 51.482 GB
39
+ Iter 50: Train loss 0.813, Learning Rate 1.000e-05, It/sec 0.229, Tokens/sec 246.430, Trained Tokens 51030, Peak mem 51.482 GB
40
+ Iter 60: Train loss 0.754, Learning Rate 1.000e-05, It/sec 0.255, Tokens/sec 254.753, Trained Tokens 61017, Peak mem 51.482 GB
41
+ Iter 70: Train loss 0.729, Learning Rate 1.000e-05, It/sec 0.251, Tokens/sec 250.801, Trained Tokens 71015, Peak mem 51.482 GB
42
+ Iter 80: Train loss 0.721, Learning Rate 1.000e-05, It/sec 0.254, Tokens/sec 254.204, Trained Tokens 81010, Peak mem 51.482 GB
43
+ Iter 90: Train loss 0.663, Learning Rate 1.000e-05, It/sec 0.273, Tokens/sec 262.878, Trained Tokens 90638, Peak mem 51.482 GB
44
+
45
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
46
+ Calculating loss...: 4%|▍ | 1/25 [00:02<01:05, 2.73s/it]
47
+ Calculating loss...: 8%|β–Š | 2/25 [00:05<00:57, 2.49s/it]
48
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:07<00:53, 2.41s/it]
49
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:09<00:49, 2.37s/it]
50
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:11<00:45, 2.29s/it]
51
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:14<00:44, 2.36s/it]
52
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:16<00:42, 2.38s/it]
53
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:18<00:39, 2.30s/it]
54
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:20<00:35, 2.25s/it]
55
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:23<00:34, 2.30s/it]
56
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:26<00:35, 2.50s/it]
57
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:28<00:31, 2.39s/it]
58
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:30<00:28, 2.38s/it]
59
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:33<00:25, 2.36s/it]
60
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:35<00:23, 2.39s/it]
61
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:39<00:24, 2.72s/it]
62
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:41<00:20, 2.60s/it]
63
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:45<00:21, 3.13s/it]
64
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:47<00:17, 2.83s/it]
65
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆοΏ½οΏ½β–ˆβ–ˆ | 20/25 [00:52<00:16, 3.32s/it]
66
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:54<00:11, 2.96s/it]
67
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:56<00:08, 2.78s/it]
68
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:59<00:05, 2.64s/it]
69
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [01:01<00:02, 2.55s/it]
70
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:04<00:00, 2.57s/it]
71
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:04<00:00, 2.57s/it]
72
+ Iter 100: Val loss 0.655, Val took 64.142s
73
+ Iter 100: Train loss 0.570, Learning Rate 1.000e-05, It/sec 0.271, Tokens/sec 252.471, Trained Tokens 99944, Peak mem 51.482 GB
74
+ Iter 100: Saved adapter weights to models/lora/deepseek_lora_telegram_20251111_165211/adapters.safetensors and models/lora/deepseek_lora_telegram_20251111_165211/0000100_adapters.safetensors.
75
+ Iter 110: Train loss 0.656, Learning Rate 1.000e-05, It/sec 0.271, Tokens/sec 261.990, Trained Tokens 109594, Peak mem 51.482 GB
76
+ Iter 120: Train loss 0.670, Learning Rate 1.000e-05, It/sec 0.249, Tokens/sec 256.186, Trained Tokens 119873, Peak mem 51.482 GB
77
+ Iter 130: Train loss 0.610, Learning Rate 1.000e-05, It/sec 0.252, Tokens/sec 254.174, Trained Tokens 129955, Peak mem 51.482 GB
78
+ Iter 140: Train loss 0.454, Learning Rate 1.000e-05, It/sec 0.311, Tokens/sec 270.304, Trained Tokens 138638, Peak mem 51.482 GB
79
+ Iter 150: Train loss 0.595, Learning Rate 1.000e-05, It/sec 0.264, Tokens/sec 254.198, Trained Tokens 148254, Peak mem 51.482 GB
80
+ Iter 160: Train loss 0.571, Learning Rate 1.000e-05, It/sec 0.263, Tokens/sec 254.557, Trained Tokens 157921, Peak mem 51.482 GB
81
+ Iter 170: Train loss 0.552, Learning Rate 1.000e-05, It/sec 0.292, Tokens/sec 267.427, Trained Tokens 167073, Peak mem 51.482 GB
82
+ Iter 180: Train loss 0.571, Learning Rate 1.000e-05, It/sec 0.269, Tokens/sec 260.904, Trained Tokens 176788, Peak mem 51.482 GB
83
+ Iter 190: Train loss 0.712, Learning Rate 1.000e-05, It/sec 0.215, Tokens/sec 236.971, Trained Tokens 187826, Peak mem 52.133 GB
84
+
85
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
86
+ Calculating loss...: 4%|▍ | 1/25 [00:05<02:23, 5.97s/it]
87
+ Calculating loss...: 8%|β–Š | 2/25 [00:08<01:35, 4.15s/it]
88
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:10<01:09, 3.17s/it]
89
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:12<00:57, 2.76s/it]
90
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:15<00:51, 2.57s/it]
91
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:17<00:50, 2.63s/it]
92
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:19<00:43, 2.43s/it]
93
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:22<00:39, 2.34s/it]
94
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:24<00:39, 2.46s/it]
95
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:27<00:37, 2.51s/it]
96
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:29<00:33, 2.39s/it]
97
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:32<00:33, 2.57s/it]
98
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:34<00:29, 2.44s/it]
99
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:37<00:26, 2.42s/it]
100
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:39<00:24, 2.47s/it]
101
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:41<00:21, 2.37s/it]
102
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:44<00:19, 2.41s/it]
103
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:46<00:16, 2.29s/it]
104
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:49<00:14, 2.44s/it]
105
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:52<00:13, 2.61s/it]
106
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:54<00:09, 2.45s/it]
107
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:56<00:07, 2.54s/it]
108
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:59<00:05, 2.60s/it]
109
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [01:02<00:02, 2.57s/it]
110
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:04<00:00, 2.50s/it]
111
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:04<00:00, 2.58s/it]
112
+ Iter 200: Val loss 0.573, Val took 64.542s
113
+ Iter 200: Train loss 0.489, Learning Rate 1.000e-05, It/sec 0.281, Tokens/sec 264.627, Trained Tokens 197242, Peak mem 52.133 GB
114
+ Iter 200: Saved adapter weights to models/lora/deepseek_lora_telegram_20251111_165211/adapters.safetensors and models/lora/deepseek_lora_telegram_20251111_165211/0000200_adapters.safetensors.
115
+ Iter 210: Train loss 0.478, Learning Rate 1.000e-05, It/sec 0.295, Tokens/sec 265.899, Trained Tokens 206252, Peak mem 52.133 GB
116
+ Iter 220: Train loss 0.500, Learning Rate 1.000e-05, It/sec 0.288, Tokens/sec 268.788, Trained Tokens 215583, Peak mem 52.133 GB
117
+ Iter 230: Train loss 0.658, Learning Rate 1.000e-05, It/sec 0.258, Tokens/sec 253.876, Trained Tokens 225430, Peak mem 52.133 GB
118
+ Iter 240: Train loss 0.583, Learning Rate 1.000e-05, It/sec 0.277, Tokens/sec 263.746, Trained Tokens 234953, Peak mem 52.133 GB
119
+ Iter 250: Train loss 0.531, Learning Rate 1.000e-05, It/sec 0.273, Tokens/sec 258.514, Trained Tokens 244424, Peak mem 52.133 GB
120
+ Iter 260: Train loss 0.540, Learning Rate 1.000e-05, It/sec 0.275, Tokens/sec 263.070, Trained Tokens 254004, Peak mem 52.133 GB
121
+ Iter 270: Train loss 0.464, Learning Rate 1.000e-05, It/sec 0.275, Tokens/sec 257.103, Trained Tokens 263367, Peak mem 52.133 GB
122
+ Iter 280: Train loss 0.445, Learning Rate 1.000e-05, It/sec 0.269, Tokens/sec 254.944, Trained Tokens 272830, Peak mem 52.133 GB
123
+ Iter 290: Train loss 0.567, Learning Rate 1.000e-05, It/sec 0.272, Tokens/sec 256.184, Trained Tokens 282233, Peak mem 52.133 GB
124
+
125
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
126
+ Calculating loss...: 4%|▍ | 1/25 [00:02<00:50, 2.12s/it]
127
+ Calculating loss...: 8%|β–Š | 2/25 [00:06<01:19, 3.46s/it]
128
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:08<01:04, 2.94s/it]
129
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:11<00:56, 2.69s/it]
130
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:13<00:51, 2.56s/it]
131
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:15<00:45, 2.41s/it]
132
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:17<00:42, 2.35s/it]
133
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:19<00:38, 2.28s/it]
134
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:22<00:36, 2.29s/it]
135
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:24<00:35, 2.35s/it]
136
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:27<00:33, 2.37s/it]
137
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:29<00:31, 2.45s/it]
138
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:32<00:31, 2.59s/it]
139
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:34<00:26, 2.41s/it]
140
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:39<00:30, 3.03s/it]
141
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:41<00:24, 2.77s/it]
142
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:43<00:21, 2.64s/it]
143
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:46<00:18, 2.67s/it]
144
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:49<00:16, 2.69s/it]
145
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:51<00:13, 2.67s/it]
146
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:53<00:09, 2.47s/it]
147
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:56<00:07, 2.54s/it]
148
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:59<00:05, 2.82s/it]
149
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [01:02<00:02, 2.66s/it]
150
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:04<00:00, 2.50s/it]
151
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:04<00:00, 2.58s/it]
152
+ Iter 300: Val loss 0.551, Val took 64.368s
153
+ Iter 300: Train loss 0.578, Learning Rate 1.000e-05, It/sec 0.288, Tokens/sec 264.039, Trained Tokens 291397, Peak mem 52.133 GB
154
+ Iter 300: Saved adapter weights to models/lora/deepseek_lora_telegram_20251111_165211/adapters.safetensors and models/lora/deepseek_lora_telegram_20251111_165211/0000300_adapters.safetensors.
155
+ Iter 310: Train loss 0.490, Learning Rate 1.000e-05, It/sec 0.294, Tokens/sec 268.104, Trained Tokens 300506, Peak mem 52.133 GB
156
+ Iter 320: Train loss 0.494, Learning Rate 1.000e-05, It/sec 0.273, Tokens/sec 255.665, Trained Tokens 309868, Peak mem 52.133 GB
157
+ Iter 330: Train loss 0.553, Learning Rate 1.000e-05, It/sec 0.274, Tokens/sec 260.283, Trained Tokens 319352, Peak mem 52.133 GB
158
+ Iter 340: Train loss 0.510, Learning Rate 1.000e-05, It/sec 0.275, Tokens/sec 255.333, Trained Tokens 328629, Peak mem 52.133 GB
159
+ Iter 350: Train loss 0.754, Learning Rate 1.000e-05, It/sec 0.199, Tokens/sec 223.530, Trained Tokens 339841, Peak mem 53.360 GB
160
+ Iter 360: Train loss 0.582, Learning Rate 1.000e-05, It/sec 0.279, Tokens/sec 268.722, Trained Tokens 349481, Peak mem 53.360 GB
161
+ Iter 370: Train loss 0.637, Learning Rate 1.000e-05, It/sec 0.231, Tokens/sec 247.694, Trained Tokens 360226, Peak mem 53.360 GB
162
+ Iter 380: Train loss 0.558, Learning Rate 1.000e-05, It/sec 0.258, Tokens/sec 253.024, Trained Tokens 370048, Peak mem 53.360 GB
163
+ Iter 390: Train loss 0.568, Learning Rate 1.000e-05, It/sec 0.251, Tokens/sec 254.267, Trained Tokens 380194, Peak mem 53.360 GB
164
+
165
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
166
+ Calculating loss...: 4%|▍ | 1/25 [00:02<00:55, 2.29s/it]
167
+ Calculating loss...: 8%|β–Š | 2/25 [00:04<00:52, 2.29s/it]
168
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:07<00:54, 2.48s/it]
169
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:12<01:15, 3.58s/it]
170
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:14<01:01, 3.06s/it]
171
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:16<00:52, 2.74s/it]
172
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:19<00:48, 2.70s/it]
173
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:21<00:43, 2.57s/it]
174
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:24<00:41, 2.58s/it]
175
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:26<00:36, 2.44s/it]
176
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:28<00:32, 2.31s/it]
177
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:30<00:30, 2.34s/it]
178
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:32<00:27, 2.27s/it]
179
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:35<00:24, 2.22s/it]
180
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:38<00:25, 2.59s/it]
181
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:40<00:22, 2.54s/it]
182
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:42<00:19, 2.38s/it]
183
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:47<00:20, 2.97s/it]
184
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:50<00:17, 2.94s/it]
185
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:52<00:13, 2.70s/it]
186
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:54<00:09, 2.49s/it]
187
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:56<00:07, 2.34s/it]
188
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:58<00:04, 2.34s/it]
189
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [01:00<00:02, 2.33s/it]
190
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:03<00:00, 2.37s/it]
191
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:03<00:00, 2.54s/it]
192
+ Iter 400: Val loss 0.584, Val took 63.439s
193
+ Iter 400: Train loss 0.464, Learning Rate 1.000e-05, It/sec 0.273, Tokens/sec 261.717, Trained Tokens 389793, Peak mem 53.360 GB
194
+ Iter 400: Saved adapter weights to models/lora/deepseek_lora_telegram_20251111_165211/adapters.safetensors and models/lora/deepseek_lora_telegram_20251111_165211/0000400_adapters.safetensors.
195
+ Iter 410: Train loss 0.492, Learning Rate 1.000e-05, It/sec 0.253, Tokens/sec 255.097, Trained Tokens 399884, Peak mem 53.360 GB
196
+ Iter 420: Train loss 0.434, Learning Rate 1.000e-05, It/sec 0.284, Tokens/sec 268.317, Trained Tokens 409347, Peak mem 53.360 GB
197
+ Iter 430: Train loss 0.477, Learning Rate 1.000e-05, It/sec 0.275, Tokens/sec 263.682, Trained Tokens 418919, Peak mem 53.360 GB
198
+ Iter 440: Train loss 0.496, Learning Rate 1.000e-05, It/sec 0.263, Tokens/sec 253.249, Trained Tokens 428532, Peak mem 53.360 GB
199
+ Iter 450: Train loss 0.497, Learning Rate 1.000e-05, It/sec 0.263, Tokens/sec 263.457, Trained Tokens 438545, Peak mem 53.360 GB
200
+ Iter 460: Train loss 0.485, Learning Rate 1.000e-05, It/sec 0.278, Tokens/sec 261.986, Trained Tokens 447955, Peak mem 53.360 GB
201
+ Iter 470: Train loss 0.471, Learning Rate 1.000e-05, It/sec 0.274, Tokens/sec 261.977, Trained Tokens 457532, Peak mem 53.360 GB
202
+ Iter 480: Train loss 0.498, Learning Rate 1.000e-05, It/sec 0.245, Tokens/sec 249.742, Trained Tokens 467722, Peak mem 53.360 GB
203
+ Iter 490: Train loss 0.471, Learning Rate 1.000e-05, It/sec 0.282, Tokens/sec 265.978, Trained Tokens 477156, Peak mem 53.360 GB
204
+
205
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
206
+ Calculating loss...: 4%|▍ | 1/25 [00:02<00:56, 2.35s/it]
207
+ Calculating loss...: 8%|β–Š | 2/25 [00:04<00:51, 2.22s/it]
208
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:06<00:47, 2.18s/it]
209
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:08<00:46, 2.24s/it]
210
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:11<00:49, 2.48s/it]
211
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:13<00:44, 2.32s/it]
212
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:16<00:45, 2.54s/it]
213
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:19<00:41, 2.42s/it]
214
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:21<00:40, 2.51s/it]
215
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:24<00:36, 2.45s/it]
216
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:26<00:32, 2.35s/it]
217
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:28<00:30, 2.34s/it]
218
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:30<00:28, 2.37s/it]
219
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:33<00:26, 2.39s/it]
220
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:37<00:29, 2.98s/it]
221
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:40<00:25, 2.81s/it]
222
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:42<00:20, 2.60s/it]
223
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:44<00:16, 2.42s/it]
224
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:46<00:13, 2.32s/it]
225
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:48<00:11, 2.26s/it]
226
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:51<00:09, 2.37s/it]
227
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:53<00:07, 2.35s/it]
228
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:55<00:04, 2.27s/it]
229
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [00:57<00:02, 2.29s/it]
230
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:00<00:00, 2.30s/it]
231
+ Calculating loss...: 100%|οΏ½οΏ½β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [01:00<00:00, 2.40s/it]
232
+ Iter 500: Val loss 0.516, Val took 60.113s
233
+ Iter 500: Train loss 0.482, Learning Rate 1.000e-05, It/sec 0.258, Tokens/sec 252.005, Trained Tokens 486939, Peak mem 53.360 GB
234
+ Iter 500: Saved adapter weights to models/lora/deepseek_lora_telegram_20251111_165211/adapters.safetensors and models/lora/deepseek_lora_telegram_20251111_165211/0000500_adapters.safetensors.
235
+ Iter 510: Train loss 0.618, Learning Rate 1.000e-05, It/sec 0.248, Tokens/sec 253.918, Trained Tokens 497197, Peak mem 53.360 GB
236
+ Iter 520: Train loss 0.454, Learning Rate 1.000e-05, It/sec 0.270, Tokens/sec 257.128, Trained Tokens 506732, Peak mem 53.360 GB
237
+ Iter 530: Train loss 0.564, Learning Rate 1.000e-05, It/sec 0.263, Tokens/sec 260.847, Trained Tokens 516645, Peak mem 53.360 GB
238
+ Iter 540: Train loss 0.401, Learning Rate 1.000e-05, It/sec 0.307, Tokens/sec 272.605, Trained Tokens 525534, Peak mem 53.360 GB
239
+ Iter 550: Train loss 0.472, Learning Rate 1.000e-05, It/sec 0.270, Tokens/sec 258.736, Trained Tokens 535129, Peak mem 53.360 GB
240
+ Iter 560: Train loss 0.661, Learning Rate 1.000e-05, It/sec 0.218, Tokens/sec 236.552, Trained Tokens 545967, Peak mem 53.360 GB
241
+ Iter 570: Train loss 0.491, Learning Rate 1.000e-05, It/sec 0.271, Tokens/sec 261.655, Trained Tokens 555617, Peak mem 53.360 GB
242
+ Iter 580: Train loss 0.465, Learning Rate 1.000e-05, It/sec 0.276, Tokens/sec 260.449, Trained Tokens 565065, Peak mem 53.360 GB
243
+ Iter 590: Train loss 0.447, Learning Rate 1.000e-05, It/sec 0.282, Tokens/sec 267.644, Trained Tokens 574556, Peak mem 53.360 GB
244
+
245
+ Calculating loss...: 0%| | 0/25 [00:00<?, ?it/s]
246
+ Calculating loss...: 4%|▍ | 1/25 [00:02<01:11, 2.97s/it]
247
+ Calculating loss...: 8%|β–Š | 2/25 [00:05<00:57, 2.48s/it]
248
+ Calculating loss...: 12%|β–ˆβ– | 3/25 [00:07<00:51, 2.32s/it]
249
+ Calculating loss...: 16%|β–ˆβ–Œ | 4/25 [00:09<00:48, 2.31s/it]
250
+ Calculating loss...: 20%|β–ˆβ–ˆ | 5/25 [00:12<00:48, 2.42s/it]
251
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 6/25 [00:14<00:45, 2.38s/it]
252
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 7/25 [00:16<00:41, 2.29s/it]
253
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 8/25 [00:18<00:39, 2.34s/it]
254
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 9/25 [00:21<00:40, 2.51s/it]
255
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 10/25 [00:24<00:39, 2.62s/it]
256
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 11/25 [00:28<00:40, 2.88s/it]
257
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 12/25 [00:30<00:35, 2.70s/it]
258
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 13/25 [00:32<00:30, 2.52s/it]
259
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 14/25 [00:34<00:26, 2.40s/it]
260
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 15/25 [00:36<00:23, 2.32s/it]
261
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 16/25 [00:38<00:20, 2.22s/it]
262
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 17/25 [00:40<00:17, 2.16s/it]
263
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 18/25 [00:43<00:15, 2.20s/it]
264
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 19/25 [00:45<00:12, 2.14s/it]
265
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 20/25 [00:47<00:10, 2.13s/it]
266
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 21/25 [00:49<00:08, 2.18s/it]
267
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 22/25 [00:51<00:06, 2.13s/it]
268
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 23/25 [00:54<00:04, 2.28s/it]
269
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 24/25 [00:56<00:02, 2.32s/it]
270
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:58<00:00, 2.27s/it]
271
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 25/25 [00:58<00:00, 2.35s/it]
272
+ Iter 600: Val loss 0.463, Val took 58.782s
273
+ Iter 600: Train loss 0.456, Learning Rate 1.000e-05, It/sec 0.292, Tokens/sec 265.446, Trained Tokens 583661, Peak mem 53.360 GB
274
+ Iter 600: Saved adapter weights to models/lora/deepseek_lora_telegram_20251111_165211/adapters.safetensors and models/lora/deepseek_lora_telegram_20251111_165211/0000600_adapters.safetensors.
275
+ Saved final weights to models/lora/deepseek_lora_telegram_20251111_165211/adapters.safetensors.
276
+ Testing
277
+
278
+ Calculating loss...: 0%| | 0/50 [00:00<?, ?it/s]
279
+ Calculating loss...: 2%|▏ | 1/50 [00:02<01:58, 2.41s/it]
280
+ Calculating loss...: 4%|▍ | 2/50 [00:04<01:47, 2.24s/it]
281
+ Calculating loss...: 6%|β–Œ | 3/50 [00:07<01:59, 2.54s/it]
282
+ Calculating loss...: 8%|β–Š | 4/50 [00:09<01:54, 2.50s/it]
283
+ Calculating loss...: 10%|β–ˆ | 5/50 [00:12<01:51, 2.47s/it]
284
+ Calculating loss...: 12%|β–ˆβ– | 6/50 [00:14<01:45, 2.41s/it]
285
+ Calculating loss...: 14%|β–ˆβ– | 7/50 [00:16<01:41, 2.37s/it]
286
+ Calculating loss...: 16%|β–ˆβ–Œ | 8/50 [00:18<01:35, 2.26s/it]
287
+ Calculating loss...: 18%|β–ˆβ–Š | 9/50 [00:20<01:29, 2.19s/it]
288
+ Calculating loss...: 20%|β–ˆβ–ˆ | 10/50 [00:23<01:36, 2.41s/it]
289
+ Calculating loss...: 22%|β–ˆβ–ˆβ– | 11/50 [00:26<01:32, 2.37s/it]
290
+ Calculating loss...: 24%|β–ˆβ–ˆβ– | 12/50 [00:28<01:25, 2.26s/it]
291
+ Calculating loss...: 26%|β–ˆβ–ˆβ–Œ | 13/50 [00:30<01:22, 2.23s/it]
292
+ Calculating loss...: 28%|β–ˆβ–ˆβ–Š | 14/50 [00:32<01:20, 2.25s/it]
293
+ Calculating loss...: 30%|β–ˆβ–ˆβ–ˆ | 15/50 [00:34<01:16, 2.18s/it]
294
+ Calculating loss...: 32%|β–ˆβ–ˆβ–ˆβ– | 16/50 [00:37<01:18, 2.31s/it]
295
+ Calculating loss...: 34%|β–ˆβ–ˆβ–ˆβ– | 17/50 [00:39<01:20, 2.44s/it]
296
+ Calculating loss...: 36%|β–ˆβ–ˆβ–ˆβ–Œ | 18/50 [00:42<01:18, 2.44s/it]
297
+ Calculating loss...: 38%|β–ˆβ–ˆβ–ˆβ–Š | 19/50 [00:44<01:12, 2.34s/it]
298
+ Calculating loss...: 40%|β–ˆβ–ˆβ–ˆβ–ˆ | 20/50 [00:46<01:08, 2.28s/it]
299
+ Calculating loss...: 42%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 21/50 [00:49<01:09, 2.38s/it]
300
+ Calculating loss...: 44%|β–ˆβ–ˆβ–ˆβ–ˆβ– | 22/50 [00:51<01:05, 2.35s/it]
301
+ Calculating loss...: 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 23/50 [00:53<01:01, 2.29s/it]
302
+ Calculating loss...: 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–Š | 24/50 [00:55<00:58, 2.23s/it]
303
+ Calculating loss...: 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 25/50 [00:58<00:58, 2.35s/it]
304
+ Calculating loss...: 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 26/50 [01:00<00:54, 2.26s/it]
305
+ Calculating loss...: 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 27/50 [01:02<00:50, 2.21s/it]
306
+ Calculating loss...: 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 28/50 [01:04<00:48, 2.19s/it]
307
+ Calculating loss...: 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 29/50 [01:06<00:45, 2.17s/it]
308
+ Calculating loss...: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 30/50 [01:09<00:45, 2.25s/it]
309
+ Calculating loss...: 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 31/50 [01:11<00:41, 2.21s/it]
310
+ Calculating loss...: 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 32/50 [01:14<00:43, 2.41s/it]
311
+ Calculating loss...: 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 33/50 [01:16<00:41, 2.42s/it]
312
+ Calculating loss...: 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 34/50 [01:19<00:38, 2.42s/it]
313
+ Calculating loss...: 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 35/50 [01:23<00:43, 2.92s/it]
314
+ Calculating loss...: 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 36/50 [01:25<00:37, 2.68s/it]
315
+ Calculating loss...: 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 37/50 [01:28<00:35, 2.74s/it]
316
+ Calculating loss...: 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 38/50 [01:30<00:32, 2.70s/it]
317
+ Calculating loss...: 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 39/50 [01:33<00:28, 2.58s/it]
318
+ Calculating loss...: 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 40/50 [01:37<00:31, 3.15s/it]
319
+ Calculating loss...: 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 41/50 [01:39<00:25, 2.84s/it]
320
+ Calculating loss...: 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 42/50 [01:42<00:22, 2.80s/it]
321
+ Calculating loss...: 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 43/50 [01:45<00:19, 2.78s/it]
322
+ Calculating loss...: 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 44/50 [01:48<00:17, 2.84s/it]
323
+ Calculating loss...: 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 45/50 [01:50<00:14, 2.85s/it]
324
+ Calculating loss...: 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 46/50 [01:53<00:10, 2.69s/it]
325
+ Calculating loss...: 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 47/50 [01:55<00:07, 2.57s/it]
326
+ Calculating loss...: 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ| 48/50 [01:57<00:04, 2.50s/it]
327
+ Calculating loss...: 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š| 49/50 [02:00<00:02, 2.44s/it]
328
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 50/50 [02:02<00:00, 2.54s/it]
329
+ Calculating loss...: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 50/50 [02:02<00:00, 2.46s/it]
330
+ Test loss 0.526, Test ppl 1.693.