guanning commited on
Commit
0f915ce
·
verified ·
1 Parent(s): e34a0d7

backup 02_lr2e-4 pre-cleanup

Browse files
.gitattributes CHANGED
@@ -129,3 +129,5 @@ _DRYRUN3/02_single_ar/last/tokenizer.json filter=lfs diff=lfs merge=lfs -text
129
  01_llada3e-4_qwen3e-4/01_qwen_body/periodic-step3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
130
  01_llada3e-4_qwen3e-4/01_qwen_body/step100-val0.7227/tokenizer.json filter=lfs diff=lfs merge=lfs -text
131
  01_llada3e-4_qwen3e-4/01_qwen_body/step50-val0.6665/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
129
  01_llada3e-4_qwen3e-4/01_qwen_body/periodic-step3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
130
  01_llada3e-4_qwen3e-4/01_qwen_body/step100-val0.7227/tokenizer.json filter=lfs diff=lfs merge=lfs -text
131
  01_llada3e-4_qwen3e-4/01_qwen_body/step50-val0.6665/tokenizer.json filter=lfs diff=lfs merge=lfs -text
132
+ 02_lr2e-4/02_single_ar/step100-val1.0025/tokenizer.json filter=lfs diff=lfs merge=lfs -text
133
+ 02_lr2e-4/02_single_ar/step50-val0.9883/tokenizer.json filter=lfs diff=lfs merge=lfs -text
02_lr2e-4/02_single_ar.log ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ trainable params: 269,936,128 || all params: 1,580,276,736 || trainable%: 17.0816
2
+ val set: 500 samples eval_interval=50 eval_batches=20
3
+ [qwen] step 20/3375 loss=1.4537 lr=4.00e-05 elapsed=15s
4
+ [qwen] step 40/3375 loss=1.1372 lr=8.00e-05 elapsed=28s
5
+ [qwen] step 50 val_loss=0.9883
6
+ /home/jgai/miniconda3/envs/diffar/lib/python3.10/site-packages/peft/utils/save_and_load.py:300: UserWarning: Setting `save_embedding_layers` to `True` as the embedding layer has been resized during finetuning.
7
+ warnings.warn(
8
+ [topk] +save step=50 val=0.9883 (top-1/2)
9
+ [qwen] step 60/3375 loss=0.9140 lr=1.20e-04 elapsed=47s
10
+ [qwen] step 80/3375 loss=0.8483 lr=1.60e-04 elapsed=60s
11
+ [qwen] step 100/3375 loss=1.2447 lr=2.00e-04 elapsed=73s
12
+ [qwen] step 100 val_loss=1.0025
13
+ [topk] +save step=100 val=1.0025 (top-2/2)
14
+ [qwen] step 120/3375 loss=0.8903 lr=2.00e-04 elapsed=92s
15
+ [qwen] step 140/3375 loss=0.9677 lr=2.00e-04 elapsed=106s
16
+ [qwen] step 150 val_loss=1.0269
17
+ [qwen] step 160/3375 loss=0.8818 lr=2.00e-04 elapsed=123s
18
+ [qwen] step 180/3375 loss=1.0244 lr=2.00e-04 elapsed=136s
19
+ [qwen] step 200/3375 loss=1.1301 lr=2.00e-04 elapsed=149s
20
+ [qwen] step 200 val_loss=1.0427
21
+ [qwen] step 220/3375 loss=1.2421 lr=1.99e-04 elapsed=166s
22
+ [qwen] step 240/3375 loss=1.1121 lr=1.99e-04 elapsed=179s
23
+ [qwen] step 250 val_loss=1.0646
24
+ [qwen] step 260/3375 loss=0.9831 lr=1.99e-04 elapsed=196s
25
+ [qwen] step 280/3375 loss=1.1533 lr=1.99e-04 elapsed=209s
26
+ [qwen] step 300/3375 loss=1.1168 lr=1.98e-04 elapsed=222s
27
+ [qwen] step 300 val_loss=1.1228
28
+ [qwen] step 320/3375 loss=1.4393 lr=1.98e-04 elapsed=239s
29
+ [qwen] step 340/3375 loss=1.1998 lr=1.97e-04 elapsed=253s
30
+ [qwen] step 350 val_loss=1.1543
31
+ [qwen] step 360/3375 loss=1.1141 lr=1.97e-04 elapsed=270s
32
+ [qwen] step 380/3375 loss=0.9754 lr=1.96e-04 elapsed=283s
33
+ [qwen] step 400/3375 loss=1.1409 lr=1.96e-04 elapsed=296s
34
+ [qwen] step 400 val_loss=1.1501
35
+ [qwen] step 420/3375 loss=1.2336 lr=1.95e-04 elapsed=313s
36
+ [qwen] step 440/3375 loss=1.1110 lr=1.95e-04 elapsed=326s
37
+ [qwen] step 450 val_loss=1.1601
38
+ [qwen] step 460/3375 loss=1.0750 lr=1.94e-04 elapsed=343s
39
+ [qwen] step 480/3375 loss=1.3495 lr=1.93e-04 elapsed=356s
40
+ [qwen] step 500/3375 loss=1.2462 lr=1.93e-04 elapsed=369s
41
+ [qwen] step 500 val_loss=1.1908
42
+ [qwen] step 520/3375 loss=1.2390 lr=1.92e-04 elapsed=387s
43
+ [qwen] step 540/3375 loss=1.3574 lr=1.91e-04 elapsed=400s
44
+ [qwen] step 550 val_loss=1.2341
45
+ [qwen] step 560/3375 loss=1.2452 lr=1.90e-04 elapsed=417s
46
+ [qwen] step 580/3375 loss=1.0924 lr=1.90e-04 elapsed=430s
47
+ [qwen] step 600/3375 loss=1.1892 lr=1.89e-04 elapsed=443s
48
+ [qwen] step 600 val_loss=1.2004
49
+ [qwen] step 620/3375 loss=1.2295 lr=1.88e-04 elapsed=460s
50
+ [qwen] step 640/3375 loss=1.0972 lr=1.87e-04 elapsed=473s
51
+ [qwen] step 650 val_loss=1.2291
52
+ [qwen] step 660/3375 loss=1.3711 lr=1.86e-04 elapsed=490s
53
+ [qwen] step 680/3375 loss=1.1601 lr=1.85e-04 elapsed=503s
54
+ [qwen] step 700/3375 loss=1.2286 lr=1.84e-04 elapsed=517s
55
+ [qwen] step 700 val_loss=1.2262
56
+ [qwen] step 720/3375 loss=1.2472 lr=1.83e-04 elapsed=534s
57
+ [qwen] step 740/3375 loss=1.4153 lr=1.82e-04 elapsed=547s
58
+ [qwen] step 750 val_loss=1.2386
59
+ [qwen] step 760/3375 loss=1.3557 lr=1.81e-04 elapsed=564s
60
+ [qwen] step 780/3375 loss=1.4020 lr=1.79e-04 elapsed=577s
61
+ [qwen] step 800/3375 loss=1.4806 lr=1.78e-04 elapsed=590s
62
+ [qwen] step 800 val_loss=1.2390
63
+ [qwen] step 820/3375 loss=1.3951 lr=1.77e-04 elapsed=607s
64
+ [qwen] step 840/3375 loss=1.0590 lr=1.76e-04 elapsed=620s
65
+ [qwen] step 850 val_loss=1.2599
66
+ [qwen] step 860/3375 loss=1.5092 lr=1.75e-04 elapsed=638s
67
+ [qwen] step 880/3375 loss=1.2216 lr=1.73e-04 elapsed=651s
68
+ [qwen] step 900/3375 loss=1.0819 lr=1.72e-04 elapsed=664s
69
+ [qwen] step 900 val_loss=1.2727
70
+ [qwen] step 920/3375 loss=1.4366 lr=1.71e-04 elapsed=681s
71
+ [qwen] step 940/3375 loss=1.3029 lr=1.69e-04 elapsed=694s
72
+ [qwen] step 950 val_loss=1.2912
73
+ [qwen] step 960/3375 loss=1.4412 lr=1.68e-04 elapsed=711s
74
+ [qwen] step 980/3375 loss=1.3547 lr=1.66e-04 elapsed=725s
75
+ [qwen] step 1000/3375 loss=1.3509 lr=1.65e-04 elapsed=738s
76
+ [qwen] step 1000 val_loss=1.2792
77
+ [qwen] step 1020/3375 loss=1.1238 lr=1.64e-04 elapsed=755s
78
+ [qwen] step 1040/3375 loss=1.3718 lr=1.62e-04 elapsed=768s
79
+ [qwen] step 1050 val_loss=1.2756
80
+ [qwen] step 1060/3375 loss=1.3287 lr=1.61e-04 elapsed=785s
81
+ [qwen] step 1080/3375 loss=1.3912 lr=1.59e-04 elapsed=798s
82
+ [qwen] step 1100/3375 loss=1.3113 lr=1.57e-04 elapsed=811s
83
+ [qwen] step 1100 val_loss=1.2876
84
+ [qwen] step 1120/3375 loss=1.3817 lr=1.56e-04 elapsed=829s
85
+ [qwen] step 1140/3375 loss=1.3074 lr=1.54e-04 elapsed=842s
86
+ [qwen] step 1150 val_loss=1.2899
87
+ [qwen] step 1160/3375 loss=1.3559 lr=1.53e-04 elapsed=859s
88
+ [qwen] step 1180/3375 loss=1.3667 lr=1.51e-04 elapsed=872s
89
+ [qwen] step 1200/3375 loss=1.2641 lr=1.49e-04 elapsed=885s
90
+ [qwen] step 1200 val_loss=1.2895
91
+ [qwen] step 1220/3375 loss=1.2989 lr=1.48e-04 elapsed=902s
92
+ [qwen] step 1240/3375 loss=1.4462 lr=1.46e-04 elapsed=915s
93
+ [qwen] step 1250 val_loss=1.2965
94
+ [qwen] step 1260/3375 loss=1.2883 lr=1.44e-04 elapsed=932s
95
+ [qwen] step 1280/3375 loss=1.2552 lr=1.42e-04 elapsed=945s
96
+ [qwen] step 1300/3375 loss=1.1436 lr=1.41e-04 elapsed=959s
97
+ [qwen] step 1300 val_loss=1.3136
98
+ [qwen] step 1320/3375 loss=1.2648 lr=1.39e-04 elapsed=976s
99
+ [qwen] step 1340/3375 loss=1.3910 lr=1.37e-04 elapsed=989s
100
+ [qwen] step 1350 val_loss=1.3282
101
+ [qwen] step 1360/3375 loss=1.3565 lr=1.35e-04 elapsed=1006s
102
+ [qwen] step 1380/3375 loss=1.1782 lr=1.34e-04 elapsed=1019s
103
+ [qwen] step 1400/3375 loss=1.4795 lr=1.32e-04 elapsed=1032s
104
+ [qwen] step 1400 val_loss=1.3184
105
+ [qwen] step 1420/3375 loss=1.6362 lr=1.30e-04 elapsed=1049s
106
+ [qwen] step 1440/3375 loss=1.2179 lr=1.28e-04 elapsed=1062s
107
+ [qwen] step 1450 val_loss=1.3086
108
+ [qwen] step 1460/3375 loss=1.2888 lr=1.26e-04 elapsed=1080s
109
+ [qwen] step 1480/3375 loss=1.3422 lr=1.24e-04 elapsed=1093s
110
+ [qwen] step 1500/3375 loss=1.5065 lr=1.23e-04 elapsed=1106s
111
+ [qwen] step 1500 val_loss=1.3138
112
+ [qwen] step 1520/3375 loss=1.3482 lr=1.21e-04 elapsed=1123s
113
+ [qwen] step 1540/3375 loss=1.3934 lr=1.19e-04 elapsed=1136s
114
+ [qwen] step 1550 val_loss=1.3221
115
+ [qwen] step 1560/3375 loss=1.3778 lr=1.17e-04 elapsed=1153s
116
+ [qwen] step 1580/3375 loss=1.3863 lr=1.15e-04 elapsed=1166s
117
+ [qwen] step 1600/3375 loss=1.3966 lr=1.13e-04 elapsed=1179s
118
+ [qwen] step 1600 val_loss=1.3248
119
+ [qwen] step 1620/3375 loss=1.2380 lr=1.11e-04 elapsed=1197s
120
+ [qwen] step 1640/3375 loss=1.1116 lr=1.09e-04 elapsed=1210s
121
+ [qwen] step 1650 val_loss=1.3187
122
+ [qwen] step 1660/3375 loss=1.2834 lr=1.07e-04 elapsed=1227s
123
+ [qwen] step 1680/3375 loss=1.4427 lr=1.06e-04 elapsed=1240s
124
+ [qwen] step 1700/3375 loss=1.1298 lr=1.04e-04 elapsed=1253s
125
+ [qwen] step 1700 val_loss=1.3172
126
+ [qwen] step 1720/3375 loss=1.2671 lr=1.02e-04 elapsed=1270s
127
+ [qwen] step 1740/3375 loss=1.4189 lr=9.98e-05 elapsed=1283s
128
+ [qwen] step 1750 val_loss=1.3274
129
+ [qwen] step 1760/3375 loss=1.1769 lr=9.78e-05 elapsed=1301s
130
+ [qwen] step 1780/3375 loss=1.5192 lr=9.59e-05 elapsed=1314s
131
+ [qwen] step 1800/3375 loss=1.4004 lr=9.40e-05 elapsed=1327s
132
+ [qwen] step 1800 val_loss=1.3293
133
+ [qwen] step 1820/3375 loss=1.2495 lr=9.21e-05 elapsed=1344s
134
+ [qwen] step 1840/3375 loss=1.4134 lr=9.02e-05 elapsed=1357s
135
+ [qwen] step 1850 val_loss=1.3086
136
+ [qwen] step 1860/3375 loss=1.2312 lr=8.83e-05 elapsed=1374s
137
+ [qwen] step 1880/3375 loss=1.2386 lr=8.64e-05 elapsed=1387s
138
+ [qwen] step 1900/3375 loss=1.4536 lr=8.45e-05 elapsed=1400s
139
+ [qwen] step 1900 val_loss=1.2963
140
+ [qwen] step 1920/3375 loss=1.1664 lr=8.26e-05 elapsed=1417s
141
+ [qwen] step 1940/3375 loss=1.1863 lr=8.07e-05 elapsed=1431s
142
+ [qwen] step 1950 val_loss=1.2917
143
+ [qwen] step 1960/3375 loss=1.4012 lr=7.88e-05 elapsed=1448s
144
+ [qwen] step 1980/3375 loss=1.2864 lr=7.69e-05 elapsed=1461s
145
+ [qwen] step 2000/3375 loss=1.4254 lr=7.51e-05 elapsed=1474s
146
+ [qwen] step 2000 val_loss=1.2892
147
+ [qwen] step 2020/3375 loss=1.2463 lr=7.32e-05 elapsed=1491s
148
+ [qwen] step 2040/3375 loss=1.2330 lr=7.14e-05 elapsed=1504s
149
+ [qwen] step 2050 val_loss=1.2848
150
+ [qwen] step 2060/3375 loss=1.3204 lr=6.96e-05 elapsed=1521s
151
+ [qwen] step 2080/3375 loss=1.1087 lr=6.77e-05 elapsed=1534s
152
+ [qwen] step 2100/3375 loss=1.3137 lr=6.59e-05 elapsed=1547s
153
+ [qwen] step 2100 val_loss=1.2806
154
+ [qwen] step 2120/3375 loss=1.1110 lr=6.41e-05 elapsed=1565s
155
+ [qwen] step 2140/3375 loss=1.3364 lr=6.23e-05 elapsed=1578s
156
+ [qwen] step 2150 val_loss=1.2771
157
+ [qwen] step 2160/3375 loss=1.2026 lr=6.06e-05 elapsed=1595s
158
+ [qwen] step 2180/3375 loss=1.5130 lr=5.88e-05 elapsed=1608s
159
+ [qwen] step 2200/3375 loss=1.3101 lr=5.71e-05 elapsed=1621s
160
+ [qwen] step 2200 val_loss=1.2735
161
+ [qwen] step 2220/3375 loss=1.1430 lr=5.54e-05 elapsed=1638s
162
+ [qwen] step 2240/3375 loss=1.4382 lr=5.36e-05 elapsed=1651s
163
+ [qwen] step 2250 val_loss=1.2697
164
+ [qwen] step 2260/3375 loss=1.2799 lr=5.20e-05 elapsed=1668s
165
+ [qwen] step 2280/3375 loss=1.2135 lr=5.03e-05 elapsed=1681s
166
+ [qwen] step 2300/3375 loss=1.5658 lr=4.86e-05 elapsed=1694s
167
+ [qwen] step 2300 val_loss=1.2722
168
+ [qwen] step 2320/3375 loss=1.0929 lr=4.70e-05 elapsed=1712s
169
+ [qwen] step 2340/3375 loss=1.3269 lr=4.54e-05 elapsed=1725s
170
+ [qwen] step 2350 val_loss=1.2670
171
+ [qwen] step 2360/3375 loss=1.1419 lr=4.38e-05 elapsed=1742s
172
+ [qwen] step 2380/3375 loss=1.1317 lr=4.22e-05 elapsed=1755s
173
+ [qwen] step 2400/3375 loss=1.3087 lr=4.06e-05 elapsed=1768s
174
+ [qwen] step 2400 val_loss=1.2614
175
+ [qwen] step 2420/3375 loss=1.3487 lr=3.91e-05 elapsed=1785s
176
+ [qwen] step 2440/3375 loss=1.3098 lr=3.76e-05 elapsed=1798s
177
+ [qwen] step 2450 val_loss=1.2615
178
+ [qwen] step 2460/3375 loss=1.0641 lr=3.61e-05 elapsed=1816s
179
+ [qwen] step 2480/3375 loss=1.1441 lr=3.46e-05 elapsed=1829s
180
+ [qwen] step 2500/3375 loss=1.4658 lr=3.32e-05 elapsed=1842s
181
+ [qwen] step 2500 val_loss=1.2598
182
+ [qwen] step 2520/3375 loss=1.0757 lr=3.18e-05 elapsed=1859s
183
+ [qwen] step 2540/3375 loss=1.2793 lr=3.04e-05 elapsed=1872s
184
+ [qwen] step 2550 val_loss=1.2584
185
+ [qwen] step 2560/3375 loss=1.1938 lr=2.90e-05 elapsed=1889s
186
+ [qwen] step 2580/3375 loss=1.2766 lr=2.77e-05 elapsed=1902s
187
+ [qwen] step 2600/3375 loss=1.2427 lr=2.64e-05 elapsed=1915s
188
+ [qwen] step 2600 val_loss=1.2535
189
+ [qwen] step 2620/3375 loss=1.3159 lr=2.51e-05 elapsed=1933s
190
+ [qwen] step 2640/3375 loss=1.2127 lr=2.38e-05 elapsed=1946s
191
+ [qwen] step 2650 val_loss=1.2495
192
+ [qwen] step 2660/3375 loss=1.2671 lr=2.26e-05 elapsed=1963s
193
+ [qwen] step 2680/3375 loss=1.3560 lr=2.14e-05 elapsed=1976s
194
+ [qwen] step 2700/3375 loss=1.3448 lr=2.02e-05 elapsed=1989s
195
+ [qwen] step 2700 val_loss=1.2487
196
+ [qwen] step 2720/3375 loss=1.4807 lr=2.00e-05 elapsed=2006s
197
+ [qwen] step 2740/3375 loss=1.2928 lr=2.00e-05 elapsed=2020s
198
+ [qwen] step 2750 val_loss=1.2438
199
+ [qwen] step 2760/3375 loss=1.4174 lr=2.00e-05 elapsed=2037s
200
+ [qwen] step 2780/3375 loss=1.1116 lr=2.00e-05 elapsed=2050s
201
+ [qwen] step 2800/3375 loss=1.3646 lr=2.00e-05 elapsed=2063s
202
+ [qwen] step 2800 val_loss=1.2422
203
+ [qwen] step 2820/3375 loss=1.1873 lr=2.00e-05 elapsed=2080s
204
+ [qwen] step 2840/3375 loss=1.2777 lr=2.00e-05 elapsed=2093s
205
+ [qwen] step 2850 val_loss=1.2417
206
+ [qwen] step 2860/3375 loss=1.2430 lr=2.00e-05 elapsed=2110s
207
+ [qwen] step 2880/3375 loss=1.3079 lr=2.00e-05 elapsed=2123s
208
+ [qwen] step 2900/3375 loss=1.1310 lr=2.00e-05 elapsed=2136s
209
+ [qwen] step 2900 val_loss=1.2389
210
+ [qwen] step 2920/3375 loss=1.0678 lr=2.00e-05 elapsed=2154s
211
+ [qwen] step 2940/3375 loss=1.4525 lr=2.00e-05 elapsed=2167s
212
+ [qwen] step 2950 val_loss=1.2381
213
+ [qwen] step 2960/3375 loss=1.2042 lr=2.00e-05 elapsed=2184s
214
+ [qwen] step 2980/3375 loss=0.9134 lr=2.00e-05 elapsed=2197s
215
+ [qwen] step 3000/3375 loss=1.2430 lr=2.00e-05 elapsed=2210s
216
+ [qwen] step 3000 val_loss=1.2379
217
+ [qwen] step 3020/3375 loss=1.3611 lr=2.00e-05 elapsed=2227s
218
+ [qwen] step 3040/3375 loss=1.4916 lr=2.00e-05 elapsed=2240s
219
+ [qwen] step 3050 val_loss=1.2373
220
+ [qwen] step 3060/3375 loss=1.2767 lr=2.00e-05 elapsed=2257s
221
+ [qwen] step 3080/3375 loss=1.2944 lr=2.00e-05 elapsed=2271s
222
+ [qwen] step 3100/3375 loss=1.5489 lr=2.00e-05 elapsed=2284s
223
+ [qwen] step 3100 val_loss=1.2352
224
+ [qwen] step 3120/3375 loss=1.3566 lr=2.00e-05 elapsed=2301s
225
+ [qwen] step 3140/3375 loss=1.3777 lr=2.00e-05 elapsed=2314s
226
+ [qwen] step 3150 val_loss=1.2327
227
+ [qwen] step 3160/3375 loss=1.3102 lr=2.00e-05 elapsed=2331s
228
+ [qwen] step 3180/3375 loss=1.3609 lr=2.00e-05 elapsed=2344s
229
+ [qwen] step 3200/3375 loss=1.1203 lr=2.00e-05 elapsed=2357s
230
+ [qwen] step 3200 val_loss=1.2295
231
+ [qwen] step 3220/3375 loss=1.2794 lr=2.00e-05 elapsed=2374s
232
+ [qwen] step 3240/3375 loss=1.3918 lr=2.00e-05 elapsed=2387s
233
+ [qwen] step 3250 val_loss=1.2313
234
+ [qwen] step 3260/3375 loss=1.3777 lr=2.00e-05 elapsed=2405s
235
+ [qwen] step 3280/3375 loss=1.1877 lr=2.00e-05 elapsed=2418s
236
+ [qwen] step 3300/3375 loss=1.0857 lr=2.00e-05 elapsed=2431s
237
+ [qwen] step 3300 val_loss=1.2278
238
+ [qwen] step 3320/3375 loss=1.3276 lr=2.00e-05 elapsed=2448s
239
+ [qwen] step 3340/3375 loss=1.1286 lr=2.00e-05 elapsed=2461s
240
+ [qwen] step 3350 val_loss=1.2278
241
+ [qwen] step 3360/3375 loss=1.1440 lr=2.00e-05 elapsed=2478s
242
+ [qwen] FINAL val_loss=1.2273
243
+ Done. Best val_loss=1.0025 at step 100 path=/data/user_data/jgai/hybrid-code-gen/ckpt/full/02_lr2e-4/02_single_ar/step100-val1.0025
02_lr2e-4/02_single_ar/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model": "Qwen/Qwen2.5-Coder-1.5B",
3
+ "train_jsonl": "/data/user_data/jgai/hybrid-code-gen/data/stack10k/train.jsonl",
4
+ "val_jsonl": "/data/user_data/jgai/hybrid-code-gen/data/stack10k/val.jsonl",
5
+ "output_dir": "/data/user_data/jgai/hybrid-code-gen/ckpt/full/02_lr2e-4/02_single_ar",
6
+ "lora_r": 32,
7
+ "lora_alpha": 64,
8
+ "lora_dropout": 0.05,
9
+ "lora_target_modules": [
10
+ "q_proj",
11
+ "k_proj",
12
+ "v_proj",
13
+ "o_proj",
14
+ "gate_proj",
15
+ "up_proj",
16
+ "down_proj"
17
+ ],
18
+ "max_steps": 3375,
19
+ "batch_size": 1,
20
+ "grad_accum": 1,
21
+ "lr": 0.0002,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 100,
24
+ "grad_clip": 1.0,
25
+ "max_seq_len": 2048,
26
+ "loss_scope": "full",
27
+ "log_interval": 20,
28
+ "eval_interval": 50,
29
+ "save_interval": 1000,
30
+ "seed": 42
31
+ }
02_lr2e-4/02_single_ar/step100-val1.0025/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Coder-1.5B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-Coder-1.5B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.17.0
02_lr2e-4/02_single_ar/step100-val1.0025/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-Coder-1.5B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 32,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "down_proj",
29
+ "up_proj",
30
+ "v_proj",
31
+ "k_proj",
32
+ "q_proj",
33
+ "o_proj",
34
+ "gate_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
02_lr2e-4/02_single_ar/step100-val1.0025/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab1f0fa2c8956315e741358b1d84df2eba9ed2c6b89e49ad2204e67f34672e5a
3
+ size 1079797096
02_lr2e-4/02_single_ar/step100-val1.0025/added_tokens.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|body_0|>": 151667,
5
+ "<|body_1|>": 151668,
6
+ "<|body_2|>": 151669,
7
+ "<|body_3|>": 151670,
8
+ "<|body_4|>": 151671,
9
+ "<|body_5|>": 151672,
10
+ "<|body_6|>": 151673,
11
+ "<|body_7|>": 151674,
12
+ "<|body_8|>": 151675,
13
+ "<|body_9|>": 151676,
14
+ "<|body_start_0|>": 151677,
15
+ "<|body_start_1|>": 151678,
16
+ "<|body_start_2|>": 151679,
17
+ "<|body_start_3|>": 151680,
18
+ "<|body_start_4|>": 151681,
19
+ "<|body_start_5|>": 151682,
20
+ "<|body_start_6|>": 151683,
21
+ "<|body_start_7|>": 151684,
22
+ "<|body_start_8|>": 151685,
23
+ "<|body_start_9|>": 151686,
24
+ "<|box_end|>": 151649,
25
+ "<|box_start|>": 151648,
26
+ "<|end_body_0|>": 151687,
27
+ "<|end_body_1|>": 151688,
28
+ "<|end_body_2|>": 151689,
29
+ "<|end_body_3|>": 151690,
30
+ "<|end_body_4|>": 151691,
31
+ "<|end_body_5|>": 151692,
32
+ "<|end_body_6|>": 151693,
33
+ "<|end_body_7|>": 151694,
34
+ "<|end_body_8|>": 151695,
35
+ "<|end_body_9|>": 151696,
36
+ "<|end_skeleton|>": 151666,
37
+ "<|endoftext|>": 151643,
38
+ "<|file_sep|>": 151664,
39
+ "<|fim_middle|>": 151660,
40
+ "<|fim_pad|>": 151662,
41
+ "<|fim_prefix|>": 151659,
42
+ "<|fim_suffix|>": 151661,
43
+ "<|im_end|>": 151645,
44
+ "<|im_start|>": 151644,
45
+ "<|image_pad|>": 151655,
46
+ "<|object_ref_end|>": 151647,
47
+ "<|object_ref_start|>": 151646,
48
+ "<|quad_end|>": 151651,
49
+ "<|quad_start|>": 151650,
50
+ "<|repo_name|>": 151663,
51
+ "<|skeleton|>": 151665,
52
+ "<|video_pad|>": 151656,
53
+ "<|vision_end|>": 151653,
54
+ "<|vision_pad|>": 151654,
55
+ "<|vision_start|>": 151652
56
+ }
02_lr2e-4/02_single_ar/step100-val1.0025/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
02_lr2e-4/02_single_ar/step100-val1.0025/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
02_lr2e-4/02_single_ar/step100-val1.0025/special_tokens_map.json ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|skeleton|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|end_skeleton|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|body_0|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<|body_1|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<|body_2|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "<|body_3|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<|body_4|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "<|body_5|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<|body_6|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "<|body_7|>",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ },
73
+ {
74
+ "content": "<|body_8|>",
75
+ "lstrip": false,
76
+ "normalized": false,
77
+ "rstrip": false,
78
+ "single_word": false
79
+ },
80
+ {
81
+ "content": "<|body_9|>",
82
+ "lstrip": false,
83
+ "normalized": false,
84
+ "rstrip": false,
85
+ "single_word": false
86
+ },
87
+ {
88
+ "content": "<|body_start_0|>",
89
+ "lstrip": false,
90
+ "normalized": false,
91
+ "rstrip": false,
92
+ "single_word": false
93
+ },
94
+ {
95
+ "content": "<|body_start_1|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false
100
+ },
101
+ {
102
+ "content": "<|body_start_2|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false
107
+ },
108
+ {
109
+ "content": "<|body_start_3|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false
114
+ },
115
+ {
116
+ "content": "<|body_start_4|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false
121
+ },
122
+ {
123
+ "content": "<|body_start_5|>",
124
+ "lstrip": false,
125
+ "normalized": false,
126
+ "rstrip": false,
127
+ "single_word": false
128
+ },
129
+ {
130
+ "content": "<|body_start_6|>",
131
+ "lstrip": false,
132
+ "normalized": false,
133
+ "rstrip": false,
134
+ "single_word": false
135
+ },
136
+ {
137
+ "content": "<|body_start_7|>",
138
+ "lstrip": false,
139
+ "normalized": false,
140
+ "rstrip": false,
141
+ "single_word": false
142
+ },
143
+ {
144
+ "content": "<|body_start_8|>",
145
+ "lstrip": false,
146
+ "normalized": false,
147
+ "rstrip": false,
148
+ "single_word": false
149
+ },
150
+ {
151
+ "content": "<|body_start_9|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false
156
+ },
157
+ {
158
+ "content": "<|end_body_0|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false
163
+ },
164
+ {
165
+ "content": "<|end_body_1|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false
170
+ },
171
+ {
172
+ "content": "<|end_body_2|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false
177
+ },
178
+ {
179
+ "content": "<|end_body_3|>",
180
+ "lstrip": false,
181
+ "normalized": false,
182
+ "rstrip": false,
183
+ "single_word": false
184
+ },
185
+ {
186
+ "content": "<|end_body_4|>",
187
+ "lstrip": false,
188
+ "normalized": false,
189
+ "rstrip": false,
190
+ "single_word": false
191
+ },
192
+ {
193
+ "content": "<|end_body_5|>",
194
+ "lstrip": false,
195
+ "normalized": false,
196
+ "rstrip": false,
197
+ "single_word": false
198
+ },
199
+ {
200
+ "content": "<|end_body_6|>",
201
+ "lstrip": false,
202
+ "normalized": false,
203
+ "rstrip": false,
204
+ "single_word": false
205
+ },
206
+ {
207
+ "content": "<|end_body_7|>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false
212
+ },
213
+ {
214
+ "content": "<|end_body_8|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false
219
+ },
220
+ {
221
+ "content": "<|end_body_9|>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false
226
+ }
227
+ ],
228
+ "eos_token": {
229
+ "content": "<|endoftext|>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false
234
+ },
235
+ "pad_token": {
236
+ "content": "<|endoftext|>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false
241
+ }
242
+ }
02_lr2e-4/02_single_ar/step100-val1.0025/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e418905c4788c5f5ea0dd1ca15c904591f50fce71e16fc311f1b4d84e7ed50d8
3
+ size 11428087
02_lr2e-4/02_single_ar/step100-val1.0025/tokenizer_config.json ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|skeleton|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|end_skeleton|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<|body_0|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "151668": {
206
+ "content": "<|body_1|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "151669": {
214
+ "content": "<|body_2|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<|body_3|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<|body_4|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<|body_5|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<|body_6|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<|body_7|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<|body_8|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "151676": {
270
+ "content": "<|body_9|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "151677": {
278
+ "content": "<|body_start_0|>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "151678": {
286
+ "content": "<|body_start_1|>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "151679": {
294
+ "content": "<|body_start_2|>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "151680": {
302
+ "content": "<|body_start_3|>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "151681": {
310
+ "content": "<|body_start_4|>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "151682": {
318
+ "content": "<|body_start_5|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "151683": {
326
+ "content": "<|body_start_6|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "151684": {
334
+ "content": "<|body_start_7|>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "151685": {
342
+ "content": "<|body_start_8|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "151686": {
350
+ "content": "<|body_start_9|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "151687": {
358
+ "content": "<|end_body_0|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "151688": {
366
+ "content": "<|end_body_1|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "151689": {
374
+ "content": "<|end_body_2|>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "151690": {
382
+ "content": "<|end_body_3|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "151691": {
390
+ "content": "<|end_body_4|>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "151692": {
398
+ "content": "<|end_body_5|>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "151693": {
406
+ "content": "<|end_body_6|>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "151694": {
414
+ "content": "<|end_body_7|>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "151695": {
422
+ "content": "<|end_body_8|>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "151696": {
430
+ "content": "<|end_body_9|>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ }
437
+ },
438
+ "additional_special_tokens": [
439
+ "<|skeleton|>",
440
+ "<|end_skeleton|>",
441
+ "<|body_0|>",
442
+ "<|body_1|>",
443
+ "<|body_2|>",
444
+ "<|body_3|>",
445
+ "<|body_4|>",
446
+ "<|body_5|>",
447
+ "<|body_6|>",
448
+ "<|body_7|>",
449
+ "<|body_8|>",
450
+ "<|body_9|>",
451
+ "<|body_start_0|>",
452
+ "<|body_start_1|>",
453
+ "<|body_start_2|>",
454
+ "<|body_start_3|>",
455
+ "<|body_start_4|>",
456
+ "<|body_start_5|>",
457
+ "<|body_start_6|>",
458
+ "<|body_start_7|>",
459
+ "<|body_start_8|>",
460
+ "<|body_start_9|>",
461
+ "<|end_body_0|>",
462
+ "<|end_body_1|>",
463
+ "<|end_body_2|>",
464
+ "<|end_body_3|>",
465
+ "<|end_body_4|>",
466
+ "<|end_body_5|>",
467
+ "<|end_body_6|>",
468
+ "<|end_body_7|>",
469
+ "<|end_body_8|>",
470
+ "<|end_body_9|>"
471
+ ],
472
+ "bos_token": null,
473
+ "clean_up_tokenization_spaces": false,
474
+ "eos_token": "<|endoftext|>",
475
+ "errors": "replace",
476
+ "extra_special_tokens": {},
477
+ "model_max_length": 32768,
478
+ "pad_token": "<|endoftext|>",
479
+ "split_special_tokens": false,
480
+ "tokenizer_class": "Qwen2Tokenizer",
481
+ "unk_token": null
482
+ }
02_lr2e-4/02_single_ar/step100-val1.0025/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
02_lr2e-4/02_single_ar/step50-val0.9883/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Coder-1.5B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-Coder-1.5B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.17.0
02_lr2e-4/02_single_ar/step50-val0.9883/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-Coder-1.5B",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 32,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "down_proj",
29
+ "up_proj",
30
+ "v_proj",
31
+ "k_proj",
32
+ "q_proj",
33
+ "o_proj",
34
+ "gate_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
02_lr2e-4/02_single_ar/step50-val0.9883/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c2766251903a846ca9d530ee155b11492300064a8ffb5e186789497d37dc06d
3
+ size 1079797096
02_lr2e-4/02_single_ar/step50-val0.9883/added_tokens.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|body_0|>": 151667,
5
+ "<|body_1|>": 151668,
6
+ "<|body_2|>": 151669,
7
+ "<|body_3|>": 151670,
8
+ "<|body_4|>": 151671,
9
+ "<|body_5|>": 151672,
10
+ "<|body_6|>": 151673,
11
+ "<|body_7|>": 151674,
12
+ "<|body_8|>": 151675,
13
+ "<|body_9|>": 151676,
14
+ "<|body_start_0|>": 151677,
15
+ "<|body_start_1|>": 151678,
16
+ "<|body_start_2|>": 151679,
17
+ "<|body_start_3|>": 151680,
18
+ "<|body_start_4|>": 151681,
19
+ "<|body_start_5|>": 151682,
20
+ "<|body_start_6|>": 151683,
21
+ "<|body_start_7|>": 151684,
22
+ "<|body_start_8|>": 151685,
23
+ "<|body_start_9|>": 151686,
24
+ "<|box_end|>": 151649,
25
+ "<|box_start|>": 151648,
26
+ "<|end_body_0|>": 151687,
27
+ "<|end_body_1|>": 151688,
28
+ "<|end_body_2|>": 151689,
29
+ "<|end_body_3|>": 151690,
30
+ "<|end_body_4|>": 151691,
31
+ "<|end_body_5|>": 151692,
32
+ "<|end_body_6|>": 151693,
33
+ "<|end_body_7|>": 151694,
34
+ "<|end_body_8|>": 151695,
35
+ "<|end_body_9|>": 151696,
36
+ "<|end_skeleton|>": 151666,
37
+ "<|endoftext|>": 151643,
38
+ "<|file_sep|>": 151664,
39
+ "<|fim_middle|>": 151660,
40
+ "<|fim_pad|>": 151662,
41
+ "<|fim_prefix|>": 151659,
42
+ "<|fim_suffix|>": 151661,
43
+ "<|im_end|>": 151645,
44
+ "<|im_start|>": 151644,
45
+ "<|image_pad|>": 151655,
46
+ "<|object_ref_end|>": 151647,
47
+ "<|object_ref_start|>": 151646,
48
+ "<|quad_end|>": 151651,
49
+ "<|quad_start|>": 151650,
50
+ "<|repo_name|>": 151663,
51
+ "<|skeleton|>": 151665,
52
+ "<|video_pad|>": 151656,
53
+ "<|vision_end|>": 151653,
54
+ "<|vision_pad|>": 151654,
55
+ "<|vision_start|>": 151652
56
+ }
02_lr2e-4/02_single_ar/step50-val0.9883/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
02_lr2e-4/02_single_ar/step50-val0.9883/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
02_lr2e-4/02_single_ar/step50-val0.9883/special_tokens_map.json ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|skeleton|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|end_skeleton|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|body_0|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<|body_1|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<|body_2|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "<|body_3|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<|body_4|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ },
52
+ {
53
+ "content": "<|body_5|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false
58
+ },
59
+ {
60
+ "content": "<|body_6|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ },
66
+ {
67
+ "content": "<|body_7|>",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false
72
+ },
73
+ {
74
+ "content": "<|body_8|>",
75
+ "lstrip": false,
76
+ "normalized": false,
77
+ "rstrip": false,
78
+ "single_word": false
79
+ },
80
+ {
81
+ "content": "<|body_9|>",
82
+ "lstrip": false,
83
+ "normalized": false,
84
+ "rstrip": false,
85
+ "single_word": false
86
+ },
87
+ {
88
+ "content": "<|body_start_0|>",
89
+ "lstrip": false,
90
+ "normalized": false,
91
+ "rstrip": false,
92
+ "single_word": false
93
+ },
94
+ {
95
+ "content": "<|body_start_1|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false
100
+ },
101
+ {
102
+ "content": "<|body_start_2|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false
107
+ },
108
+ {
109
+ "content": "<|body_start_3|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false
114
+ },
115
+ {
116
+ "content": "<|body_start_4|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false
121
+ },
122
+ {
123
+ "content": "<|body_start_5|>",
124
+ "lstrip": false,
125
+ "normalized": false,
126
+ "rstrip": false,
127
+ "single_word": false
128
+ },
129
+ {
130
+ "content": "<|body_start_6|>",
131
+ "lstrip": false,
132
+ "normalized": false,
133
+ "rstrip": false,
134
+ "single_word": false
135
+ },
136
+ {
137
+ "content": "<|body_start_7|>",
138
+ "lstrip": false,
139
+ "normalized": false,
140
+ "rstrip": false,
141
+ "single_word": false
142
+ },
143
+ {
144
+ "content": "<|body_start_8|>",
145
+ "lstrip": false,
146
+ "normalized": false,
147
+ "rstrip": false,
148
+ "single_word": false
149
+ },
150
+ {
151
+ "content": "<|body_start_9|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false
156
+ },
157
+ {
158
+ "content": "<|end_body_0|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false
163
+ },
164
+ {
165
+ "content": "<|end_body_1|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false
170
+ },
171
+ {
172
+ "content": "<|end_body_2|>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false
177
+ },
178
+ {
179
+ "content": "<|end_body_3|>",
180
+ "lstrip": false,
181
+ "normalized": false,
182
+ "rstrip": false,
183
+ "single_word": false
184
+ },
185
+ {
186
+ "content": "<|end_body_4|>",
187
+ "lstrip": false,
188
+ "normalized": false,
189
+ "rstrip": false,
190
+ "single_word": false
191
+ },
192
+ {
193
+ "content": "<|end_body_5|>",
194
+ "lstrip": false,
195
+ "normalized": false,
196
+ "rstrip": false,
197
+ "single_word": false
198
+ },
199
+ {
200
+ "content": "<|end_body_6|>",
201
+ "lstrip": false,
202
+ "normalized": false,
203
+ "rstrip": false,
204
+ "single_word": false
205
+ },
206
+ {
207
+ "content": "<|end_body_7|>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false
212
+ },
213
+ {
214
+ "content": "<|end_body_8|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false
219
+ },
220
+ {
221
+ "content": "<|end_body_9|>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false
226
+ }
227
+ ],
228
+ "eos_token": {
229
+ "content": "<|endoftext|>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false
234
+ },
235
+ "pad_token": {
236
+ "content": "<|endoftext|>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false
241
+ }
242
+ }
02_lr2e-4/02_single_ar/step50-val0.9883/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e418905c4788c5f5ea0dd1ca15c904591f50fce71e16fc311f1b4d84e7ed50d8
3
+ size 11428087
02_lr2e-4/02_single_ar/step50-val0.9883/tokenizer_config.json ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|skeleton|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|end_skeleton|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<|body_0|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "151668": {
206
+ "content": "<|body_1|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "151669": {
214
+ "content": "<|body_2|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<|body_3|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<|body_4|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<|body_5|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<|body_6|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<|body_7|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<|body_8|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "151676": {
270
+ "content": "<|body_9|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "151677": {
278
+ "content": "<|body_start_0|>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "151678": {
286
+ "content": "<|body_start_1|>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "151679": {
294
+ "content": "<|body_start_2|>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "151680": {
302
+ "content": "<|body_start_3|>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "151681": {
310
+ "content": "<|body_start_4|>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "151682": {
318
+ "content": "<|body_start_5|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "151683": {
326
+ "content": "<|body_start_6|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "151684": {
334
+ "content": "<|body_start_7|>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "151685": {
342
+ "content": "<|body_start_8|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "151686": {
350
+ "content": "<|body_start_9|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "151687": {
358
+ "content": "<|end_body_0|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "151688": {
366
+ "content": "<|end_body_1|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "151689": {
374
+ "content": "<|end_body_2|>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "151690": {
382
+ "content": "<|end_body_3|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "151691": {
390
+ "content": "<|end_body_4|>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "151692": {
398
+ "content": "<|end_body_5|>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "151693": {
406
+ "content": "<|end_body_6|>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "151694": {
414
+ "content": "<|end_body_7|>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "151695": {
422
+ "content": "<|end_body_8|>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "151696": {
430
+ "content": "<|end_body_9|>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ }
437
+ },
438
+ "additional_special_tokens": [
439
+ "<|skeleton|>",
440
+ "<|end_skeleton|>",
441
+ "<|body_0|>",
442
+ "<|body_1|>",
443
+ "<|body_2|>",
444
+ "<|body_3|>",
445
+ "<|body_4|>",
446
+ "<|body_5|>",
447
+ "<|body_6|>",
448
+ "<|body_7|>",
449
+ "<|body_8|>",
450
+ "<|body_9|>",
451
+ "<|body_start_0|>",
452
+ "<|body_start_1|>",
453
+ "<|body_start_2|>",
454
+ "<|body_start_3|>",
455
+ "<|body_start_4|>",
456
+ "<|body_start_5|>",
457
+ "<|body_start_6|>",
458
+ "<|body_start_7|>",
459
+ "<|body_start_8|>",
460
+ "<|body_start_9|>",
461
+ "<|end_body_0|>",
462
+ "<|end_body_1|>",
463
+ "<|end_body_2|>",
464
+ "<|end_body_3|>",
465
+ "<|end_body_4|>",
466
+ "<|end_body_5|>",
467
+ "<|end_body_6|>",
468
+ "<|end_body_7|>",
469
+ "<|end_body_8|>",
470
+ "<|end_body_9|>"
471
+ ],
472
+ "bos_token": null,
473
+ "clean_up_tokenization_spaces": false,
474
+ "eos_token": "<|endoftext|>",
475
+ "errors": "replace",
476
+ "extra_special_tokens": {},
477
+ "model_max_length": 32768,
478
+ "pad_token": "<|endoftext|>",
479
+ "split_special_tokens": false,
480
+ "tokenizer_class": "Qwen2Tokenizer",
481
+ "unk_token": null
482
+ }
02_lr2e-4/02_single_ar/step50-val0.9883/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
02_lr2e-4/02_single_ar/topk_val.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "rank": 0,
4
+ "val_loss": 1.0024574875831604,
5
+ "step": 100,
6
+ "path": "/data/user_data/jgai/hybrid-code-gen/ckpt/full/02_lr2e-4/02_single_ar/step100-val1.0025"
7
+ },
8
+ {
9
+ "rank": 1,
10
+ "val_loss": 0.9883273065090179,
11
+ "step": 50,
12
+ "path": "/data/user_data/jgai/hybrid-code-gen/ckpt/full/02_lr2e-4/02_single_ar/step50-val0.9883"
13
+ }
14
+ ]