File size: 138,901 Bytes
ca97aa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
import { LlamaTokenizer } from "../../../src/tokenizers.js";
import { BASE_TEST_STRINGS, LLAMA_TEST_STRINGS } from "../test_strings.js";

export const TOKENIZER_CLASS = LlamaTokenizer;
export const TEST_CONFIG = {
  "Xenova/llama-tokenizer": {
    SIMPLE: {
      text: BASE_TEST_STRINGS.SIMPLE,
      tokens: ["\u2581How", "\u2581are", "\u2581you", "\u2581doing", "?"],
      ids: [1, 1128, 526, 366, 2599, 29973],
      decoded: "<s> How are you doing?",
    },
    SIMPLE_WITH_PUNCTUATION: {
      text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
      tokens: ["\u2581You", "\u2581should", "'", "ve", "\u2581done", "\u2581this"],
      ids: [1, 887, 881, 29915, 345, 2309, 445],
      decoded: "<s> You should've done this",
    },
    NUMBERS: {
      text: BASE_TEST_STRINGS.NUMBERS,
      tokens: ["\u2581", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "\u2581", "0", "\u2581", "1", "\u2581", "2", "\u2581", "3", "\u2581", "4", "\u2581", "5", "\u2581", "6", "\u2581", "7", "\u2581", "8", "\u2581", "9", "\u2581", "1", "0", "\u2581", "1", "0", "0", "\u2581", "1", "0", "0", "0"],
      ids: [1, 29871, 29900, 29896, 29906, 29941, 29946, 29945, 29953, 29955, 29947, 29929, 29871, 29900, 29871, 29896, 29871, 29906, 29871, 29941, 29871, 29946, 29871, 29945, 29871, 29953, 29871, 29955, 29871, 29947, 29871, 29929, 29871, 29896, 29900, 29871, 29896, 29900, 29900, 29871, 29896, 29900, 29900, 29900],
      decoded: "<s> 0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000",
    },
    TEXT_WITH_NUMBERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
      tokens: ["\u2581The", "\u2581company", "\u2581was", "\u2581founded", "\u2581in", "\u2581", "2", "0", "1", "6", "."],
      ids: [1, 450, 5001, 471, 11091, 297, 29871, 29906, 29900, 29896, 29953, 29889],
      decoded: "<s> The company was founded in 2016.",
    },
    PUNCTUATION: {
      text: BASE_TEST_STRINGS.PUNCTUATION,
      tokens: ["\u2581A", "<0x0A>", "'", "ll", "\u2581!!", "to", "?'", "d", "''", "d", "\u2581of", ",", "\u2581can", "'", "t", "."],
      ids: [1, 319, 13, 29915, 645, 21443, 517, 17901, 29881, 4907, 29881, 310, 29892, 508, 29915, 29873, 29889],
      decoded: "<s> A\n'll !!to?'d''d of, can't.",
    },
    PYTHON_CODE: {
      text: BASE_TEST_STRINGS.PYTHON_CODE,
      tokens: ["\u2581def", "\u2581main", "():", "<0x0A>", "<0x09>", "pass"],
      ids: [1, 822, 1667, 7295, 13, 12, 3364],
      decoded: "<s> def main():\n\tpass",
    },
    JAVASCRIPT_CODE: {
      text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
      tokens: ["\u2581let", "\u2581a", "\u2581=", "\u2581obj", ".", "toString", "();", "<0x0A>", "toString", "();"],
      ids: [1, 1235, 263, 353, 5446, 29889, 7711, 890, 13, 7711, 890],
      decoded: "<s> let a = obj.toString();\ntoString();",
    },
    NEWLINES: {
      text: LLAMA_TEST_STRINGS.NEWLINES,
      tokens: ["\u2581ax", "<0x0A>", "####", "<0x0A>", "bo", "o"],
      ids: [1, 4853, 13, 4136, 13, 833, 29877],
      decoded: "<s> ax\n####\nboo",
    },
    BASIC: {
      text: BASE_TEST_STRINGS.BASIC,
      tokens: ["\u2581UN", "w", "ant", "\u00e9d", ",", "running"],
      ids: [1, 8291, 29893, 424, 2487, 29892, 21094],
      decoded: "<s> UNwant\u00e9d,running",
    },
    CONTROL_TOKENS: {
      text: BASE_TEST_STRINGS.CONTROL_TOKENS,
      tokens: ["\u2581", "1", "<0x00>", "2", "\ufffd", "3"],
      ids: [1, 29871, 29896, 3, 29906, 30140, 29941],
      decoded: "<s> 1\u00002\ufffd3",
    },
    HELLO_WORLD_TITLECASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
      tokens: ["\u2581Hello", "\u2581World"],
      ids: [1, 15043, 2787],
      decoded: "<s> Hello World",
    },
    HELLO_WORLD_LOWERCASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE,
      tokens: ["\u2581hello", "\u2581world"],
      ids: [1, 22172, 3186],
      decoded: "<s> hello world",
    },
    CHINESE_ONLY: {
      text: BASE_TEST_STRINGS.CHINESE_ONLY,
      tokens: ["\u2581", "\u751f", "\u6d3b", "\u7684", "\u771f", "<0xE8>", "<0xB0>", "<0x9B>", "\u662f"],
      ids: [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392],
      decoded: "<s> \u751f\u6d3b\u7684\u771f\u8c1b\u662f",
    },
    LEADING_SPACE: {
      text: BASE_TEST_STRINGS.LEADING_SPACE,
      tokens: ["\u2581\u2581\u2581", "\u2581leading", "\u2581space"],
      ids: [1, 1678, 8236, 2913],
      decoded: "<s>    leading space",
    },
    TRAILING_SPACE: {
      text: BASE_TEST_STRINGS.TRAILING_SPACE,
      tokens: ["\u2581trailing", "\u2581space", "\u2581\u2581\u2581"],
      ids: [1, 25053, 2913, 1678],
      decoded: "<s> trailing space   ",
    },
    DOUBLE_SPACE: {
      text: BASE_TEST_STRINGS.DOUBLE_SPACE,
      tokens: ["\u2581Hi", "\u2581", "\u2581Hello"],
      ids: [1, 6324, 29871, 15043],
      decoded: "<s> Hi  Hello",
    },
    CURRENCY: {
      text: BASE_TEST_STRINGS.CURRENCY,
      tokens: ["\u2581test", "\u2581$", "1", "\u2581R", "2", "\u2581#", "3", "\u2581\u20ac", "4", "\u2581\u00a3", "5", "\u2581", "\u00a5", "6", "\u2581", "<0xE2>", "<0x82>", "<0xA3>", "7", "\u2581", "\u20b9", "8", "\u2581", "<0xE2>", "<0x82>", "<0xB1>", "9", "\u2581test"],
      ids: [1, 1243, 395, 29896, 390, 29906, 396, 29941, 25540, 29946, 15151, 29945, 29871, 30563, 29953, 29871, 229, 133, 166, 29955, 29871, 30620, 29947, 29871, 229, 133, 180, 29929, 1243],
      decoded: "<s> test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test",
    },
    CURRENCY_WITH_DECIMALS: {
      text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
      tokens: ["\u2581I", "\u2581bought", "\u2581an", "\u2581apple", "\u2581for", "\u2581$", "1", ".", "0", "0", "\u2581at", "\u2581the", "\u2581store", "."],
      ids: [1, 306, 18093, 385, 26163, 363, 395, 29896, 29889, 29900, 29900, 472, 278, 3787, 29889],
      decoded: "<s> I bought an apple for $1.00 at the store.",
    },
    ELLIPSIS: {
      text: BASE_TEST_STRINGS.ELLIPSIS,
      tokens: ["\u2581you", "\u2026", "\u2581\u2581"],
      ids: [1, 366, 30098, 259],
      decoded: "<s> you\u2026  ",
    },
    TEXT_WITH_ESCAPE_CHARACTERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
      tokens: ["\u2581you", "\u2026", "\u00a0\u00a0"],
      ids: [1, 366, 30098, 8655],
      decoded: "<s> you\u2026\u00a0\u00a0",
    },
    TEXT_WITH_ESCAPE_CHARACTERS_2: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
      tokens: ["\u2581you", "\u2026", "\u00a0\u00a0", "you", "\u2026", "\u00a0\u00a0"],
      ids: [1, 366, 30098, 8655, 6293, 30098, 8655],
      decoded: "<s> you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0",
    },
    TILDE_NORMALIZATION: {
      text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
      tokens: ["\u2581weird", "\u2581", "\uff5e", "\u2581edge", "\u2581", "\uff5e", "\u2581case"],
      ids: [1, 13543, 29871, 30739, 7636, 29871, 30739, 1206],
      decoded: "<s> weird \uff5e edge \uff5e case",
    },
    SPIECE_UNDERSCORE: {
      text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
      tokens: ["\u2581", "\u2581This", "\u2581", "\u2581is", "\u2581", "\u2581a", "\u2581", "\u2581test", "\u2581", "\u2581."],
      ids: [1, 29871, 910, 29871, 338, 29871, 263, 29871, 1243, 29871, 869],
      decoded: "<s>  This  is  a  test  .",
    },
    POPULAR_EMOJIS: {
      text: BASE_TEST_STRINGS.POPULAR_EMOJIS,
      tokens: ["\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x82>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x8D>", "\u2581", "<0xF0>", "<0x9F>", "<0xA4>", "<0xA3>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x8D>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0xAD>", "\u2581", "<0xF0>", "<0x9F>", "<0x8E>", "<0x89>", "\u2581", "<0xF0>", "<0x9F>", "<0x99>", "<0x8F>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x8A>", "\u2581", "<0xF0>", "<0x9F>", "<0x94>", "<0xA5>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x81>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x85>", "\u2581", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x86>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x8F>", "\u2581", "<0xE2>", "<0x9D>", "<0xA4>", "\ufe0f", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x9C>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x9A>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x97>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x99>", "\u2581", "<0xF0>", "<0x9F>", "<0x96>", "<0xA4>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x8E>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x8C>", "\u2581", "<0xF0>", "<0x9F>", "<0xA5>", "<0xB3>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0xAA>", "\u2581", "<0xE2>", "<0x9C>", "<0xA8>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x89>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x80>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0xAF>", "\u2581", "<0xF0>", "<0x9F>", "<0x8E>", "<0x88>", "\u2581", "<0xF0>", "<0x9F>", "<0x99>", "<0x88>", "\u2581", "<0xF0>", "<0x9F>", "<0x99>", "<0x8C>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x80>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x87>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x8B>", "\u2581", "\u2705", "\u2581", "<0xF0>", "<0x9F>", "<0x8E>", "<0x81>", "\u2581", "<0xF0>", "<0x9F>", "<0x8C>", "<0x9E>", "\u2581", "<0xF0>", "<0x9F>", "<0x8C>", "<0xB8>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0xB0>"],
      ids: [1, 29871, 243, 162, 155, 133, 29871, 243, 162, 148, 144, 29871, 243, 162, 167, 166, 29871, 243, 162, 155, 144, 29871, 243, 162, 155, 176, 29871, 243, 162, 145, 140, 29871, 243, 162, 156, 146, 29871, 243, 162, 155, 141, 29871, 243, 162, 151, 168, 29871, 243, 162, 155, 132, 29871, 243, 162, 155, 136, 29871, 243, 162, 167, 154, 29871, 243, 162, 155, 137, 29871, 243, 162, 148, 146, 29871, 229, 160, 167, 30598, 29871, 243, 162, 149, 159, 29871, 243, 162, 149, 157, 29871, 243, 162, 149, 154, 29871, 243, 162, 149, 156, 29871, 243, 162, 153, 167, 29871, 243, 162, 155, 145, 29871, 243, 162, 148, 143, 29871, 243, 162, 168, 182, 29871, 243, 162, 149, 173, 29871, 229, 159, 171, 29871, 243, 162, 148, 140, 29871, 243, 162, 148, 131, 29871, 243, 162, 149, 178, 29871, 243, 162, 145, 139, 29871, 243, 162, 156, 139, 29871, 243, 162, 156, 143, 29871, 243, 162, 149, 131, 29871, 243, 162, 148, 138, 29871, 243, 162, 148, 142, 29871, 31681, 29871, 243, 162, 145, 132, 29871, 243, 162, 143, 161, 29871, 243, 162, 143, 187, 29871, 243, 162, 149, 179],
      decoded: "<s> \ud83d\ude02 \ud83d\udc4d \ud83e\udd23 \ud83d\ude0d \ud83d\ude2d \ud83c\udf89 \ud83d\ude4f \ud83d\ude0a \ud83d\udd25 \ud83d\ude01 \ud83d\ude05 \ud83e\udd17 \ud83d\ude06 \ud83d\udc4f \u2764\ufe0f \ud83d\udc9c \ud83d\udc9a \ud83d\udc97 \ud83d\udc99 \ud83d\udda4 \ud83d\ude0e \ud83d\udc4c \ud83e\udd73 \ud83d\udcaa \u2728 \ud83d\udc49 \ud83d\udc40 \ud83d\udcaf \ud83c\udf88 \ud83d\ude48 \ud83d\ude4c \ud83d\udc80 \ud83d\udc47 \ud83d\udc4b \u2705 \ud83c\udf81 \ud83c\udf1e \ud83c\udf38 \ud83d\udcb0",
    },
    MULTIBYTE_EMOJIS: {
      text: BASE_TEST_STRINGS.MULTIBYTE_EMOJIS,
      tokens: ["\u2581", "<0xE2>", "<0x9C>", "<0xA8>", "\u2581", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x81>", "\ufe0f", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xB1>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "\u2581", "<0xF0>", "<0x9F>", "<0x95>", "<0xB5>", "\u200d", "\u2642", "\ufe0f", "\u2581", "<0xF0>", "<0x9F>", "<0xA7>", "<0x99>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "\u200d", "\u2642", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xA8>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "\u200d", "<0xF0>", "<0x9F>", "<0x8C>", "<0xBE>", "\u2581", "<0xF0>", "<0x9F>", "<0xA7>", "<0x91>", "\u200d", "<0xF0>", "<0x9F>", "<0xA4>", "<0x9D>", "\u200d", "<0xF0>", "<0x9F>", "<0xA7>", "<0x91>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xA9>", "\u200d", "<0xE2>", "<0x9D>", "<0xA4>", "\u200d", "<0xF0>", "<0x9F>", "<0x92>", "<0x8B>", "\u200d", "<0xF0>", "<0x9F>", "<0x91>", "<0xA8>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xA9>", "\u200d", "<0xF0>", "<0x9F>", "<0x91>", "<0xA9>", "\u200d", "<0xF0>", "<0x9F>", "<0x91>", "<0xA7>", "\u200d", "<0xF0>", "<0x9F>", "<0x91>", "<0xA6>", "\u2581", "<0xF0>", "<0x9F>", "<0xA7>", "<0x91>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "\u200d", "<0xF0>", "<0x9F>", "<0xA4>", "<0x9D>", "\u200d", "<0xF0>", "<0x9F>", "<0xA7>", "<0x91>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "\u2581", "<0xF0>", "<0x9F>", "<0x8F>", "<0xB4>", "<0xF3>", "<0xA0>", "<0x81>", "<0xA7>", "<0xF3>", "<0xA0>", "<0x81>", "<0xA2>", "<0xF3>", "<0xA0>", "<0x81>", "<0xA5>", "<0xF3>", "<0xA0>", "<0x81>", "<0xAE>", "<0xF3>", "<0xA0>", "<0x81>", "<0xA7>", "<0xF3>", "<0xA0>", "<0x81>", "<0xBF>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xA8>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "\u200d", "<0xE2>", "<0x9D>", "<0xA4>", "\ufe0f", "\u200d", "<0xF0>", "<0x9F>", "<0x92>", "<0x8B>", "\u200d", "<0xF0>", "<0x9F>", "<0x91>", "<0xA8>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBC>"],
      ids: [1, 29871, 229, 159, 171, 29871, 243, 162, 167, 154, 29871, 243, 162, 148, 132, 30598, 29871, 243, 162, 148, 180, 243, 162, 146, 190, 29871, 243, 162, 152, 184, 30722, 31135, 30598, 29871, 243, 162, 170, 156, 243, 162, 146, 190, 30722, 31135, 29871, 243, 162, 148, 171, 243, 162, 146, 190, 30722, 243, 162, 143, 193, 29871, 243, 162, 170, 148, 30722, 243, 162, 167, 160, 30722, 243, 162, 170, 148, 29871, 243, 162, 148, 172, 30722, 229, 160, 167, 30722, 243, 162, 149, 142, 30722, 243, 162, 148, 171, 29871, 243, 162, 148, 172, 30722, 243, 162, 148, 172, 30722, 243, 162, 148, 170, 30722, 243, 162, 148, 169, 29871, 243, 162, 170, 148, 243, 162, 146, 190, 30722, 243, 162, 167, 160, 30722, 243, 162, 170, 148, 243, 162, 146, 190, 29871, 243, 162, 146, 183, 246, 163, 132, 170, 246, 163, 132, 165, 246, 163, 132, 168, 246, 163, 132, 177, 246, 163, 132, 170, 246, 163, 132, 194, 29871, 243, 162, 148, 171, 243, 162, 146, 190, 30722, 229, 160, 167, 30598, 30722, 243, 162, 149, 142, 30722, 243, 162, 148, 171, 243, 162, 146, 191],
      decoded: "<s> \u2728 \ud83e\udd17 \ud83d\udc41\ufe0f \ud83d\udc71\ud83c\udffb \ud83d\udd75\u200d\u2642\ufe0f \ud83e\uddd9\ud83c\udffb\u200d\u2642 \ud83d\udc68\ud83c\udffb\u200d\ud83c\udf3e \ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1 \ud83d\udc69\u200d\u2764\u200d\ud83d\udc8b\u200d\ud83d\udc68 \ud83d\udc69\u200d\ud83d\udc69\u200d\ud83d\udc67\u200d\ud83d\udc66 \ud83e\uddd1\ud83c\udffb\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c\udffb \ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f \ud83d\udc68\ud83c\udffb\u200d\u2764\ufe0f\u200d\ud83d\udc8b\u200d\ud83d\udc68\ud83c\udffc",
    },
    BPE_SCORES_PRIORITY_1: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_1,
      tokens: ["\u2581gra", "bb", "ed"],
      ids: [1, 2646, 1327, 287],
      decoded: "<s> grabbed",
    },
    BPE_SCORES_PRIORITY_2: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_2,
      tokens: ["\u2581", "\u2581gra", "bb", "ed"],
      ids: [1, 29871, 2646, 1327, 287],
      decoded: "<s>  grabbed",
    },
    BPE_SCORES_PRIORITY_3: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_3,
      tokens: ["\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581\u2581", "\u2581gra", "bb", "ed"],
      ids: [1, 9651, 2646, 1327, 287],
      decoded: "<s>            grabbed",
    },
    NEWLINE: {
      text: LLAMA_TEST_STRINGS.NEWLINE,
      tokens: ["\u2581", "<0x0A>"],
      ids: [1, 29871, 13],
      decoded: "<s> \n",
    },
    NEWLINE_WITH_LEADING_SPACE: {
      text: LLAMA_TEST_STRINGS.NEWLINE_WITH_LEADING_SPACE,
      tokens: ["\u2581\u2581", "<0x0A>"],
      ids: [1, 259, 13],
      decoded: "<s>  \n",
    },
    TABS: {
      text: LLAMA_TEST_STRINGS.TABS,
      tokens: ["\u2581", "<0x09>", "tabs", "<0x09>", "<0x09>", "<0x09>", "<0x09>", "out", "\u2581here"],
      ids: [1, 29871, 12, 21175, 12, 12, 12, 12, 449, 1244],
      decoded: "<s> \ttabs\t\t\t\tout here",
    },
    NEWLINE_AND_TAB: {
      text: LLAMA_TEST_STRINGS.NEWLINE_AND_TAB,
      tokens: ["\u2581", "<0x0A>", "<0x09>", "<0x0A>"],
      ids: [1, 29871, 13, 12, 13],
      decoded: "<s> \n\t\n",
    },
    CHINESE_LETTER: {
      text: LLAMA_TEST_STRINGS.CHINESE_LETTER,
      tokens: ["\u2581", "\u9547"],
      ids: [1, 29871, 30411],
      decoded: "<s> \u9547",
    },
    EMOJIS_1: {
      text: LLAMA_TEST_STRINGS.EMOJIS_1,
      tokens: ["\u2581", "<0xF0>", "<0x9F>", "<0xA6>", "<0x99>"],
      ids: [1, 29871, 243, 162, 169, 156],
      decoded: "<s> \ud83e\udd99",
    },
    EMOJIS_2: {
      text: LLAMA_TEST_STRINGS.EMOJIS_2,
      tokens: ["\u2581", "<0xF0>", "<0x9F>", "<0xA6>", "<0x99>", "<0xEA>", "<0x99>", "<0x8A>"],
      ids: [1, 29871, 243, 162, 169, 156, 237, 156, 141],
      decoded: "<s> \ud83e\udd99\ua64a",
    },
    EMOJIS_3: {
      text: LLAMA_TEST_STRINGS.EMOJIS_3,
      tokens: ["\u2581", "<0xEA>", "<0x99>", "<0x8A>", "<0xF0>", "<0x9F>", "<0xA6>", "<0x99>"],
      ids: [1, 29871, 237, 156, 141, 243, 162, 169, 156],
      decoded: "<s> \ua64a\ud83e\udd99",
    },
    PARAGRAPH: {
      text: LLAMA_TEST_STRINGS.PARAGRAPH,
      tokens: ["\u2581The", "\u2581ll", "ama", "\u2581(/", "\u02c8", "l", "\u0251", "\u02d0", "m", "\u0259", "/", ";", "\u2581", "<0xF0>", "<0x9F>", "<0xA6>", "<0x99>", "Span", "ish", "\u2581pron", "unci", "ation", ":", "\u2581[", "\u02c8", "\u028e", "ama", "])", "\u2581(", "L", "ama", "\u2581gl", "ama", ")", "\u2581is", "\u2581a", "\u2581domestic", "ated", "\u2581South", "\u2581American", "\u2581cam", "el", "id", ",", "\u2581widely", "\u2581used", "\u2581as", "\u2581a", "\u2581meat", "\u2581and", "\u2581pack", "\u2581animal", "\u2581by", "\u2581And", "e", "an", "\u2581cult", "ures", "\u2581since", "\u2581the", "\u2581Pre", "-", "Col", "umb", "ian", "\u2581era", ".", "\u2581L", "lam", "as", "\u2581are", "\u2581social", "\u2581animals", "\u2581and", "\u2581live", "\u2581with", "\u2581others", "\u2581as", "\u2581a", "\u2581her", "d", ".", "\u2581Their", "\u2581w", "ool", "\u2581is", "\u2581soft", "\u2581and", "\u2581contains", "\u2581only", "\u2581a", "\u2581small", "\u2581amount", "\u2581of", "\u2581lan", "olin", ".[", "2", "]", "\u2581L", "lam", "as", "\u2581can", "\u2581learn", "\u2581simple", "\u2581tasks", "\u2581after", "\u2581a", "\u2581few", "\u2581repet", "itions", ".", "\u2581When", "\u2581using", "\u2581a", "\u2581pack", ",", "\u2581they", "\u2581can", "\u2581carry", "\u2581about", "\u2581", "2", "5", "\u2581to", "\u2581", "3", "0", "%", "\u2581of", "\u2581their", "\u2581body", "\u2581weight", "\u2581for", "\u2581", "8", "\u2581to", "\u2581", "1", "3", "\u2581km", "\u2581(", "5", "\u2013", "8", "\u2581miles", ").", "[", "3", "]", "\u2581The", "\u2581name", "\u2581ll", "ama", "\u2581(", "in", "\u2581the", "\u2581past", "\u2581also", "\u2581sp", "elled", '\u2581"', "l", "ama", '"', "\u2581or", '\u2581"', "gl", "ama", '")', "\u2581was", "\u2581adopted", "\u2581by", "\u2581European", "\u2581sett", "lers", "\u2581from", "\u2581native", "\u2581Peru", "vi", "ans", ".[", "4", "]", "\u2581The", "\u2581ancest", "ors", "\u2581of", "\u2581llam", "as", "\u2581are", "\u2581thought", "\u2581to", "\u2581have", "\u2581origin", "ated", "\u2581from", "\u2581the", "\u2581Great", "\u2581Pla", "ins", "\u2581of", "\u2581North", "\u2581America", "\u2581about", "\u2581", "4", "0", "\u2581million", "\u2581years", "\u2581ago", ",", "\u2581and", "\u2581subsequently", "\u2581migr", "ated", "\u2581to", "\u2581South", "\u2581America", "\u2581about", "\u2581three", "\u2581million", "\u2581years", "\u2581ago", "\u2581during", "\u2581the", "\u2581Great", "\u2581American", "\u2581Inter", "change", ".", "\u2581By", "\u2581the", "\u2581end", "\u2581of", "\u2581the", "\u2581last", "\u2581ice", "\u2581age", "\u2581(", "1", "0", ",", "0", "0", "0", "\u2013", "1", "2", ",", "0", "0", "0", "\u2581years", "\u2581ago", "),", "\u2581cam", "el", "ids", "\u2581were", "\u2581ext", "inct", "\u2581in", "\u2581North", "\u2581America", ".[", "3", "]", "\u2581As", "\u2581of", "\u2581", "2", "0", "0", "7", ",", "\u2581there", "\u2581were", "\u2581over", "\u2581seven", "\u2581million", "\u2581llam", "as", "\u2581and", "\u2581al", "p", "ac", "as", "\u2581in", "\u2581South", "\u2581America", "\u2581and", "\u2581over", "\u2581", "1", "5", "8", ",", "0", "0", "0", "\u2581llam", "as", "\u2581and", "\u2581", "1", "0", "0", ",", "0", "0", "0", "<0xEA>", "<0x99>", "<0x8A>", "<0xF0>", "<0x9F>", "<0xA6>", "<0x99>", "\u2581al", "p", "ac", "as", ",", "\u2581desc", "ended", "\u2581from", "\u2581pro", "gen", "itors", "\u2581imported", "\u2581late", "\u2581in", "\u2581the", "\u2581", "2", "0", "th", "\u2581century", ",", "\u2581in", "\u2581the", "\u2581United", "\u2581States", "\u2581and", "\u2581Canada", ".[", "5", "]", "\u2581In", "\u2581A", "ym", "ara", "\u2581myth", "ology", ",", "\u2581llam", "as", "\u2581are", "\u2581important", "\u2581be", "ings", ".", "\u2581The", "\u2581Heaven", "ly", "\u2581L", "l", "ama", "\u2581is", "\u2581said", "\u2581to", "\u2581drink", "\u2581water", "\u2581from", "\u2581the", "\u2581ocean", "\u2581and", "\u2581ur", "in", "ates", "\u2581as", "\u2581it", "\u2581ra", "ins", ".[", "6", "]", "\u2581According", "\u2581to", "\u2581A", "ym", "ara", "\u2581es", "chat", "ology", ",", "\u2581llam", "as", "\u2581will", "\u2581return", "\u2581to", "\u2581the", "\u2581water", "\u2581spr", "ings", "\u2581and", "\u2581l", "ago", "ons", "\u2581where", "\u2581they", "\u2581come", "\u2581from", "\u2581at", "\u2581the", "\u2581end", "\u2581of", "\u2581time", ".[", "6", "]"],
      ids: [1, 450, 11148, 3304, 20374, 30176, 29880, 30426, 30215, 29885, 30184, 29914, 29936, 29871, 243, 162, 169, 156, 15495, 728, 11504, 11173, 362, 29901, 518, 30176, 31743, 3304, 2314, 313, 29931, 3304, 3144, 3304, 29897, 338, 263, 21849, 630, 4275, 3082, 3949, 295, 333, 29892, 17644, 1304, 408, 263, 27654, 322, 4870, 13019, 491, 1126, 29872, 273, 4185, 1973, 1951, 278, 4721, 29899, 1625, 3774, 713, 3152, 29889, 365, 5288, 294, 526, 5264, 15006, 322, 5735, 411, 4045, 408, 263, 902, 29881, 29889, 11275, 281, 1507, 338, 4964, 322, 3743, 871, 263, 2319, 5253, 310, 10906, 22878, 7226, 29906, 29962, 365, 5288, 294, 508, 5110, 2560, 9595, 1156, 263, 2846, 21159, 2187, 29889, 1932, 773, 263, 4870, 29892, 896, 508, 8677, 1048, 29871, 29906, 29945, 304, 29871, 29941, 29900, 29995, 310, 1009, 3573, 7688, 363, 29871, 29947, 304, 29871, 29896, 29941, 2383, 313, 29945, 29994, 29947, 7800, 467, 29961, 29941, 29962, 450, 1024, 11148, 3304, 313, 262, 278, 4940, 884, 805, 14356, 376, 29880, 3304, 29908, 470, 376, 3820, 3304, 1159, 471, 16356, 491, 7824, 3604, 9306, 515, 7531, 25493, 1403, 550, 7226, 29946, 29962, 450, 19525, 943, 310, 11829, 294, 526, 2714, 304, 505, 3978, 630, 515, 278, 7027, 13494, 1144, 310, 4644, 6813, 1048, 29871, 29946, 29900, 7284, 2440, 8020, 29892, 322, 17602, 9725, 630, 304, 4275, 6813, 1048, 2211, 7284, 2440, 8020, 2645, 278, 7027, 3082, 4124, 3167, 29889, 2648, 278, 1095, 310, 278, 1833, 14890, 5046, 313, 29896, 29900, 29892, 29900, 29900, 29900, 29994, 29896, 29906, 29892, 29900, 29900, 29900, 2440, 8020, 511, 3949, 295, 4841, 892, 1294, 5562, 297, 4644, 6813, 7226, 29941, 29962, 1094, 310, 29871, 29906, 29900, 29900, 29955, 29892, 727, 892, 975, 9881, 7284, 11829, 294, 322, 394, 29886, 562, 294, 297, 4275, 6813, 322, 975, 29871, 29896, 29945, 29947, 29892, 29900, 29900, 29900, 11829, 294, 322, 29871, 29896, 29900, 29900, 29892, 29900, 29900, 29900, 237, 156, 141, 243, 162, 169, 156, 394, 29886, 562, 294, 29892, 5153, 2760, 515, 410, 1885, 17259, 19673, 5683, 297, 278, 29871, 29906, 29900, 386, 6462, 29892, 297, 278, 3303, 3900, 322, 7400, 7226, 29945, 29962, 512, 319, 962, 2518, 22082, 3002, 29892, 11829, 294, 526, 4100, 367, 886, 29889, 450, 22977, 368, 365, 29880, 3304, 338, 1497, 304, 13748, 4094, 515, 278, 23474, 322, 5065, 262, 1078, 408, 372, 1153, 1144, 7226, 29953, 29962, 7579, 304, 319, 962, 2518, 831, 13496, 3002, 29892, 11829, 294, 674, 736, 304, 278, 4094, 7689, 886, 322, 301, 4425, 787, 988, 896, 2041, 515, 472, 278, 1095, 310, 931, 7226, 29953, 29962],
      decoded: '<s> The llama (/\u02c8l\u0251\u02d0m\u0259/; \ud83e\udd99Spanish pronunciation: [\u02c8\u028eama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the Pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5\u20138 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have originated from the Great Plains of North America about 40 million years ago, and subsequently migrated to South America about three million years ago during the Great American Interchange. By the end of the last ice age (10,000\u201312,000 years ago), camelids were extinct in North America.[3] As of 2007, there were over seven million llamas and alpacas in South America and over 158,000 llamas and 100,000\ua64a\ud83e\udd99 alpacas, descended from progenitors imported late in the 20th century, in the United States and Canada.[5] In Aymara mythology, llamas are important beings. The Heavenly Llama is said to drink water from the ocean and urinates as it rains.[6] According to Aymara eschatology, llamas will return to the water springs and lagoons where they come from at the end of time.[6]',
    },
  },
  "Xenova/llama3-tokenizer": {
    SIMPLE: {
      text: BASE_TEST_STRINGS.SIMPLE,
      tokens: ["How", "\u0120are", "\u0120you", "\u0120doing", "?"],
      ids: [4438, 527, 499, 3815, 30],
      decoded: "How are you doing?",
    },
    SIMPLE_WITH_PUNCTUATION: {
      text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
      tokens: ["You", "\u0120should", "'ve", "\u0120done", "\u0120this"],
      ids: [2675, 1288, 3077, 2884, 420],
      decoded: "You should've done this",
    },
    NUMBERS: {
      text: BASE_TEST_STRINGS.NUMBERS,
      tokens: ["012", "345", "678", "9", "\u0120", "0", "\u0120", "1", "\u0120", "2", "\u0120", "3", "\u0120", "4", "\u0120", "5", "\u0120", "6", "\u0120", "7", "\u0120", "8", "\u0120", "9", "\u0120", "10", "\u0120", "100", "\u0120", "100", "0"],
      ids: [11531, 12901, 17458, 24, 220, 15, 220, 16, 220, 17, 220, 18, 220, 19, 220, 20, 220, 21, 220, 22, 220, 23, 220, 24, 220, 605, 220, 1041, 220, 1041, 15],
      decoded: "0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000",
    },
    TEXT_WITH_NUMBERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
      tokens: ["The", "\u0120company", "\u0120was", "\u0120founded", "\u0120in", "\u0120", "201", "6", "."],
      ids: [791, 2883, 574, 18538, 304, 220, 679, 21, 13],
      decoded: "The company was founded in 2016.",
    },
    PUNCTUATION: {
      text: BASE_TEST_STRINGS.PUNCTUATION,
      tokens: ["A", "\u010a", "'ll", "\u0120!!", "to", "?'", "d", "''", "d", "\u0120of", ",", "\u0120can", "'t", "."],
      ids: [32, 198, 3358, 11261, 998, 20837, 67, 4708, 67, 315, 11, 649, 956, 13],
      decoded: "A\n'll!!to?'d''d of, can't.",
    },
    PYTHON_CODE: {
      text: BASE_TEST_STRINGS.PYTHON_CODE,
      tokens: ["def", "\u0120main", "():\u010a", "\u0109pass"],
      ids: [755, 1925, 4019, 42531],
      decoded: "def main():\n\tpass",
    },
    JAVASCRIPT_CODE: {
      text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
      tokens: ["let", "\u0120a", "\u0120=", "\u0120obj", ".toString", "();\u010a", "toString", "();"],
      ids: [1169, 264, 284, 2909, 5180, 545, 6712, 2178],
      decoded: "let a = obj.toString();\ntoString();",
    },
    NEWLINES: {
      text: LLAMA_TEST_STRINGS.NEWLINES,
      tokens: ["ax", "\u010a", "####\u010a", "boo"],
      ids: [710, 198, 71050, 34093],
      decoded: "ax\n####\nboo",
    },
    BASIC: {
      text: BASE_TEST_STRINGS.BASIC,
      tokens: ["UN", "want", "\u00c3\u00a9d", ",", "running"],
      ids: [1899, 53757, 15433, 11, 28272],
      decoded: "UNwant\u00e9d,running",
    },
    CONTROL_TOKENS: {
      text: BASE_TEST_STRINGS.CONTROL_TOKENS,
      tokens: ["1", "\u0100", "2", "\u00ef\u00bf\u00bd", "3"],
      ids: [16, 188, 17, 5809, 18],
      decoded: "1\u00002\ufffd3",
    },
    HELLO_WORLD_TITLECASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
      tokens: ["Hello", "\u0120World"],
      ids: [9906, 4435],
      decoded: "Hello World",
    },
    HELLO_WORLD_LOWERCASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE,
      tokens: ["hello", "\u0120world"],
      ids: [15339, 1917],
      decoded: "hello world",
    },
    CHINESE_ONLY: {
      text: BASE_TEST_STRINGS.CHINESE_ONLY,
      tokens: ["\u00e7\u0136\u0141\u00e6\u00b4\u00bb", "\u00e7\u013c\u0126", "\u00e7\u013e\u0141", "\u00e8\u00b0", "\u013d", "\u00e6\u013a\u00af"],
      ids: [104654, 9554, 89151, 39013, 249, 21043],
      decoded: "\u751f\u6d3b\u7684\u771f\u8c1b\u662f",
    },
    LEADING_SPACE: {
      text: BASE_TEST_STRINGS.LEADING_SPACE,
      tokens: ["\u0120\u0120", "\u0120leading", "\u0120space"],
      ids: [256, 6522, 3634],
      decoded: "   leading space",
    },
    TRAILING_SPACE: {
      text: BASE_TEST_STRINGS.TRAILING_SPACE,
      tokens: ["tr", "ailing", "\u0120space", "\u0120\u0120\u0120"],
      ids: [376, 14612, 3634, 262],
      decoded: "trailing space   ",
    },
    DOUBLE_SPACE: {
      text: BASE_TEST_STRINGS.DOUBLE_SPACE,
      tokens: ["Hi", "\u0120", "\u0120Hello"],
      ids: [13347, 220, 22691],
      decoded: "Hi  Hello",
    },
    CURRENCY: {
      text: BASE_TEST_STRINGS.CURRENCY,
      tokens: ["test", "\u0120$", "1", "\u0120R", "2", "\u0120#", "3", "\u0120\u00e2\u0124\u00ac", "4", "\u0120\u00c2\u00a3", "5", "\u0120\u00c2\u00a5", "6", "\u0120\u00e2\u0124", "\u00a3", "7", "\u0120\u00e2\u0124\u00b9", "8", "\u0120\u00e2\u0124", "\u00b1", "9", "\u0120test"],
      ids: [1985, 400, 16, 432, 17, 674, 18, 13281, 19, 7083, 20, 72588, 21, 113384, 96, 22, 90891, 23, 113384, 109, 24, 1296],
      decoded: "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test",
    },
    CURRENCY_WITH_DECIMALS: {
      text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
      tokens: ["I", "\u0120bought", "\u0120an", "\u0120apple", "\u0120for", "\u0120$", "1", ".", "00", "\u0120at", "\u0120the", "\u0120store", "."],
      ids: [40, 11021, 459, 24149, 369, 400, 16, 13, 410, 520, 279, 3637, 13],
      decoded: "I bought an apple for $1.00 at the store.",
    },
    ELLIPSIS: {
      text: BASE_TEST_STRINGS.ELLIPSIS,
      tokens: ["you", "\u00e2\u0122\u00a6", "\u0120\u0120"],
      ids: [9514, 1981, 256],
      decoded: "you\u2026  ",
    },
    TEXT_WITH_ESCAPE_CHARACTERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
      tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"],
      ids: [9514, 1981, 9421],
      decoded: "you\u2026\u00a0\u00a0",
    },
    TEXT_WITH_ESCAPE_CHARACTERS_2: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
      tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142", "\u00c2\u0142", "you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"],
      ids: [9514, 1981, 4194, 4194, 9514, 1981, 9421],
      decoded: "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0",
    },
    TILDE_NORMALIZATION: {
      text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
      tokens: ["we", "ird", "\u0120\u00ef\u00bd\u0140", "\u0120edge", "\u0120\u00ef\u00bd\u0140", "\u0120case"],
      ids: [906, 2668, 111942, 6964, 111942, 1162],
      decoded: "weird \uff5e edge \uff5e case",
    },
    SPIECE_UNDERSCORE: {
      text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
      tokens: ["\u00e2\u0138", "\u0123", "This", "\u0120\u00e2\u0138", "\u0123", "is", "\u0120\u00e2\u0138", "\u0123", "a", "\u0120\u00e2\u0138", "\u0123", "test", "\u0120\u00e2\u0138", "\u0123", "."],
      ids: [10634, 223, 2028, 14860, 223, 285, 14860, 223, 64, 14860, 223, 1985, 14860, 223, 13],
      decoded: "\u2581This \u2581is \u2581a \u2581test \u2581.",
    },
    POPULAR_EMOJIS: {
      text: BASE_TEST_STRINGS.POPULAR_EMOJIS,
      tokens: ["\u00f0\u0141\u013a", "\u0124", "\u0120\u00f0\u0141\u0133", "\u012f", "\u0120\u00f0\u0141", "\u00a4", "\u00a3", "\u0120\u00f0\u0141\u013a", "\u012f", "\u0120\u00f0\u0141\u013a", "\u0143", "\u0120\u00f0\u0141", "\u0130", "\u012b", "\u0120\u00f0\u0141", "\u013b", "\u0131", "\u0120\u00f0\u0141\u013a", "\u012c", "\u0120\u00f0\u0141\u0136", "\u00a5", "\u0120\u00f0\u0141\u013a", "\u0123", "\u0120\u00f0\u0141\u013a", "\u0127", "\u0120\u00f0\u0141", "\u00a4", "\u0139", "\u0120\u00f0\u0141\u013a", "\u0128", "\u0120\u00f0\u0141\u0133", "\u0131", "\u0120\u00e2\u013f\u00a4", "\u00ef\u00b8\u0131", "\u0120\u00f0\u0141\u0134", "\u013e", "\u0120\u00f0\u0141\u0134", "\u013c", "\u0120\u00f0\u0141\u0134", "\u0139", "\u0120\u00f0\u0141\u0134", "\u013b", "\u0120\u00f0\u0141", "\u0138", "\u00a4", "\u0120\u00f0\u0141\u013a", "\u0130", "\u0120\u00f0\u0141\u0133", "\u012e", "\u0120\u00f0\u0141", "\u00a5", "\u00b3", "\u0120\u00f0\u0141\u0134", "\u00aa", "\u0120\u00e2\u013e", "\u00a8", "\u0120\u00f0\u0141\u0133", "\u012b", "\u0120\u00f0\u0141\u0133", "\u0122", "\u0120\u00f0\u0141\u0134", "\u00af", "\u0120\u00f0\u0141", "\u0130", "\u012a", "\u0120\u00f0\u0141", "\u013b", "\u012a", "\u0120\u00f0\u0141", "\u013b", "\u012e", "\u0120\u00f0\u0141\u0134", "\u0122", "\u0120\u00f0\u0141\u0133", "\u0129", "\u0120\u00f0\u0141\u0133", "\u012d", "\u0120\u00e2\u013e", "\u0127", "\u0120\u00f0\u0141", "\u0130", "\u0123", "\u0120\u00f0\u0141", "\u012e", "\u0140", "\u0120\u00f0\u0141", "\u012e", "\u00b8", "\u0120\u00f0\u0141\u0134", "\u00b0"],
      ids: [76460, 224, 62904, 235, 11410, 97, 96, 27623, 235, 27623, 255, 11410, 236, 231, 11410, 247, 237, 27623, 232, 96169, 98, 27623, 223, 27623, 227, 11410, 97, 245, 27623, 228, 62904, 237, 71570, 31643, 64139, 250, 64139, 248, 64139, 245, 64139, 247, 11410, 244, 97, 27623, 236, 62904, 234, 11410, 98, 111, 64139, 103, 26602, 101, 62904, 231, 62904, 222, 64139, 107, 11410, 236, 230, 11410, 247, 230, 11410, 247, 234, 64139, 222, 62904, 229, 62904, 233, 26602, 227, 11410, 236, 223, 11410, 234, 252, 11410, 234, 116, 64139, 108],
      decoded: "\ud83d\ude02 \ud83d\udc4d \ud83e\udd23 \ud83d\ude0d \ud83d\ude2d \ud83c\udf89 \ud83d\ude4f \ud83d\ude0a \ud83d\udd25 \ud83d\ude01 \ud83d\ude05 \ud83e\udd17 \ud83d\ude06 \ud83d\udc4f \u2764\ufe0f \ud83d\udc9c \ud83d\udc9a \ud83d\udc97 \ud83d\udc99 \ud83d\udda4 \ud83d\ude0e \ud83d\udc4c \ud83e\udd73 \ud83d\udcaa \u2728 \ud83d\udc49 \ud83d\udc40 \ud83d\udcaf \ud83c\udf88 \ud83d\ude48 \ud83d\ude4c \ud83d\udc80 \ud83d\udc47 \ud83d\udc4b \u2705 \ud83c\udf81 \ud83c\udf1e \ud83c\udf38 \ud83d\udcb0",
    },
    MULTIBYTE_EMOJIS: {
      text: BASE_TEST_STRINGS.MULTIBYTE_EMOJIS,
      tokens: ["\u00e2\u013e", "\u00a8", "\u0120\u00f0\u0141", "\u00a4", "\u0139", "\u0120\u00f0\u0141\u0133", "\u0123", "\u00ef\u00b8\u0131", "\u0120\u00f0\u0141\u0133", "\u00b1", "\u00f0\u0141", "\u0131", "\u00bb", "\u0120\u00f0\u0141", "\u0137", "\u00b5", "\u00e2\u0122\u012f", "\u00e2\u013b", "\u0124", "\u00ef\u00b8\u0131", "\u0120\u00f0\u0141", "\u00a7", "\u013b", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122\u012f", "\u00e2\u013b", "\u0124", "\u0120\u00f0\u0141\u0133", "\u00a8", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u012e", "\u00be", "\u0120\u00f0\u0141", "\u00a7", "\u0133", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u00a4", "\u013f", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u00a7", "\u0133", "\u0120\u00f0\u0141\u0133", "\u00a9", "\u00e2\u0122\u012f", "\u00e2\u013f\u00a4", "\u00e2\u0122\u012f", "\u00f0\u0141\u0134", "\u012d", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a8", "\u0120\u00f0\u0141\u0133", "\u00a9", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a9", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a7", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a6", "\u0120\u00f0\u0141", "\u00a7", "\u0133", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u00a4", "\u013f", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u00a7", "\u0133", "\u00f0\u0141", "\u0131", "\u00bb", "\u0120\u00f0\u0141", "\u0131", "\u00b4", "\u00f3", "\u0142\u0123", "\u00a7", "\u00f3", "\u0142\u0123", "\u00a2", "\u00f3", "\u0142\u0123", "\u00a5", "\u00f3", "\u0142\u0123", "\u00ae", "\u00f3", "\u0142\u0123", "\u00a7", "\u00f3", "\u0142\u0123", "\u00bf", "\u0120\u00f0\u0141\u0133", "\u00a8", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122\u012f", "\u00e2\u013f\u00a4", "\u00ef\u00b8\u0131", "\u00e2\u0122\u012f", "\u00f0\u0141\u0134", "\u012d", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a8", "\u00f0\u0141", "\u0131", "\u00bc"],
      ids: [38798, 101, 11410, 97, 245, 62904, 223, 31643, 62904, 109, 9468, 237, 119, 11410, 243, 113, 102470, 17245, 224, 31643, 11410, 100, 247, 9468, 237, 119, 102470, 17245, 224, 62904, 101, 9468, 237, 119, 102470, 9468, 234, 122, 11410, 100, 239, 102470, 9468, 97, 251, 102470, 9468, 100, 239, 62904, 102, 102470, 121643, 102470, 93273, 233, 102470, 9468, 239, 101, 62904, 102, 102470, 9468, 239, 102, 102470, 9468, 239, 100, 102470, 9468, 239, 99, 11410, 100, 239, 9468, 237, 119, 102470, 9468, 97, 251, 102470, 9468, 100, 239, 9468, 237, 119, 11410, 237, 112, 175, 16050, 100, 175, 16050, 95, 175, 16050, 98, 175, 16050, 106, 175, 16050, 100, 175, 16050, 123, 62904, 101, 9468, 237, 119, 102470, 121643, 31643, 102470, 93273, 233, 102470, 9468, 239, 101, 9468, 237, 120],
      decoded: "\u2728 \ud83e\udd17 \ud83d\udc41\ufe0f \ud83d\udc71\ud83c\udffb \ud83d\udd75\u200d\u2642\ufe0f \ud83e\uddd9\ud83c\udffb\u200d\u2642 \ud83d\udc68\ud83c\udffb\u200d\ud83c\udf3e \ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1 \ud83d\udc69\u200d\u2764\u200d\ud83d\udc8b\u200d\ud83d\udc68 \ud83d\udc69\u200d\ud83d\udc69\u200d\ud83d\udc67\u200d\ud83d\udc66 \ud83e\uddd1\ud83c\udffb\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c\udffb \ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f \ud83d\udc68\ud83c\udffb\u200d\u2764\ufe0f\u200d\ud83d\udc8b\u200d\ud83d\udc68\ud83c\udffc",
    },
    BPE_SCORES_PRIORITY_1: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_1,
      tokens: ["grab", "bed"],
      ids: [59312, 2788],
      decoded: "grabbed",
    },
    BPE_SCORES_PRIORITY_2: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_2,
      tokens: ["\u0120grabbed"],
      ids: [30418],
      decoded: " grabbed",
    },
    BPE_SCORES_PRIORITY_3: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_3,
      tokens: ["\u0120\u0120\u0120\u0120\u0120\u0120\u0120\u0120\u0120\u0120", "\u0120grabbed"],
      ids: [1881, 30418],
      decoded: "           grabbed",
    },
    NEWLINE: {
      text: LLAMA_TEST_STRINGS.NEWLINE,
      tokens: ["\u010a"],
      ids: [198],
      decoded: "\n",
    },
    NEWLINE_WITH_LEADING_SPACE: {
      text: LLAMA_TEST_STRINGS.NEWLINE_WITH_LEADING_SPACE,
      tokens: ["\u0120\u010a"],
      ids: [720],
      decoded: " \n",
    },
    TABS: {
      text: LLAMA_TEST_STRINGS.TABS,
      tokens: ["\u0109t", "abs", "\u0109\u0109\u0109", "\u0109out", "\u0120here"],
      ids: [3324, 3518, 573, 14294, 1618],
      decoded: "\ttabs\t\t\t\tout here",
    },
    NEWLINE_AND_TAB: {
      text: LLAMA_TEST_STRINGS.NEWLINE_AND_TAB,
      tokens: ["\u010a\u0109\u010a"],
      ids: [18108],
      decoded: "\n\t\n",
    },
    CHINESE_LETTER: {
      text: LLAMA_TEST_STRINGS.CHINESE_LETTER,
      tokens: ["\u00e9\u0137\u0129"],
      ids: [104643],
      decoded: "\u9547",
    },
    EMOJIS_1: {
      text: LLAMA_TEST_STRINGS.EMOJIS_1,
      tokens: ["\u00f0\u0141", "\u00a6", "\u013b"],
      ids: [9468, 99, 247],
      decoded: "\ud83e\udd99",
    },
    EMOJIS_2: {
      text: LLAMA_TEST_STRINGS.EMOJIS_2,
      tokens: ["\u00f0\u0141", "\u00a6", "\u013b", "\u00ea", "\u013b", "\u012c"],
      ids: [9468, 99, 247, 166, 247, 232],
      decoded: "\ud83e\udd99\ua64a",
    },
    EMOJIS_3: {
      text: LLAMA_TEST_STRINGS.EMOJIS_3,
      tokens: ["\u00ea", "\u013b", "\u012c", "\u00f0\u0141", "\u00a6", "\u013b"],
      ids: [166, 247, 232, 9468, 99, 247],
      decoded: "\ua64a\ud83e\udd99",
    },
    PARAGRAPH: {
      text: LLAMA_TEST_STRINGS.PARAGRAPH,
      tokens: ["The", "\u0120llama", "\u0120(/", "\u00cb", "\u012a", "l", "\u00c9", "\u0133", "\u00cb", "\u0132", "m", "\u00c9\u013b", "/", ";", "\u0120\u00f0\u0141", "\u00a6", "\u013b", "Spanish", "\u0120pronunciation", ":", "\u0120[", "\u00cb", "\u012a", "\u00ca", "\u0130", "ama", "])", "\u0120(", "L", "ama", "\u0120gl", "ama", ")", "\u0120is", "\u0120a", "\u0120domestic", "ated", "\u0120South", "\u0120American", "\u0120camel", "id", ",", "\u0120widely", "\u0120used", "\u0120as", "\u0120a", "\u0120meat", "\u0120and", "\u0120pack", "\u0120animal", "\u0120by", "\u0120And", "ean", "\u0120cultures", "\u0120since", "\u0120the", "\u0120Pre", "-C", "olum", "bian", "\u0120era", ".", "\u0120L", "lam", "as", "\u0120are", "\u0120social", "\u0120animals", "\u0120and", "\u0120live", "\u0120with", "\u0120others", "\u0120as", "\u0120a", "\u0120herd", ".", "\u0120Their", "\u0120wool", "\u0120is", "\u0120soft", "\u0120and", "\u0120contains", "\u0120only", "\u0120a", "\u0120small", "\u0120amount", "\u0120of", "\u0120lan", "olin", ".[", "2", "]", "\u0120L", "lam", "as", "\u0120can", "\u0120learn", "\u0120simple", "\u0120tasks", "\u0120after", "\u0120a", "\u0120few", "\u0120repetitions", ".", "\u0120When", "\u0120using", "\u0120a", "\u0120pack", ",", "\u0120they", "\u0120can", "\u0120carry", "\u0120about", "\u0120", "25", "\u0120to", "\u0120", "30", "%", "\u0120of", "\u0120their", "\u0120body", "\u0120weight", "\u0120for", "\u0120", "8", "\u0120to", "\u0120", "13", "\u0120km", "\u0120(", "5", "\u00e2\u0122\u0135", "8", "\u0120miles", ").[", "3", "]", "\u0120The", "\u0120name", "\u0120llama", "\u0120(", "in", "\u0120the", "\u0120past", "\u0120also", "\u0120spelled", '\u0120"', "lama", '"', "\u0120or", '\u0120"', "gl", "ama", '")', "\u0120was", "\u0120adopted", "\u0120by", "\u0120European", "\u0120settlers", "\u0120from", "\u0120native", "\u0120Per", "uv", "ians", ".[", "4", "]", "\u0120The", "\u0120ancestors", "\u0120of", "\u0120ll", "amas", "\u0120are", "\u0120thought", "\u0120to", "\u0120have", "\u0120originated", "\u0120from", "\u0120the", "\u0120Great", "\u0120Plains", "\u0120of", "\u0120North", "\u0120America", "\u0120about", "\u0120", "40", "\u0120million", "\u0120years", "\u0120ago", ",", "\u0120and", "\u0120subsequently", "\u0120migrated", "\u0120to", "\u0120South", "\u0120America", "\u0120about", "\u0120three", "\u0120million", "\u0120years", "\u0120ago", "\u0120during", "\u0120the", "\u0120Great", "\u0120American", "\u0120Inter", "change", ".", "\u0120By", "\u0120the", "\u0120end", "\u0120of", "\u0120the", "\u0120last", "\u0120ice", "\u0120age", "\u0120(", "10", ",", "000", "\u00e2\u0122\u0135", "12", ",", "000", "\u0120years", "\u0120ago", "),", "\u0120camel", "ids", "\u0120were", "\u0120extinct", "\u0120in", "\u0120North", "\u0120America", ".[", "3", "]", "\u0120As", "\u0120of", "\u0120", "200", "7", ",", "\u0120there", "\u0120were", "\u0120over", "\u0120seven", "\u0120million", "\u0120ll", "amas", "\u0120and", "\u0120al", "pac", "as", "\u0120in", "\u0120South", "\u0120America", "\u0120and", "\u0120over", "\u0120", "158", ",", "000", "\u0120ll", "amas", "\u0120and", "\u0120", "100", ",", "000", "\u00ea", "\u013b", "\u012c", "\u00f0\u0141", "\u00a6", "\u013b", "\u0120al", "pac", "as", ",", "\u0120descended", "\u0120from", "\u0120progen", "itors", "\u0120imported", "\u0120late", "\u0120in", "\u0120the", "\u0120", "20", "th", "\u0120century", ",", "\u0120in", "\u0120the", "\u0120United", "\u0120States", "\u0120and", "\u0120Canada", ".[", "5", "]", "\u0120In", "\u0120A", "ym", "ara", "\u0120mythology", ",", "\u0120ll", "amas", "\u0120are", "\u0120important", "\u0120beings", ".", "\u0120The", "\u0120Heavenly", "\u0120L", "lama", "\u0120is", "\u0120said", "\u0120to", "\u0120drink", "\u0120water", "\u0120from", "\u0120the", "\u0120ocean", "\u0120and", "\u0120ur", "in", "ates", "\u0120as", "\u0120it", "\u0120rains", ".[", "6", "]", "\u0120According", "\u0120to", "\u0120A", "ym", "ara", "\u0120es", "chat", "ology", ",", "\u0120ll", "amas", "\u0120will", "\u0120return", "\u0120to", "\u0120the", "\u0120water", "\u0120springs", "\u0120and", "\u0120l", "ago", "ons", "\u0120where", "\u0120they", "\u0120come", "\u0120from", "\u0120at", "\u0120the", "\u0120end", "\u0120of", "\u0120time", ".[", "6", "]"],
      ids: [791, 94776, 47325, 135, 230, 75, 133, 239, 135, 238, 76, 99638, 14, 26, 11410, 99, 247, 62897, 71722, 25, 510, 135, 230, 134, 236, 3105, 2526, 320, 43, 3105, 2840, 3105, 8, 374, 264, 13018, 660, 4987, 3778, 50252, 307, 11, 13882, 1511, 439, 264, 13339, 323, 3854, 10065, 555, 1628, 5420, 27833, 2533, 279, 5075, 7813, 1152, 13464, 11639, 13, 445, 24705, 300, 527, 3674, 10099, 323, 3974, 449, 3885, 439, 264, 59213, 13, 11205, 39640, 374, 8579, 323, 5727, 1193, 264, 2678, 3392, 315, 31791, 37737, 8032, 17, 60, 445, 24705, 300, 649, 4048, 4382, 9256, 1306, 264, 2478, 86066, 13, 3277, 1701, 264, 3854, 11, 814, 649, 6920, 922, 220, 914, 311, 220, 966, 4, 315, 872, 2547, 4785, 369, 220, 23, 311, 220, 1032, 13437, 320, 20, 4235, 23, 8931, 94638, 18, 60, 578, 836, 94776, 320, 258, 279, 3347, 1101, 68918, 330, 81101, 1, 477, 330, 6200, 3105, 909, 574, 18306, 555, 7665, 61107, 505, 10068, 3700, 12328, 5493, 8032, 19, 60, 578, 38618, 315, 9507, 29189, 527, 3463, 311, 617, 44853, 505, 279, 8681, 63911, 315, 4892, 5270, 922, 220, 1272, 3610, 1667, 4227, 11, 323, 28520, 73691, 311, 4987, 5270, 922, 2380, 3610, 1667, 4227, 2391, 279, 8681, 3778, 5783, 3455, 13, 3296, 279, 842, 315, 279, 1566, 10054, 4325, 320, 605, 11, 931, 4235, 717, 11, 931, 1667, 4227, 705, 50252, 3447, 1051, 69918, 304, 4892, 5270, 8032, 18, 60, 1666, 315, 220, 1049, 22, 11, 1070, 1051, 927, 8254, 3610, 9507, 29189, 323, 453, 46051, 300, 304, 4987, 5270, 323, 927, 220, 11286, 11, 931, 9507, 29189, 323, 220, 1041, 11, 931, 166, 247, 232, 9468, 99, 247, 453, 46051, 300, 11, 58842, 505, 84360, 12170, 25973, 3389, 304, 279, 220, 508, 339, 9478, 11, 304, 279, 3723, 4273, 323, 7008, 8032, 20, 60, 763, 362, 1631, 5169, 59492, 11, 9507, 29189, 527, 3062, 23837, 13, 578, 88150, 445, 81101, 374, 1071, 311, 7172, 3090, 505, 279, 18435, 323, 4433, 258, 988, 439, 433, 62555, 8032, 21, 60, 10771, 311, 362, 1631, 5169, 1560, 9884, 2508, 11, 9507, 29189, 690, 471, 311, 279, 3090, 42242, 323, 326, 6438, 2439, 1405, 814, 2586, 505, 520, 279, 842, 315, 892, 8032, 21, 60],
      decoded: 'The llama (/\u02c8l\u0251\u02d0m\u0259/; \ud83e\udd99Spanish pronunciation: [\u02c8\u028eama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the Pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5\u20138 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have originated from the Great Plains of North America about 40 million years ago, and subsequently migrated to South America about three million years ago during the Great American Interchange. By the end of the last ice age (10,000\u201312,000 years ago), camelids were extinct in North America.[3] As of 2007, there were over seven million llamas and alpacas in South America and over 158,000 llamas and 100,000\ua64a\ud83e\udd99 alpacas, descended from progenitors imported late in the 20th century, in the United States and Canada.[5] In Aymara mythology, llamas are important beings. The Heavenly Llama is said to drink water from the ocean and urinates as it rains.[6] According to Aymara eschatology, llamas will return to the water springs and lagoons where they come from at the end of time.[6]',
    },
  },

  // - Sequence PostProcessor
  // - "ignore_merges": true
  "Xenova/llama3-tokenizer-new": {
    SIMPLE: {
      text: BASE_TEST_STRINGS.SIMPLE,
      tokens: ["How", "\u0120are", "\u0120you", "\u0120doing", "?"],
      ids: [128000, 4438, 527, 499, 3815, 30],
      decoded: "<|begin_of_text|>How are you doing?",
    },
    SIMPLE_WITH_PUNCTUATION: {
      text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
      tokens: ["You", "\u0120should", "'ve", "\u0120done", "\u0120this"],
      ids: [128000, 2675, 1288, 3077, 2884, 420],
      decoded: "<|begin_of_text|>You should've done this",
    },
    NUMBERS: {
      text: BASE_TEST_STRINGS.NUMBERS,
      tokens: ["012", "345", "678", "9", "\u0120", "0", "\u0120", "1", "\u0120", "2", "\u0120", "3", "\u0120", "4", "\u0120", "5", "\u0120", "6", "\u0120", "7", "\u0120", "8", "\u0120", "9", "\u0120", "10", "\u0120", "100", "\u0120", "100", "0"],
      ids: [128000, 11531, 12901, 17458, 24, 220, 15, 220, 16, 220, 17, 220, 18, 220, 19, 220, 20, 220, 21, 220, 22, 220, 23, 220, 24, 220, 605, 220, 1041, 220, 1041, 15],
      decoded: "<|begin_of_text|>0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000",
    },
    TEXT_WITH_NUMBERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
      tokens: ["The", "\u0120company", "\u0120was", "\u0120founded", "\u0120in", "\u0120", "201", "6", "."],
      ids: [128000, 791, 2883, 574, 18538, 304, 220, 679, 21, 13],
      decoded: "<|begin_of_text|>The company was founded in 2016.",
    },
    PUNCTUATION: {
      text: BASE_TEST_STRINGS.PUNCTUATION,
      tokens: ["A", "\u010a", "'ll", "\u0120!!", "to", "?'", "d", "''", "d", "\u0120of", ",", "\u0120can", "'t", "."],
      ids: [128000, 32, 198, 3358, 11261, 998, 20837, 67, 4708, 67, 315, 11, 649, 956, 13],
      decoded: "<|begin_of_text|>A\n'll!!to?'d''d of, can't.",
    },
    PYTHON_CODE: {
      text: BASE_TEST_STRINGS.PYTHON_CODE,
      tokens: ["def", "\u0120main", "():\u010a", "\u0109pass"],
      ids: [128000, 755, 1925, 4019, 42531],
      decoded: "<|begin_of_text|>def main():\n\tpass",
    },
    JAVASCRIPT_CODE: {
      text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
      tokens: ["let", "\u0120a", "\u0120=", "\u0120obj", ".toString", "();\u010a", "toString", "();"],
      ids: [128000, 1169, 264, 284, 2909, 5180, 545, 6712, 2178],
      decoded: "<|begin_of_text|>let a = obj.toString();\ntoString();",
    },
    NEWLINES: {
      text: LLAMA_TEST_STRINGS.NEWLINES,
      tokens: ["ax", "\u010a", "####\u010a", "boo"],
      ids: [128000, 710, 198, 71050, 34093],
      decoded: "<|begin_of_text|>ax\n####\nboo",
    },
    BASIC: {
      text: BASE_TEST_STRINGS.BASIC,
      tokens: ["UN", "want", "\u00c3\u00a9d", ",", "running"],
      ids: [128000, 1899, 53757, 15433, 11, 28272],
      decoded: "<|begin_of_text|>UNwant\u00e9d,running",
    },
    CONTROL_TOKENS: {
      text: BASE_TEST_STRINGS.CONTROL_TOKENS,
      tokens: ["1", "\u0100", "2", "\u00ef\u00bf\u00bd", "3"],
      ids: [128000, 16, 188, 17, 5809, 18],
      decoded: "<|begin_of_text|>1\u00002\ufffd3",
    },
    HELLO_WORLD_TITLECASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
      tokens: ["Hello", "\u0120World"],
      ids: [128000, 9906, 4435],
      decoded: "<|begin_of_text|>Hello World",
    },
    HELLO_WORLD_LOWERCASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE,
      tokens: ["hello", "\u0120world"],
      ids: [128000, 15339, 1917],
      decoded: "<|begin_of_text|>hello world",
    },
    CHINESE_ONLY: {
      text: BASE_TEST_STRINGS.CHINESE_ONLY,
      tokens: ["\u00e7\u0136\u0141\u00e6\u00b4\u00bb", "\u00e7\u013c\u0126", "\u00e7\u013e\u0141", "\u00e8\u00b0", "\u013d", "\u00e6\u013a\u00af"],
      ids: [128000, 104654, 9554, 89151, 39013, 249, 21043],
      decoded: "<|begin_of_text|>\u751f\u6d3b\u7684\u771f\u8c1b\u662f",
    },
    LEADING_SPACE: {
      text: BASE_TEST_STRINGS.LEADING_SPACE,
      tokens: ["\u0120\u0120", "\u0120leading", "\u0120space"],
      ids: [128000, 256, 6522, 3634],
      decoded: "<|begin_of_text|>   leading space",
    },
    TRAILING_SPACE: {
      text: BASE_TEST_STRINGS.TRAILING_SPACE,
      tokens: ["tr", "ailing", "\u0120space", "\u0120\u0120\u0120"],
      ids: [128000, 376, 14612, 3634, 262],
      decoded: "<|begin_of_text|>trailing space   ",
    },
    DOUBLE_SPACE: {
      text: BASE_TEST_STRINGS.DOUBLE_SPACE,
      tokens: ["Hi", "\u0120", "\u0120Hello"],
      ids: [128000, 13347, 220, 22691],
      decoded: "<|begin_of_text|>Hi  Hello",
    },
    CURRENCY: {
      text: BASE_TEST_STRINGS.CURRENCY,
      tokens: ["test", "\u0120$", "1", "\u0120R", "2", "\u0120#", "3", "\u0120\u00e2\u0124\u00ac", "4", "\u0120\u00c2\u00a3", "5", "\u0120\u00c2\u00a5", "6", "\u0120\u00e2\u0124", "\u00a3", "7", "\u0120\u00e2\u0124\u00b9", "8", "\u0120\u00e2\u0124", "\u00b1", "9", "\u0120test"],
      ids: [128000, 1985, 400, 16, 432, 17, 674, 18, 13281, 19, 7083, 20, 72588, 21, 113384, 96, 22, 90891, 23, 113384, 109, 24, 1296],
      decoded: "<|begin_of_text|>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test",
    },
    CURRENCY_WITH_DECIMALS: {
      text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
      tokens: ["I", "\u0120bought", "\u0120an", "\u0120apple", "\u0120for", "\u0120$", "1", ".", "00", "\u0120at", "\u0120the", "\u0120store", "."],
      ids: [128000, 40, 11021, 459, 24149, 369, 400, 16, 13, 410, 520, 279, 3637, 13],
      decoded: "<|begin_of_text|>I bought an apple for $1.00 at the store.",
    },
    ELLIPSIS: {
      text: BASE_TEST_STRINGS.ELLIPSIS,
      tokens: ["you", "\u00e2\u0122\u00a6", "\u0120\u0120"],
      ids: [128000, 9514, 1981, 256],
      decoded: "<|begin_of_text|>you\u2026  ",
    },
    TEXT_WITH_ESCAPE_CHARACTERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
      tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"],
      ids: [128000, 9514, 1981, 9421],
      decoded: "<|begin_of_text|>you\u2026\u00a0\u00a0",
    },
    TEXT_WITH_ESCAPE_CHARACTERS_2: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
      tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142", "\u00c2\u0142", "you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"],
      ids: [128000, 9514, 1981, 4194, 4194, 9514, 1981, 9421],
      decoded: "<|begin_of_text|>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0",
    },
    TILDE_NORMALIZATION: {
      text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
      tokens: ["we", "ird", "\u0120\u00ef\u00bd\u0140", "\u0120edge", "\u0120\u00ef\u00bd\u0140", "\u0120case"],
      ids: [128000, 906, 2668, 111942, 6964, 111942, 1162],
      decoded: "<|begin_of_text|>weird \uff5e edge \uff5e case",
    },
    SPIECE_UNDERSCORE: {
      text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
      tokens: ["\u00e2\u0138", "\u0123", "This", "\u0120\u00e2\u0138", "\u0123", "is", "\u0120\u00e2\u0138", "\u0123", "a", "\u0120\u00e2\u0138", "\u0123", "test", "\u0120\u00e2\u0138", "\u0123", "."],
      ids: [128000, 10634, 223, 2028, 14860, 223, 285, 14860, 223, 64, 14860, 223, 1985, 14860, 223, 13],
      decoded: "<|begin_of_text|>\u2581This \u2581is \u2581a \u2581test \u2581.",
    },
    POPULAR_EMOJIS: {
      text: BASE_TEST_STRINGS.POPULAR_EMOJIS,
      tokens: ["\u00f0\u0141\u013a", "\u0124", "\u0120\u00f0\u0141\u0133", "\u012f", "\u0120\u00f0\u0141", "\u00a4", "\u00a3", "\u0120\u00f0\u0141\u013a", "\u012f", "\u0120\u00f0\u0141\u013a", "\u0143", "\u0120\u00f0\u0141", "\u0130", "\u012b", "\u0120\u00f0\u0141", "\u013b", "\u0131", "\u0120\u00f0\u0141\u013a", "\u012c", "\u0120\u00f0\u0141\u0136", "\u00a5", "\u0120\u00f0\u0141\u013a", "\u0123", "\u0120\u00f0\u0141\u013a", "\u0127", "\u0120\u00f0\u0141", "\u00a4", "\u0139", "\u0120\u00f0\u0141\u013a", "\u0128", "\u0120\u00f0\u0141\u0133", "\u0131", "\u0120\u00e2\u013f\u00a4", "\u00ef\u00b8\u0131", "\u0120\u00f0\u0141\u0134", "\u013e", "\u0120\u00f0\u0141\u0134", "\u013c", "\u0120\u00f0\u0141\u0134", "\u0139", "\u0120\u00f0\u0141\u0134", "\u013b", "\u0120\u00f0\u0141", "\u0138", "\u00a4", "\u0120\u00f0\u0141\u013a", "\u0130", "\u0120\u00f0\u0141\u0133", "\u012e", "\u0120\u00f0\u0141", "\u00a5", "\u00b3", "\u0120\u00f0\u0141\u0134", "\u00aa", "\u0120\u00e2\u013e", "\u00a8", "\u0120\u00f0\u0141\u0133", "\u012b", "\u0120\u00f0\u0141\u0133", "\u0122", "\u0120\u00f0\u0141\u0134", "\u00af", "\u0120\u00f0\u0141", "\u0130", "\u012a", "\u0120\u00f0\u0141", "\u013b", "\u012a", "\u0120\u00f0\u0141", "\u013b", "\u012e", "\u0120\u00f0\u0141\u0134", "\u0122", "\u0120\u00f0\u0141\u0133", "\u0129", "\u0120\u00f0\u0141\u0133", "\u012d", "\u0120\u00e2\u013e", "\u0127", "\u0120\u00f0\u0141", "\u0130", "\u0123", "\u0120\u00f0\u0141", "\u012e", "\u0140", "\u0120\u00f0\u0141", "\u012e", "\u00b8", "\u0120\u00f0\u0141\u0134", "\u00b0"],
      ids: [128000, 76460, 224, 62904, 235, 11410, 97, 96, 27623, 235, 27623, 255, 11410, 236, 231, 11410, 247, 237, 27623, 232, 96169, 98, 27623, 223, 27623, 227, 11410, 97, 245, 27623, 228, 62904, 237, 71570, 31643, 64139, 250, 64139, 248, 64139, 245, 64139, 247, 11410, 244, 97, 27623, 236, 62904, 234, 11410, 98, 111, 64139, 103, 26602, 101, 62904, 231, 62904, 222, 64139, 107, 11410, 236, 230, 11410, 247, 230, 11410, 247, 234, 64139, 222, 62904, 229, 62904, 233, 26602, 227, 11410, 236, 223, 11410, 234, 252, 11410, 234, 116, 64139, 108],
      decoded: "<|begin_of_text|>\ud83d\ude02 \ud83d\udc4d \ud83e\udd23 \ud83d\ude0d \ud83d\ude2d \ud83c\udf89 \ud83d\ude4f \ud83d\ude0a \ud83d\udd25 \ud83d\ude01 \ud83d\ude05 \ud83e\udd17 \ud83d\ude06 \ud83d\udc4f \u2764\ufe0f \ud83d\udc9c \ud83d\udc9a \ud83d\udc97 \ud83d\udc99 \ud83d\udda4 \ud83d\ude0e \ud83d\udc4c \ud83e\udd73 \ud83d\udcaa \u2728 \ud83d\udc49 \ud83d\udc40 \ud83d\udcaf \ud83c\udf88 \ud83d\ude48 \ud83d\ude4c \ud83d\udc80 \ud83d\udc47 \ud83d\udc4b \u2705 \ud83c\udf81 \ud83c\udf1e \ud83c\udf38 \ud83d\udcb0",
    },
    MULTIBYTE_EMOJIS: {
      text: BASE_TEST_STRINGS.MULTIBYTE_EMOJIS,
      tokens: ["\u00e2\u013e", "\u00a8", "\u0120\u00f0\u0141", "\u00a4", "\u0139", "\u0120\u00f0\u0141\u0133", "\u0123", "\u00ef\u00b8\u0131", "\u0120\u00f0\u0141\u0133", "\u00b1", "\u00f0\u0141", "\u0131", "\u00bb", "\u0120\u00f0\u0141", "\u0137", "\u00b5", "\u00e2\u0122\u012f", "\u00e2\u013b", "\u0124", "\u00ef\u00b8\u0131", "\u0120\u00f0\u0141", "\u00a7", "\u013b", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122\u012f", "\u00e2\u013b", "\u0124", "\u0120\u00f0\u0141\u0133", "\u00a8", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u012e", "\u00be", "\u0120\u00f0\u0141", "\u00a7", "\u0133", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u00a4", "\u013f", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u00a7", "\u0133", "\u0120\u00f0\u0141\u0133", "\u00a9", "\u00e2\u0122\u012f", "\u00e2\u013f\u00a4", "\u00e2\u0122\u012f", "\u00f0\u0141\u0134", "\u012d", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a8", "\u0120\u00f0\u0141\u0133", "\u00a9", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a9", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a7", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a6", "\u0120\u00f0\u0141", "\u00a7", "\u0133", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u00a4", "\u013f", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u00a7", "\u0133", "\u00f0\u0141", "\u0131", "\u00bb", "\u0120\u00f0\u0141", "\u0131", "\u00b4", "\u00f3", "\u0142\u0123", "\u00a7", "\u00f3", "\u0142\u0123", "\u00a2", "\u00f3", "\u0142\u0123", "\u00a5", "\u00f3", "\u0142\u0123", "\u00ae", "\u00f3", "\u0142\u0123", "\u00a7", "\u00f3", "\u0142\u0123", "\u00bf", "\u0120\u00f0\u0141\u0133", "\u00a8", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122\u012f", "\u00e2\u013f\u00a4", "\u00ef\u00b8\u0131", "\u00e2\u0122\u012f", "\u00f0\u0141\u0134", "\u012d", "\u00e2\u0122\u012f", "\u00f0\u0141", "\u0133", "\u00a8", "\u00f0\u0141", "\u0131", "\u00bc"],
      ids: [128000, 38798, 101, 11410, 97, 245, 62904, 223, 31643, 62904, 109, 9468, 237, 119, 11410, 243, 113, 102470, 17245, 224, 31643, 11410, 100, 247, 9468, 237, 119, 102470, 17245, 224, 62904, 101, 9468, 237, 119, 102470, 9468, 234, 122, 11410, 100, 239, 102470, 9468, 97, 251, 102470, 9468, 100, 239, 62904, 102, 102470, 121643, 102470, 93273, 233, 102470, 9468, 239, 101, 62904, 102, 102470, 9468, 239, 102, 102470, 9468, 239, 100, 102470, 9468, 239, 99, 11410, 100, 239, 9468, 237, 119, 102470, 9468, 97, 251, 102470, 9468, 100, 239, 9468, 237, 119, 11410, 237, 112, 175, 16050, 100, 175, 16050, 95, 175, 16050, 98, 175, 16050, 106, 175, 16050, 100, 175, 16050, 123, 62904, 101, 9468, 237, 119, 102470, 121643, 31643, 102470, 93273, 233, 102470, 9468, 239, 101, 9468, 237, 120],
      decoded: "<|begin_of_text|>\u2728 \ud83e\udd17 \ud83d\udc41\ufe0f \ud83d\udc71\ud83c\udffb \ud83d\udd75\u200d\u2642\ufe0f \ud83e\uddd9\ud83c\udffb\u200d\u2642 \ud83d\udc68\ud83c\udffb\u200d\ud83c\udf3e \ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1 \ud83d\udc69\u200d\u2764\u200d\ud83d\udc8b\u200d\ud83d\udc68 \ud83d\udc69\u200d\ud83d\udc69\u200d\ud83d\udc67\u200d\ud83d\udc66 \ud83e\uddd1\ud83c\udffb\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c\udffb \ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f \ud83d\udc68\ud83c\udffb\u200d\u2764\ufe0f\u200d\ud83d\udc8b\u200d\ud83d\udc68\ud83c\udffc",
    },
    BPE_SCORES_PRIORITY_1: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_1,
      tokens: ["grab", "bed"],
      ids: [128000, 59312, 2788],
      decoded: "<|begin_of_text|>grabbed",
    },
    BPE_SCORES_PRIORITY_2: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_2,
      tokens: ["\u0120grabbed"],
      ids: [128000, 30418],
      decoded: "<|begin_of_text|> grabbed",
    },
    BPE_SCORES_PRIORITY_3: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_3,
      tokens: ["\u0120\u0120\u0120\u0120\u0120\u0120\u0120\u0120\u0120\u0120", "\u0120grabbed"],
      ids: [128000, 1881, 30418],
      decoded: "<|begin_of_text|>           grabbed",
    },
    NEWLINE: {
      text: LLAMA_TEST_STRINGS.NEWLINE,
      tokens: ["\u010a"],
      ids: [128000, 198],
      decoded: "<|begin_of_text|>\n",
    },
    NEWLINE_WITH_LEADING_SPACE: {
      text: LLAMA_TEST_STRINGS.NEWLINE_WITH_LEADING_SPACE,
      tokens: ["\u0120\u010a"],
      ids: [128000, 720],
      decoded: "<|begin_of_text|> \n",
    },
    TABS: {
      text: LLAMA_TEST_STRINGS.TABS,
      tokens: ["\u0109t", "abs", "\u0109\u0109\u0109", "\u0109out", "\u0120here"],
      ids: [128000, 3324, 3518, 573, 14294, 1618],
      decoded: "<|begin_of_text|>\ttabs\t\t\t\tout here",
    },
    NEWLINE_AND_TAB: {
      text: LLAMA_TEST_STRINGS.NEWLINE_AND_TAB,
      tokens: ["\u010a\u0109\u010a"],
      ids: [128000, 18108],
      decoded: "<|begin_of_text|>\n\t\n",
    },
    CHINESE_LETTER: {
      text: LLAMA_TEST_STRINGS.CHINESE_LETTER,
      tokens: ["\u00e9\u0137\u0129"],
      ids: [128000, 104643],
      decoded: "<|begin_of_text|>\u9547",
    },
    EMOJIS_1: {
      text: LLAMA_TEST_STRINGS.EMOJIS_1,
      tokens: ["\u00f0\u0141", "\u00a6", "\u013b"],
      ids: [128000, 9468, 99, 247],
      decoded: "<|begin_of_text|>\ud83e\udd99",
    },
    EMOJIS_2: {
      text: LLAMA_TEST_STRINGS.EMOJIS_2,
      tokens: ["\u00f0\u0141", "\u00a6", "\u013b", "\u00ea", "\u013b", "\u012c"],
      ids: [128000, 9468, 99, 247, 166, 247, 232],
      decoded: "<|begin_of_text|>\ud83e\udd99\ua64a",
    },
    EMOJIS_3: {
      text: LLAMA_TEST_STRINGS.EMOJIS_3,
      tokens: ["\u00ea", "\u013b", "\u012c", "\u00f0\u0141", "\u00a6", "\u013b"],
      ids: [128000, 166, 247, 232, 9468, 99, 247],
      decoded: "<|begin_of_text|>\ua64a\ud83e\udd99",
    },
    PARAGRAPH: {
      text: LLAMA_TEST_STRINGS.PARAGRAPH,
      tokens: ["The", "\u0120llama", "\u0120(/", "\u00cb", "\u012a", "l", "\u00c9", "\u0133", "\u00cb", "\u0132", "m", "\u00c9\u013b", "/", ";", "\u0120\u00f0\u0141", "\u00a6", "\u013b", "Spanish", "\u0120pronunciation", ":", "\u0120[", "\u00cb", "\u012a", "\u00ca", "\u0130", "ama", "])", "\u0120(", "L", "ama", "\u0120gl", "ama", ")", "\u0120is", "\u0120a", "\u0120domestic", "ated", "\u0120South", "\u0120American", "\u0120camel", "id", ",", "\u0120widely", "\u0120used", "\u0120as", "\u0120a", "\u0120meat", "\u0120and", "\u0120pack", "\u0120animal", "\u0120by", "\u0120And", "ean", "\u0120cultures", "\u0120since", "\u0120the", "\u0120Pre", "-C", "olum", "bian", "\u0120era", ".", "\u0120L", "lam", "as", "\u0120are", "\u0120social", "\u0120animals", "\u0120and", "\u0120live", "\u0120with", "\u0120others", "\u0120as", "\u0120a", "\u0120herd", ".", "\u0120Their", "\u0120wool", "\u0120is", "\u0120soft", "\u0120and", "\u0120contains", "\u0120only", "\u0120a", "\u0120small", "\u0120amount", "\u0120of", "\u0120lan", "olin", ".[", "2", "]", "\u0120L", "lam", "as", "\u0120can", "\u0120learn", "\u0120simple", "\u0120tasks", "\u0120after", "\u0120a", "\u0120few", "\u0120repetitions", ".", "\u0120When", "\u0120using", "\u0120a", "\u0120pack", ",", "\u0120they", "\u0120can", "\u0120carry", "\u0120about", "\u0120", "25", "\u0120to", "\u0120", "30", "%", "\u0120of", "\u0120their", "\u0120body", "\u0120weight", "\u0120for", "\u0120", "8", "\u0120to", "\u0120", "13", "\u0120km", "\u0120(", "5", "\u00e2\u0122\u0135", "8", "\u0120miles", ").[", "3", "]", "\u0120The", "\u0120name", "\u0120llama", "\u0120(", "in", "\u0120the", "\u0120past", "\u0120also", "\u0120spelled", '\u0120"', "lama", '"', "\u0120or", '\u0120"', "gl", "ama", '")', "\u0120was", "\u0120adopted", "\u0120by", "\u0120European", "\u0120settlers", "\u0120from", "\u0120native", "\u0120Per", "uv", "ians", ".[", "4", "]", "\u0120The", "\u0120ancestors", "\u0120of", "\u0120ll", "amas", "\u0120are", "\u0120thought", "\u0120to", "\u0120have", "\u0120originated", "\u0120from", "\u0120the", "\u0120Great", "\u0120Plains", "\u0120of", "\u0120North", "\u0120America", "\u0120about", "\u0120", "40", "\u0120million", "\u0120years", "\u0120ago", ",", "\u0120and", "\u0120subsequently", "\u0120migrated", "\u0120to", "\u0120South", "\u0120America", "\u0120about", "\u0120three", "\u0120million", "\u0120years", "\u0120ago", "\u0120during", "\u0120the", "\u0120Great", "\u0120American", "\u0120Inter", "change", ".", "\u0120By", "\u0120the", "\u0120end", "\u0120of", "\u0120the", "\u0120last", "\u0120ice", "\u0120age", "\u0120(", "10", ",", "000", "\u00e2\u0122\u0135", "12", ",", "000", "\u0120years", "\u0120ago", "),", "\u0120camel", "ids", "\u0120were", "\u0120extinct", "\u0120in", "\u0120North", "\u0120America", ".[", "3", "]", "\u0120As", "\u0120of", "\u0120", "200", "7", ",", "\u0120there", "\u0120were", "\u0120over", "\u0120seven", "\u0120million", "\u0120ll", "amas", "\u0120and", "\u0120al", "pac", "as", "\u0120in", "\u0120South", "\u0120America", "\u0120and", "\u0120over", "\u0120", "158", ",", "000", "\u0120ll", "amas", "\u0120and", "\u0120", "100", ",", "000", "\u00ea", "\u013b", "\u012c", "\u00f0\u0141", "\u00a6", "\u013b", "\u0120al", "pac", "as", ",", "\u0120descended", "\u0120from", "\u0120progen", "itors", "\u0120imported", "\u0120late", "\u0120in", "\u0120the", "\u0120", "20", "th", "\u0120century", ",", "\u0120in", "\u0120the", "\u0120United", "\u0120States", "\u0120and", "\u0120Canada", ".[", "5", "]", "\u0120In", "\u0120A", "ym", "ara", "\u0120mythology", ",", "\u0120ll", "amas", "\u0120are", "\u0120important", "\u0120beings", ".", "\u0120The", "\u0120Heavenly", "\u0120L", "lama", "\u0120is", "\u0120said", "\u0120to", "\u0120drink", "\u0120water", "\u0120from", "\u0120the", "\u0120ocean", "\u0120and", "\u0120ur", "in", "ates", "\u0120as", "\u0120it", "\u0120rains", ".[", "6", "]", "\u0120According", "\u0120to", "\u0120A", "ym", "ara", "\u0120es", "chat", "ology", ",", "\u0120ll", "amas", "\u0120will", "\u0120return", "\u0120to", "\u0120the", "\u0120water", "\u0120springs", "\u0120and", "\u0120l", "ago", "ons", "\u0120where", "\u0120they", "\u0120come", "\u0120from", "\u0120at", "\u0120the", "\u0120end", "\u0120of", "\u0120time", ".[", "6", "]"],
      ids: [128000, 791, 94776, 47325, 135, 230, 75, 133, 239, 135, 238, 76, 99638, 14, 26, 11410, 99, 247, 62897, 71722, 25, 510, 135, 230, 134, 236, 3105, 2526, 320, 43, 3105, 2840, 3105, 8, 374, 264, 13018, 660, 4987, 3778, 50252, 307, 11, 13882, 1511, 439, 264, 13339, 323, 3854, 10065, 555, 1628, 5420, 27833, 2533, 279, 5075, 7813, 1152, 13464, 11639, 13, 445, 24705, 300, 527, 3674, 10099, 323, 3974, 449, 3885, 439, 264, 59213, 13, 11205, 39640, 374, 8579, 323, 5727, 1193, 264, 2678, 3392, 315, 31791, 37737, 8032, 17, 60, 445, 24705, 300, 649, 4048, 4382, 9256, 1306, 264, 2478, 86066, 13, 3277, 1701, 264, 3854, 11, 814, 649, 6920, 922, 220, 914, 311, 220, 966, 4, 315, 872, 2547, 4785, 369, 220, 23, 311, 220, 1032, 13437, 320, 20, 4235, 23, 8931, 94638, 18, 60, 578, 836, 94776, 320, 258, 279, 3347, 1101, 68918, 330, 81101, 1, 477, 330, 6200, 3105, 909, 574, 18306, 555, 7665, 61107, 505, 10068, 3700, 12328, 5493, 8032, 19, 60, 578, 38618, 315, 9507, 29189, 527, 3463, 311, 617, 44853, 505, 279, 8681, 63911, 315, 4892, 5270, 922, 220, 1272, 3610, 1667, 4227, 11, 323, 28520, 73691, 311, 4987, 5270, 922, 2380, 3610, 1667, 4227, 2391, 279, 8681, 3778, 5783, 3455, 13, 3296, 279, 842, 315, 279, 1566, 10054, 4325, 320, 605, 11, 931, 4235, 717, 11, 931, 1667, 4227, 705, 50252, 3447, 1051, 69918, 304, 4892, 5270, 8032, 18, 60, 1666, 315, 220, 1049, 22, 11, 1070, 1051, 927, 8254, 3610, 9507, 29189, 323, 453, 46051, 300, 304, 4987, 5270, 323, 927, 220, 11286, 11, 931, 9507, 29189, 323, 220, 1041, 11, 931, 166, 247, 232, 9468, 99, 247, 453, 46051, 300, 11, 58842, 505, 84360, 12170, 25973, 3389, 304, 279, 220, 508, 339, 9478, 11, 304, 279, 3723, 4273, 323, 7008, 8032, 20, 60, 763, 362, 1631, 5169, 59492, 11, 9507, 29189, 527, 3062, 23837, 13, 578, 88150, 445, 81101, 374, 1071, 311, 7172, 3090, 505, 279, 18435, 323, 4433, 258, 988, 439, 433, 62555, 8032, 21, 60, 10771, 311, 362, 1631, 5169, 1560, 9884, 2508, 11, 9507, 29189, 690, 471, 311, 279, 3090, 42242, 323, 326, 6438, 2439, 1405, 814, 2586, 505, 520, 279, 842, 315, 892, 8032, 21, 60],
      decoded: '<|begin_of_text|>The llama (/\u02c8l\u0251\u02d0m\u0259/; \ud83e\udd99Spanish pronunciation: [\u02c8\u028eama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the Pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5\u20138 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have originated from the Great Plains of North America about 40 million years ago, and subsequently migrated to South America about three million years ago during the Great American Interchange. By the end of the last ice age (10,000\u201312,000 years ago), camelids were extinct in North America.[3] As of 2007, there were over seven million llamas and alpacas in South America and over 158,000 llamas and 100,000\ua64a\ud83e\udd99 alpacas, descended from progenitors imported late in the 20th century, in the United States and Canada.[5] In Aymara mythology, llamas are important beings. The Heavenly Llama is said to drink water from the ocean and urinates as it rains.[6] According to Aymara eschatology, llamas will return to the water springs and lagoons where they come from at the end of time.[6]',
    },
  },
  "Xenova/TinyLLama-v0": {
    NEWLINES: {
      text: LLAMA_TEST_STRINGS.NEWLINES,
      tokens: ["\u2581ax", "<0x0A>", "####", "<0x0A>", "b", "oo"],
      ids: [1, 9013, 13, 20411, 13, 31842, 2742],
      decoded: "<s> ax\n####\nboo",
    },
    CHINESE_ONLY: {
      text: BASE_TEST_STRINGS.CHINESE_ONLY,
      tokens: ["\u2581", "<0xE7>", "<0x94>", "<0x9F>", "<0xE6>", "<0xB4>", "<0xBB>", "<0xE7>", "<0x9A>", "<0x84>", "<0xE7>", "<0x9C>", "<0x9F>", "<0xE8>", "<0xB0>", "<0x9B>", "<0xE6>", "<0x98>", "<0xAF>"],
      ids: [1, 31822, 234, 151, 162, 233, 183, 190, 234, 157, 135, 234, 159, 162, 235, 179, 158, 233, 155, 178],
      decoded: "<s> \u751f\u6d3b\u7684\u771f\u8c1b\u662f",
    },
    TRAILING_SPACE: {
      text: BASE_TEST_STRINGS.TRAILING_SPACE,
      tokens: ["\u2581trailing", "\u2581space", "\u2581", "\u2581", "\u2581"],
      ids: [1, 30174, 2138, 31822, 31822, 31822],
      decoded: "<s> trailing space   ",
    },
    CURRENCY: {
      text: BASE_TEST_STRINGS.CURRENCY,
      tokens: ["\u2581test", "\u2581$", "1", "\u2581R", "2", "\u2581#", "3", "\u2581\u20ac", "4", "\u2581\u00a3", "5", "\u2581", "<0xC2>", "<0xA5>", "6", "\u2581", "<0xE2>", "<0x82>", "<0xA3>", "7", "\u2581", "<0xE2>", "<0x82>", "<0xB9>", "8", "\u2581", "<0xE2>", "<0x82>", "<0xB1>", "9", "\u2581test"],
      ids: [1, 1397, 569, 31853, 360, 31855, 1257, 31878, 9390, 31882, 3922, 31880, 31822, 197, 168, 31887, 31822, 229, 133, 166, 31888, 31822, 229, 133, 188, 31886, 31822, 229, 133, 180, 31877, 1397],
      decoded: "<s> test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test",
    },
    ELLIPSIS: {
      text: BASE_TEST_STRINGS.ELLIPSIS,
      tokens: ["\u2581you", "\u2026", "\u2581", "\u2581"],
      ids: [1, 365, 31925, 31822, 31822],
      decoded: "<s> you\u2026  ",
    },
    TEXT_WITH_ESCAPE_CHARACTERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
      tokens: ["\u2581you", "\u2026", "\u00a0", "\u00a0"],
      ids: [1, 365, 31925, 31963, 31963],
      decoded: "<s> you\u2026\u00a0\u00a0",
    },
    TEXT_WITH_ESCAPE_CHARACTERS_2: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
      tokens: ["\u2581you", "\u2026", "\u00a0", "\u00a0", "you", "\u2026", "\u00a0", "\u00a0"],
      ids: [1, 365, 31925, 31963, 31963, 7936, 31925, 31963, 31963],
      decoded: "<s> you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0",
    },
    TILDE_NORMALIZATION: {
      text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
      tokens: ["\u2581weird", "\u2581", "<0xEF>", "<0xBD>", "<0x9E>", "\u2581edge", "\u2581", "<0xEF>", "<0xBD>", "<0x9E>", "\u2581case"],
      ids: [1, 9907, 31822, 242, 192, 161, 5991, 31822, 242, 192, 161, 1372],
      decoded: "<s> weird \uff5e edge \uff5e case",
    },
    POPULAR_EMOJIS: {
      text: BASE_TEST_STRINGS.POPULAR_EMOJIS,
      tokens: ["\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x82>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x8D>", "\u2581", "<0xF0>", "<0x9F>", "<0xA4>", "<0xA3>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x8D>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0xAD>", "\u2581", "<0xF0>", "<0x9F>", "<0x8E>", "<0x89>", "\u2581", "<0xF0>", "<0x9F>", "<0x99>", "<0x8F>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x8A>", "\u2581", "<0xF0>", "<0x9F>", "<0x94>", "<0xA5>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x81>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x85>", "\u2581", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x86>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x8F>", "\u2581", "<0xE2>", "<0x9D>", "<0xA4>", "<0xEF>", "<0xB8>", "<0x8F>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x9C>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x9A>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x97>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x99>", "\u2581", "<0xF0>", "<0x9F>", "<0x96>", "<0xA4>", "\u2581", "<0xF0>", "<0x9F>", "<0x98>", "<0x8E>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x8C>", "\u2581", "<0xF0>", "<0x9F>", "<0xA5>", "<0xB3>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0xAA>", "\u2581", "<0xE2>", "<0x9C>", "<0xA8>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x89>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x80>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0xAF>", "\u2581", "<0xF0>", "<0x9F>", "<0x8E>", "<0x88>", "\u2581", "<0xF0>", "<0x9F>", "<0x99>", "<0x88>", "\u2581", "<0xF0>", "<0x9F>", "<0x99>", "<0x8C>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0x80>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x87>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x8B>", "\u2581", "<0xE2>", "<0x9C>", "<0x85>", "\u2581", "<0xF0>", "<0x9F>", "<0x8E>", "<0x81>", "\u2581", "<0xF0>", "<0x9F>", "<0x8C>", "<0x9E>", "\u2581", "<0xF0>", "<0x9F>", "<0x8C>", "<0xB8>", "\u2581", "<0xF0>", "<0x9F>", "<0x92>", "<0xB0>"],
      ids: [1, 31822, 243, 162, 155, 133, 31822, 243, 162, 148, 144, 31822, 243, 162, 167, 166, 31822, 243, 162, 155, 144, 31822, 243, 162, 155, 176, 31822, 243, 162, 145, 140, 31822, 243, 162, 156, 146, 31822, 243, 162, 155, 141, 31822, 243, 162, 151, 168, 31822, 243, 162, 155, 132, 31822, 243, 162, 155, 136, 31822, 243, 162, 167, 154, 31822, 243, 162, 155, 137, 31822, 243, 162, 148, 146, 31822, 229, 160, 167, 242, 187, 146, 31822, 243, 162, 149, 159, 31822, 243, 162, 149, 157, 31822, 243, 162, 149, 154, 31822, 243, 162, 149, 156, 31822, 243, 162, 153, 167, 31822, 243, 162, 155, 145, 31822, 243, 162, 148, 143, 31822, 243, 162, 168, 182, 31822, 243, 162, 149, 173, 31822, 229, 159, 171, 31822, 243, 162, 148, 140, 31822, 243, 162, 148, 131, 31822, 243, 162, 149, 178, 31822, 243, 162, 145, 139, 31822, 243, 162, 156, 139, 31822, 243, 162, 156, 143, 31822, 243, 162, 149, 131, 31822, 243, 162, 148, 138, 31822, 243, 162, 148, 142, 31822, 229, 159, 136, 31822, 243, 162, 145, 132, 31822, 243, 162, 143, 161, 31822, 243, 162, 143, 187, 31822, 243, 162, 149, 179],
      decoded: "<s> \ud83d\ude02 \ud83d\udc4d \ud83e\udd23 \ud83d\ude0d \ud83d\ude2d \ud83c\udf89 \ud83d\ude4f \ud83d\ude0a \ud83d\udd25 \ud83d\ude01 \ud83d\ude05 \ud83e\udd17 \ud83d\ude06 \ud83d\udc4f \u2764\ufe0f \ud83d\udc9c \ud83d\udc9a \ud83d\udc97 \ud83d\udc99 \ud83d\udda4 \ud83d\ude0e \ud83d\udc4c \ud83e\udd73 \ud83d\udcaa \u2728 \ud83d\udc49 \ud83d\udc40 \ud83d\udcaf \ud83c\udf88 \ud83d\ude48 \ud83d\ude4c \ud83d\udc80 \ud83d\udc47 \ud83d\udc4b \u2705 \ud83c\udf81 \ud83c\udf1e \ud83c\udf38 \ud83d\udcb0",
    },
    MULTIBYTE_EMOJIS: {
      text: BASE_TEST_STRINGS.MULTIBYTE_EMOJIS,
      tokens: ["\u2581", "<0xE2>", "<0x9C>", "<0xA8>", "\u2581", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0x81>", "<0xEF>", "<0xB8>", "<0x8F>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xB1>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "\u2581", "<0xF0>", "<0x9F>", "<0x95>", "<0xB5>", "<0xE2>", "<0x80>", "<0x8D>", "<0xE2>", "<0x99>", "<0x82>", "<0xEF>", "<0xB8>", "<0x8F>", "\u2581", "<0xF0>", "<0x9F>", "<0xA7>", "<0x99>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "<0xE2>", "<0x80>", "<0x8D>", "<0xE2>", "<0x99>", "<0x82>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xA8>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0x8C>", "<0xBE>", "\u2581", "<0xF0>", "<0x9F>", "<0xA7>", "<0x91>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0xA4>", "<0x9D>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0xA7>", "<0x91>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xA9>", "<0xE2>", "<0x80>", "<0x8D>", "<0xE2>", "<0x9D>", "<0xA4>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0x92>", "<0x8B>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0x91>", "<0xA8>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xA9>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0x91>", "<0xA9>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0x91>", "<0xA7>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0x91>", "<0xA6>", "\u2581", "<0xF0>", "<0x9F>", "<0xA7>", "<0x91>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0xA4>", "<0x9D>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0xA7>", "<0x91>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "\u2581", "<0xF0>", "<0x9F>", "<0x8F>", "<0xB4>", "<0xF3>", "<0xA0>", "<0x81>", "<0xA7>", "<0xF3>", "<0xA0>", "<0x81>", "<0xA2>", "<0xF3>", "<0xA0>", "<0x81>", "<0xA5>", "<0xF3>", "<0xA0>", "<0x81>", "<0xAE>", "<0xF3>", "<0xA0>", "<0x81>", "<0xA7>", "<0xF3>", "<0xA0>", "<0x81>", "<0xBF>", "\u2581", "<0xF0>", "<0x9F>", "<0x91>", "<0xA8>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBB>", "<0xE2>", "<0x80>", "<0x8D>", "<0xE2>", "<0x9D>", "<0xA4>", "<0xEF>", "<0xB8>", "<0x8F>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0x92>", "<0x8B>", "<0xE2>", "<0x80>", "<0x8D>", "<0xF0>", "<0x9F>", "<0x91>", "<0xA8>", "<0xF0>", "<0x9F>", "<0x8F>", "<0xBC>"],
      ids: [1, 31822, 229, 159, 171, 31822, 243, 162, 167, 154, 31822, 243, 162, 148, 132, 242, 187, 146, 31822, 243, 162, 148, 180, 243, 162, 146, 190, 31822, 243, 162, 152, 184, 229, 131, 144, 229, 156, 133, 242, 187, 146, 31822, 243, 162, 170, 156, 243, 162, 146, 190, 229, 131, 144, 229, 156, 133, 31822, 243, 162, 148, 171, 243, 162, 146, 190, 229, 131, 144, 243, 162, 143, 193, 31822, 243, 162, 170, 148, 229, 131, 144, 243, 162, 167, 160, 229, 131, 144, 243, 162, 170, 148, 31822, 243, 162, 148, 172, 229, 131, 144, 229, 160, 167, 229, 131, 144, 243, 162, 149, 142, 229, 131, 144, 243, 162, 148, 171, 31822, 243, 162, 148, 172, 229, 131, 144, 243, 162, 148, 172, 229, 131, 144, 243, 162, 148, 170, 229, 131, 144, 243, 162, 148, 169, 31822, 243, 162, 170, 148, 243, 162, 146, 190, 229, 131, 144, 243, 162, 167, 160, 229, 131, 144, 243, 162, 170, 148, 243, 162, 146, 190, 31822, 243, 162, 146, 183, 246, 163, 132, 170, 246, 163, 132, 165, 246, 163, 132, 168, 246, 163, 132, 177, 246, 163, 132, 170, 246, 163, 132, 194, 31822, 243, 162, 148, 171, 243, 162, 146, 190, 229, 131, 144, 229, 160, 167, 242, 187, 146, 229, 131, 144, 243, 162, 149, 142, 229, 131, 144, 243, 162, 148, 171, 243, 162, 146, 191],
      decoded: "<s> \u2728 \ud83e\udd17 \ud83d\udc41\ufe0f \ud83d\udc71\ud83c\udffb \ud83d\udd75\u200d\u2642\ufe0f \ud83e\uddd9\ud83c\udffb\u200d\u2642 \ud83d\udc68\ud83c\udffb\u200d\ud83c\udf3e \ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1 \ud83d\udc69\u200d\u2764\u200d\ud83d\udc8b\u200d\ud83d\udc68 \ud83d\udc69\u200d\ud83d\udc69\u200d\ud83d\udc67\u200d\ud83d\udc66 \ud83e\uddd1\ud83c\udffb\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c\udffb \ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f \ud83d\udc68\ud83c\udffb\u200d\u2764\ufe0f\u200d\ud83d\udc8b\u200d\ud83d\udc68\ud83c\udffc",
    },
    NEWLINE_WITH_LEADING_SPACE: {
      text: LLAMA_TEST_STRINGS.NEWLINE_WITH_LEADING_SPACE,
      tokens: ["\u2581", "\u2581", "<0x0A>"],
      ids: [1, 31822, 31822, 13],
      decoded: "<s>  \n",
    },
    CHINESE_LETTER: {
      text: LLAMA_TEST_STRINGS.CHINESE_LETTER,
      tokens: ["\u2581", "<0xE9>", "<0x95>", "<0x87>"],
      ids: [1, 31822, 236, 152, 138],
      decoded: "<s> \u9547",
    },
    PARAGRAPH: {
      text: LLAMA_TEST_STRINGS.PARAGRAPH,
      tokens: ["\u2581The", "\u2581ll", "ama", "\u2581(", "/", "<0xCB>", "<0x88>", "l", "<0xC9>", "<0x91>", "<0xCB>", "<0x90>", "m", "<0xC9>", "<0x99>", "/", ";", "\u2581", "<0xF0>", "<0x9F>", "<0xA6>", "<0x99>", "Sp", "anish", "\u2581pron", "unciation", ":", "\u2581[", "<0xCB>", "<0x88>", "<0xCA>", "<0x8E>", "ama", "])", "\u2581(", "L", "ama", "\u2581gl", "ama", ")", "\u2581is", "\u2581a", "\u2581domest", "icated", "\u2581South", "\u2581American", "\u2581cam", "el", "id", ",", "\u2581widely", "\u2581used", "\u2581as", "\u2581a", "\u2581meat", "\u2581and", "\u2581pack", "\u2581animal", "\u2581by", "\u2581And", "ean", "\u2581cultures", "\u2581since", "\u2581the", "\u2581Pre", "-", "Col", "umb", "ian", "\u2581era", ".", "\u2581L", "lam", "as", "\u2581are", "\u2581social", "\u2581animals", "\u2581and", "\u2581live", "\u2581with", "\u2581others", "\u2581as", "\u2581a", "\u2581herd", ".", "\u2581Their", "\u2581wool", "\u2581is", "\u2581soft", "\u2581and", "\u2581contains", "\u2581only", "\u2581a", "\u2581small", "\u2581amount", "\u2581of", "\u2581l", "anol", "in", ".[", "2", "]", "\u2581L", "lam", "as", "\u2581can", "\u2581learn", "\u2581simple", "\u2581", "t", "asks", "\u2581after", "\u2581a", "\u2581few", "\u2581repet", "itions", ".", "\u2581When", "\u2581using", "\u2581a", "\u2581pack", ",", "\u2581they", "\u2581can", "\u2581carry", "\u2581about", "\u2581", "2", "5", "\u2581to", "\u2581", "3", "0", "%", "\u2581of", "\u2581their", "\u2581body", "\u2581weight", "\u2581for", "\u2581", "8", "\u2581to", "\u2581", "1", "3", "\u2581km", "\u2581(", "5", "\u2013", "8", "\u2581miles", ").", "[", "3", "]", "\u2581The", "\u2581name", "\u2581ll", "ama", "\u2581(", "in", "\u2581the", "\u2581past", "\u2581also", "\u2581sp", "elled", '\u2581"', "l", "ama", '"', "\u2581or", '\u2581"', "gl", "ama", '")', "\u2581was", "\u2581adopted", "\u2581by", "\u2581European", "\u2581settlers", "\u2581from", "\u2581native", "\u2581Per", "uv", "ians", ".[", "4", "]", "\u2581The", "\u2581ancestors", "\u2581of", "\u2581l", "lam", "as", "\u2581are", "\u2581thought", "\u2581to", "\u2581have", "\u2581originated", "\u2581from", "\u2581the", "\u2581Great", "\u2581Plains", "\u2581of", "\u2581North", "\u2581America", "\u2581about", "\u2581", "4", "0", "\u2581million", "\u2581years", "\u2581ago", ",", "\u2581and", "\u2581subsequently", "\u2581mig", "rated", "\u2581to", "\u2581South", "\u2581America", "\u2581about", "\u2581three", "\u2581million", "\u2581years", "\u2581ago", "\u2581during", "\u2581the", "\u2581Great", "\u2581American", "\u2581Inter", "change", ".", "\u2581By", "\u2581the", "\u2581end", "\u2581of", "\u2581the", "\u2581last", "\u2581ice", "\u2581age", "\u2581(", "1", "0", ",", "0", "0", "0", "\u2013", "1", "2", ",", "0", "0", "0", "\u2581years", "\u2581ago", "),", "\u2581cam", "el", "ids", "\u2581were", "\u2581extinct", "\u2581in", "\u2581North", "\u2581America", ".[", "3", "]", "\u2581As", "\u2581of", "\u2581", "2", "0", "0", "7", ",", "\u2581there", "\u2581were", "\u2581over", "\u2581seven", "\u2581million", "\u2581l", "lam", "as", "\u2581and", "\u2581al", "p", "ac", "as", "\u2581in", "\u2581South", "\u2581America", "\u2581and", "\u2581over", "\u2581", "1", "5", "8", ",", "0", "0", "0", "\u2581l", "lam", "as", "\u2581and", "\u2581", "1", "0", "0", ",", "0", "0", "0", "<0xEA>", "<0x99>", "<0x8A>", "<0xF0>", "<0x9F>", "<0xA6>", "<0x99>", "\u2581al", "p", "ac", "as", ",", "\u2581descended", "\u2581from", "\u2581pro", "gen", "itors", "\u2581imported", "\u2581late", "\u2581in", "\u2581the", "\u2581", "2", "0", "th", "\u2581century", ",", "\u2581in", "\u2581the", "\u2581United", "\u2581States", "\u2581and", "\u2581Canada", ".[", "5", "]", "\u2581In", "\u2581A", "ym", "ara", "\u2581mythology", ",", "\u2581l", "lam", "as", "\u2581are", "\u2581important", "\u2581beings", ".", "\u2581The", "\u2581Heaven", "ly", "\u2581Ll", "ama", "\u2581is", "\u2581said", "\u2581to", "\u2581drink", "\u2581water", "\u2581from", "\u2581the", "\u2581ocean", "\u2581and", "\u2581ur", "inates", "\u2581as", "\u2581it", "\u2581rains", ".[", "6", "]", "\u2581According", "\u2581to", "\u2581A", "ym", "ara", "\u2581es", "chat", "ology", ",", "\u2581l", "lam", "as", "\u2581will", "\u2581return", "\u2581to", "\u2581the", "\u2581water", "\u2581springs", "\u2581and", "\u2581l", "ago", "ons", "\u2581where", "\u2581they", "\u2581come", "\u2581from", "\u2581at", "\u2581the", "\u2581end", "\u2581of", "\u2581time", ".[", "6", "]"],
      ids: [1, 347, 31763, 2269, 352, 31873, 206, 139, 31832, 204, 148, 206, 147, 31836, 204, 156, 31873, 31891, 31822, 243, 162, 169, 156, 8889, 5817, 11155, 26128, 31871, 836, 206, 139, 205, 145, 2269, 9772, 352, 31867, 2269, 1192, 2269, 31861, 322, 260, 27940, 2672, 1897, 1454, 3764, 307, 317, 31844, 7055, 1065, 362, 260, 8659, 291, 2667, 6075, 417, 787, 14083, 10775, 1314, 266, 2345, 31854, 4848, 2234, 620, 5998, 31843, 372, 3082, 295, 397, 1619, 5220, 291, 1983, 351, 1892, 362, 260, 27172, 31843, 4585, 22729, 322, 2647, 291, 5140, 744, 260, 1435, 2399, 287, 309, 18426, 261, 3564, 31855, 31908, 372, 3082, 295, 473, 1977, 3102, 31822, 31824, 5577, 768, 260, 1346, 17042, 1479, 31843, 1408, 1340, 260, 2667, 31844, 526, 473, 3875, 562, 31822, 31855, 31880, 289, 31822, 31878, 31852, 31914, 287, 518, 2108, 4182, 329, 31822, 31886, 289, 31822, 31853, 31878, 6512, 352, 31880, 31906, 31886, 4465, 656, 31907, 31878, 31908, 347, 1382, 31763, 2269, 352, 261, 266, 1646, 615, 612, 5902, 495, 31832, 2269, 31875, 405, 495, 4261, 2269, 4290, 393, 7574, 417, 2821, 23343, 427, 6412, 2083, 10099, 1580, 3564, 31882, 31908, 347, 18294, 287, 309, 3082, 295, 397, 1991, 289, 435, 20355, 427, 266, 3172, 26744, 287, 1975, 2139, 562, 31822, 31882, 31852, 1577, 778, 2236, 31844, 291, 11786, 21052, 3397, 289, 1897, 2139, 562, 1166, 1577, 778, 2236, 1177, 266, 3172, 1454, 3029, 3604, 31843, 1433, 266, 928, 287, 266, 1060, 5707, 2253, 352, 31853, 31852, 31844, 31852, 31852, 31852, 31906, 31853, 31855, 31844, 31852, 31852, 31852, 778, 2236, 698, 3764, 307, 1982, 577, 30610, 288, 1975, 2139, 3564, 31878, 31908, 717, 287, 31822, 31855, 31852, 31852, 31888, 31844, 635, 577, 648, 3931, 1577, 309, 3082, 295, 291, 366, 31837, 380, 295, 288, 1897, 2139, 291, 648, 31822, 31853, 31880, 31886, 31844, 31852, 31852, 31852, 309, 3082, 295, 291, 31822, 31853, 31852, 31852, 31844, 31852, 31852, 31852, 237, 156, 141, 243, 162, 169, 156, 366, 31837, 380, 295, 31844, 27627, 427, 375, 3353, 4705, 17798, 2732, 288, 266, 31822, 31855, 31852, 388, 3373, 31844, 288, 266, 1494, 1769, 291, 3008, 3564, 31880, 31908, 455, 308, 1276, 2776, 24143, 31844, 309, 3082, 295, 397, 1480, 11844, 31843, 347, 15836, 326, 11321, 2269, 322, 664, 289, 5065, 1579, 427, 266, 8622, 291, 4328, 11466, 362, 357, 28738, 3564, 31887, 31908, 3252, 289, 308, 1276, 2776, 1582, 20068, 1058, 31844, 309, 3082, 295, 482, 1199, 289, 266, 1579, 24250, 291, 309, 3405, 680, 804, 526, 1412, 427, 389, 266, 928, 287, 647, 3564, 31887, 31908],
      decoded: '<s> The llama (/\u02c8l\u0251\u02d0m\u0259/; \ud83e\udd99Spanish pronunciation: [\u02c8\u028eama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the Pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5\u20138 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have originated from the Great Plains of North America about 40 million years ago, and subsequently migrated to South America about three million years ago during the Great American Interchange. By the end of the last ice age (10,000\u201312,000 years ago), camelids were extinct in North America.[3] As of 2007, there were over seven million llamas and alpacas in South America and over 158,000 llamas and 100,000\ua64a\ud83e\udd99 alpacas, descended from progenitors imported late in the 20th century, in the United States and Canada.[5] In Aymara mythology, llamas are important beings. The Heavenly Llama is said to drink water from the ocean and urinates as it rains.[6] According to Aymara eschatology, llamas will return to the water springs and lagoons where they come from at the end of time.[6]',
    },
  },
  "Xenova/deepseek-coder-1.3b-instruct": {
    SIMPLE: {
      text: BASE_TEST_STRINGS.SIMPLE,
      tokens: ["How", "\u0120are", "\u0120you", "\u0120doing", "?"],
      ids: [32013, 2808, 417, 340, 3207, 30],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>How are you doing?",
    },
    SIMPLE_WITH_PUNCTUATION: {
      text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
      tokens: ["You", "\u0120should", "'", "ve", "\u0120done", "\u0120this"],
      ids: [32013, 2042, 1020, 6, 312, 2359, 437],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>You should've done this",
    },
    NUMBERS: {
      text: BASE_TEST_STRINGS.NUMBERS,
      tokens: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "\u0120", "0", "\u0120", "1", "\u0120", "2", "\u0120", "3", "\u0120", "4", "\u0120", "5", "\u0120", "6", "\u0120", "7", "\u0120", "8", "\u0120", "9", "\u0120", "1", "0", "\u0120", "1", "0", "0", "\u0120", "1", "0", "0", "0"],
      ids: [32013, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 207, 15, 207, 16, 207, 17, 207, 18, 207, 19, 207, 20, 207, 21, 207, 22, 207, 23, 207, 24, 207, 16, 15, 207, 16, 15, 15, 207, 16, 15, 15, 15],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000",
    },
    TEXT_WITH_NUMBERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
      tokens: ["The", "\u0120company", "\u0120was", "\u0120founded", "\u0120in", "\u0120", "2", "0", "1", "6", "."],
      ids: [32013, 546, 2595, 438, 16316, 279, 207, 17, 15, 16, 21, 13],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>The company was founded in 2016.",
    },
    PUNCTUATION: {
      text: BASE_TEST_STRINGS.PUNCTUATION,
      tokens: ["A", "\u010a", "'", "ll", "\u0120!!", "to", "?'", "d", "''", "d", "\u0120of", ",", "\u0120can", "'", "t", "."],
      ids: [32013, 32, 185, 6, 642, 24466, 577, 11665, 67, 4191, 67, 280, 11, 482, 6, 83, 13],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>A\n'll !!to?'d''d of, can't.",
    },
    PYTHON_CODE: {
      text: BASE_TEST_STRINGS.PYTHON_CODE,
      tokens: ["def", "\u0120main", "():", "\u010a", "\u0109", "pass"],
      ids: [32013, 1551, 1959, 10942, 185, 184, 4805],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>def main():\n\tpass",
    },
    JAVASCRIPT_CODE: {
      text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
      tokens: ["let", "\u0120a", "\u0120=", "\u0120obj", ".", "toString", "();", "\u010a", "toString", "();"],
      ids: [32013, 1160, 245, 405, 6528, 13, 12617, 1293, 185, 12617, 1293],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>let a = obj.toString();\ntoString();",
    },
    NEWLINES: {
      text: LLAMA_TEST_STRINGS.NEWLINES,
      tokens: ["ax", "\u010a", "####", "\u010a", "bo", "o"],
      ids: [32013, 1099, 185, 3576, 185, 952, 78],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>ax\n####\nboo",
    },
    BASIC: {
      text: BASE_TEST_STRINGS.BASIC,
      tokens: ["UN", "want", "\u00c3\u00a9d", ",", "running"],
      ids: [32013, 4348, 28626, 31898, 11, 22785],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>UNwant\u00e9d,running",
    },
    CONTROL_TOKENS: {
      text: BASE_TEST_STRINGS.CONTROL_TOKENS,
      tokens: ["1", "\u0100", "2", "\u00ef\u00bf\u00bd", "3"],
      ids: [32013, 16, 175, 17, 10006, 18],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>1\u00002\ufffd3",
    },
    HELLO_WORLD_TITLECASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
      tokens: ["Hello", "\u0120World"],
      ids: [32013, 17535, 5414],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>Hello World",
    },
    HELLO_WORLD_LOWERCASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE,
      tokens: ["hello", "\u0120world"],
      ids: [32013, 31702, 1835],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>hello world",
    },
    CHINESE_ONLY: {
      text: BASE_TEST_STRINGS.CHINESE_ONLY,
      tokens: ["\u00e7\u0136\u0141\u00e6\u00b4\u00bb\u00e7\u013c\u0126", "\u00e7\u013e\u0141", "\u00e8\u00b0", "\u013d", "\u00e6\u013a\u00af"],
      ids: [32013, 23393, 2651, 1534, 236, 502],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\u751f\u6d3b\u7684\u771f\u8c1b\u662f",
    },
    LEADING_SPACE: {
      text: BASE_TEST_STRINGS.LEADING_SPACE,
      tokens: ["\u0120\u0120", "\u0120leading", "\u0120space"],
      ids: [32013, 243, 5877, 2507],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>   leading space",
    },
    TRAILING_SPACE: {
      text: BASE_TEST_STRINGS.TRAILING_SPACE,
      tokens: ["tra", "iling", "\u0120space", "\u0120\u0120\u0120"],
      ids: [32013, 7246, 5964, 2507, 315],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>trailing space   ",
    },
    DOUBLE_SPACE: {
      text: BASE_TEST_STRINGS.DOUBLE_SPACE,
      tokens: ["Hi", "\u0120", "\u0120H", "ello"],
      ids: [32013, 11041, 207, 414, 9489],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>Hi  Hello",
    },
    CURRENCY: {
      text: BASE_TEST_STRINGS.CURRENCY,
      tokens: ["test", "\u0120$", "1", "\u0120R", "2", "\u0120#", "3", "\u0120", "\u00e2\u0124\u00ac", "4", "\u0120\u00c2\u00a3", "5", "\u0120\u00c2", "\u00a5", "6", "\u0120", "\u00e2\u0124", "\u00a3", "7", "\u0120", "\u00e2\u0124", "\u00b9", "8", "\u0120", "\u00e2\u0124", "\u00b1", "9", "\u0120test"],
      ids: [32013, 2806, 371, 16, 432, 17, 1494, 18, 207, 11010, 19, 8761, 20, 2688, 98, 21, 207, 7935, 96, 22, 207, 7935, 117, 23, 207, 7935, 109, 24, 1719],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test",
    },
    CURRENCY_WITH_DECIMALS: {
      text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
      tokens: ["I", "\u0120bought", "\u0120an", "\u0120apple", "\u0120for", "\u0120$", "1", ".", "0", "0", "\u0120at", "\u0120the", "\u0120store", "."],
      ids: [32013, 40, 8942, 274, 15902, 327, 371, 16, 13, 15, 15, 429, 254, 4730, 13],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>I bought an apple for $1.00 at the store.",
    },
    ELLIPSIS: {
      text: BASE_TEST_STRINGS.ELLIPSIS,
      tokens: ["you", "\u00e2\u0122\u00a6", "\u0120\u0120"],
      ids: [32013, 4209, 2484, 243],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>you\u2026  ",
    },
    TEXT_WITH_ESCAPE_CHARACTERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
      tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"],
      ids: [32013, 4209, 2484, 10447],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>you\u2026\u00a0\u00a0",
    },
    TEXT_WITH_ESCAPE_CHARACTERS_2: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
      tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142", "\u00c2\u0142", "you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"],
      ids: [32013, 4209, 2484, 1200, 1200, 4209, 2484, 10447],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0",
    },
    TILDE_NORMALIZATION: {
      text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
      tokens: ["we", "ird", "\u0120", "\u00ef", "\u00bd", "\u0140", "\u0120edge", "\u0120", "\u00ef", "\u00bd", "\u0140", "\u0120case"],
      ids: [32013, 828, 2369, 207, 169, 121, 239, 5935, 207, 169, 121, 239, 1452],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>weird \uff5e edge \uff5e case",
    },
    POPULAR_EMOJIS: {
      text: BASE_TEST_STRINGS.POPULAR_EMOJIS,
      tokens: ["\u00f0\u0141", "\u013a", "\u0124", "\u0120\u00f0\u0141", "\u0133", "\u012f", "\u0120\u00f0\u0141", "\u00a4", "\u00a3", "\u0120\u00f0\u0141", "\u013a", "\u012f", "\u0120\u00f0\u0141", "\u013a", "\u0143", "\u0120\u00f0\u0141", "\u0130", "\u012b", "\u0120\u00f0\u0141\u013b", "\u0131", "\u0120\u00f0\u0141", "\u013a", "\u012c", "\u0120\u00f0\u0141", "\u0136", "\u00a5", "\u0120\u00f0\u0141", "\u013a", "\u0123", "\u0120\u00f0\u0141", "\u013a", "\u0127", "\u0120\u00f0\u0141", "\u00a4", "\u0139", "\u0120\u00f0\u0141", "\u013a", "\u0128", "\u0120\u00f0\u0141", "\u0133", "\u0131", "\u0120", "\u00e2", "\u013f", "\u00a4", "\u00ef", "\u00b8", "\u0131", "\u0120\u00f0\u0141", "\u0134", "\u013e", "\u0120\u00f0\u0141", "\u0134", "\u013c", "\u0120\u00f0\u0141", "\u0134", "\u0139", "\u0120\u00f0\u0141", "\u0134", "\u013b", "\u0120\u00f0\u0141", "\u0138", "\u00a4", "\u0120\u00f0\u0141", "\u013a", "\u0130", "\u0120\u00f0\u0141", "\u0133", "\u012e", "\u0120\u00f0\u0141", "\u00a5", "\u00b3", "\u0120\u00f0\u0141", "\u0134", "\u00aa", "\u0120", "\u00e2", "\u013e", "\u00a8", "\u0120\u00f0\u0141", "\u0133", "\u012b", "\u0120\u00f0\u0141", "\u0133", "\u0122", "\u0120\u00f0\u0141", "\u0134", "\u00af", "\u0120\u00f0\u0141", "\u0130", "\u012a", "\u0120\u00f0\u0141\u013b", "\u012a", "\u0120\u00f0\u0141\u013b", "\u012e", "\u0120\u00f0\u0141", "\u0134", "\u0122", "\u0120\u00f0\u0141", "\u0133", "\u0129", "\u0120\u00f0\u0141", "\u0133", "\u012d", "\u0120", "\u00e2", "\u013e", "\u0127", "\u0120\u00f0\u0141", "\u0130", "\u0123", "\u0120\u00f0\u0141", "\u012e", "\u0140", "\u0120\u00f0\u0141", "\u012e", "\u00b8", "\u0120\u00f0\u0141", "\u0134", "\u00b0"],
      ids: [32013, 10047, 233, 211, 12394, 226, 222, 12394, 97, 96, 12394, 233, 222, 12394, 233, 242, 12394, 223, 218, 22709, 224, 12394, 233, 219, 12394, 229, 98, 12394, 233, 210, 12394, 233, 214, 12394, 97, 232, 12394, 233, 215, 12394, 226, 224, 207, 156, 238, 97, 169, 116, 224, 12394, 227, 237, 12394, 227, 235, 12394, 227, 232, 12394, 227, 234, 12394, 231, 97, 12394, 233, 223, 12394, 226, 221, 12394, 98, 111, 12394, 227, 103, 207, 156, 237, 101, 12394, 226, 218, 12394, 226, 209, 12394, 227, 107, 12394, 223, 217, 22709, 217, 22709, 221, 12394, 227, 209, 12394, 226, 216, 12394, 226, 220, 207, 156, 237, 214, 12394, 223, 210, 12394, 221, 239, 12394, 221, 116, 12394, 227, 108],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\ud83d\ude02 \ud83d\udc4d \ud83e\udd23 \ud83d\ude0d \ud83d\ude2d \ud83c\udf89 \ud83d\ude4f \ud83d\ude0a \ud83d\udd25 \ud83d\ude01 \ud83d\ude05 \ud83e\udd17 \ud83d\ude06 \ud83d\udc4f \u2764\ufe0f \ud83d\udc9c \ud83d\udc9a \ud83d\udc97 \ud83d\udc99 \ud83d\udda4 \ud83d\ude0e \ud83d\udc4c \ud83e\udd73 \ud83d\udcaa \u2728 \ud83d\udc49 \ud83d\udc40 \ud83d\udcaf \ud83c\udf88 \ud83d\ude48 \ud83d\ude4c \ud83d\udc80 \ud83d\udc47 \ud83d\udc4b \u2705 \ud83c\udf81 \ud83c\udf1e \ud83c\udf38 \ud83d\udcb0",
    },
    MULTIBYTE_EMOJIS: {
      text: BASE_TEST_STRINGS.MULTIBYTE_EMOJIS,
      tokens: ["\u00e2", "\u013e", "\u00a8", "\u0120\u00f0\u0141", "\u00a4", "\u0139", "\u0120\u00f0\u0141", "\u0133", "\u0123", "\u00ef", "\u00b8", "\u0131", "\u0120\u00f0\u0141", "\u0133", "\u00b1", "\u00f0\u0141", "\u0131", "\u00bb", "\u0120\u00f0\u0141", "\u0137", "\u00b5", "\u00e2\u0122", "\u012f", "\u00e2", "\u013b", "\u0124", "\u00ef", "\u00b8", "\u0131", "\u0120\u00f0\u0141", "\u00a7", "\u013b", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122", "\u012f", "\u00e2", "\u013b", "\u0124", "\u0120\u00f0\u0141", "\u0133", "\u00a8", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u012e", "\u00be", "\u0120\u00f0\u0141", "\u00a7", "\u0133", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u00a4", "\u013f", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u00a7", "\u0133", "\u0120\u00f0\u0141", "\u0133", "\u00a9", "\u00e2\u0122", "\u012f", "\u00e2", "\u013f", "\u00a4", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u0134", "\u012d", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u0133", "\u00a8", "\u0120\u00f0\u0141", "\u0133", "\u00a9", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u0133", "\u00a9", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u0133", "\u00a7", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u0133", "\u00a6", "\u0120\u00f0\u0141", "\u00a7", "\u0133", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u00a4", "\u013f", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u00a7", "\u0133", "\u00f0\u0141", "\u0131", "\u00bb", "\u0120\u00f0\u0141", "\u0131", "\u00b4", "\u00f3", "\u0142", "\u0123", "\u00a7", "\u00f3", "\u0142", "\u0123", "\u00a2", "\u00f3", "\u0142", "\u0123", "\u00a5", "\u00f3", "\u0142", "\u0123", "\u00ae", "\u00f3", "\u0142", "\u0123", "\u00a7", "\u00f3", "\u0142", "\u0123", "\u00bf", "\u0120\u00f0\u0141", "\u0133", "\u00a8", "\u00f0\u0141", "\u0131", "\u00bb", "\u00e2\u0122", "\u012f", "\u00e2", "\u013f", "\u00a4", "\u00ef", "\u00b8", "\u0131", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u0134", "\u012d", "\u00e2\u0122", "\u012f", "\u00f0\u0141", "\u0133", "\u00a8", "\u00f0\u0141", "\u0131", "\u00bc"],
      ids: [32013, 156, 237, 101, 12394, 97, 232, 12394, 226, 210, 169, 116, 224, 12394, 226, 109, 10047, 224, 119, 12394, 230, 113, 350, 222, 156, 234, 211, 169, 116, 224, 12394, 100, 234, 10047, 224, 119, 350, 222, 156, 234, 211, 12394, 226, 101, 10047, 224, 119, 350, 222, 10047, 221, 122, 12394, 100, 226, 350, 222, 10047, 97, 238, 350, 222, 10047, 100, 226, 12394, 226, 102, 350, 222, 156, 238, 97, 350, 222, 10047, 227, 220, 350, 222, 10047, 226, 101, 12394, 226, 102, 350, 222, 10047, 226, 102, 350, 222, 10047, 226, 100, 350, 222, 10047, 226, 99, 12394, 100, 226, 10047, 224, 119, 350, 222, 10047, 97, 238, 350, 222, 10047, 100, 226, 10047, 224, 119, 12394, 224, 112, 173, 241, 210, 100, 173, 241, 210, 95, 173, 241, 210, 98, 173, 241, 210, 106, 173, 241, 210, 100, 173, 241, 210, 123, 12394, 226, 101, 10047, 224, 119, 350, 222, 156, 238, 97, 169, 116, 224, 350, 222, 10047, 227, 220, 350, 222, 10047, 226, 101, 10047, 224, 120],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\u2728 \ud83e\udd17 \ud83d\udc41\ufe0f \ud83d\udc71\ud83c\udffb \ud83d\udd75\u200d\u2642\ufe0f \ud83e\uddd9\ud83c\udffb\u200d\u2642 \ud83d\udc68\ud83c\udffb\u200d\ud83c\udf3e \ud83e\uddd1\u200d\ud83e\udd1d\u200d\ud83e\uddd1 \ud83d\udc69\u200d\u2764\u200d\ud83d\udc8b\u200d\ud83d\udc68 \ud83d\udc69\u200d\ud83d\udc69\u200d\ud83d\udc67\u200d\ud83d\udc66 \ud83e\uddd1\ud83c\udffb\u200d\ud83e\udd1d\u200d\ud83e\uddd1\ud83c\udffb \ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f \ud83d\udc68\ud83c\udffb\u200d\u2764\ufe0f\u200d\ud83d\udc8b\u200d\ud83d\udc68\ud83c\udffc",
    },
    SPIECE_UNDERSCORE: {
      text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
      tokens: ["\u00e2\u0138", "\u0123", "This", "\u0120", "\u00e2\u0138", "\u0123", "is", "\u0120", "\u00e2\u0138", "\u0123", "a", "\u0120", "\u00e2\u0138", "\u0123", "test", "\u0120", "\u00e2\u0138", "\u0123", "."],
      ids: [32013, 11028, 210, 1559, 207, 11028, 210, 262, 207, 11028, 210, 64, 207, 11028, 210, 2806, 207, 11028, 210, 13],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\u2581This \u2581is \u2581a \u2581test \u2581.",
    },
    BPE_SCORES_PRIORITY_1: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_1,
      tokens: ["gr", "ab", "bed"],
      ids: [32013, 877, 356, 3861],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>grabbed",
    },
    BPE_SCORES_PRIORITY_2: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_2,
      tokens: ["\u0120grab", "bed"],
      ids: [32013, 14596, 3861],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c> grabbed",
    },
    BPE_SCORES_PRIORITY_3: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_3,
      tokens: ["\u0120\u0120\u0120\u0120\u0120\u0120\u0120\u0120\u0120\u0120", "\u0120grab", "bed"],
      ids: [32013, 3137, 14596, 3861],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>           grabbed",
    },
    NEWLINE: {
      text: LLAMA_TEST_STRINGS.NEWLINE,
      tokens: ["\u010a"],
      ids: [32013, 185],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\n",
    },
    NEWLINE_WITH_LEADING_SPACE: {
      text: LLAMA_TEST_STRINGS.NEWLINE_WITH_LEADING_SPACE,
      tokens: ["\u0120", "\u010a"],
      ids: [32013, 207, 185],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c> \n",
    },
    TABS: {
      text: LLAMA_TEST_STRINGS.TABS,
      tokens: ["\u0109", "tabs", "\u0109\u0109\u0109", "\u0109", "out", "\u0120here"],
      ids: [32013, 184, 20611, 1749, 184, 406, 1283],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\ttabs\t\t\t\tout here",
    },
    NEWLINE_AND_TAB: {
      text: LLAMA_TEST_STRINGS.NEWLINE_AND_TAB,
      tokens: ["\u010a", "\u0109", "\u010a"],
      ids: [32013, 185, 184, 185],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\n\t\n",
    },
    CHINESE_LETTER: {
      text: LLAMA_TEST_STRINGS.CHINESE_LETTER,
      tokens: ["\u00e9\u0137\u0129"],
      ids: [32013, 6759],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\u9547",
    },
    EMOJIS_1: {
      text: LLAMA_TEST_STRINGS.EMOJIS_1,
      tokens: ["\u00f0\u0141", "\u00a6", "\u013b"],
      ids: [32013, 10047, 99, 234],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\ud83e\udd99",
    },
    EMOJIS_2: {
      text: LLAMA_TEST_STRINGS.EMOJIS_2,
      tokens: ["\u00f0\u0141", "\u00a6", "\u013b", "\u00ea", "\u013b", "\u012c"],
      ids: [32013, 10047, 99, 234, 164, 234, 219],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\ud83e\udd99\ua64a",
    },
    EMOJIS_3: {
      text: LLAMA_TEST_STRINGS.EMOJIS_3,
      tokens: ["\u00ea", "\u013b", "\u012c", "\u00f0\u0141", "\u00a6", "\u013b"],
      ids: [32013, 164, 234, 219, 10047, 99, 234],
      decoded: "<\uff5cbegin\u2581of\u2581sentence\uff5c>\ua64a\ud83e\udd99",
    },
    PARAGRAPH: {
      text: LLAMA_TEST_STRINGS.PARAGRAPH,
      tokens: ["The", "\u0120ll", "ama", "\u0120(/", "\u00cb\u012a", "l", "\u00c9", "\u0133", "\u00cb", "\u0132", "m", "\u00c9\u013b", "/", ";", "\u0120\u00f0\u0141", "\u00a6", "\u013b", "Span", "ish", "\u0120pron", "unciation", ":", "\u0120[", "\u00cb\u012a", "\u00ca", "\u0130", "ama", "])", "\u0120(", "L", "ama", "\u0120gl", "ama", ")", "\u0120is", "\u0120a", "\u0120domestic", "ated", "\u0120South", "\u0120American", "\u0120cam", "el", "id", ",", "\u0120widely", "\u0120used", "\u0120as", "\u0120a", "\u0120meat", "\u0120and", "\u0120pack", "\u0120animal", "\u0120by", "\u0120And", "ean", "\u0120cultures", "\u0120since", "\u0120the", "\u0120Pre", "-", "Col", "umb", "ian", "\u0120era", ".", "\u0120L", "lam", "as", "\u0120are", "\u0120social", "\u0120animals", "\u0120and", "\u0120live", "\u0120with", "\u0120others", "\u0120as", "\u0120a", "\u0120her", "d", ".", "\u0120Their", "\u0120wool", "\u0120is", "\u0120soft", "\u0120and", "\u0120contains", "\u0120only", "\u0120a", "\u0120small", "\u0120amount", "\u0120of", "\u0120lan", "ol", "in", ".[", "2", "]", "\u0120L", "lam", "as", "\u0120can", "\u0120learn", "\u0120simple", "\u0120tasks", "\u0120after", "\u0120a", "\u0120few", "\u0120repet", "itions", ".", "\u0120When", "\u0120using", "\u0120a", "\u0120pack", ",", "\u0120they", "\u0120can", "\u0120carry", "\u0120about", "\u0120", "2", "5", "\u0120to", "\u0120", "3", "0", "%", "\u0120of", "\u0120their", "\u0120body", "\u0120weight", "\u0120for", "\u0120", "8", "\u0120to", "\u0120", "1", "3", "\u0120km", "\u0120(", "5", "\u00e2\u0122\u0135", "8", "\u0120miles", ").", "[", "3", "]", "\u0120The", "\u0120name", "\u0120ll", "ama", "\u0120(", "in", "\u0120the", "\u0120past", "\u0120also", "\u0120sp", "elled", '\u0120"', "l", "ama", '"', "\u0120or", '\u0120"', "gl", "ama", '")', "\u0120was", "\u0120adopted", "\u0120by", "\u0120European", "\u0120sett", "lers", "\u0120from", "\u0120native", "\u0120Per", "uv", "ians", ".[", "4", "]", "\u0120The", "\u0120ancest", "ors", "\u0120of", "\u0120llam", "as", "\u0120are", "\u0120thought", "\u0120to", "\u0120have", "\u0120origin", "ated", "\u0120from", "\u0120the", "\u0120Great", "\u0120Pl", "ains", "\u0120of", "\u0120North", "\u0120America", "\u0120about", "\u0120", "4", "0", "\u0120million", "\u0120years", "\u0120ago", ",", "\u0120and", "\u0120subsequently", "\u0120mig", "rated", "\u0120to", "\u0120South", "\u0120America", "\u0120about", "\u0120three", "\u0120million", "\u0120years", "\u0120ago", "\u0120during", "\u0120the", "\u0120Great", "\u0120American", "\u0120Inter", "change", ".", "\u0120By", "\u0120the", "\u0120end", "\u0120of", "\u0120the", "\u0120last", "\u0120ice", "\u0120age", "\u0120(", "1", "0", ",", "0", "0", "0", "\u00e2\u0122\u0135", "1", "2", ",", "0", "0", "0", "\u0120years", "\u0120ago", "),", "\u0120cam", "el", "ids", "\u0120were", "\u0120ext", "inct", "\u0120in", "\u0120North", "\u0120America", ".[", "3", "]", "\u0120As", "\u0120of", "\u0120", "2", "0", "0", "7", ",", "\u0120there", "\u0120were", "\u0120over", "\u0120seven", "\u0120million", "\u0120llam", "as", "\u0120and", "\u0120al", "p", "ac", "as", "\u0120in", "\u0120South", "\u0120America", "\u0120and", "\u0120over", "\u0120", "1", "5", "8", ",", "0", "0", "0", "\u0120llam", "as", "\u0120and", "\u0120", "1", "0", "0", ",", "0", "0", "0", "\u00ea", "\u013b", "\u012c", "\u00f0\u0141", "\u00a6", "\u013b", "\u0120al", "p", "ac", "as", ",", "\u0120desc", "ended", "\u0120from", "\u0120pro", "gen", "itors", "\u0120imported", "\u0120late", "\u0120in", "\u0120the", "\u0120", "2", "0", "th", "\u0120century", ",", "\u0120in", "\u0120the", "\u0120United", "\u0120States", "\u0120and", "\u0120Canada", ".[", "5", "]", "\u0120In", "\u0120A", "ym", "ara", "\u0120myth", "ology", ",", "\u0120llam", "as", "\u0120are", "\u0120important", "\u0120beings", ".", "\u0120The", "\u0120Heaven", "ly", "\u0120Ll", "ama", "\u0120is", "\u0120said", "\u0120to", "\u0120drink", "\u0120water", "\u0120from", "\u0120the", "\u0120ocean", "\u0120and", "\u0120ur", "in", "ates", "\u0120as", "\u0120it", "\u0120ra", "ins", ".[", "6", "]", "\u0120According", "\u0120to", "\u0120A", "ym", "ara", "\u0120es", "chat", "ology", ",", "\u0120llam", "as", "\u0120will", "\u0120return", "\u0120to", "\u0120the", "\u0120water", "\u0120springs", "\u0120and", "\u0120l", "ago", "ons", "\u0120where", "\u0120they", "\u0120come", "\u0120from", "\u0120at", "\u0120the", "\u0120end", "\u0120of", "\u0120time", ".[", "6", "]"],
      ids: [32013, 546, 1703, 4204, 31905, 31459, 75, 131, 226, 133, 225, 76, 28747, 14, 26, 12394, 99, 234, 20786, 840, 9119, 25307, 25, 821, 31459, 132, 223, 4204, 5589, 334, 43, 4204, 1649, 4204, 8, 317, 245, 13569, 612, 5168, 4115, 4370, 282, 304, 11, 13620, 1219, 372, 245, 12342, 285, 2379, 9542, 457, 1306, 24391, 24783, 1952, 254, 7606, 12, 2608, 4313, 987, 2895, 13, 412, 8265, 281, 417, 3601, 8469, 285, 3516, 365, 3060, 372, 245, 706, 67, 13, 9195, 24547, 317, 2829, 285, 5396, 885, 245, 1752, 3733, 280, 27264, 313, 246, 9469, 17, 60, 412, 8265, 281, 482, 3059, 2966, 9227, 1164, 245, 1853, 15747, 2160, 13, 2463, 1242, 245, 2379, 11, 653, 482, 5642, 782, 207, 17, 20, 276, 207, 18, 15, 4, 280, 699, 3110, 4285, 327, 207, 23, 276, 207, 16, 18, 9004, 334, 20, 887, 23, 6595, 628, 58, 18, 60, 428, 1208, 1703, 4204, 334, 246, 254, 2872, 835, 731, 6679, 440, 75, 4204, 1, 409, 440, 2521, 4204, 2456, 438, 13509, 457, 8717, 6762, 12104, 473, 8118, 3043, 12466, 3091, 9469, 19, 60, 428, 18901, 710, 280, 15410, 281, 417, 2207, 276, 463, 6948, 612, 473, 254, 6984, 2284, 2200, 280, 5216, 6092, 782, 207, 19, 15, 4866, 1547, 4074, 11, 285, 23909, 8290, 9831, 276, 5168, 6092, 782, 1846, 4866, 1547, 4074, 2310, 254, 6984, 4115, 6660, 4865, 13, 3550, 254, 1223, 280, 254, 1554, 9405, 4489, 334, 16, 15, 11, 15, 15, 15, 887, 16, 17, 11, 15, 15, 15, 1547, 4074, 650, 4370, 282, 2929, 773, 1309, 5729, 279, 5216, 6092, 9469, 18, 60, 1725, 280, 207, 17, 15, 15, 22, 11, 741, 773, 851, 7970, 4866, 15410, 281, 285, 360, 79, 305, 281, 279, 5168, 6092, 285, 851, 207, 16, 20, 23, 11, 15, 15, 15, 15410, 281, 285, 207, 16, 15, 15, 11, 15, 15, 15, 164, 234, 219, 10047, 99, 234, 360, 79, 305, 281, 11, 1774, 2611, 473, 381, 4920, 6041, 26357, 5179, 279, 254, 207, 17, 15, 392, 8299, 11, 279, 254, 4783, 5098, 285, 8905, 9469, 20, 60, 680, 338, 1254, 3367, 25157, 2333, 11, 15410, 281, 417, 2364, 22792, 13, 428, 18933, 326, 9140, 4204, 317, 989, 276, 7371, 2345, 473, 254, 15439, 285, 8580, 246, 980, 372, 359, 1809, 1231, 9469, 21, 60, 10068, 276, 338, 1254, 3367, 707, 24570, 2333, 11, 15410, 281, 540, 967, 276, 254, 2345, 30851, 285, 284, 5980, 875, 1064, 653, 1857, 473, 429, 254, 1223, 280, 761, 9469, 21, 60],
      decoded: '<\uff5cbegin\u2581of\u2581sentence\uff5c>The llama (/\u02c8l\u0251\u02d0m\u0259/; \ud83e\udd99Spanish pronunciation: [\u02c8\u028eama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the Pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5\u20138 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have originated from the Great Plains of North America about 40 million years ago, and subsequently migrated to South America about three million years ago during the Great American Interchange. By the end of the last ice age (10,000\u201312,000 years ago), camelids were extinct in North America.[3] As of 2007, there were over seven million llamas and alpacas in South America and over 158,000 llamas and 100,000\ua64a\ud83e\udd99 alpacas, descended from progenitors imported late in the 20th century, in the United States and Canada.[5] In Aymara mythology, llamas are important beings. The Heavenly Llama is said to drink water from the ocean and urinates as it rains.[6] According to Aymara eschatology, llamas will return to the water springs and lagoons where they come from at the end of time.[6]',
    },
  },
  "Xenova/tamillama_tiny_30m": {
    TEXT_WITH_NUMBERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
      tokens: ["\u2581The", "\u2581company", "\u2581was", "\u2581found", "ed", "\u2581in", "\u2581", "2", "0", "1", "6", "."],
      ids: [1, 147, 10984, 139, 949, 78, 198, 31654, 13, 21, 12, 17, 34],
      decoded: "<s> The company was founded in 2016.",
    },
    PUNCTUATION: {
      text: BASE_TEST_STRINGS.PUNCTUATION,
      tokens: ["\u2581A", "\n", "'", "ll", "\u2581", "!", "!", "to", "?", "'", "d", "'", "'", "d", "\u2581of", ",", "\u2581can", "'", "t", "."],
      ids: [1, 231, 5, 31, 370, 31654, 31715, 31715, 5140, 31725, 31, 31679, 31, 31, 31679, 251, 35, 645, 31, 31665, 34],
      decoded: "<s> A\n'll !!to?'d''d of, can't.",
    },
    PYTHON_CODE: {
      text: BASE_TEST_STRINGS.PYTHON_CODE,
      tokens: ["\u2581def", "\u2581main", "(", ")", ":", "\n", "<unk>", "p", "ass"],
      ids: [1, 12849, 17375, 32, 33, 29, 5, 0, 31694, 1917],
      decoded: "<s> def main():\n<unk>pass",
    },
    JAVASCRIPT_CODE: {
      text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
      tokens: ["\u2581let", "\u2581a", "\u2581", "=", "\u2581ob", "j", ".", "to", "St", "ring", "(", ")", ";", "\n", "to", "St", "ring", "(", ")", ";"],
      ids: [1, 1996, 48, 31654, 25, 4083, 31733, 34, 5140, 23417, 6631, 32, 33, 30, 5, 5140, 23417, 6631, 32, 33, 30],
      decoded: "<s> let a = obj.toString();\ntoString();",
    },
    NEWLINES: {
      text: LLAMA_TEST_STRINGS.NEWLINES,
      tokens: ["\u2581ax", "\n", "#", "#", "#", "#", "\n", "boo"],
      ids: [1, 11441, 5, 22, 22, 22, 22, 5, 21260],
      decoded: "<s> ax\n####\nboo",
    },
    BASIC: {
      text: BASE_TEST_STRINGS.BASIC,
      tokens: ["\u2581U", "N", "w", "ant", "\u00e9", "d", ",", "r", "un", "ning"],
      ids: [1, 5841, 31748, 31689, 1027, 31771, 31679, 35, 31678, 367, 1855],
      decoded: "<s> UNwant\u00e9d,running",
    },
    CONTROL_TOKENS: {
      text: BASE_TEST_STRINGS.CONTROL_TOKENS,
      tokens: ["\u2581", "1", "<unk>", "2", "<unk>", "3"],
      ids: [1, 31654, 12, 0, 13, 0, 14],
      decoded: "<s> 1<unk>2<unk>3",
    },
    HELLO_WORLD_TITLECASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
      tokens: ["\u2581H", "ello", "\u2581World"],
      ids: [1, 207, 3589, 25544],
      decoded: "<s> Hello World",
    },
    CHINESE_ONLY: {
      text: BASE_TEST_STRINGS.CHINESE_ONLY,
      tokens: ["\u2581", "<unk>"],
      ids: [1, 31654, 0],
      decoded: "<s> <unk>",
    },
    LEADING_SPACE: {
      text: BASE_TEST_STRINGS.LEADING_SPACE,
      tokens: ["\u2581", "\u2581", "\u2581", "\u2581leading", "\u2581space"],
      ids: [1, 31654, 31654, 31654, 7951, 7259],
      decoded: "<s>    leading space",
    },
    TRAILING_SPACE: {
      text: BASE_TEST_STRINGS.TRAILING_SPACE,
      tokens: ["\u2581tra", "iling", "\u2581space", "\u2581", "\u2581", "\u2581"],
      ids: [1, 2036, 9850, 7259, 31654, 31654, 31654],
      decoded: "<s> trailing space   ",
    },
    DOUBLE_SPACE: {
      text: BASE_TEST_STRINGS.DOUBLE_SPACE,
      tokens: ["\u2581H", "i", "\u2581", "\u2581H", "ello"],
      ids: [1, 207, 31673, 31654, 207, 3589],
      decoded: "<s> Hi  Hello",
    },
    CURRENCY: {
      text: BASE_TEST_STRINGS.CURRENCY,
      tokens: ["\u2581test", "\u2581", "$", "1", "\u2581R", "2", "\u2581", "#", "3", "\u2581", "\u20ac", "4", "\u2581", "\u00a3", "5", "\u2581", "<unk>", "6", "\u2581", "<unk>", "7", "\u2581", "\u20b9", "8", "\u2581", "<unk>", "9", "\u2581test"],
      ids: [1, 6370, 31654, 9, 12, 947, 13, 31654, 22, 14, 31654, 31746, 15, 31654, 31792, 16, 31654, 0, 17, 31654, 0, 18, 31654, 31999, 19, 31654, 0, 20, 6370],
      decoded: "<s> test $1 R2 #3 \u20ac4 \u00a35 <unk>6 <unk>7 \u20b98 <unk>9 test",
    },
    CURRENCY_WITH_DECIMALS: {
      text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
      tokens: ["\u2581I", "\u2581bought", "\u2581an", "\u2581apple", "\u2581for", "\u2581", "$", "1", ".", "0", "0", "\u2581at", "\u2581the", "\u2581store", "."],
      ids: [1, 320, 4685, 446, 4223, 347, 31654, 9, 12, 34, 21, 21, 586, 70, 2023, 34],
      decoded: "<s> I bought an apple for $1.00 at the store.",
    },
    ELLIPSIS: {
      text: BASE_TEST_STRINGS.ELLIPSIS,
      tokens: ["\u2581you", "<unk>", "\u2581", "\u2581"],
      ids: [1, 356, 0, 31654, 31654],
      decoded: "<s> you<unk>  ",
    },
    TEXT_WITH_ESCAPE_CHARACTERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
      tokens: ["\u2581you", "<unk>"],
      ids: [1, 356, 0],
      decoded: "<s> you<unk>",
    },
    TEXT_WITH_ESCAPE_CHARACTERS_2: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
      tokens: ["\u2581you", "<unk>", "you", "<unk>"],
      ids: [1, 356, 0, 21984, 0],
      decoded: "<s> you<unk>you<unk>",
    },
    TILDE_NORMALIZATION: {
      text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
      tokens: ["\u2581weird", "\u2581", "<unk>", "\u2581edge", "\u2581", "<unk>", "\u2581case"],
      ids: [1, 7865, 31654, 0, 11148, 31654, 0, 10143],
      decoded: "<s> weird <unk> edge <unk> case",
    },
    SPIECE_UNDERSCORE: {
      text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
      tokens: ["\u2581", "\u2581This", "\u2581", "\u2581is", "\u2581", "\u2581a", "\u2581", "\u2581test", "\u2581", "\u2581", "."],
      ids: [1, 31654, 3827, 31654, 344, 31654, 48, 31654, 6370, 31654, 31654, 34],
      decoded: "<s>  This  is  a  test  .",
    },
    POPULAR_EMOJIS: {
      text: BASE_TEST_STRINGS.POPULAR_EMOJIS,
      tokens: ["\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>"],
      ids: [1, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0],
      decoded: "<s> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>",
    },
    MULTIBYTE_EMOJIS: {
      text: BASE_TEST_STRINGS.MULTIBYTE_EMOJIS,
      tokens: ["\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u200d", "<unk>", "\u2581", "<unk>", "\u200d", "<unk>", "\u2581", "<unk>", "\u200d", "<unk>", "\u2581", "<unk>", "\u200d", "<unk>", "\u200d", "<unk>", "\u2581", "<unk>", "\u200d", "<unk>", "\u200d", "<unk>", "\u200d", "<unk>", "\u2581", "<unk>", "\u200d", "<unk>", "\u200d", "<unk>", "\u200d", "<unk>", "\u2581", "<unk>", "\u200d", "<unk>", "\u200d", "<unk>", "\u2581", "<unk>", "\u2581", "<unk>", "\u200d", "<unk>", "\u200d", "<unk>", "\u200d", "<unk>"],
      ids: [1, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31654, 0, 31928, 0, 31654, 0, 31928, 0, 31654, 0, 31928, 0, 31654, 0, 31928, 0, 31928, 0, 31654, 0, 31928, 0, 31928, 0, 31928, 0, 31654, 0, 31928, 0, 31928, 0, 31928, 0, 31654, 0, 31928, 0, 31928, 0, 31654, 0, 31654, 0, 31928, 0, 31928, 0, 31928, 0],
      decoded: "<s> <unk> <unk> <unk> <unk> <unk>\u200d<unk> <unk>\u200d<unk> <unk>\u200d<unk> <unk>\u200d<unk>\u200d<unk> <unk>\u200d<unk>\u200d<unk>\u200d<unk> <unk>\u200d<unk>\u200d<unk>\u200d<unk> <unk>\u200d<unk>\u200d<unk> <unk> <unk>\u200d<unk>\u200d<unk>\u200d<unk>",
    },
    BPE_SCORES_PRIORITY_1: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_1,
      tokens: ["\u2581grabbed"],
      ids: [1, 3618],
      decoded: "<s> grabbed",
    },
    BPE_SCORES_PRIORITY_2: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_2,
      tokens: ["\u2581", "\u2581grabbed"],
      ids: [1, 31654, 3618],
      decoded: "<s>  grabbed",
    },
    BPE_SCORES_PRIORITY_3: {
      text: LLAMA_TEST_STRINGS.BPE_SCORES_PRIORITY_3,
      tokens: ["\u2581", "\u2581", "\u2581", "\u2581", "\u2581", "\u2581", "\u2581", "\u2581", "\u2581", "\u2581", "\u2581", "\u2581grabbed"],
      ids: [1, 31654, 31654, 31654, 31654, 31654, 31654, 31654, 31654, 31654, 31654, 31654, 3618],
      decoded: "<s>            grabbed",
    },
    NEWLINE: {
      text: LLAMA_TEST_STRINGS.NEWLINE,
      tokens: ["\u2581", "\n"],
      ids: [1, 31654, 5],
      decoded: "<s> \n",
    },
    NEWLINE_WITH_LEADING_SPACE: {
      text: LLAMA_TEST_STRINGS.NEWLINE_WITH_LEADING_SPACE,
      tokens: ["\u2581", "\u2581", "\n"],
      ids: [1, 31654, 31654, 5],
      decoded: "<s>  \n",
    },
    TABS: {
      text: LLAMA_TEST_STRINGS.TABS,
      tokens: ["\u2581", "<unk>", "t", "ab", "s", "<unk>", "out", "\u2581here"],
      ids: [1, 31654, 0, 31665, 878, 31675, 0, 415, 3278],
      decoded: "<s> <unk>tabs<unk>out here",
    },
    NEWLINE_AND_TAB: {
      text: LLAMA_TEST_STRINGS.NEWLINE_AND_TAB,
      tokens: ["\u2581", "\n", "<unk>", "\n"],
      ids: [1, 31654, 5, 0, 5],
      decoded: "<s> \n<unk>\n",
    },
    CHINESE_LETTER: {
      text: LLAMA_TEST_STRINGS.CHINESE_LETTER,
      tokens: ["\u2581", "<unk>"],
      ids: [1, 31654, 0],
      decoded: "<s> <unk>",
    },
    EMOJIS_1: {
      text: LLAMA_TEST_STRINGS.EMOJIS_1,
      tokens: ["\u2581", "<unk>"],
      ids: [1, 31654, 0],
      decoded: "<s> <unk>",
    },
    EMOJIS_2: {
      text: LLAMA_TEST_STRINGS.EMOJIS_2,
      tokens: ["\u2581", "<unk>"],
      ids: [1, 31654, 0],
      decoded: "<s> <unk>",
    },
    EMOJIS_3: {
      text: LLAMA_TEST_STRINGS.EMOJIS_3,
      tokens: ["\u2581", "<unk>"],
      ids: [1, 31654, 0],
      decoded: "<s> <unk>",
    },
    PARAGRAPH: {
      text: LLAMA_TEST_STRINGS.PARAGRAPH,
      tokens: ["\u2581The", "\u2581l", "l", "ama", "\u2581", "(", "/", "\u02c8", "l", "\u0251", "\u02d0", "m", "\u0259", "/", ";", "\u2581", "<unk>", "Sp", "an", "ish", "\u2581pr", "on", "un", "ci", "ation", ":", "\u2581", "[", "\u02c8", "<unk>", "ama", "]", ")", "\u2581", "(", "L", "ama", "\u2581gl", "ama", ")", "\u2581is", "\u2581a", "\u2581d", "om", "est", "ic", "ated", "\u2581South", "\u2581American", "\u2581cam", "el", "id", ",", "\u2581wid", "ely", "\u2581used", "\u2581as", "\u2581a", "\u2581meat", "\u2581and", "\u2581pack", "\u2581animal", "\u2581by", "\u2581And", "e", "an", "\u2581c", "ult", "ures", "\u2581since", "\u2581the", "\u2581P", "re", "-", "C", "ol", "umb", "ian", "\u2581", "era", ".", "\u2581L", "l", "am", "as", "\u2581are", "\u2581social", "\u2581animals", "\u2581and", "\u2581live", "\u2581with", "\u2581others", "\u2581as", "\u2581a", "\u2581her", "d", ".", "\u2581Their", "\u2581wool", "\u2581is", "\u2581soft", "\u2581and", "\u2581contains", "\u2581only", "\u2581a", "\u2581small", "\u2581amount", "\u2581of", "\u2581l", "an", "ol", "in", ".", "[", "2", "]", "\u2581L", "l", "am", "as", "\u2581can", "\u2581learn", "\u2581simple", "\u2581tasks", "\u2581after", "\u2581a", "\u2581few", "\u2581rep", "et", "itions", ".", "\u2581When", "\u2581using", "\u2581a", "\u2581pack", ",", "\u2581they", "\u2581can", "\u2581carry", "\u2581about", "\u2581", "2", "5", "\u2581to", "\u2581", "3", "0", "%", "\u2581of", "\u2581their", "\u2581body", "\u2581weight", "\u2581for", "\u2581", "8", "\u2581to", "\u2581", "1", "3", "\u2581km", "\u2581", "(", "5", "\u2013", "8", "\u2581miles", ")", ".", "[", "3", "]", "\u2581The", "\u2581name", "\u2581l", "l", "ama", "\u2581", "(", "in", "\u2581the", "\u2581past", "\u2581also", "\u2581spell", "ed", '\u2581"', "l", "ama", '"', "\u2581or", '\u2581"', "gl", "ama", '"', ")", "\u2581was", "\u2581adop", "ted", "\u2581by", "\u2581E", "urope", "an", "\u2581sett", "l", "ers", "\u2581from", "\u2581n", "ative", "\u2581Per", "u", "v", "ians", ".", "[", "4", "]", "\u2581The", "\u2581an", "c", "est", "ors", "\u2581of", "\u2581l", "l", "am", "as", "\u2581are", "\u2581thought", "\u2581to", "\u2581have", "\u2581origin", "ated", "\u2581from", "\u2581the", "\u2581Great", "\u2581Pl", "ain", "s", "\u2581of", "\u2581North", "\u2581America", "\u2581about", "\u2581", "4", "0", "\u2581million", "\u2581years", "\u2581ago", ",", "\u2581and", "\u2581sub", "sequ", "ently", "\u2581m", "ig", "r", "ated", "\u2581to", "\u2581South", "\u2581America", "\u2581about", "\u2581three", "\u2581million", "\u2581years", "\u2581ago", "\u2581during", "\u2581the", "\u2581Great", "\u2581American", "\u2581Int", "er", "ch", "ange", ".", "\u2581By", "\u2581the", "\u2581end", "\u2581of", "\u2581the", "\u2581last", "\u2581ice", "\u2581age", "\u2581", "(", "1", "0", ",", "0", "0", "0", "\u2013", "1", "2", ",", "0", "0", "0", "\u2581years", "\u2581ago", ")", ",", "\u2581cam", "el", "ids", "\u2581were", "\u2581ext", "inct", "\u2581in", "\u2581North", "\u2581America", ".", "[", "3", "]", "\u2581As", "\u2581of", "\u2581", "2", "0", "0", "7", ",", "\u2581there", "\u2581were", "\u2581over", "\u2581seven", "\u2581million", "\u2581l", "l", "am", "as", "\u2581and", "\u2581al", "p", "ac", "as", "\u2581in", "\u2581South", "\u2581America", "\u2581and", "\u2581over", "\u2581", "1", "5", "8", ",", "0", "0", "0", "\u2581l", "l", "am", "as", "\u2581and", "\u2581", "1", "0", "0", ",", "0", "0", "0", "<unk>", "\u2581al", "p", "ac", "as", ",", "\u2581des", "ce", "nd", "ed", "\u2581from", "\u2581pro", "gen", "it", "ors", "\u2581import", "ed", "\u2581late", "\u2581in", "\u2581the", "\u2581", "2", "0", "th", "\u2581cent", "ury", ",", "\u2581in", "\u2581the", "\u2581United", "\u2581States", "\u2581and", "\u2581Can", "ada", ".", "[", "5", "]", "\u2581In", "\u2581A", "ym", "ara", "\u2581my", "th", "ology", ",", "\u2581l", "l", "am", "as", "\u2581are", "\u2581important", "\u2581be", "ings", ".", "\u2581The", "\u2581He", "aven", "ly", "\u2581L", "l", "ama", "\u2581is", "\u2581said", "\u2581to", "\u2581drink", "\u2581water", "\u2581from", "\u2581the", "\u2581ocean", "\u2581and", "\u2581ur", "in", "ates", "\u2581as", "\u2581it", "\u2581rains", ".", "[", "6", "]", "\u2581Acc", "ord", "ing", "\u2581to", "\u2581A", "ym", "ara", "\u2581es", "ch", "at", "ology", ",", "\u2581l", "l", "am", "as", "\u2581will", "\u2581return", "\u2581to", "\u2581the", "\u2581water", "\u2581spr", "ings", "\u2581and", "\u2581l", "ag", "oons", "\u2581where", "\u2581they", "\u2581come", "\u2581from", "\u2581at", "\u2581the", "\u2581end", "\u2581of", "\u2581time", ".", "[", "6", "]"],
      ids: [1, 147, 105, 31683, 4464, 31654, 32, 31753, 31774, 31683, 31813, 31779, 31687, 31781, 31753, 30, 31654, 0, 30106, 142, 531, 1823, 111, 367, 8762, 633, 29, 31654, 31778, 31774, 0, 4464, 31780, 33, 31654, 32, 31717, 4464, 1861, 4464, 33, 344, 48, 108, 120, 504, 515, 3062, 29052, 18424, 8829, 256, 153, 35, 20517, 2001, 2680, 488, 48, 9910, 83, 4314, 1448, 1015, 1736, 31660, 142, 103, 3441, 605, 13397, 70, 1629, 86, 7, 31739, 819, 4618, 1685, 31654, 7129, 34, 218, 31683, 235, 691, 617, 23632, 1707, 83, 5860, 249, 2905, 488, 48, 192, 31679, 34, 5290, 11964, 344, 3077, 83, 12959, 2859, 48, 1388, 7238, 251, 105, 142, 819, 81, 34, 31778, 13, 31780, 218, 31683, 235, 691, 645, 907, 16188, 22936, 1609, 48, 4505, 4706, 183, 29049, 34, 1354, 5247, 48, 4314, 35, 338, 645, 4923, 1096, 31654, 13, 16, 84, 31654, 14, 21, 10, 251, 626, 6011, 9152, 347, 31654, 19, 84, 31654, 12, 14, 29496, 31654, 32, 16, 31760, 19, 7843, 33, 34, 31778, 14, 31780, 147, 3516, 105, 31683, 4464, 31654, 32, 81, 70, 4829, 2320, 9948, 78, 245, 31683, 4464, 31690, 1187, 245, 686, 4464, 31690, 33, 139, 25228, 2490, 1015, 465, 25799, 142, 16405, 31683, 983, 825, 152, 12724, 24466, 31688, 31711, 26361, 34, 31778, 15, 31780, 147, 446, 31692, 504, 4166, 251, 105, 31683, 235, 691, 617, 1302, 84, 649, 7206, 3062, 825, 70, 27718, 12966, 588, 31675, 251, 26698, 27393, 1096, 31654, 15, 21, 23109, 3514, 17246, 35, 83, 5097, 17541, 19560, 114, 258, 31678, 3062, 84, 29052, 27393, 1096, 2765, 23109, 3514, 17246, 5823, 70, 27718, 18424, 25473, 98, 345, 3292, 34, 15498, 70, 1645, 251, 70, 6103, 2802, 13463, 31654, 32, 12, 21, 35, 21, 21, 21, 31760, 12, 13, 35, 21, 21, 21, 3514, 17246, 33, 35, 8829, 256, 16185, 579, 7522, 21465, 198, 26698, 27393, 34, 31778, 14, 31780, 1822, 251, 31654, 13, 21, 21, 18, 35, 478, 579, 1407, 20358, 23109, 105, 31683, 235, 691, 83, 789, 31694, 1324, 691, 198, 29052, 27393, 83, 1407, 31654, 12, 16, 19, 35, 21, 21, 21, 105, 31683, 235, 691, 83, 31654, 12, 21, 21, 35, 21, 21, 21, 0, 789, 31694, 1324, 691, 35, 3601, 215, 65, 78, 825, 2482, 8170, 93, 4166, 1777, 78, 5359, 198, 70, 31654, 13, 21, 1671, 11823, 11325, 35, 198, 70, 17562, 18843, 83, 3226, 19507, 34, 31778, 16, 31780, 2266, 231, 10586, 1362, 1286, 1671, 25316, 35, 105, 31683, 235, 691, 617, 2288, 233, 826, 34, 147, 264, 21794, 321, 218, 31683, 4464, 344, 309, 84, 4057, 1357, 825, 70, 5187, 83, 9947, 81, 4897, 488, 182, 24761, 34, 31778, 17, 31780, 28616, 4173, 127, 84, 231, 10586, 1362, 4469, 345, 122, 25316, 35, 105, 31683, 235, 691, 1214, 3520, 84, 70, 1357, 12312, 826, 83, 105, 762, 31431, 1930, 338, 1909, 825, 586, 70, 1645, 251, 470, 34, 31778, 17, 31780],
      decoded: '<s> The llama (/\u02c8l\u0251\u02d0m\u0259/; <unk>Spanish pronunciation: [\u02c8<unk>ama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the Pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5\u20138 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have originated from the Great Plains of North America about 40 million years ago, and subsequently migrated to South America about three million years ago during the Great American Interchange. By the end of the last ice age (10,000\u201312,000 years ago), camelids were extinct in North America.[3] As of 2007, there were over seven million llamas and alpacas in South America and over 158,000 llamas and 100,000<unk> alpacas, descended from progenitors imported late in the 20th century, in the United States and Canada.[5] In Aymara mythology, llamas are important beings. The Heavenly Llama is said to drink water from the ocean and urinates as it rains.[6] According to Aymara eschatology, llamas will return to the water springs and lagoons where they come from at the end of time.[6]',
    },
  },
};

const MAX_EXECUTION_TIME = 10_000;
export const CUSTOM_TESTS = () => {
  // Tests to ensure that no matter what, the correct tokenization is returned.
  // This is necessary since there are sometimes bugs in the transformers library.
  describe("hard-coded", () => {
    const TESTS = {
      "Xenova/llama-tokenizer": [
        // Test legacy compatibility
        {
          // legacy unset => legacy=true
          // NOTE: While incorrect, it is necessary to match legacy behaviour
          data: {
            "<s>\n": [1, 29871, 13],
          },
          legacy: null,
        },
        {
          // override legacy=true (same results as above)
          data: {
            "<s>\n": [1, 29871, 13],
          },
          legacy: true,
        },
        {
          // override legacy=false (fixed results)
          data: {
            "<s>\n": [1, 13],
          },
          legacy: false,
        },
      ],

      "Xenova/llama-tokenizer_new": [
        // legacy=false
        {
          data: {
            " </s> 1  2   3    4   ": [259, 2, 29871, 29896, 259, 29906, 1678, 29941, 268, 29946, 1678],
            "<s>\n": [1, 13],
            "</s>test</s>": [2, 1688, 2],
            " </s> test </s> ": [259, 2, 1243, 29871, 2, 29871],
            "A\n'll": [319, 13, 29915, 645],
            "Hey </s>. how are you": [18637, 29871, 2, 29889, 920, 526, 366],
            "  Hi  Hello  ": [259, 6324, 29871, 15043, 259],
          },
          reversible: true,
          legacy: null,
        },
        {
          // override legacy=true (incorrect results, but necessary to match legacy behaviour)
          data: {
            "<s>\n": [1, 29871, 13],
          },
          legacy: true,
        },
      ],

      // new serialization format (tokenizers >= 0.20.0)
      // BPE merges are now [string, string][] instead of string[]
      "Xenova/Llama-3.2-Tokenizer": [
        {
          data: {
            "hello world": [15339, 1917],
            " belirtilen": [120909],
          },
          reversible: true,
        },

        // Test ignore_merges=false
        {
          data: {
            "hello world": [15339, 1917],
            " belirtilen": [101664, 1678, 268],
          },
          reversible: true,
          override: (tokenizer) => {
            tokenizer.model.ignore_merges = false;
          },
        },
      ],
    };

    // Re-use the same tests for the llama2 tokenizer
    TESTS["Xenova/llama2-tokenizer"] = TESTS["Xenova/llama-tokenizer_new"];

    for (const [tokenizerName, test_data] of Object.entries(TESTS)) {
      it(
        tokenizerName,
        async () => {
          for (const { data, reversible, legacy, override } of test_data) {
            const tokenizer = await LlamaTokenizer.from_pretrained(tokenizerName, { legacy });
            if (override) {
              override(tokenizer);
            }
            for (const [text, expected] of Object.entries(data)) {
              const token_ids = tokenizer.encode(text, { add_special_tokens: false });
              expect(token_ids).toEqual(expected);

              // If reversible, test that decoding produces the original text
              if (reversible) {
                const decoded = tokenizer.decode(token_ids);
                expect(decoded).toEqual(text);
              }
            }
          }
        },
        MAX_EXECUTION_TIME,
      );
    }
  });
};