rubert-base-collection3-ner-coreml / tokenizer_fixtures.json
smkrv's picture
fp16 Core ML conversion: ruBert-base Collection3 NER (PER/ORG/LOC), parity 99.98% tokens
0fe9ca9 verified
Raw
History Blame Contribute Delete
12.1 kB
[
{
"text": "Добрый день, меня зовут Колесникова Мария, чем могу помочь?",
"ids": [
48311,
378,
1336,
121,
1024,
10160,
23759,
9511,
14982,
390,
121,
1009,
1385,
4881,
161
],
"tokens": [
"добры",
"##и",
"день",
",",
"меня",
"зовут",
"колес",
"##никова",
"мари",
"##я",
",",
"чем",
"могу",
"помочь",
"?"
],
"offsets": [
[
0,
5
],
[
5,
6
],
[
7,
11
],
[
11,
12
],
[
13,
17
],
[
18,
23
],
[
24,
29
],
[
29,
35
],
[
36,
40
],
[
40,
41
],
[
41,
42
],
[
43,
46
],
[
47,
51
],
[
52,
58
],
[
58,
59
]
]
},
{
"text": "Счёт выставлен на ООО \"ПК МОРОШКА ДВ\", оплата ожидается до пятницы.",
"ids": [
3189,
65269,
660,
83963,
152,
117,
385,
10867,
1780,
116,
384,
152,
121,
45616,
11114,
708,
39716,
126
],
"tokens": [
"счет",
"выставлен",
"на",
"ооо",
"\"",
"п",
"##к",
"моро",
"##шка",
"д",
"##в",
"\"",
",",
"оплата",
"ожидается",
"до",
"пятницы",
"."
],
"offsets": [
[
0,
4
],
[
5,
14
],
[
15,
17
],
[
18,
21
],
[
22,
23
],
[
23,
24
],
[
24,
25
],
[
26,
30
],
[
30,
33
],
[
34,
35
],
[
35,
36
],
[
36,
37
],
[
37,
38
],
[
39,
45
],
[
46,
55
],
[
56,
58
],
[
59,
66
],
[
66,
67
]
]
},
{
"text": "Мы подключили Game Park (МагазинВидеоИгр ООО) к облачной кассе.",
"ids": [
945,
100208,
108282,
41337,
486,
160,
10001,
79511,
378,
12980,
83963,
158,
114,
87079,
378,
54650,
126
],
"tokens": [
"мы",
"подключили",
"game",
"par",
"##k",
"(",
"магазин",
"##видео",
"##и",
"##гр",
"ооо",
")",
"к",
"облачно",
"##и",
"кассе",
"."
],
"offsets": [
[
0,
2
],
[
3,
13
],
[
14,
18
],
[
19,
22
],
[
22,
23
],
[
24,
25
],
[
25,
32
],
[
32,
37
],
[
37,
38
],
[
38,
40
],
[
41,
44
],
[
44,
45
],
[
46,
47
],
[
48,
55
],
[
55,
56
],
[
57,
62
],
[
62,
63
]
]
},
{
"text": "ИП Мкртчян Елена Тухаировна, ИНН 772456789012, телефон +7 (912) 345-67-89.",
"ids": [
90161,
119309,
1246,
398,
1731,
13475,
669,
92674,
377,
27009,
10239,
669,
121,
880,
379,
9444,
9968,
44295,
38383,
19359,
12626,
121,
5431,
250,
182,
160,
14013,
420,
158,
71951,
133,
9807,
133,
12447,
126
],
"tokens": [
"ип",
"мк",
"##рт",
"##ч",
"##ян",
"еле",
"##на",
"тух",
"##а",
"##ир",
"##ов",
"##на",
",",
"ин",
"##н",
"77",
"##24",
"##56",
"##78",
"##90",
"##12",
",",
"телефон",
"+",
"7",
"(",
"91",
"##2",
")",
"345",
"-",
"67",
"-",
"89",
"."
],
"offsets": [
[
0,
2
],
[
3,
5
],
[
5,
7
],
[
7,
8
],
[
8,
10
],
[
11,
14
],
[
14,
16
],
[
17,
20
],
[
20,
21
],
[
21,
23
],
[
23,
25
],
[
25,
27
],
[
27,
28
],
[
29,
31
],
[
31,
32
],
[
33,
35
],
[
35,
37
],
[
37,
39
],
[
39,
41
],
[
41,
43
],
[
43,
45
],
[
45,
46
],
[
47,
54
],
[
55,
56
],
[
56,
57
],
[
58,
59
],
[
59,
61
],
[
61,
62
],
[
62,
63
],
[
64,
67
],
[
67,
68
],
[
68,
70
],
[
70,
71
],
[
71,
73
],
[
73,
74
]
]
},
{
"text": "Адрес: 193312, г. Санкт-Петербург, пр-кт Солидарности, д. 19, кв. 82.",
"ids": [
6546,
162,
14785,
12626,
121,
122,
126,
64100,
133,
9663,
121,
12417,
133,
114,
380,
36274,
121,
116,
126,
855,
121,
10650,
126,
11609,
126
],
"tokens": [
"адрес",
":",
"1933",
"##12",
",",
"г",
".",
"санкт",
"-",
"петербург",
",",
"пр",
"-",
"к",
"##т",
"солидарности",
",",
"д",
".",
"19",
",",
"кв",
".",
"82",
"."
],
"offsets": [
[
0,
5
],
[
5,
6
],
[
7,
11
],
[
11,
13
],
[
13,
14
],
[
15,
16
],
[
16,
17
],
[
18,
23
],
[
23,
24
],
[
24,
33
],
[
33,
34
],
[
35,
37
],
[
37,
38
],
[
38,
39
],
[
39,
40
],
[
41,
53
],
[
53,
54
],
[
55,
56
],
[
56,
57
],
[
58,
60
],
[
60,
61
],
[
62,
64
],
[
64,
65
],
[
66,
68
],
[
68,
69
]
]
},
{
"text": "ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ «СМАЙЛ ДЕНТ»",
"ids": [
7626,
110,
103859,
378,
49222,
151,
6798,
1181,
19871,
380,
150
],
"tokens": [
"общество",
"с",
"ограниченно",
"##и",
"ответственностью",
"«",
"сма",
"##ил",
"ден",
"##т",
"»"
],
"offsets": [
[
0,
8
],
[
9,
10
],
[
11,
22
],
[
22,
23
],
[
24,
40
],
[
41,
42
],
[
42,
45
],
[
45,
47
],
[
48,
51
],
[
51,
52
],
[
52,
53
]
]
},
{
"text": "передайте марии ивановне что договор готов",
"ids": [
1568,
42699,
14982,
378,
104691,
823,
691,
693,
7737,
3075
],
"tokens": [
"переда",
"##ите",
"мари",
"##и",
"ива",
"##нов",
"##не",
"что",
"договор",
"готов"
],
"offsets": [
[
0,
6
],
[
6,
9
],
[
10,
14
],
[
14,
15
],
[
16,
19
],
[
19,
22
],
[
22,
24
],
[
25,
28
],
[
29,
36
],
[
37,
42
]
]
},
{
"text": "Сёмга, ёжик и Алёна Васильёва-Кузнецова приехали в Орехово-Зуево.",
"ids": [
1544,
741,
121,
76994,
385,
107,
29336,
669,
100295,
8983,
377,
133,
59637,
97208,
12166,
113,
64353,
375,
133,
4263,
19128,
126
],
"tokens": [
"сем",
"##га",
",",
"ежи",
"##к",
"и",
"але",
"##на",
"васи",
"##льев",
"##а",
"-",
"кузнец",
"##ова",
"приехали",
"в",
"орехов",
"##о",
"-",
"зу",
"##ево",
"."
],
"offsets": [
[
0,
3
],
[
3,
5
],
[
5,
6
],
[
7,
10
],
[
10,
11
],
[
12,
13
],
[
14,
17
],
[
17,
19
],
[
20,
24
],
[
24,
28
],
[
28,
29
],
[
29,
30
],
[
30,
36
],
[
36,
39
],
[
40,
48
],
[
49,
50
],
[
51,
57
],
[
57,
58
],
[
58,
59
],
[
59,
61
],
[
61,
64
],
[
64,
65
]
]
},
{
"text": "email info@servisspb.ru и сайт http://servisspb.ru",
"ids": [
43096,
7364,
27046,
270,
83848,
13645,
472,
483,
126,
4551,
107,
798,
1301,
10843,
162,
197,
197,
83848,
13645,
472,
483,
126,
4551
],
"tokens": [
"em",
"##ail",
"info",
"@",
"serv",
"##iss",
"##p",
"##b",
".",
"ru",
"и",
"са",
"##ит",
"http",
":",
"/",
"/",
"serv",
"##iss",
"##p",
"##b",
".",
"ru"
],
"offsets": [
[
0,
2
],
[
2,
5
],
[
6,
10
],
[
10,
11
],
[
11,
15
],
[
15,
18
],
[
18,
19
],
[
19,
20
],
[
20,
21
],
[
21,
23
],
[
24,
25
],
[
26,
28
],
[
28,
30
],
[
31,
35
],
[
35,
36
],
[
36,
37
],
[
37,
38
],
[
38,
42
],
[
42,
45
],
[
45,
46
],
[
46,
47
],
[
47,
48
],
[
48,
50
]
]
},
{
"text": "Hello John Smith from New York, card 4276 1600 1234 5678.",
"ids": [
67124,
70471,
263,
30374,
31269,
10834,
18801,
52976,
207,
11267,
121,
81111,
90621,
451,
34196,
118430,
7889,
38383,
126
],
"tokens": [
"hel",
"##lo",
"j",
"##ohn",
"sm",
"##ith",
"from",
"new",
"y",
"##ork",
",",
"card",
"427",
"##6",
"1600",
"1234",
"56",
"##78",
"."
],
"offsets": [
[
0,
3
],
[
3,
5
],
[
6,
7
],
[
7,
10
],
[
11,
13
],
[
13,
16
],
[
17,
21
],
[
22,
25
],
[
26,
27
],
[
27,
30
],
[
30,
31
],
[
32,
36
],
[
37,
40
],
[
40,
41
],
[
42,
46
],
[
47,
51
],
[
52,
54
],
[
54,
56
],
[
56,
57
]
]
},
{
"text": "строка с несколькими пробелами и\tтабом",
"ids": [
63594,
110,
14010,
32813,
14236,
107,
22360,
6536
],
"tokens": [
"строка",
"с",
"несколькими",
"пробел",
"##ами",
"и",
"таб",
"##ом"
],
"offsets": [
[
0,
6
],
[
7,
8
],
[
12,
23
],
[
26,
32
],
[
32,
35
],
[
36,
37
],
[
38,
41
],
[
41,
43
]
]
},
{
"text": "深圳 unknown glyphs здесь",
"ids": [
100,
100,
16832,
93384,
20848,
45344,
71055,
466,
454,
1640
],
"tokens": [
"[UNK]",
"[UNK]",
"un",
"##kn",
"##own",
"gl",
"##yp",
"##h",
"##s",
"здесь"
],
"offsets": [
[
0,
1
],
[
1,
2
],
[
3,
5
],
[
5,
7
],
[
7,
10
],
[
11,
13
],
[
13,
15
],
[
15,
16
],
[
16,
17
],
[
18,
23
]
]
},
{
"text": "",
"ids": [],
"tokens": [],
"offsets": []
},
{
"text": "а",
"ids": [
106
],
"tokens": [
"а"
],
"offsets": [
[
0,
1
]
]
}
]