alikirec commited on
Commit
0f04f61
·
verified ·
1 Parent(s): 94932bd

Upload processor

Browse files
added_tokens.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<end_of_utterance>": 49279,
3
+ "<fake_token_around_image>": 49189,
4
+ "<global-img>": 49152,
5
+ "<image>": 49190,
6
+ "<row_1_col_1>": 49153,
7
+ "<row_1_col_2>": 49154,
8
+ "<row_1_col_3>": 49155,
9
+ "<row_1_col_4>": 49156,
10
+ "<row_1_col_5>": 49157,
11
+ "<row_1_col_6>": 49158,
12
+ "<row_2_col_1>": 49159,
13
+ "<row_2_col_2>": 49160,
14
+ "<row_2_col_3>": 49161,
15
+ "<row_2_col_4>": 49162,
16
+ "<row_2_col_5>": 49163,
17
+ "<row_2_col_6>": 49164,
18
+ "<row_3_col_1>": 49165,
19
+ "<row_3_col_2>": 49166,
20
+ "<row_3_col_3>": 49167,
21
+ "<row_3_col_4>": 49168,
22
+ "<row_3_col_5>": 49169,
23
+ "<row_3_col_6>": 49170,
24
+ "<row_4_col_1>": 49171,
25
+ "<row_4_col_2>": 49172,
26
+ "<row_4_col_3>": 49173,
27
+ "<row_4_col_4>": 49174,
28
+ "<row_4_col_5>": 49175,
29
+ "<row_4_col_6>": 49176,
30
+ "<row_5_col_1>": 49177,
31
+ "<row_5_col_2>": 49178,
32
+ "<row_5_col_3>": 49179,
33
+ "<row_5_col_4>": 49180,
34
+ "<row_5_col_5>": 49181,
35
+ "<row_5_col_6>": 49182,
36
+ "<row_6_col_1>": 49183,
37
+ "<row_6_col_2>": 49184,
38
+ "<row_6_col_3>": 49185,
39
+ "<row_6_col_4>": 49186,
40
+ "<row_6_col_5>": 49187,
41
+ "<row_6_col_6>": 49188,
42
+ "<|reserved_special_token_0|>": 49191,
43
+ "<|reserved_special_token_10|>": 49201,
44
+ "<|reserved_special_token_11|>": 49202,
45
+ "<|reserved_special_token_12|>": 49203,
46
+ "<|reserved_special_token_13|>": 49204,
47
+ "<|reserved_special_token_14|>": 49205,
48
+ "<|reserved_special_token_15|>": 49206,
49
+ "<|reserved_special_token_16|>": 49207,
50
+ "<|reserved_special_token_17|>": 49208,
51
+ "<|reserved_special_token_18|>": 49209,
52
+ "<|reserved_special_token_19|>": 49210,
53
+ "<|reserved_special_token_1|>": 49192,
54
+ "<|reserved_special_token_20|>": 49211,
55
+ "<|reserved_special_token_21|>": 49212,
56
+ "<|reserved_special_token_22|>": 49213,
57
+ "<|reserved_special_token_23|>": 49214,
58
+ "<|reserved_special_token_24|>": 49215,
59
+ "<|reserved_special_token_25|>": 49216,
60
+ "<|reserved_special_token_26|>": 49217,
61
+ "<|reserved_special_token_27|>": 49218,
62
+ "<|reserved_special_token_28|>": 49219,
63
+ "<|reserved_special_token_29|>": 49220,
64
+ "<|reserved_special_token_2|>": 49193,
65
+ "<|reserved_special_token_30|>": 49221,
66
+ "<|reserved_special_token_31|>": 49222,
67
+ "<|reserved_special_token_32|>": 49223,
68
+ "<|reserved_special_token_33|>": 49224,
69
+ "<|reserved_special_token_34|>": 49225,
70
+ "<|reserved_special_token_35|>": 49226,
71
+ "<|reserved_special_token_36|>": 49227,
72
+ "<|reserved_special_token_37|>": 49228,
73
+ "<|reserved_special_token_38|>": 49229,
74
+ "<|reserved_special_token_39|>": 49230,
75
+ "<|reserved_special_token_3|>": 49194,
76
+ "<|reserved_special_token_40|>": 49231,
77
+ "<|reserved_special_token_41|>": 49232,
78
+ "<|reserved_special_token_42|>": 49233,
79
+ "<|reserved_special_token_43|>": 49234,
80
+ "<|reserved_special_token_44|>": 49235,
81
+ "<|reserved_special_token_45|>": 49236,
82
+ "<|reserved_special_token_46|>": 49237,
83
+ "<|reserved_special_token_47|>": 49238,
84
+ "<|reserved_special_token_48|>": 49239,
85
+ "<|reserved_special_token_49|>": 49240,
86
+ "<|reserved_special_token_4|>": 49195,
87
+ "<|reserved_special_token_50|>": 49241,
88
+ "<|reserved_special_token_51|>": 49242,
89
+ "<|reserved_special_token_52|>": 49243,
90
+ "<|reserved_special_token_53|>": 49244,
91
+ "<|reserved_special_token_54|>": 49245,
92
+ "<|reserved_special_token_55|>": 49246,
93
+ "<|reserved_special_token_56|>": 49247,
94
+ "<|reserved_special_token_57|>": 49248,
95
+ "<|reserved_special_token_58|>": 49249,
96
+ "<|reserved_special_token_59|>": 49250,
97
+ "<|reserved_special_token_5|>": 49196,
98
+ "<|reserved_special_token_60|>": 49251,
99
+ "<|reserved_special_token_61|>": 49252,
100
+ "<|reserved_special_token_62|>": 49253,
101
+ "<|reserved_special_token_63|>": 49254,
102
+ "<|reserved_special_token_64|>": 49255,
103
+ "<|reserved_special_token_65|>": 49256,
104
+ "<|reserved_special_token_66|>": 49257,
105
+ "<|reserved_special_token_67|>": 49258,
106
+ "<|reserved_special_token_68|>": 49259,
107
+ "<|reserved_special_token_69|>": 49260,
108
+ "<|reserved_special_token_6|>": 49197,
109
+ "<|reserved_special_token_70|>": 49261,
110
+ "<|reserved_special_token_71|>": 49262,
111
+ "<|reserved_special_token_72|>": 49263,
112
+ "<|reserved_special_token_73|>": 49264,
113
+ "<|reserved_special_token_74|>": 49265,
114
+ "<|reserved_special_token_75|>": 49266,
115
+ "<|reserved_special_token_76|>": 49267,
116
+ "<|reserved_special_token_77|>": 49268,
117
+ "<|reserved_special_token_78|>": 49269,
118
+ "<|reserved_special_token_79|>": 49270,
119
+ "<|reserved_special_token_7|>": 49198,
120
+ "<|reserved_special_token_80|>": 49271,
121
+ "<|reserved_special_token_81|>": 49272,
122
+ "<|reserved_special_token_82|>": 49273,
123
+ "<|reserved_special_token_83|>": 49274,
124
+ "<|reserved_special_token_84|>": 49275,
125
+ "<|reserved_special_token_85|>": 49276,
126
+ "<|reserved_special_token_86|>": 49277,
127
+ "<|reserved_special_token_87|>": 49278,
128
+ "<|reserved_special_token_8|>": 49199,
129
+ "<|reserved_special_token_9|>": 49200
130
+ }
chat_template.json CHANGED
@@ -1,3 +1,3 @@
1
  {
2
  "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
3
- }
 
1
  {
2
  "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
3
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json CHANGED
@@ -25,4 +25,4 @@
25
  "size": {
26
  "longest_edge": 2048
27
  }
28
- }
 
25
  "size": {
26
  "longest_edge": 2048
27
  }
28
+ }
processor_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
  "image_seq_len": 64,
3
  "processor_class": "Idefics3Processor"
4
- }
 
1
  {
2
  "image_seq_len": 64,
3
  "processor_class": "Idefics3Processor"
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<fake_token_around_image>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<image>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<end_of_utterance>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ ],
25
+ "bos_token": {
26
+ "content": "<|im_start|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "eos_token": {
33
+ "content": "<end_of_utterance>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
+ "pad_token": {
40
+ "content": "<|im_end|>",
41
+ "lstrip": false,
42
+ "normalized": false,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "unk_token": {
47
+ "content": "<|endoftext|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ }
53
+ }
tokenizer_config.json CHANGED
@@ -1171,6 +1171,7 @@
1171
  "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
1172
  "clean_up_tokenization_spaces": false,
1173
  "eos_token": "<end_of_utterance>",
 
1174
  "legacy": false,
1175
  "model_max_length": 8192,
1176
  "pad_token": "<|im_end|>",
@@ -1178,5 +1179,5 @@
1178
  "tokenizer_class": "GPT2Tokenizer",
1179
  "truncation_side": "left",
1180
  "unk_token": "<|endoftext|>",
1181
- "vocab_size": 49280
1182
- }
 
1171
  "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
1172
  "clean_up_tokenization_spaces": false,
1173
  "eos_token": "<end_of_utterance>",
1174
+ "extra_special_tokens": {},
1175
  "legacy": false,
1176
  "model_max_length": 8192,
1177
  "pad_token": "<|im_end|>",
 
1179
  "tokenizer_class": "GPT2Tokenizer",
1180
  "truncation_side": "left",
1181
  "unk_token": "<|endoftext|>",
1182
+ "vocab_size": 49152
1183
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff