ningpp commited on
Commit
46f2559
·
verified ·
1 Parent(s): cd8b459

Upload 6 files

Browse files
README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: image-to-text
3
+ library_name: onnxruntime
4
+ tags:
5
+ - falcon
6
+ - ocr
7
+ - vision-language
8
+ - document-understanding
9
+ license: apache-2.0
10
+ ---
11
+
12
+
13
+ # ONNX model for [Falcon-OCR](https://huggingface.co/tiiuae/Falcon-OCR)
14
+
15
+ ## try with [ningpp/flux](https://github.com/ningpp/flux)
16
+
17
+ Flux is a Java-based OCR
18
+
19
+
20
+
21
+ # Falcon OCR
22
+
23
+ Falcon OCR is a 300M parameter early-fusion vision-language model for document OCR. Given an image, it can produce plain text, LaTeX for formulas, or HTML for tables, depending on the requested output format.
24
+
25
+ Most OCR VLM systems are built as a pipeline with a vision encoder feeding a separate text decoder, plus additional task-specific glue. Falcon OCR takes a different approach: a single Transformer processes image patches and text tokens in a shared parameter space from the first layer, using a hybrid attention mask where image tokens attend bidirectionally and text tokens decode causally conditioned on the image.
26
+
27
+ We built it this way for two practical reasons. First, it keeps the interface simple: one backbone, one decoding path, and task switching through prompts rather than a growing set of modules. Second, a 0.3B model has a lower latency and cost footprint than 0.9B-class OCR VLMs, and in our vLLM-based serving setup this translates into higher throughput, often 2–3× faster depending on sequence lengths and batch configuration. To our knowledge, this is one of the first attempts to apply this early-fusion single-stack recipe directly to competitive document OCR at this scale.
28
+
29
+
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "FalconOCRForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_falcon_ocr.FalconOCRConfig",
7
+ "AutoModelForCausalLM": "modeling_falcon_ocr.FalconOCRForCausalLM"
8
+ },
9
+ "model_type": "falcon_ocr",
10
+ "torch_dtype": "float32",
11
+ "dim": 768,
12
+ "n_layers": 22,
13
+ "n_heads": 16,
14
+ "head_dim": 64,
15
+ "n_kv_heads": 8,
16
+ "vocab_size": 65536,
17
+ "ffn_dim": 2304,
18
+ "norm_eps": 1e-05,
19
+ "max_seq_len": 8192,
20
+ "rope_theta": 10000,
21
+ "channel_size": 3,
22
+ "spatial_patch_size": 16,
23
+ "temporal_patch_size": 1,
24
+ "eos_id": 11,
25
+ "img_id": 227,
26
+ "image_cls_token_id": 244,
27
+ "image_reg_1_token_id": 245,
28
+ "image_reg_2_token_id": 246,
29
+ "image_reg_3_token_id": 247,
30
+ "image_reg_4_token_id": 248,
31
+ "img_end_id": 230
32
+ }
falcon_ocr_kv_token.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e31c4feadd5dfdb1fb1321c7e4e46f3db6db917054b48ba5a47b71cb4f9b94d5
3
+ size 1081568222
special_tokens_map.json ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|pad|>",
4
+ ">>ABSTRACT<<",
5
+ ">>INTRODUCTION<<",
6
+ ">>SUMMARY<<",
7
+ ">>COMMENT<<",
8
+ ">>ANSWER<<",
9
+ ">>QUESTION<<",
10
+ ">>DOMAIN<<",
11
+ ">>PREFIX<<",
12
+ ">>SUFFIX<<",
13
+ ">>MIDDLE<<",
14
+ "<|finetune_right_pad_id|>",
15
+ "<|start_header_id|>",
16
+ "<|end_header_id|>",
17
+ "<|eom_id|>",
18
+ "<|eot_id|>",
19
+ "<|begin_of_text|>",
20
+ ">>TITLE<<",
21
+ "<tool_response>",
22
+ "</tool_response>",
23
+ "<tool_call>",
24
+ "</tool_call>",
25
+ "<schema>",
26
+ "</schema>",
27
+ "<scratch_pad>",
28
+ "</scratch_pad>",
29
+ "<thinking>",
30
+ "</thinking>",
31
+ "<explanation>",
32
+ "</explanation>",
33
+ "<file_sep>",
34
+ "<repo_name>",
35
+ "<tr>",
36
+ "</tr>",
37
+ "<|image|>",
38
+ "<|image_row_sep|>",
39
+ "<|start_of_image|>",
40
+ "<|end_of_image|>",
41
+ "<|start_of_video|>",
42
+ "<|end_of_video|>",
43
+ "<|frame_sep|>",
44
+ "<|start_of_turn|>",
45
+ "<|end_of_turn|>",
46
+ "<|start_of_diffusion_query|>",
47
+ "<|end_of_diffusion_query|>",
48
+ "<|diffusion_query|>",
49
+ "<|object|>",
50
+ "<|coord|>",
51
+ "<|size|>",
52
+ "<|perceive|>",
53
+ "<|image_mask_token|>",
54
+ "<|image_cls|>",
55
+ "<|image_reg_1|>",
56
+ "<|image_reg_2|>",
57
+ "<|image_reg_3|>",
58
+ "<|image_reg_4|>",
59
+ "<|image_reg_5|>",
60
+ "<|image_reg_6|>",
61
+ "<|image_reg_7|>",
62
+ "<|image_reg_8|>",
63
+ "<|DET|>",
64
+ "<|POINTING|>",
65
+ "<|OCR_GROUNDING|>",
66
+ "<|OCR_DOC_PARSER|>",
67
+ "<|OCR_PLAIN|>",
68
+ "<|REF_SEG|>",
69
+ "<|POINT_REF_SEG|>",
70
+ "<|CAPTION|>",
71
+ "<|DETAILED_CAPTION|>",
72
+ "<|seg|>",
73
+ "<|end_of_query|>",
74
+ "<|start_of_query|>",
75
+ "<|task_sep|>",
76
+ "<|QA|>",
77
+ "<|LAYOUT_DETECTION|>",
78
+ "<|category_sep|>",
79
+ "<td>",
80
+ "</td>",
81
+ "<th>",
82
+ "</th>",
83
+ ">>UNUSED_261<<",
84
+ ">>UNUSED_262<<",
85
+ ">>UNUSED_263<<",
86
+ ">>UNUSED_264<<",
87
+ ">>UNUSED_265<<",
88
+ ">>UNUSED_266<<",
89
+ ">>UNUSED_267<<",
90
+ ">>UNUSED_268<<",
91
+ ">>UNUSED_269<<",
92
+ ">>UNUSED_270<<",
93
+ ">>UNUSED_271<<",
94
+ ">>UNUSED_272<<",
95
+ ">>UNUSED_273<<",
96
+ ">>UNUSED_274<<",
97
+ ">>UNUSED_275<<",
98
+ ">>UNUSED_276<<",
99
+ ">>UNUSED_277<<",
100
+ ">>UNUSED_278<<",
101
+ ">>UNUSED_279<<",
102
+ ">>UNUSED_280<<",
103
+ ">>UNUSED_281<<",
104
+ ">>UNUSED_282<<",
105
+ ">>UNUSED_283<<",
106
+ ">>UNUSED_284<<",
107
+ ">>UNUSED_285<<",
108
+ ">>UNUSED_286<<",
109
+ ">>UNUSED_287<<",
110
+ ">>UNUSED_288<<",
111
+ ">>UNUSED_289<<",
112
+ ">>UNUSED_290<<",
113
+ ">>UNUSED_291<<",
114
+ ">>UNUSED_292<<",
115
+ ">>UNUSED_293<<",
116
+ ">>UNUSED_294<<",
117
+ ">>UNUSED_295<<",
118
+ ">>UNUSED_296<<",
119
+ ">>UNUSED_297<<",
120
+ ">>UNUSED_298<<",
121
+ ">>UNUSED_299<<",
122
+ ">>UNUSED_300<<",
123
+ ">>UNUSED_301<<",
124
+ ">>UNUSED_302<<",
125
+ ">>UNUSED_303<<",
126
+ ">>UNUSED_304<<",
127
+ ">>UNUSED_305<<",
128
+ ">>UNUSED_306<<",
129
+ ">>UNUSED_307<<",
130
+ ">>UNUSED_308<<",
131
+ ">>UNUSED_309<<",
132
+ ">>UNUSED_310<<",
133
+ ">>UNUSED_311<<",
134
+ ">>UNUSED_312<<",
135
+ ">>UNUSED_313<<",
136
+ ">>UNUSED_314<<",
137
+ ">>UNUSED_315<<",
138
+ ">>UNUSED_316<<",
139
+ ">>UNUSED_317<<",
140
+ ">>UNUSED_318<<",
141
+ ">>UNUSED_319<<",
142
+ ">>UNUSED_320<<",
143
+ ">>UNUSED_321<<",
144
+ ">>UNUSED_322<<",
145
+ ">>UNUSED_323<<",
146
+ ">>UNUSED_324<<",
147
+ ">>UNUSED_325<<",
148
+ ">>UNUSED_326<<",
149
+ ">>UNUSED_327<<",
150
+ ">>UNUSED_328<<",
151
+ ">>UNUSED_329<<",
152
+ ">>UNUSED_330<<",
153
+ ">>UNUSED_331<<",
154
+ ">>UNUSED_332<<",
155
+ ">>UNUSED_333<<",
156
+ ">>UNUSED_334<<",
157
+ ">>UNUSED_335<<",
158
+ ">>UNUSED_336<<",
159
+ ">>UNUSED_337<<",
160
+ ">>UNUSED_338<<",
161
+ ">>UNUSED_339<<",
162
+ ">>UNUSED_340<<",
163
+ ">>UNUSED_341<<",
164
+ ">>UNUSED_342<<",
165
+ ">>UNUSED_343<<",
166
+ ">>UNUSED_344<<",
167
+ ">>UNUSED_345<<",
168
+ ">>UNUSED_346<<",
169
+ ">>UNUSED_347<<",
170
+ ">>UNUSED_348<<",
171
+ ">>UNUSED_349<<",
172
+ ">>UNUSED_350<<",
173
+ ">>UNUSED_351<<",
174
+ ">>UNUSED_352<<",
175
+ ">>UNUSED_353<<",
176
+ ">>UNUSED_354<<",
177
+ ">>UNUSED_355<<",
178
+ ">>UNUSED_356<<",
179
+ ">>UNUSED_357<<",
180
+ ">>UNUSED_358<<",
181
+ ">>UNUSED_359<<",
182
+ ">>UNUSED_360<<",
183
+ ">>UNUSED_361<<",
184
+ ">>UNUSED_362<<",
185
+ ">>UNUSED_363<<",
186
+ ">>UNUSED_364<<",
187
+ ">>UNUSED_365<<",
188
+ ">>UNUSED_366<<",
189
+ ">>UNUSED_367<<",
190
+ ">>UNUSED_368<<",
191
+ ">>UNUSED_369<<",
192
+ ">>UNUSED_370<<",
193
+ ">>UNUSED_371<<",
194
+ ">>UNUSED_372<<",
195
+ ">>UNUSED_373<<",
196
+ ">>UNUSED_374<<",
197
+ ">>UNUSED_375<<",
198
+ ">>UNUSED_376<<",
199
+ ">>UNUSED_377<<",
200
+ ">>UNUSED_378<<",
201
+ ">>UNUSED_379<<",
202
+ ">>UNUSED_380<<",
203
+ ">>UNUSED_381<<",
204
+ ">>UNUSED_382<<",
205
+ ">>UNUSED_383<<",
206
+ ">>UNUSED_384<<",
207
+ ">>UNUSED_385<<",
208
+ ">>UNUSED_386<<",
209
+ ">>UNUSED_387<<",
210
+ ">>UNUSED_388<<",
211
+ ">>UNUSED_389<<",
212
+ ">>UNUSED_390<<",
213
+ ">>UNUSED_391<<",
214
+ ">>UNUSED_392<<",
215
+ ">>UNUSED_393<<",
216
+ ">>UNUSED_394<<",
217
+ ">>UNUSED_395<<",
218
+ ">>UNUSED_396<<",
219
+ ">>UNUSED_397<<",
220
+ ">>UNUSED_398<<",
221
+ ">>UNUSED_399<<",
222
+ ">>UNUSED_400<<",
223
+ ">>UNUSED_401<<",
224
+ ">>UNUSED_402<<",
225
+ ">>UNUSED_403<<",
226
+ ">>UNUSED_404<<",
227
+ ">>UNUSED_405<<",
228
+ ">>UNUSED_406<<",
229
+ ">>UNUSED_407<<",
230
+ ">>UNUSED_408<<",
231
+ ">>UNUSED_409<<",
232
+ ">>UNUSED_410<<",
233
+ ">>UNUSED_411<<",
234
+ ">>UNUSED_412<<",
235
+ ">>UNUSED_413<<",
236
+ ">>UNUSED_414<<",
237
+ ">>UNUSED_415<<",
238
+ ">>UNUSED_416<<",
239
+ ">>UNUSED_417<<",
240
+ ">>UNUSED_418<<",
241
+ ">>UNUSED_419<<",
242
+ ">>UNUSED_420<<",
243
+ ">>UNUSED_421<<",
244
+ ">>UNUSED_422<<",
245
+ ">>UNUSED_423<<",
246
+ ">>UNUSED_424<<",
247
+ ">>UNUSED_425<<",
248
+ ">>UNUSED_426<<",
249
+ ">>UNUSED_427<<",
250
+ ">>UNUSED_428<<",
251
+ ">>UNUSED_429<<",
252
+ ">>UNUSED_430<<",
253
+ ">>UNUSED_431<<",
254
+ ">>UNUSED_432<<",
255
+ ">>UNUSED_433<<",
256
+ ">>UNUSED_434<<",
257
+ ">>UNUSED_435<<",
258
+ ">>UNUSED_436<<",
259
+ ">>UNUSED_437<<",
260
+ ">>UNUSED_438<<",
261
+ ">>UNUSED_439<<",
262
+ ">>UNUSED_440<<",
263
+ ">>UNUSED_441<<",
264
+ ">>UNUSED_442<<",
265
+ ">>UNUSED_443<<",
266
+ ">>UNUSED_444<<",
267
+ ">>UNUSED_445<<",
268
+ ">>UNUSED_446<<",
269
+ ">>UNUSED_447<<",
270
+ ">>UNUSED_448<<",
271
+ ">>UNUSED_449<<",
272
+ ">>UNUSED_450<<",
273
+ ">>UNUSED_451<<",
274
+ ">>UNUSED_452<<",
275
+ ">>UNUSED_453<<",
276
+ ">>UNUSED_454<<",
277
+ ">>UNUSED_455<<",
278
+ ">>UNUSED_456<<",
279
+ ">>UNUSED_457<<",
280
+ ">>UNUSED_458<<",
281
+ ">>UNUSED_459<<",
282
+ ">>UNUSED_460<<",
283
+ ">>UNUSED_461<<",
284
+ ">>UNUSED_462<<",
285
+ ">>UNUSED_463<<",
286
+ ">>UNUSED_464<<",
287
+ ">>UNUSED_465<<",
288
+ ">>UNUSED_466<<",
289
+ ">>UNUSED_467<<",
290
+ ">>UNUSED_468<<",
291
+ ">>UNUSED_469<<",
292
+ ">>UNUSED_470<<",
293
+ ">>UNUSED_471<<",
294
+ ">>UNUSED_472<<",
295
+ ">>UNUSED_473<<",
296
+ ">>UNUSED_474<<",
297
+ ">>UNUSED_475<<",
298
+ ">>UNUSED_476<<",
299
+ ">>UNUSED_477<<",
300
+ ">>UNUSED_478<<",
301
+ ">>UNUSED_479<<",
302
+ ">>UNUSED_480<<",
303
+ ">>UNUSED_481<<",
304
+ ">>UNUSED_482<<",
305
+ ">>UNUSED_483<<",
306
+ ">>UNUSED_484<<",
307
+ ">>UNUSED_485<<",
308
+ ">>UNUSED_486<<",
309
+ ">>UNUSED_487<<",
310
+ ">>UNUSED_488<<",
311
+ ">>UNUSED_489<<",
312
+ ">>UNUSED_490<<",
313
+ ">>UNUSED_491<<",
314
+ ">>UNUSED_492<<",
315
+ ">>UNUSED_493<<",
316
+ ">>UNUSED_494<<",
317
+ ">>UNUSED_495<<",
318
+ ">>UNUSED_496<<",
319
+ ">>UNUSED_497<<",
320
+ ">>UNUSED_498<<",
321
+ ">>UNUSED_499<<",
322
+ ">>UNUSED_500<<",
323
+ ">>UNUSED_501<<",
324
+ ">>UNUSED_502<<",
325
+ ">>UNUSED_503<<",
326
+ ">>UNUSED_504<<",
327
+ ">>UNUSED_505<<",
328
+ ">>UNUSED_506<<",
329
+ ">>UNUSED_507<<",
330
+ ">>UNUSED_508<<",
331
+ ">>UNUSED_509<<",
332
+ ">>UNUSED_510<<",
333
+ ">>UNUSED_511<<"
334
+ ],
335
+ "eos_token": {
336
+ "content": "<|end_of_text|>",
337
+ "lstrip": false,
338
+ "normalized": false,
339
+ "rstrip": false,
340
+ "single_word": false
341
+ },
342
+ "image_token": "<|image|>",
343
+ "image_cls_token": "<|image_cls|>",
344
+ "image_reg_1_token": "<|image_reg_1|>",
345
+ "image_reg_2_token": "<|image_reg_2|>",
346
+ "image_reg_3_token": "<|image_reg_3|>",
347
+ "image_reg_4_token": "<|image_reg_4|>",
348
+ "image_reg_5_token": "<|image_reg_5|>",
349
+ "image_reg_6_token": "<|image_reg_6|>",
350
+ "image_reg_7_token": "<|image_reg_7|>",
351
+ "image_reg_8_token": "<|image_reg_8|>",
352
+ "image_row_sep_token": "<|image_row_sep|>",
353
+ "start_of_image_token": "<|start_of_image|>",
354
+ "end_of_image_token": "<|end_of_image|>",
355
+ "start_of_video_token": "<|start_of_video|>",
356
+ "end_of_video_token": "<|end_of_video|>",
357
+ "frame_sep_token": "<|frame_sep|>",
358
+ "start_of_turn_token": "<|start_of_turn|>",
359
+ "end_of_turn_token": "<|end_of_turn|>",
360
+ "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
361
+ "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
362
+ "diffusion_query_token": "<|diffusion_query|>",
363
+ "object_token": "<|object|>",
364
+ "coord_token": "<|coord|>",
365
+ "size_token": "<|size|>",
366
+ "perceive_token": "<|perceive|>",
367
+ "image_mask_token": "<|image_mask_token|>",
368
+ "det_token": "<|DET|>",
369
+ "pointing_token": "<|POINTING|>",
370
+ "ocr_grounding_token": "<|OCR_GROUNDING|>",
371
+ "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
372
+ "ocr_plain_token": "<|OCR_PLAIN|>",
373
+ "ref_seg_token": "<|REF_SEG|>",
374
+ "point_ref_seg_token": "<|POINT_REF_SEG|>",
375
+ "caption_token": "<|CAPTION|>",
376
+ "detailed_caption_token": "<|DETAILED_CAPTION|>",
377
+ "seg_token": "<|seg|>",
378
+ "start_of_query_token": "<|start_of_query|>",
379
+ "end_of_query_token": "<|end_of_query|>",
380
+ "task_sep_token": "<|task_sep|>",
381
+ "qa_token": "<|QA|>",
382
+ "layout_detection_token": "<|LAYOUT_DETECTION|>",
383
+ "category_sep_token": "<|category_sep|>",
384
+ "table_row_start_token": "<tr>",
385
+ "table_row_end_token": "</tr>",
386
+ "table_data_start_token": "<td>",
387
+ "table_data_end_token": "</td>",
388
+ "table_header_start_token": "<th>",
389
+ "table_header_end_token": "</th>"
390
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "caption_token": "<|CAPTION|>",
4
+ "category_sep_token": "<|category_sep|>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "coord_token": "<|coord|>",
7
+ "det_token": "<|DET|>",
8
+ "detailed_caption_token": "<|DETAILED_CAPTION|>",
9
+ "diffusion_query_token": "<|diffusion_query|>",
10
+ "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
11
+ "end_of_image_token": "<|end_of_image|>",
12
+ "end_of_query_token": "<|end_of_query|>",
13
+ "end_of_turn_token": "<|end_of_turn|>",
14
+ "end_of_video_token": "<|end_of_video|>",
15
+ "eos_token": "<|end_of_text|>",
16
+ "frame_sep_token": "<|frame_sep|>",
17
+ "image_cls_token": "<|image_cls|>",
18
+ "image_mask_token": "<|image_mask_token|>",
19
+ "image_reg_1_token": "<|image_reg_1|>",
20
+ "image_reg_2_token": "<|image_reg_2|>",
21
+ "image_reg_3_token": "<|image_reg_3|>",
22
+ "image_reg_4_token": "<|image_reg_4|>",
23
+ "image_reg_5_token": "<|image_reg_5|>",
24
+ "image_reg_6_token": "<|image_reg_6|>",
25
+ "image_reg_7_token": "<|image_reg_7|>",
26
+ "image_reg_8_token": "<|image_reg_8|>",
27
+ "image_row_sep_token": "<|image_row_sep|>",
28
+ "image_token": "<|image|>",
29
+ "is_local": true,
30
+ "layout_detection_token": "<|LAYOUT_DETECTION|>",
31
+ "model_input_names": [
32
+ "input_ids",
33
+ "attention_mask"
34
+ ],
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "model_specific_special_tokens": {
37
+ "caption_token": "<|CAPTION|>",
38
+ "category_sep_token": "<|category_sep|>",
39
+ "coord_token": "<|coord|>",
40
+ "det_token": "<|DET|>",
41
+ "detailed_caption_token": "<|DETAILED_CAPTION|>",
42
+ "diffusion_query_token": "<|diffusion_query|>",
43
+ "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
44
+ "end_of_image_token": "<|end_of_image|>",
45
+ "end_of_query_token": "<|end_of_query|>",
46
+ "end_of_turn_token": "<|end_of_turn|>",
47
+ "end_of_video_token": "<|end_of_video|>",
48
+ "frame_sep_token": "<|frame_sep|>",
49
+ "image_cls_token": "<|image_cls|>",
50
+ "image_mask_token": "<|image_mask_token|>",
51
+ "image_reg_1_token": "<|image_reg_1|>",
52
+ "image_reg_2_token": "<|image_reg_2|>",
53
+ "image_reg_3_token": "<|image_reg_3|>",
54
+ "image_reg_4_token": "<|image_reg_4|>",
55
+ "image_reg_5_token": "<|image_reg_5|>",
56
+ "image_reg_6_token": "<|image_reg_6|>",
57
+ "image_reg_7_token": "<|image_reg_7|>",
58
+ "image_reg_8_token": "<|image_reg_8|>",
59
+ "image_row_sep_token": "<|image_row_sep|>",
60
+ "image_token": "<|image|>",
61
+ "layout_detection_token": "<|LAYOUT_DETECTION|>",
62
+ "object_token": "<|object|>",
63
+ "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
64
+ "ocr_grounding_token": "<|OCR_GROUNDING|>",
65
+ "ocr_plain_token": "<|OCR_PLAIN|>",
66
+ "perceive_token": "<|perceive|>",
67
+ "point_ref_seg_token": "<|POINT_REF_SEG|>",
68
+ "pointing_token": "<|POINTING|>",
69
+ "qa_token": "<|QA|>",
70
+ "ref_seg_token": "<|REF_SEG|>",
71
+ "seg_token": "<|seg|>",
72
+ "size_token": "<|size|>",
73
+ "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
74
+ "start_of_image_token": "<|start_of_image|>",
75
+ "start_of_query_token": "<|start_of_query|>",
76
+ "start_of_turn_token": "<|start_of_turn|>",
77
+ "start_of_video_token": "<|start_of_video|>",
78
+ "table_data_end_token": "</td>",
79
+ "table_data_start_token": "<td>",
80
+ "table_header_end_token": "</th>",
81
+ "table_header_start_token": "<th>",
82
+ "table_row_end_token": "</tr>",
83
+ "table_row_start_token": "<tr>",
84
+ "task_sep_token": "<|task_sep|>"
85
+ },
86
+ "object_token": "<|object|>",
87
+ "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
88
+ "ocr_grounding_token": "<|OCR_GROUNDING|>",
89
+ "ocr_plain_token": "<|OCR_PLAIN|>",
90
+ "perceive_token": "<|perceive|>",
91
+ "point_ref_seg_token": "<|POINT_REF_SEG|>",
92
+ "pointing_token": "<|POINTING|>",
93
+ "qa_token": "<|QA|>",
94
+ "ref_seg_token": "<|REF_SEG|>",
95
+ "seg_token": "<|seg|>",
96
+ "size_token": "<|size|>",
97
+ "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
98
+ "start_of_image_token": "<|start_of_image|>",
99
+ "start_of_query_token": "<|start_of_query|>",
100
+ "start_of_turn_token": "<|start_of_turn|>",
101
+ "start_of_video_token": "<|start_of_video|>",
102
+ "table_data_end_token": "</td>",
103
+ "table_data_start_token": "<td>",
104
+ "table_header_end_token": "</th>",
105
+ "table_header_start_token": "<th>",
106
+ "table_row_end_token": "</tr>",
107
+ "table_row_start_token": "<tr>",
108
+ "task_sep_token": "<|task_sep|>",
109
+ "tokenizer_class": "TokenizersBackend"
110
+ }