MattyMroz commited on
Commit
4a48dbc
verified
1 Parent(s): 4668426

magiv3 with bugs

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Usage
2
+
3
+ ```python
4
+ model = AutoModelForCausalLM.from_pretrained("ragavsachdeva/magiv3", torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
5
+ processor = AutoProcessor.from_pretrained("ragavsachdeva/magiv3", trust_remote_code=True)
6
+
7
+ model.predict_detections_and_associations(images, processor)
8
+ model.predict_ocr(images, processor)
9
+ model.predict_character_grounding(images, captions, processor)
10
+
11
+ ```
added_tokens.json ADDED
@@ -0,0 +1,1030 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</cap>": 51270,
3
+ "</dcap>": 51274,
4
+ "</grounding>": 51276,
5
+ "</ncap>": 51272,
6
+ "</ocr>": 50268,
7
+ "</od>": 50266,
8
+ "</poly>": 51287,
9
+ "</proposal>": 51285,
10
+ "</region_cap>": 51281,
11
+ "</region_to_desciption>": 51283,
12
+ "</seg>": 51278,
13
+ "<and>": 51288,
14
+ "<cap>": 51269,
15
+ "<character>": 51291,
16
+ "<dcap>": 51273,
17
+ "<grounding>": 51275,
18
+ "<loc_0>": 50269,
19
+ "<loc_100>": 50369,
20
+ "<loc_101>": 50370,
21
+ "<loc_102>": 50371,
22
+ "<loc_103>": 50372,
23
+ "<loc_104>": 50373,
24
+ "<loc_105>": 50374,
25
+ "<loc_106>": 50375,
26
+ "<loc_107>": 50376,
27
+ "<loc_108>": 50377,
28
+ "<loc_109>": 50378,
29
+ "<loc_10>": 50279,
30
+ "<loc_110>": 50379,
31
+ "<loc_111>": 50380,
32
+ "<loc_112>": 50381,
33
+ "<loc_113>": 50382,
34
+ "<loc_114>": 50383,
35
+ "<loc_115>": 50384,
36
+ "<loc_116>": 50385,
37
+ "<loc_117>": 50386,
38
+ "<loc_118>": 50387,
39
+ "<loc_119>": 50388,
40
+ "<loc_11>": 50280,
41
+ "<loc_120>": 50389,
42
+ "<loc_121>": 50390,
43
+ "<loc_122>": 50391,
44
+ "<loc_123>": 50392,
45
+ "<loc_124>": 50393,
46
+ "<loc_125>": 50394,
47
+ "<loc_126>": 50395,
48
+ "<loc_127>": 50396,
49
+ "<loc_128>": 50397,
50
+ "<loc_129>": 50398,
51
+ "<loc_12>": 50281,
52
+ "<loc_130>": 50399,
53
+ "<loc_131>": 50400,
54
+ "<loc_132>": 50401,
55
+ "<loc_133>": 50402,
56
+ "<loc_134>": 50403,
57
+ "<loc_135>": 50404,
58
+ "<loc_136>": 50405,
59
+ "<loc_137>": 50406,
60
+ "<loc_138>": 50407,
61
+ "<loc_139>": 50408,
62
+ "<loc_13>": 50282,
63
+ "<loc_140>": 50409,
64
+ "<loc_141>": 50410,
65
+ "<loc_142>": 50411,
66
+ "<loc_143>": 50412,
67
+ "<loc_144>": 50413,
68
+ "<loc_145>": 50414,
69
+ "<loc_146>": 50415,
70
+ "<loc_147>": 50416,
71
+ "<loc_148>": 50417,
72
+ "<loc_149>": 50418,
73
+ "<loc_14>": 50283,
74
+ "<loc_150>": 50419,
75
+ "<loc_151>": 50420,
76
+ "<loc_152>": 50421,
77
+ "<loc_153>": 50422,
78
+ "<loc_154>": 50423,
79
+ "<loc_155>": 50424,
80
+ "<loc_156>": 50425,
81
+ "<loc_157>": 50426,
82
+ "<loc_158>": 50427,
83
+ "<loc_159>": 50428,
84
+ "<loc_15>": 50284,
85
+ "<loc_160>": 50429,
86
+ "<loc_161>": 50430,
87
+ "<loc_162>": 50431,
88
+ "<loc_163>": 50432,
89
+ "<loc_164>": 50433,
90
+ "<loc_165>": 50434,
91
+ "<loc_166>": 50435,
92
+ "<loc_167>": 50436,
93
+ "<loc_168>": 50437,
94
+ "<loc_169>": 50438,
95
+ "<loc_16>": 50285,
96
+ "<loc_170>": 50439,
97
+ "<loc_171>": 50440,
98
+ "<loc_172>": 50441,
99
+ "<loc_173>": 50442,
100
+ "<loc_174>": 50443,
101
+ "<loc_175>": 50444,
102
+ "<loc_176>": 50445,
103
+ "<loc_177>": 50446,
104
+ "<loc_178>": 50447,
105
+ "<loc_179>": 50448,
106
+ "<loc_17>": 50286,
107
+ "<loc_180>": 50449,
108
+ "<loc_181>": 50450,
109
+ "<loc_182>": 50451,
110
+ "<loc_183>": 50452,
111
+ "<loc_184>": 50453,
112
+ "<loc_185>": 50454,
113
+ "<loc_186>": 50455,
114
+ "<loc_187>": 50456,
115
+ "<loc_188>": 50457,
116
+ "<loc_189>": 50458,
117
+ "<loc_18>": 50287,
118
+ "<loc_190>": 50459,
119
+ "<loc_191>": 50460,
120
+ "<loc_192>": 50461,
121
+ "<loc_193>": 50462,
122
+ "<loc_194>": 50463,
123
+ "<loc_195>": 50464,
124
+ "<loc_196>": 50465,
125
+ "<loc_197>": 50466,
126
+ "<loc_198>": 50467,
127
+ "<loc_199>": 50468,
128
+ "<loc_19>": 50288,
129
+ "<loc_1>": 50270,
130
+ "<loc_200>": 50469,
131
+ "<loc_201>": 50470,
132
+ "<loc_202>": 50471,
133
+ "<loc_203>": 50472,
134
+ "<loc_204>": 50473,
135
+ "<loc_205>": 50474,
136
+ "<loc_206>": 50475,
137
+ "<loc_207>": 50476,
138
+ "<loc_208>": 50477,
139
+ "<loc_209>": 50478,
140
+ "<loc_20>": 50289,
141
+ "<loc_210>": 50479,
142
+ "<loc_211>": 50480,
143
+ "<loc_212>": 50481,
144
+ "<loc_213>": 50482,
145
+ "<loc_214>": 50483,
146
+ "<loc_215>": 50484,
147
+ "<loc_216>": 50485,
148
+ "<loc_217>": 50486,
149
+ "<loc_218>": 50487,
150
+ "<loc_219>": 50488,
151
+ "<loc_21>": 50290,
152
+ "<loc_220>": 50489,
153
+ "<loc_221>": 50490,
154
+ "<loc_222>": 50491,
155
+ "<loc_223>": 50492,
156
+ "<loc_224>": 50493,
157
+ "<loc_225>": 50494,
158
+ "<loc_226>": 50495,
159
+ "<loc_227>": 50496,
160
+ "<loc_228>": 50497,
161
+ "<loc_229>": 50498,
162
+ "<loc_22>": 50291,
163
+ "<loc_230>": 50499,
164
+ "<loc_231>": 50500,
165
+ "<loc_232>": 50501,
166
+ "<loc_233>": 50502,
167
+ "<loc_234>": 50503,
168
+ "<loc_235>": 50504,
169
+ "<loc_236>": 50505,
170
+ "<loc_237>": 50506,
171
+ "<loc_238>": 50507,
172
+ "<loc_239>": 50508,
173
+ "<loc_23>": 50292,
174
+ "<loc_240>": 50509,
175
+ "<loc_241>": 50510,
176
+ "<loc_242>": 50511,
177
+ "<loc_243>": 50512,
178
+ "<loc_244>": 50513,
179
+ "<loc_245>": 50514,
180
+ "<loc_246>": 50515,
181
+ "<loc_247>": 50516,
182
+ "<loc_248>": 50517,
183
+ "<loc_249>": 50518,
184
+ "<loc_24>": 50293,
185
+ "<loc_250>": 50519,
186
+ "<loc_251>": 50520,
187
+ "<loc_252>": 50521,
188
+ "<loc_253>": 50522,
189
+ "<loc_254>": 50523,
190
+ "<loc_255>": 50524,
191
+ "<loc_256>": 50525,
192
+ "<loc_257>": 50526,
193
+ "<loc_258>": 50527,
194
+ "<loc_259>": 50528,
195
+ "<loc_25>": 50294,
196
+ "<loc_260>": 50529,
197
+ "<loc_261>": 50530,
198
+ "<loc_262>": 50531,
199
+ "<loc_263>": 50532,
200
+ "<loc_264>": 50533,
201
+ "<loc_265>": 50534,
202
+ "<loc_266>": 50535,
203
+ "<loc_267>": 50536,
204
+ "<loc_268>": 50537,
205
+ "<loc_269>": 50538,
206
+ "<loc_26>": 50295,
207
+ "<loc_270>": 50539,
208
+ "<loc_271>": 50540,
209
+ "<loc_272>": 50541,
210
+ "<loc_273>": 50542,
211
+ "<loc_274>": 50543,
212
+ "<loc_275>": 50544,
213
+ "<loc_276>": 50545,
214
+ "<loc_277>": 50546,
215
+ "<loc_278>": 50547,
216
+ "<loc_279>": 50548,
217
+ "<loc_27>": 50296,
218
+ "<loc_280>": 50549,
219
+ "<loc_281>": 50550,
220
+ "<loc_282>": 50551,
221
+ "<loc_283>": 50552,
222
+ "<loc_284>": 50553,
223
+ "<loc_285>": 50554,
224
+ "<loc_286>": 50555,
225
+ "<loc_287>": 50556,
226
+ "<loc_288>": 50557,
227
+ "<loc_289>": 50558,
228
+ "<loc_28>": 50297,
229
+ "<loc_290>": 50559,
230
+ "<loc_291>": 50560,
231
+ "<loc_292>": 50561,
232
+ "<loc_293>": 50562,
233
+ "<loc_294>": 50563,
234
+ "<loc_295>": 50564,
235
+ "<loc_296>": 50565,
236
+ "<loc_297>": 50566,
237
+ "<loc_298>": 50567,
238
+ "<loc_299>": 50568,
239
+ "<loc_29>": 50298,
240
+ "<loc_2>": 50271,
241
+ "<loc_300>": 50569,
242
+ "<loc_301>": 50570,
243
+ "<loc_302>": 50571,
244
+ "<loc_303>": 50572,
245
+ "<loc_304>": 50573,
246
+ "<loc_305>": 50574,
247
+ "<loc_306>": 50575,
248
+ "<loc_307>": 50576,
249
+ "<loc_308>": 50577,
250
+ "<loc_309>": 50578,
251
+ "<loc_30>": 50299,
252
+ "<loc_310>": 50579,
253
+ "<loc_311>": 50580,
254
+ "<loc_312>": 50581,
255
+ "<loc_313>": 50582,
256
+ "<loc_314>": 50583,
257
+ "<loc_315>": 50584,
258
+ "<loc_316>": 50585,
259
+ "<loc_317>": 50586,
260
+ "<loc_318>": 50587,
261
+ "<loc_319>": 50588,
262
+ "<loc_31>": 50300,
263
+ "<loc_320>": 50589,
264
+ "<loc_321>": 50590,
265
+ "<loc_322>": 50591,
266
+ "<loc_323>": 50592,
267
+ "<loc_324>": 50593,
268
+ "<loc_325>": 50594,
269
+ "<loc_326>": 50595,
270
+ "<loc_327>": 50596,
271
+ "<loc_328>": 50597,
272
+ "<loc_329>": 50598,
273
+ "<loc_32>": 50301,
274
+ "<loc_330>": 50599,
275
+ "<loc_331>": 50600,
276
+ "<loc_332>": 50601,
277
+ "<loc_333>": 50602,
278
+ "<loc_334>": 50603,
279
+ "<loc_335>": 50604,
280
+ "<loc_336>": 50605,
281
+ "<loc_337>": 50606,
282
+ "<loc_338>": 50607,
283
+ "<loc_339>": 50608,
284
+ "<loc_33>": 50302,
285
+ "<loc_340>": 50609,
286
+ "<loc_341>": 50610,
287
+ "<loc_342>": 50611,
288
+ "<loc_343>": 50612,
289
+ "<loc_344>": 50613,
290
+ "<loc_345>": 50614,
291
+ "<loc_346>": 50615,
292
+ "<loc_347>": 50616,
293
+ "<loc_348>": 50617,
294
+ "<loc_349>": 50618,
295
+ "<loc_34>": 50303,
296
+ "<loc_350>": 50619,
297
+ "<loc_351>": 50620,
298
+ "<loc_352>": 50621,
299
+ "<loc_353>": 50622,
300
+ "<loc_354>": 50623,
301
+ "<loc_355>": 50624,
302
+ "<loc_356>": 50625,
303
+ "<loc_357>": 50626,
304
+ "<loc_358>": 50627,
305
+ "<loc_359>": 50628,
306
+ "<loc_35>": 50304,
307
+ "<loc_360>": 50629,
308
+ "<loc_361>": 50630,
309
+ "<loc_362>": 50631,
310
+ "<loc_363>": 50632,
311
+ "<loc_364>": 50633,
312
+ "<loc_365>": 50634,
313
+ "<loc_366>": 50635,
314
+ "<loc_367>": 50636,
315
+ "<loc_368>": 50637,
316
+ "<loc_369>": 50638,
317
+ "<loc_36>": 50305,
318
+ "<loc_370>": 50639,
319
+ "<loc_371>": 50640,
320
+ "<loc_372>": 50641,
321
+ "<loc_373>": 50642,
322
+ "<loc_374>": 50643,
323
+ "<loc_375>": 50644,
324
+ "<loc_376>": 50645,
325
+ "<loc_377>": 50646,
326
+ "<loc_378>": 50647,
327
+ "<loc_379>": 50648,
328
+ "<loc_37>": 50306,
329
+ "<loc_380>": 50649,
330
+ "<loc_381>": 50650,
331
+ "<loc_382>": 50651,
332
+ "<loc_383>": 50652,
333
+ "<loc_384>": 50653,
334
+ "<loc_385>": 50654,
335
+ "<loc_386>": 50655,
336
+ "<loc_387>": 50656,
337
+ "<loc_388>": 50657,
338
+ "<loc_389>": 50658,
339
+ "<loc_38>": 50307,
340
+ "<loc_390>": 50659,
341
+ "<loc_391>": 50660,
342
+ "<loc_392>": 50661,
343
+ "<loc_393>": 50662,
344
+ "<loc_394>": 50663,
345
+ "<loc_395>": 50664,
346
+ "<loc_396>": 50665,
347
+ "<loc_397>": 50666,
348
+ "<loc_398>": 50667,
349
+ "<loc_399>": 50668,
350
+ "<loc_39>": 50308,
351
+ "<loc_3>": 50272,
352
+ "<loc_400>": 50669,
353
+ "<loc_401>": 50670,
354
+ "<loc_402>": 50671,
355
+ "<loc_403>": 50672,
356
+ "<loc_404>": 50673,
357
+ "<loc_405>": 50674,
358
+ "<loc_406>": 50675,
359
+ "<loc_407>": 50676,
360
+ "<loc_408>": 50677,
361
+ "<loc_409>": 50678,
362
+ "<loc_40>": 50309,
363
+ "<loc_410>": 50679,
364
+ "<loc_411>": 50680,
365
+ "<loc_412>": 50681,
366
+ "<loc_413>": 50682,
367
+ "<loc_414>": 50683,
368
+ "<loc_415>": 50684,
369
+ "<loc_416>": 50685,
370
+ "<loc_417>": 50686,
371
+ "<loc_418>": 50687,
372
+ "<loc_419>": 50688,
373
+ "<loc_41>": 50310,
374
+ "<loc_420>": 50689,
375
+ "<loc_421>": 50690,
376
+ "<loc_422>": 50691,
377
+ "<loc_423>": 50692,
378
+ "<loc_424>": 50693,
379
+ "<loc_425>": 50694,
380
+ "<loc_426>": 50695,
381
+ "<loc_427>": 50696,
382
+ "<loc_428>": 50697,
383
+ "<loc_429>": 50698,
384
+ "<loc_42>": 50311,
385
+ "<loc_430>": 50699,
386
+ "<loc_431>": 50700,
387
+ "<loc_432>": 50701,
388
+ "<loc_433>": 50702,
389
+ "<loc_434>": 50703,
390
+ "<loc_435>": 50704,
391
+ "<loc_436>": 50705,
392
+ "<loc_437>": 50706,
393
+ "<loc_438>": 50707,
394
+ "<loc_439>": 50708,
395
+ "<loc_43>": 50312,
396
+ "<loc_440>": 50709,
397
+ "<loc_441>": 50710,
398
+ "<loc_442>": 50711,
399
+ "<loc_443>": 50712,
400
+ "<loc_444>": 50713,
401
+ "<loc_445>": 50714,
402
+ "<loc_446>": 50715,
403
+ "<loc_447>": 50716,
404
+ "<loc_448>": 50717,
405
+ "<loc_449>": 50718,
406
+ "<loc_44>": 50313,
407
+ "<loc_450>": 50719,
408
+ "<loc_451>": 50720,
409
+ "<loc_452>": 50721,
410
+ "<loc_453>": 50722,
411
+ "<loc_454>": 50723,
412
+ "<loc_455>": 50724,
413
+ "<loc_456>": 50725,
414
+ "<loc_457>": 50726,
415
+ "<loc_458>": 50727,
416
+ "<loc_459>": 50728,
417
+ "<loc_45>": 50314,
418
+ "<loc_460>": 50729,
419
+ "<loc_461>": 50730,
420
+ "<loc_462>": 50731,
421
+ "<loc_463>": 50732,
422
+ "<loc_464>": 50733,
423
+ "<loc_465>": 50734,
424
+ "<loc_466>": 50735,
425
+ "<loc_467>": 50736,
426
+ "<loc_468>": 50737,
427
+ "<loc_469>": 50738,
428
+ "<loc_46>": 50315,
429
+ "<loc_470>": 50739,
430
+ "<loc_471>": 50740,
431
+ "<loc_472>": 50741,
432
+ "<loc_473>": 50742,
433
+ "<loc_474>": 50743,
434
+ "<loc_475>": 50744,
435
+ "<loc_476>": 50745,
436
+ "<loc_477>": 50746,
437
+ "<loc_478>": 50747,
438
+ "<loc_479>": 50748,
439
+ "<loc_47>": 50316,
440
+ "<loc_480>": 50749,
441
+ "<loc_481>": 50750,
442
+ "<loc_482>": 50751,
443
+ "<loc_483>": 50752,
444
+ "<loc_484>": 50753,
445
+ "<loc_485>": 50754,
446
+ "<loc_486>": 50755,
447
+ "<loc_487>": 50756,
448
+ "<loc_488>": 50757,
449
+ "<loc_489>": 50758,
450
+ "<loc_48>": 50317,
451
+ "<loc_490>": 50759,
452
+ "<loc_491>": 50760,
453
+ "<loc_492>": 50761,
454
+ "<loc_493>": 50762,
455
+ "<loc_494>": 50763,
456
+ "<loc_495>": 50764,
457
+ "<loc_496>": 50765,
458
+ "<loc_497>": 50766,
459
+ "<loc_498>": 50767,
460
+ "<loc_499>": 50768,
461
+ "<loc_49>": 50318,
462
+ "<loc_4>": 50273,
463
+ "<loc_500>": 50769,
464
+ "<loc_501>": 50770,
465
+ "<loc_502>": 50771,
466
+ "<loc_503>": 50772,
467
+ "<loc_504>": 50773,
468
+ "<loc_505>": 50774,
469
+ "<loc_506>": 50775,
470
+ "<loc_507>": 50776,
471
+ "<loc_508>": 50777,
472
+ "<loc_509>": 50778,
473
+ "<loc_50>": 50319,
474
+ "<loc_510>": 50779,
475
+ "<loc_511>": 50780,
476
+ "<loc_512>": 50781,
477
+ "<loc_513>": 50782,
478
+ "<loc_514>": 50783,
479
+ "<loc_515>": 50784,
480
+ "<loc_516>": 50785,
481
+ "<loc_517>": 50786,
482
+ "<loc_518>": 50787,
483
+ "<loc_519>": 50788,
484
+ "<loc_51>": 50320,
485
+ "<loc_520>": 50789,
486
+ "<loc_521>": 50790,
487
+ "<loc_522>": 50791,
488
+ "<loc_523>": 50792,
489
+ "<loc_524>": 50793,
490
+ "<loc_525>": 50794,
491
+ "<loc_526>": 50795,
492
+ "<loc_527>": 50796,
493
+ "<loc_528>": 50797,
494
+ "<loc_529>": 50798,
495
+ "<loc_52>": 50321,
496
+ "<loc_530>": 50799,
497
+ "<loc_531>": 50800,
498
+ "<loc_532>": 50801,
499
+ "<loc_533>": 50802,
500
+ "<loc_534>": 50803,
501
+ "<loc_535>": 50804,
502
+ "<loc_536>": 50805,
503
+ "<loc_537>": 50806,
504
+ "<loc_538>": 50807,
505
+ "<loc_539>": 50808,
506
+ "<loc_53>": 50322,
507
+ "<loc_540>": 50809,
508
+ "<loc_541>": 50810,
509
+ "<loc_542>": 50811,
510
+ "<loc_543>": 50812,
511
+ "<loc_544>": 50813,
512
+ "<loc_545>": 50814,
513
+ "<loc_546>": 50815,
514
+ "<loc_547>": 50816,
515
+ "<loc_548>": 50817,
516
+ "<loc_549>": 50818,
517
+ "<loc_54>": 50323,
518
+ "<loc_550>": 50819,
519
+ "<loc_551>": 50820,
520
+ "<loc_552>": 50821,
521
+ "<loc_553>": 50822,
522
+ "<loc_554>": 50823,
523
+ "<loc_555>": 50824,
524
+ "<loc_556>": 50825,
525
+ "<loc_557>": 50826,
526
+ "<loc_558>": 50827,
527
+ "<loc_559>": 50828,
528
+ "<loc_55>": 50324,
529
+ "<loc_560>": 50829,
530
+ "<loc_561>": 50830,
531
+ "<loc_562>": 50831,
532
+ "<loc_563>": 50832,
533
+ "<loc_564>": 50833,
534
+ "<loc_565>": 50834,
535
+ "<loc_566>": 50835,
536
+ "<loc_567>": 50836,
537
+ "<loc_568>": 50837,
538
+ "<loc_569>": 50838,
539
+ "<loc_56>": 50325,
540
+ "<loc_570>": 50839,
541
+ "<loc_571>": 50840,
542
+ "<loc_572>": 50841,
543
+ "<loc_573>": 50842,
544
+ "<loc_574>": 50843,
545
+ "<loc_575>": 50844,
546
+ "<loc_576>": 50845,
547
+ "<loc_577>": 50846,
548
+ "<loc_578>": 50847,
549
+ "<loc_579>": 50848,
550
+ "<loc_57>": 50326,
551
+ "<loc_580>": 50849,
552
+ "<loc_581>": 50850,
553
+ "<loc_582>": 50851,
554
+ "<loc_583>": 50852,
555
+ "<loc_584>": 50853,
556
+ "<loc_585>": 50854,
557
+ "<loc_586>": 50855,
558
+ "<loc_587>": 50856,
559
+ "<loc_588>": 50857,
560
+ "<loc_589>": 50858,
561
+ "<loc_58>": 50327,
562
+ "<loc_590>": 50859,
563
+ "<loc_591>": 50860,
564
+ "<loc_592>": 50861,
565
+ "<loc_593>": 50862,
566
+ "<loc_594>": 50863,
567
+ "<loc_595>": 50864,
568
+ "<loc_596>": 50865,
569
+ "<loc_597>": 50866,
570
+ "<loc_598>": 50867,
571
+ "<loc_599>": 50868,
572
+ "<loc_59>": 50328,
573
+ "<loc_5>": 50274,
574
+ "<loc_600>": 50869,
575
+ "<loc_601>": 50870,
576
+ "<loc_602>": 50871,
577
+ "<loc_603>": 50872,
578
+ "<loc_604>": 50873,
579
+ "<loc_605>": 50874,
580
+ "<loc_606>": 50875,
581
+ "<loc_607>": 50876,
582
+ "<loc_608>": 50877,
583
+ "<loc_609>": 50878,
584
+ "<loc_60>": 50329,
585
+ "<loc_610>": 50879,
586
+ "<loc_611>": 50880,
587
+ "<loc_612>": 50881,
588
+ "<loc_613>": 50882,
589
+ "<loc_614>": 50883,
590
+ "<loc_615>": 50884,
591
+ "<loc_616>": 50885,
592
+ "<loc_617>": 50886,
593
+ "<loc_618>": 50887,
594
+ "<loc_619>": 50888,
595
+ "<loc_61>": 50330,
596
+ "<loc_620>": 50889,
597
+ "<loc_621>": 50890,
598
+ "<loc_622>": 50891,
599
+ "<loc_623>": 50892,
600
+ "<loc_624>": 50893,
601
+ "<loc_625>": 50894,
602
+ "<loc_626>": 50895,
603
+ "<loc_627>": 50896,
604
+ "<loc_628>": 50897,
605
+ "<loc_629>": 50898,
606
+ "<loc_62>": 50331,
607
+ "<loc_630>": 50899,
608
+ "<loc_631>": 50900,
609
+ "<loc_632>": 50901,
610
+ "<loc_633>": 50902,
611
+ "<loc_634>": 50903,
612
+ "<loc_635>": 50904,
613
+ "<loc_636>": 50905,
614
+ "<loc_637>": 50906,
615
+ "<loc_638>": 50907,
616
+ "<loc_639>": 50908,
617
+ "<loc_63>": 50332,
618
+ "<loc_640>": 50909,
619
+ "<loc_641>": 50910,
620
+ "<loc_642>": 50911,
621
+ "<loc_643>": 50912,
622
+ "<loc_644>": 50913,
623
+ "<loc_645>": 50914,
624
+ "<loc_646>": 50915,
625
+ "<loc_647>": 50916,
626
+ "<loc_648>": 50917,
627
+ "<loc_649>": 50918,
628
+ "<loc_64>": 50333,
629
+ "<loc_650>": 50919,
630
+ "<loc_651>": 50920,
631
+ "<loc_652>": 50921,
632
+ "<loc_653>": 50922,
633
+ "<loc_654>": 50923,
634
+ "<loc_655>": 50924,
635
+ "<loc_656>": 50925,
636
+ "<loc_657>": 50926,
637
+ "<loc_658>": 50927,
638
+ "<loc_659>": 50928,
639
+ "<loc_65>": 50334,
640
+ "<loc_660>": 50929,
641
+ "<loc_661>": 50930,
642
+ "<loc_662>": 50931,
643
+ "<loc_663>": 50932,
644
+ "<loc_664>": 50933,
645
+ "<loc_665>": 50934,
646
+ "<loc_666>": 50935,
647
+ "<loc_667>": 50936,
648
+ "<loc_668>": 50937,
649
+ "<loc_669>": 50938,
650
+ "<loc_66>": 50335,
651
+ "<loc_670>": 50939,
652
+ "<loc_671>": 50940,
653
+ "<loc_672>": 50941,
654
+ "<loc_673>": 50942,
655
+ "<loc_674>": 50943,
656
+ "<loc_675>": 50944,
657
+ "<loc_676>": 50945,
658
+ "<loc_677>": 50946,
659
+ "<loc_678>": 50947,
660
+ "<loc_679>": 50948,
661
+ "<loc_67>": 50336,
662
+ "<loc_680>": 50949,
663
+ "<loc_681>": 50950,
664
+ "<loc_682>": 50951,
665
+ "<loc_683>": 50952,
666
+ "<loc_684>": 50953,
667
+ "<loc_685>": 50954,
668
+ "<loc_686>": 50955,
669
+ "<loc_687>": 50956,
670
+ "<loc_688>": 50957,
671
+ "<loc_689>": 50958,
672
+ "<loc_68>": 50337,
673
+ "<loc_690>": 50959,
674
+ "<loc_691>": 50960,
675
+ "<loc_692>": 50961,
676
+ "<loc_693>": 50962,
677
+ "<loc_694>": 50963,
678
+ "<loc_695>": 50964,
679
+ "<loc_696>": 50965,
680
+ "<loc_697>": 50966,
681
+ "<loc_698>": 50967,
682
+ "<loc_699>": 50968,
683
+ "<loc_69>": 50338,
684
+ "<loc_6>": 50275,
685
+ "<loc_700>": 50969,
686
+ "<loc_701>": 50970,
687
+ "<loc_702>": 50971,
688
+ "<loc_703>": 50972,
689
+ "<loc_704>": 50973,
690
+ "<loc_705>": 50974,
691
+ "<loc_706>": 50975,
692
+ "<loc_707>": 50976,
693
+ "<loc_708>": 50977,
694
+ "<loc_709>": 50978,
695
+ "<loc_70>": 50339,
696
+ "<loc_710>": 50979,
697
+ "<loc_711>": 50980,
698
+ "<loc_712>": 50981,
699
+ "<loc_713>": 50982,
700
+ "<loc_714>": 50983,
701
+ "<loc_715>": 50984,
702
+ "<loc_716>": 50985,
703
+ "<loc_717>": 50986,
704
+ "<loc_718>": 50987,
705
+ "<loc_719>": 50988,
706
+ "<loc_71>": 50340,
707
+ "<loc_720>": 50989,
708
+ "<loc_721>": 50990,
709
+ "<loc_722>": 50991,
710
+ "<loc_723>": 50992,
711
+ "<loc_724>": 50993,
712
+ "<loc_725>": 50994,
713
+ "<loc_726>": 50995,
714
+ "<loc_727>": 50996,
715
+ "<loc_728>": 50997,
716
+ "<loc_729>": 50998,
717
+ "<loc_72>": 50341,
718
+ "<loc_730>": 50999,
719
+ "<loc_731>": 51000,
720
+ "<loc_732>": 51001,
721
+ "<loc_733>": 51002,
722
+ "<loc_734>": 51003,
723
+ "<loc_735>": 51004,
724
+ "<loc_736>": 51005,
725
+ "<loc_737>": 51006,
726
+ "<loc_738>": 51007,
727
+ "<loc_739>": 51008,
728
+ "<loc_73>": 50342,
729
+ "<loc_740>": 51009,
730
+ "<loc_741>": 51010,
731
+ "<loc_742>": 51011,
732
+ "<loc_743>": 51012,
733
+ "<loc_744>": 51013,
734
+ "<loc_745>": 51014,
735
+ "<loc_746>": 51015,
736
+ "<loc_747>": 51016,
737
+ "<loc_748>": 51017,
738
+ "<loc_749>": 51018,
739
+ "<loc_74>": 50343,
740
+ "<loc_750>": 51019,
741
+ "<loc_751>": 51020,
742
+ "<loc_752>": 51021,
743
+ "<loc_753>": 51022,
744
+ "<loc_754>": 51023,
745
+ "<loc_755>": 51024,
746
+ "<loc_756>": 51025,
747
+ "<loc_757>": 51026,
748
+ "<loc_758>": 51027,
749
+ "<loc_759>": 51028,
750
+ "<loc_75>": 50344,
751
+ "<loc_760>": 51029,
752
+ "<loc_761>": 51030,
753
+ "<loc_762>": 51031,
754
+ "<loc_763>": 51032,
755
+ "<loc_764>": 51033,
756
+ "<loc_765>": 51034,
757
+ "<loc_766>": 51035,
758
+ "<loc_767>": 51036,
759
+ "<loc_768>": 51037,
760
+ "<loc_769>": 51038,
761
+ "<loc_76>": 50345,
762
+ "<loc_770>": 51039,
763
+ "<loc_771>": 51040,
764
+ "<loc_772>": 51041,
765
+ "<loc_773>": 51042,
766
+ "<loc_774>": 51043,
767
+ "<loc_775>": 51044,
768
+ "<loc_776>": 51045,
769
+ "<loc_777>": 51046,
770
+ "<loc_778>": 51047,
771
+ "<loc_779>": 51048,
772
+ "<loc_77>": 50346,
773
+ "<loc_780>": 51049,
774
+ "<loc_781>": 51050,
775
+ "<loc_782>": 51051,
776
+ "<loc_783>": 51052,
777
+ "<loc_784>": 51053,
778
+ "<loc_785>": 51054,
779
+ "<loc_786>": 51055,
780
+ "<loc_787>": 51056,
781
+ "<loc_788>": 51057,
782
+ "<loc_789>": 51058,
783
+ "<loc_78>": 50347,
784
+ "<loc_790>": 51059,
785
+ "<loc_791>": 51060,
786
+ "<loc_792>": 51061,
787
+ "<loc_793>": 51062,
788
+ "<loc_794>": 51063,
789
+ "<loc_795>": 51064,
790
+ "<loc_796>": 51065,
791
+ "<loc_797>": 51066,
792
+ "<loc_798>": 51067,
793
+ "<loc_799>": 51068,
794
+ "<loc_79>": 50348,
795
+ "<loc_7>": 50276,
796
+ "<loc_800>": 51069,
797
+ "<loc_801>": 51070,
798
+ "<loc_802>": 51071,
799
+ "<loc_803>": 51072,
800
+ "<loc_804>": 51073,
801
+ "<loc_805>": 51074,
802
+ "<loc_806>": 51075,
803
+ "<loc_807>": 51076,
804
+ "<loc_808>": 51077,
805
+ "<loc_809>": 51078,
806
+ "<loc_80>": 50349,
807
+ "<loc_810>": 51079,
808
+ "<loc_811>": 51080,
809
+ "<loc_812>": 51081,
810
+ "<loc_813>": 51082,
811
+ "<loc_814>": 51083,
812
+ "<loc_815>": 51084,
813
+ "<loc_816>": 51085,
814
+ "<loc_817>": 51086,
815
+ "<loc_818>": 51087,
816
+ "<loc_819>": 51088,
817
+ "<loc_81>": 50350,
818
+ "<loc_820>": 51089,
819
+ "<loc_821>": 51090,
820
+ "<loc_822>": 51091,
821
+ "<loc_823>": 51092,
822
+ "<loc_824>": 51093,
823
+ "<loc_825>": 51094,
824
+ "<loc_826>": 51095,
825
+ "<loc_827>": 51096,
826
+ "<loc_828>": 51097,
827
+ "<loc_829>": 51098,
828
+ "<loc_82>": 50351,
829
+ "<loc_830>": 51099,
830
+ "<loc_831>": 51100,
831
+ "<loc_832>": 51101,
832
+ "<loc_833>": 51102,
833
+ "<loc_834>": 51103,
834
+ "<loc_835>": 51104,
835
+ "<loc_836>": 51105,
836
+ "<loc_837>": 51106,
837
+ "<loc_838>": 51107,
838
+ "<loc_839>": 51108,
839
+ "<loc_83>": 50352,
840
+ "<loc_840>": 51109,
841
+ "<loc_841>": 51110,
842
+ "<loc_842>": 51111,
843
+ "<loc_843>": 51112,
844
+ "<loc_844>": 51113,
845
+ "<loc_845>": 51114,
846
+ "<loc_846>": 51115,
847
+ "<loc_847>": 51116,
848
+ "<loc_848>": 51117,
849
+ "<loc_849>": 51118,
850
+ "<loc_84>": 50353,
851
+ "<loc_850>": 51119,
852
+ "<loc_851>": 51120,
853
+ "<loc_852>": 51121,
854
+ "<loc_853>": 51122,
855
+ "<loc_854>": 51123,
856
+ "<loc_855>": 51124,
857
+ "<loc_856>": 51125,
858
+ "<loc_857>": 51126,
859
+ "<loc_858>": 51127,
860
+ "<loc_859>": 51128,
861
+ "<loc_85>": 50354,
862
+ "<loc_860>": 51129,
863
+ "<loc_861>": 51130,
864
+ "<loc_862>": 51131,
865
+ "<loc_863>": 51132,
866
+ "<loc_864>": 51133,
867
+ "<loc_865>": 51134,
868
+ "<loc_866>": 51135,
869
+ "<loc_867>": 51136,
870
+ "<loc_868>": 51137,
871
+ "<loc_869>": 51138,
872
+ "<loc_86>": 50355,
873
+ "<loc_870>": 51139,
874
+ "<loc_871>": 51140,
875
+ "<loc_872>": 51141,
876
+ "<loc_873>": 51142,
877
+ "<loc_874>": 51143,
878
+ "<loc_875>": 51144,
879
+ "<loc_876>": 51145,
880
+ "<loc_877>": 51146,
881
+ "<loc_878>": 51147,
882
+ "<loc_879>": 51148,
883
+ "<loc_87>": 50356,
884
+ "<loc_880>": 51149,
885
+ "<loc_881>": 51150,
886
+ "<loc_882>": 51151,
887
+ "<loc_883>": 51152,
888
+ "<loc_884>": 51153,
889
+ "<loc_885>": 51154,
890
+ "<loc_886>": 51155,
891
+ "<loc_887>": 51156,
892
+ "<loc_888>": 51157,
893
+ "<loc_889>": 51158,
894
+ "<loc_88>": 50357,
895
+ "<loc_890>": 51159,
896
+ "<loc_891>": 51160,
897
+ "<loc_892>": 51161,
898
+ "<loc_893>": 51162,
899
+ "<loc_894>": 51163,
900
+ "<loc_895>": 51164,
901
+ "<loc_896>": 51165,
902
+ "<loc_897>": 51166,
903
+ "<loc_898>": 51167,
904
+ "<loc_899>": 51168,
905
+ "<loc_89>": 50358,
906
+ "<loc_8>": 50277,
907
+ "<loc_900>": 51169,
908
+ "<loc_901>": 51170,
909
+ "<loc_902>": 51171,
910
+ "<loc_903>": 51172,
911
+ "<loc_904>": 51173,
912
+ "<loc_905>": 51174,
913
+ "<loc_906>": 51175,
914
+ "<loc_907>": 51176,
915
+ "<loc_908>": 51177,
916
+ "<loc_909>": 51178,
917
+ "<loc_90>": 50359,
918
+ "<loc_910>": 51179,
919
+ "<loc_911>": 51180,
920
+ "<loc_912>": 51181,
921
+ "<loc_913>": 51182,
922
+ "<loc_914>": 51183,
923
+ "<loc_915>": 51184,
924
+ "<loc_916>": 51185,
925
+ "<loc_917>": 51186,
926
+ "<loc_918>": 51187,
927
+ "<loc_919>": 51188,
928
+ "<loc_91>": 50360,
929
+ "<loc_920>": 51189,
930
+ "<loc_921>": 51190,
931
+ "<loc_922>": 51191,
932
+ "<loc_923>": 51192,
933
+ "<loc_924>": 51193,
934
+ "<loc_925>": 51194,
935
+ "<loc_926>": 51195,
936
+ "<loc_927>": 51196,
937
+ "<loc_928>": 51197,
938
+ "<loc_929>": 51198,
939
+ "<loc_92>": 50361,
940
+ "<loc_930>": 51199,
941
+ "<loc_931>": 51200,
942
+ "<loc_932>": 51201,
943
+ "<loc_933>": 51202,
944
+ "<loc_934>": 51203,
945
+ "<loc_935>": 51204,
946
+ "<loc_936>": 51205,
947
+ "<loc_937>": 51206,
948
+ "<loc_938>": 51207,
949
+ "<loc_939>": 51208,
950
+ "<loc_93>": 50362,
951
+ "<loc_940>": 51209,
952
+ "<loc_941>": 51210,
953
+ "<loc_942>": 51211,
954
+ "<loc_943>": 51212,
955
+ "<loc_944>": 51213,
956
+ "<loc_945>": 51214,
957
+ "<loc_946>": 51215,
958
+ "<loc_947>": 51216,
959
+ "<loc_948>": 51217,
960
+ "<loc_949>": 51218,
961
+ "<loc_94>": 50363,
962
+ "<loc_950>": 51219,
963
+ "<loc_951>": 51220,
964
+ "<loc_952>": 51221,
965
+ "<loc_953>": 51222,
966
+ "<loc_954>": 51223,
967
+ "<loc_955>": 51224,
968
+ "<loc_956>": 51225,
969
+ "<loc_957>": 51226,
970
+ "<loc_958>": 51227,
971
+ "<loc_959>": 51228,
972
+ "<loc_95>": 50364,
973
+ "<loc_960>": 51229,
974
+ "<loc_961>": 51230,
975
+ "<loc_962>": 51231,
976
+ "<loc_963>": 51232,
977
+ "<loc_964>": 51233,
978
+ "<loc_965>": 51234,
979
+ "<loc_966>": 51235,
980
+ "<loc_967>": 51236,
981
+ "<loc_968>": 51237,
982
+ "<loc_969>": 51238,
983
+ "<loc_96>": 50365,
984
+ "<loc_970>": 51239,
985
+ "<loc_971>": 51240,
986
+ "<loc_972>": 51241,
987
+ "<loc_973>": 51242,
988
+ "<loc_974>": 51243,
989
+ "<loc_975>": 51244,
990
+ "<loc_976>": 51245,
991
+ "<loc_977>": 51246,
992
+ "<loc_978>": 51247,
993
+ "<loc_979>": 51248,
994
+ "<loc_97>": 50366,
995
+ "<loc_980>": 51249,
996
+ "<loc_981>": 51250,
997
+ "<loc_982>": 51251,
998
+ "<loc_983>": 51252,
999
+ "<loc_984>": 51253,
1000
+ "<loc_985>": 51254,
1001
+ "<loc_986>": 51255,
1002
+ "<loc_987>": 51256,
1003
+ "<loc_988>": 51257,
1004
+ "<loc_989>": 51258,
1005
+ "<loc_98>": 50367,
1006
+ "<loc_990>": 51259,
1007
+ "<loc_991>": 51260,
1008
+ "<loc_992>": 51261,
1009
+ "<loc_993>": 51262,
1010
+ "<loc_994>": 51263,
1011
+ "<loc_995>": 51264,
1012
+ "<loc_996>": 51265,
1013
+ "<loc_997>": 51266,
1014
+ "<loc_998>": 51267,
1015
+ "<loc_999>": 51268,
1016
+ "<loc_99>": 50368,
1017
+ "<loc_9>": 50278,
1018
+ "<ncap>": 51271,
1019
+ "<ocr>": 50267,
1020
+ "<od>": 50265,
1021
+ "<panel>": 51289,
1022
+ "<poly>": 51286,
1023
+ "<proposal>": 51284,
1024
+ "<region_cap>": 51280,
1025
+ "<region_to_desciption>": 51282,
1026
+ "<seg>": 51277,
1027
+ "<sep>": 51279,
1028
+ "<tail>": 51292,
1029
+ "<text>": 51290
1030
+ }
config.json ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "magiv3",
3
+ "architectures": [
4
+ "Florence2ForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_florence2.Florence2Config",
8
+ "AutoModel": "modeling_florence2.Florence2ForConditionalGeneration",
9
+ "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
10
+ },
11
+ "bos_token_id": 0,
12
+ "eos_token_id": 2,
13
+ "ignore_index": -100,
14
+ "is_encoder_decoder": true,
15
+ "model_type": "florence2",
16
+ "pad_token_id": 1,
17
+ "projection_dim": 1024,
18
+ "text_config": {
19
+ "_name_or_path": "",
20
+ "activation_dropout": 0.1,
21
+ "activation_function": "gelu",
22
+ "add_bias_logits": false,
23
+ "add_cross_attention": false,
24
+ "add_final_layer_norm": false,
25
+ "architectures": null,
26
+ "attention_dropout": 0.1,
27
+ "bad_words_ids": null,
28
+ "begin_suppress_tokens": null,
29
+ "bos_token_id": 0,
30
+ "chunk_size_feed_forward": 0,
31
+ "classif_dropout": 0.1,
32
+ "classifier_dropout": 0.0,
33
+ "cross_attention_hidden_size": null,
34
+ "d_model": 1024,
35
+ "decoder_attention_heads": 16,
36
+ "decoder_ffn_dim": 4096,
37
+ "decoder_layerdrop": 0.0,
38
+ "decoder_layers": 12,
39
+ "decoder_start_token_id": 2,
40
+ "diversity_penalty": 0.0,
41
+ "do_sample": false,
42
+ "dropout": 0.1,
43
+ "early_stopping": true,
44
+ "encoder_attention_heads": 16,
45
+ "encoder_ffn_dim": 4096,
46
+ "encoder_layerdrop": 0.0,
47
+ "encoder_layers": 12,
48
+ "encoder_no_repeat_ngram_size": 0,
49
+ "eos_token_id": 2,
50
+ "exponential_decay_length_penalty": null,
51
+ "finetuning_task": null,
52
+ "forced_bos_token_id": 0,
53
+ "forced_eos_token_id": 2,
54
+ "gradient_checkpointing": false,
55
+ "id2label": {
56
+ "0": "LABEL_0",
57
+ "1": "LABEL_1",
58
+ "2": "LABEL_2"
59
+ },
60
+ "init_std": 0.02,
61
+ "is_decoder": false,
62
+ "is_encoder_decoder": true,
63
+ "label2id": {
64
+ "LABEL_0": 0,
65
+ "LABEL_1": 1,
66
+ "LABEL_2": 2
67
+ },
68
+ "length_penalty": 1.0,
69
+ "max_length": 20,
70
+ "max_position_embeddings": 1024,
71
+ "min_length": 0,
72
+ "model_type": "florence2_language",
73
+ "no_repeat_ngram_size": 3,
74
+ "normalize_before": false,
75
+ "num_beam_groups": 1,
76
+ "num_beams": 3,
77
+ "num_hidden_layers": 12,
78
+ "num_return_sequences": 1,
79
+ "output_attentions": false,
80
+ "output_hidden_states": false,
81
+ "output_scores": false,
82
+ "pad_token_id": 1,
83
+ "prefix": null,
84
+ "problem_type": null,
85
+ "pruned_heads": {},
86
+ "remove_invalid_values": false,
87
+ "repetition_penalty": 1.0,
88
+ "return_dict": true,
89
+ "return_dict_in_generate": false,
90
+ "scale_embedding": false,
91
+ "sep_token_id": null,
92
+ "suppress_tokens": null,
93
+ "task_specific_params": null,
94
+ "temperature": 1.0,
95
+ "tf_legacy_loss": false,
96
+ "tie_encoder_decoder": false,
97
+ "tie_word_embeddings": true,
98
+ "tokenizer_class": null,
99
+ "top_k": 50,
100
+ "top_p": 1.0,
101
+ "torch_dtype": null,
102
+ "torchscript": false,
103
+ "typical_p": 1.0,
104
+ "use_bfloat16": false,
105
+ "use_cache": true,
106
+ "vocab_size": 51293
107
+ },
108
+ "torch_dtype": "float16",
109
+ "transformers_version": "4.45.2",
110
+ "vision_config": {
111
+ "_name_or_path": "",
112
+ "add_cross_attention": false,
113
+ "architectures": null,
114
+ "bad_words_ids": null,
115
+ "begin_suppress_tokens": null,
116
+ "bos_token_id": null,
117
+ "chunk_size_feed_forward": 0,
118
+ "cross_attention_hidden_size": null,
119
+ "decoder_start_token_id": null,
120
+ "depths": [
121
+ 1,
122
+ 1,
123
+ 9,
124
+ 1
125
+ ],
126
+ "dim_embed": [
127
+ 256,
128
+ 512,
129
+ 1024,
130
+ 2048
131
+ ],
132
+ "diversity_penalty": 0.0,
133
+ "do_sample": false,
134
+ "drop_path_rate": 0.1,
135
+ "early_stopping": false,
136
+ "enable_checkpoint": false,
137
+ "encoder_no_repeat_ngram_size": 0,
138
+ "eos_token_id": null,
139
+ "exponential_decay_length_penalty": null,
140
+ "finetuning_task": null,
141
+ "forced_bos_token_id": null,
142
+ "forced_eos_token_id": null,
143
+ "id2label": {
144
+ "0": "LABEL_0",
145
+ "1": "LABEL_1"
146
+ },
147
+ "image_feature_source": [
148
+ "spatial_avg_pool",
149
+ "temporal_avg_pool"
150
+ ],
151
+ "image_pos_embed": {
152
+ "max_pos_embeddings": 50,
153
+ "type": "learned_abs_2d"
154
+ },
155
+ "is_decoder": false,
156
+ "is_encoder_decoder": false,
157
+ "label2id": {
158
+ "LABEL_0": 0,
159
+ "LABEL_1": 1
160
+ },
161
+ "length_penalty": 1.0,
162
+ "max_length": 20,
163
+ "min_length": 0,
164
+ "model_type": "",
165
+ "no_repeat_ngram_size": 0,
166
+ "num_beam_groups": 1,
167
+ "num_beams": 1,
168
+ "num_groups": [
169
+ 8,
170
+ 16,
171
+ 32,
172
+ 64
173
+ ],
174
+ "num_heads": [
175
+ 8,
176
+ 16,
177
+ 32,
178
+ 64
179
+ ],
180
+ "num_return_sequences": 1,
181
+ "output_attentions": false,
182
+ "output_hidden_states": false,
183
+ "output_scores": false,
184
+ "pad_token_id": null,
185
+ "patch_padding": [
186
+ 3,
187
+ 1,
188
+ 1,
189
+ 1
190
+ ],
191
+ "patch_prenorm": [
192
+ false,
193
+ true,
194
+ true,
195
+ true
196
+ ],
197
+ "patch_size": [
198
+ 7,
199
+ 3,
200
+ 3,
201
+ 3
202
+ ],
203
+ "patch_stride": [
204
+ 4,
205
+ 2,
206
+ 2,
207
+ 2
208
+ ],
209
+ "prefix": null,
210
+ "problem_type": null,
211
+ "projection_dim": 1024,
212
+ "pruned_heads": {},
213
+ "remove_invalid_values": false,
214
+ "repetition_penalty": 1.0,
215
+ "return_dict": true,
216
+ "return_dict_in_generate": false,
217
+ "sep_token_id": null,
218
+ "suppress_tokens": null,
219
+ "task_specific_params": null,
220
+ "temperature": 1.0,
221
+ "tf_legacy_loss": false,
222
+ "tie_encoder_decoder": false,
223
+ "tie_word_embeddings": true,
224
+ "tokenizer_class": null,
225
+ "top_k": 50,
226
+ "top_p": 1.0,
227
+ "torch_dtype": null,
228
+ "torchscript": false,
229
+ "typical_p": 1.0,
230
+ "use_bfloat16": false,
231
+ "visual_temporal_embedding": {
232
+ "max_temporal_embeddings": 100,
233
+ "type": "COSINE"
234
+ },
235
+ "window_size": 12
236
+ },
237
+ "vocab_size": 51293
238
+ }
configuration_florence2.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import warnings
15
+ """ Florence-2 configuration"""
16
+
17
+ from typing import Optional
18
+
19
+ from transformers import AutoConfig
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ class Florence2VisionConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
28
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
29
+ defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
+ documentation from [`PretrainedConfig`] for more information.
33
+
34
+ Args:
35
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
36
+ The dropout rate of the drop path layer.
37
+ patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
38
+ The patch size of the image.
39
+ patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
40
+ The patch stride of the image.
41
+ patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
42
+ The patch padding of the image.
43
+ patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
44
+ Whether to apply layer normalization before the patch embedding layer.
45
+ enable_checkpoint (`bool`, *optional*, defaults to False):
46
+ Whether to enable checkpointing.
47
+ dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
48
+ The dimension of the embedding layer.
49
+ num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
50
+ The number of attention heads.
51
+ num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
52
+ The number of groups.
53
+ depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
54
+ The depth of the model.
55
+ window_size (`int`, *optional*, defaults to 12):
56
+ The window size of the model.
57
+ projection_dim (`int`, *optional*, defaults to 1024):
58
+ The dimension of the projection layer.
59
+ visual_temporal_embedding (`dict`, *optional*):
60
+ The configuration of the visual temporal embedding.
61
+ image_pos_embed (`dict`, *optional*):
62
+ The configuration of the image position embedding.
63
+ image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
64
+ The source of the image feature.
65
+ Example:
66
+
67
+ ```python
68
+ >>> from transformers import Florence2VisionConfig, Florence2VisionModel
69
+
70
+ >>> # Initializing a Florence2 Vision style configuration
71
+ >>> configuration = Florence2VisionConfig()
72
+
73
+ >>> # Initializing a model (with random weights)
74
+ >>> model = Florence2VisionModel(configuration)
75
+
76
+ >>> # Accessing the model configuration
77
+ >>> configuration = model.config
78
+ ```"""
79
+
80
+ model_type = "florence2_vision"
81
+ keys_to_ignore_at_inference = ["past_key_values"]
82
+
83
+ def __init__(
84
+ self,
85
+ drop_path_rate=0.1,
86
+ patch_size=[7, 3, 3, 3],
87
+ patch_stride=[4, 2, 2, 2],
88
+ patch_padding=[3, 1, 1, 1],
89
+ patch_prenorm=[False, True, True, True],
90
+ enable_checkpoint=False,
91
+ dim_embed=[256, 512, 1024, 2048],
92
+ num_heads=[8, 16, 32, 64],
93
+ num_groups=[8, 16, 32, 64],
94
+ depths=[1, 1, 9, 1],
95
+ window_size=12,
96
+ projection_dim=1024,
97
+ visual_temporal_embedding=None,
98
+ image_pos_embed=None,
99
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
100
+ **kwargs,
101
+ ):
102
+ self.drop_path_rate = drop_path_rate
103
+ self.patch_size = patch_size
104
+ self.patch_stride = patch_stride
105
+ self.patch_padding = patch_padding
106
+ self.patch_prenorm = patch_prenorm
107
+ self.enable_checkpoint = enable_checkpoint
108
+ self.dim_embed = dim_embed
109
+ self.num_heads = num_heads
110
+ self.num_groups = num_groups
111
+ self.depths = depths
112
+ self.window_size = window_size
113
+ self.projection_dim = projection_dim
114
+ self.visual_temporal_embedding = visual_temporal_embedding
115
+ self.image_pos_embed = image_pos_embed
116
+ self.image_feature_source = image_feature_source
117
+
118
+ super().__init__(**kwargs)
119
+
120
+
121
+
122
+ class Florence2LanguageConfig(PretrainedConfig):
123
+ r"""
124
+ This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
125
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
126
+ defaults will yield a similar configuration to that of the BART
127
+ [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
128
+
129
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
130
+ documentation from [`PretrainedConfig`] for more information.
131
+
132
+
133
+ Args:
134
+ vocab_size (`int`, *optional*, defaults to 51289):
135
+ Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
136
+ `inputs_ids` passed when calling [`Florence2LanguageModel`].
137
+ d_model (`int`, *optional*, defaults to 1024):
138
+ Dimensionality of the layers and the pooler layer.
139
+ encoder_layers (`int`, *optional*, defaults to 12):
140
+ Number of encoder layers.
141
+ decoder_layers (`int`, *optional*, defaults to 12):
142
+ Number of decoder layers.
143
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
144
+ Number of attention heads for each attention layer in the Transformer encoder.
145
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
146
+ Number of attention heads for each attention layer in the Transformer decoder.
147
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
148
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
149
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
150
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
151
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
152
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
153
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
154
+ dropout (`float`, *optional*, defaults to 0.1):
155
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
156
+ attention_dropout (`float`, *optional*, defaults to 0.0):
157
+ The dropout ratio for the attention probabilities.
158
+ activation_dropout (`float`, *optional*, defaults to 0.0):
159
+ The dropout ratio for activations inside the fully connected layer.
160
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
161
+ The dropout ratio for classifier.
162
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
163
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
164
+ just in case (e.g., 512 or 1024 or 2048).
165
+ init_std (`float`, *optional*, defaults to 0.02):
166
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
167
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
168
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
169
+ for more details.
170
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
171
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
172
+ for more details.
173
+ scale_embedding (`bool`, *optional*, defaults to `False`):
174
+ Scale embeddings by diving by sqrt(d_model).
175
+ use_cache (`bool`, *optional*, defaults to `True`):
176
+ Whether or not the model should return the last key/values attentions (not used by all models).
177
+ num_labels (`int`, *optional*, defaults to 3):
178
+ The number of labels to use in [`Florence2LanguageForSequenceClassification`].
179
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
180
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
181
+ `eos_token_id`.
182
+
183
+ Example:
184
+
185
+ ```python
186
+ >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
187
+
188
+ >>> # Initializing a Florence2 Language style configuration
189
+ >>> configuration = Florence2LanguageConfig()
190
+
191
+ >>> # Initializing a model (with random weights)
192
+ >>> model = Florence2LangaugeModel(configuration)
193
+
194
+ >>> # Accessing the model configuration
195
+ >>> configuration = model.config
196
+ ```"""
197
+
198
+ model_type = "florence2_language"
199
+ keys_to_ignore_at_inference = ["past_key_values"]
200
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
201
+
202
+ def __init__(
203
+ self,
204
+ vocab_size=51289,
205
+ max_position_embeddings=1024,
206
+ encoder_layers=12,
207
+ encoder_ffn_dim=4096,
208
+ encoder_attention_heads=16,
209
+ decoder_layers=12,
210
+ decoder_ffn_dim=4096,
211
+ decoder_attention_heads=16,
212
+ encoder_layerdrop=0.0,
213
+ decoder_layerdrop=0.0,
214
+ activation_function="gelu",
215
+ d_model=1024,
216
+ dropout=0.1,
217
+ attention_dropout=0.0,
218
+ activation_dropout=0.0,
219
+ init_std=0.02,
220
+ classifier_dropout=0.0,
221
+ scale_embedding=False,
222
+ use_cache=True,
223
+ num_labels=3,
224
+ pad_token_id=1,
225
+ bos_token_id=0,
226
+ eos_token_id=2,
227
+ is_encoder_decoder=True,
228
+ decoder_start_token_id=2,
229
+ forced_eos_token_id=2,
230
+ **kwargs,
231
+ ):
232
+ self.vocab_size = vocab_size
233
+ self.max_position_embeddings = max_position_embeddings
234
+ self.d_model = d_model
235
+ self.encoder_ffn_dim = encoder_ffn_dim
236
+ self.encoder_layers = encoder_layers
237
+ self.encoder_attention_heads = encoder_attention_heads
238
+ self.decoder_ffn_dim = decoder_ffn_dim
239
+ self.decoder_layers = decoder_layers
240
+ self.decoder_attention_heads = decoder_attention_heads
241
+ self.dropout = dropout
242
+ self.attention_dropout = attention_dropout
243
+ self.activation_dropout = activation_dropout
244
+ self.activation_function = activation_function
245
+ self.init_std = init_std
246
+ self.encoder_layerdrop = encoder_layerdrop
247
+ self.decoder_layerdrop = decoder_layerdrop
248
+ self.classifier_dropout = classifier_dropout
249
+ self.use_cache = use_cache
250
+ self.num_hidden_layers = encoder_layers
251
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
252
+
253
+ super().__init__(
254
+ num_labels=num_labels,
255
+ pad_token_id=pad_token_id,
256
+ bos_token_id=bos_token_id,
257
+ eos_token_id=eos_token_id,
258
+ is_encoder_decoder=is_encoder_decoder,
259
+ decoder_start_token_id=decoder_start_token_id,
260
+ forced_eos_token_id=forced_eos_token_id,
261
+ **kwargs,
262
+ )
263
+
264
+ # ensure backward compatibility for BART CNN models
265
+ if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
266
+ self.forced_bos_token_id = self.bos_token_id
267
+ warnings.warn(
268
+ f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
269
+ "The config can simply be saved and uploaded again to be fixed."
270
+ )
271
+
272
+ class Florence2Config(PretrainedConfig):
273
+ r"""
274
+ This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
275
+ Florence-2 model according to the specified arguments, defining the model architecture.
276
+
277
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
278
+ documentation from [`PretrainedConfig`] for more information.
279
+
280
+ Args:
281
+ vision_config (`Florence2VisionConfig`, *optional*):
282
+ Custom vision config or dict
283
+ text_config (`Union[AutoConfig, dict]`, *optional*):
284
+ The config object of the text backbone.
285
+ ignore_index (`int`, *optional*, defaults to -100):
286
+ The ignore index for the loss function.
287
+ vocab_size (`int`, *optional*, defaults to 51289):
288
+ Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
289
+ `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
290
+ projection_dim (`int`, *optional*, defaults to 1024):
291
+ Dimension of the multimodal projection space.
292
+
293
+ Example:
294
+
295
+ ```python
296
+ >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
297
+
298
+ >>> # Initializing a clip-like vision config
299
+ >>> vision_config = CLIPVisionConfig()
300
+
301
+ >>> # Initializing a Bart config
302
+ >>> text_config = BartConfig()
303
+
304
+ >>> # Initializing a Florence-2 configuration
305
+ >>> configuration = Florence2Config(vision_config, text_config)
306
+
307
+ >>> # Initializing a model from the florence-2 configuration
308
+ >>> model = Florence2ForConditionalGeneration(configuration)
309
+
310
+ >>> # Accessing the model configuration
311
+ >>> configuration = model.config
312
+ ```"""
313
+
314
+ model_type = "florence2"
315
+ is_composition = False
316
+
317
+ def __init__(
318
+ self,
319
+ vision_config=None,
320
+ text_config=None,
321
+ ignore_index=-100,
322
+ vocab_size=51289,
323
+ projection_dim=1024,
324
+ **kwargs,
325
+ ):
326
+ self.ignore_index = ignore_index
327
+ self.vocab_size = vocab_size
328
+ self.projection_dim = projection_dim
329
+ if vision_config is not None:
330
+ vision_config = PretrainedConfig(**vision_config)
331
+ self.vision_config = vision_config
332
+ self.vocab_size = self.vocab_size
333
+
334
+ self.text_config = text_config
335
+ if text_config is not None:
336
+ self.text_config = Florence2LanguageConfig(**text_config)
337
+
338
+
339
+ super().__init__(**kwargs)
340
+
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "num_beams": 3,
3
+ "transformers_version": "4.45.2"
4
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:922cf0a84284cfa3ddf7f487482040afce2d976c363070c5c13cccb4d62c6469
3
+ size 1665460218
modeling_florence2.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "crop_size": {
6
+ "height": 768,
7
+ "width": 768
8
+ },
9
+ "do_center_crop": false,
10
+ "do_convert_rgb": null,
11
+ "do_normalize": true,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "image_mean": [
15
+ 0.485,
16
+ 0.456,
17
+ 0.406
18
+ ],
19
+ "image_processor_type": "CLIPImageProcessor",
20
+ "image_seq_length": 577,
21
+ "image_std": [
22
+ 0.229,
23
+ 0.224,
24
+ 0.225
25
+ ],
26
+ "processor_class": "Florence2Processor",
27
+ "resample": 3,
28
+ "rescale_factor": 0.00392156862745098,
29
+ "size": {
30
+ "height": 768,
31
+ "width": 768
32
+ }
33
+ }
processing_florence2.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Florence-2.
17
+ """
18
+
19
+ import re
20
+ import logging
21
+ from typing import List, Optional, Union
22
+ import numpy as np
23
+
24
+ import torch
25
+ import PIL
26
+
27
+ from transformers.feature_extraction_utils import BatchFeature
28
+ from transformers.image_utils import ImageInput
29
+ from transformers.processing_utils import ProcessorMixin
30
+ from transformers.tokenization_utils_base import (
31
+ PaddingStrategy,
32
+ TextInput,
33
+ TruncationStrategy,
34
+ )
35
+ from transformers.utils import TensorType
36
+ import re
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ class Florence2Processor(ProcessorMixin):
42
+ attributes = ["image_processor", "tokenizer"]
43
+ image_processor_class = "CLIPImageProcessor"
44
+ tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
45
+
46
+ def __init__(
47
+ self,
48
+ image_processor=None,
49
+ tokenizer=None,
50
+ ):
51
+ if image_processor is None:
52
+ raise ValueError("You need to specify an `image_processor`.")
53
+ if tokenizer is None:
54
+ raise ValueError("You need to specify a `tokenizer`.")
55
+ if not hasattr(image_processor, "image_seq_length"):
56
+ raise ValueError("Image processor is missing an `image_seq_length` attribute.")
57
+
58
+ self.image_seq_length = image_processor.image_seq_length
59
+
60
+ tokens_to_add = {
61
+ 'additional_special_tokens': \
62
+ tokenizer.additional_special_tokens + \
63
+ ['<od>', '</od>', '<ocr>', '</ocr>'] + \
64
+ [f'<loc_{x}>' for x in range(1000)] + \
65
+ ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>'] + \
66
+ ['<panel>', '<text>', '<character>', '<tail>']
67
+ }
68
+ tokenizer.add_special_tokens(tokens_to_add)
69
+ self.decoder_start_token_id = 2
70
+
71
+ self.box_quantizer = BoxQuantizer(
72
+ mode='floor',
73
+ bins=(1000, 1000),
74
+ )
75
+
76
+ super().__init__(image_processor, tokenizer)
77
+
78
+ def __call__(
79
+ self,
80
+ batch_input_text: List[TextInput] = None,
81
+ batch_input_list_of_list_of_bboxes: List[List[List[List[float]]]] = None,
82
+ batch_output_text: List[TextInput] = None,
83
+ batch_output_list_of_list_of_bboxes: List[List[List[List[float]]]] = None,
84
+ batch_images: ImageInput = None,
85
+ batch_character_cluster_labels = None,
86
+ batch_text_character_association_labels = None,
87
+ batch_text_tail_association_labels = None,
88
+ batch_is_essential_text_labels = None,
89
+ batch_tail_character_association_labels = None,
90
+ padding: Union[bool, str, PaddingStrategy] = None,
91
+ truncation: Union[bool, str, TruncationStrategy] = None,
92
+ max_input_length_including_image_tokens=None,
93
+ max_output_length=None,
94
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
95
+ do_resize: bool = None,
96
+ do_normalize: bool = None,
97
+ image_mean: Optional[Union[float, List[float]]] = None,
98
+ image_std: Optional[Union[float, List[float]]] = None,
99
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
100
+ input_data_format: Optional[
101
+ Union[str, "ChannelDimension"] # noqa: F821
102
+ ] = None,
103
+ resample: "PILImageResampling" = None, # noqa: F821
104
+ do_convert_rgb: bool = None,
105
+ dtype: torch.dtype = None,
106
+ device: torch.device = None,
107
+ ) -> BatchFeature:
108
+
109
+ assert batch_images is not None, "`batch_images` are expected as arguments to a `Florence2Processor` instance."
110
+ assert batch_input_text is not None, "`batch_input_text` are expected as arguments to a `Florence2Processor` instance."
111
+ if batch_input_list_of_list_of_bboxes is None:
112
+ batch_input_list_of_list_of_bboxes = [[] for _ in range(len(batch_input_text))]
113
+ assert len(batch_input_text) == len(batch_input_list_of_list_of_bboxes) == len(batch_images), "`batch_input_text`, `batch_input_list_of_list_of_bboxes` and `batch_images` have different lengths."
114
+ if batch_output_text is None:
115
+ assert batch_output_list_of_list_of_bboxes is None, "`batch_output_text` and `batch_output_list_of_list_of_bboxes` should be provided together."
116
+ else:
117
+ if batch_output_list_of_list_of_bboxes is None:
118
+ batch_output_list_of_list_of_bboxes = [[] for _ in range(len(batch_output_text))]
119
+ assert len(batch_output_text) == len(batch_output_list_of_list_of_bboxes) == len(batch_images), "`batch_output_text`, `batch_output_list_of_list_of_bboxes` and `batch_images` have different lengths."
120
+
121
+ max_input_length = max_input_length_including_image_tokens - self.image_seq_length if max_input_length_including_image_tokens is not None else None
122
+ batch_input_texts = [self._format_text_with_bboxes(text, list_of_list_of_bboxes, image) for text, list_of_list_of_bboxes, image in zip(batch_input_text, batch_input_list_of_list_of_bboxes, batch_images)]
123
+ inputs = self.tokenizer(
124
+ batch_input_texts,
125
+ return_tensors=return_tensors,
126
+ padding=padding,
127
+ truncation=False,
128
+ )
129
+ # Truncating manually because I don't want </s> token at the end of truncated sequences, which is the default behavior
130
+ if inputs["input_ids"].shape[1] > max_input_length:
131
+ inputs["input_ids"] = inputs["input_ids"][:, :max_input_length]
132
+ inputs["attention_mask"] = inputs["attention_mask"][:, :max_input_length]
133
+
134
+ if batch_output_text is not None:
135
+ batch_output_texts = [self._format_text_with_bboxes(text, list_of_list_of_bboxes, image) for text, list_of_list_of_bboxes, image in zip(batch_output_text, batch_output_list_of_list_of_bboxes, batch_images)]
136
+ decoder_inputs = self.tokenizer(
137
+ batch_output_texts,
138
+ return_tensors=return_tensors,
139
+ padding=padding,
140
+ truncation=False,
141
+ )
142
+ # Truncating manually because I don't want </s> token at the end of truncated sequences, which is the default behavior
143
+ if decoder_inputs["input_ids"].shape[1] > max_output_length:
144
+ decoder_inputs["input_ids"] = decoder_inputs["input_ids"][:, :max_output_length]
145
+ decoder_inputs["attention_mask"] = decoder_inputs["attention_mask"][:, :max_output_length]
146
+
147
+
148
+ pixel_values = self.image_processor(
149
+ batch_images,
150
+ do_resize=do_resize,
151
+ do_normalize=do_normalize,
152
+ return_tensors=return_tensors,
153
+ image_mean=image_mean,
154
+ image_std=image_std,
155
+ input_data_format=input_data_format,
156
+ data_format=data_format,
157
+ resample=resample,
158
+ do_convert_rgb=do_convert_rgb,
159
+ )["pixel_values"]
160
+
161
+ if dtype is not None:
162
+ pixel_values = pixel_values.to(dtype)
163
+
164
+ return_data = {**inputs, "pixel_values": pixel_values}
165
+
166
+ if batch_output_text is not None:
167
+ labels = decoder_inputs["input_ids"]
168
+ decoder_input_ids = labels.new_zeros(labels.shape)
169
+ decoder_input_ids[:, 1:] = labels[:, :-1].clone()
170
+ decoder_input_ids[:, 0] = self.decoder_start_token_id
171
+ decoder_attention_mask = decoder_inputs["attention_mask"].new_ones(decoder_input_ids.shape)
172
+ decoder_attention_mask[:, 1:] = decoder_inputs["attention_mask"][:, :-1].clone()
173
+ # Mask fill labels to replace pad token ID with -100
174
+ labels.masked_fill_(labels == self.tokenizer.pad_token_id, -100)
175
+ return_data.update({
176
+ "labels": labels,
177
+ "decoder_input_ids": decoder_input_ids,
178
+ "decoder_attention_mask": decoder_attention_mask,
179
+ })
180
+
181
+ if device is not None:
182
+ for key, value in return_data.items():
183
+ if isinstance(value, torch.Tensor):
184
+ return_data[key] = value.to(device)
185
+
186
+ if batch_character_cluster_labels is not None:
187
+ return_data["character_cluster_labels"] = batch_character_cluster_labels
188
+ if batch_text_character_association_labels is not None:
189
+ return_data["text_character_association_labels"] = batch_text_character_association_labels
190
+ if batch_text_tail_association_labels is not None:
191
+ return_data["text_tail_association_labels"] = batch_text_tail_association_labels
192
+ if batch_is_essential_text_labels is not None:
193
+ return_data["is_essential_text_labels"] = batch_is_essential_text_labels
194
+ if batch_tail_character_association_labels is not None:
195
+ return_data["tail_character_association_labels"] = batch_tail_character_association_labels
196
+
197
+ return_data["tokenizer"] = self.tokenizer
198
+ return BatchFeature(data=return_data)
199
+
200
+ def cleanup_generated_text(self, generated_text):
201
+ return generated_text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
202
+
203
+ def postprocess_output(self, generated_ids, images):
204
+ generated_ids.masked_fill_(generated_ids == -100, self.tokenizer.pad_token_id) # only for some testing purposes
205
+ batch_decoded_texts = self.batch_decode(generated_ids, skip_special_tokens=False)
206
+ batch_decoded_texts = [self.cleanup_generated_text(text) for text in batch_decoded_texts]
207
+ batch_list_of_list_of_bboxes = []
208
+ batch_indices_of_bboxes_in_new_string = []
209
+ batch_new_texts = []
210
+ for text, image in zip(batch_decoded_texts, images):
211
+ size_wh = self._get_image_size_wh(image)
212
+ parsed_text, list_of_stringified_bboxes, start_end_in_new_string = self._parse_text_with_bboxes(text)
213
+ list_of_list_of_bboxes = [self.box_quantizer.dequantize_from_stringified_bboxes(stringified_bbox, size_wh) for stringified_bbox in list_of_stringified_bboxes]
214
+ batch_list_of_list_of_bboxes.append(list_of_list_of_bboxes)
215
+ batch_indices_of_bboxes_in_new_string.append(start_end_in_new_string)
216
+ batch_new_texts.append(parsed_text)
217
+ return batch_new_texts, batch_list_of_list_of_bboxes, batch_indices_of_bboxes_in_new_string
218
+
219
+ def _parse_text_with_bboxes(self, text):
220
+ loc_pattern = r'((?:<loc_\d+>){4}(?:,(?:<loc_\d+>){4})*)'
221
+ grounding_pattern = r'<grounding>(.*?)</grounding>' + loc_pattern
222
+
223
+ list_of_stringified_bboxes = []
224
+ start_end_in_new_string = []
225
+ new_text = ""
226
+ original_pos = 0
227
+ new_pos = 0
228
+
229
+ for match in re.finditer(grounding_pattern + '|' + loc_pattern, text):
230
+ # Add text before the match
231
+ new_text += text[original_pos:match.start()]
232
+ new_pos += match.start() - original_pos
233
+
234
+ if match.group(0).startswith('<grounding>'):
235
+ # Handle grounding pattern
236
+ grounding_text = match.group(1)
237
+ locs = match.group(2)
238
+ new_text += grounding_text
239
+ list_of_stringified_bboxes.append(locs)
240
+ start_end_in_new_string.append((new_pos, new_pos + len(grounding_text)))
241
+ new_pos += len(grounding_text)
242
+ else:
243
+ # Handle loc pattern
244
+ locs = match.group(0)
245
+ replacement = ""
246
+ new_text += replacement
247
+ list_of_stringified_bboxes.append(locs)
248
+ start_end_in_new_string.append((new_pos, new_pos + len(replacement)))
249
+ new_pos += len(replacement)
250
+
251
+ original_pos = match.end()
252
+
253
+ # Add any remaining text
254
+ new_text += text[original_pos:]
255
+
256
+ return new_text, list_of_stringified_bboxes, start_end_in_new_string
257
+
258
+ def _format_text_with_bboxes(self, text, list_of_list_of_bboxes, image):
259
+ size_wh = self._get_image_size_wh(image)
260
+ quantized_bbox_lists = []
261
+ for list_of_bboxes in list_of_list_of_bboxes:
262
+ quantized_bboxes = self.box_quantizer.quantize(list_of_bboxes, size_wh=size_wh)
263
+ stringified_bboxes = [f"<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>" for x1, y1, x2, y2 in quantized_bboxes]
264
+ stringified_bboxes = ",".join(stringified_bboxes)
265
+ quantized_bbox_lists.append(stringified_bboxes)
266
+ return text.format(*quantized_bbox_lists)
267
+
268
+ def _get_image_size_wh(self, image):
269
+ # Get size_wh from image based on its type
270
+ if isinstance(image, torch.Tensor):
271
+ # For PyTorch tensor
272
+ if image.dim() == 3:
273
+ size_wh = (image.shape[2], image.shape[1]) # (width, height)
274
+ elif image.dim() == 4:
275
+ size_wh = (image.shape[3], image.shape[2]) # (width, height)
276
+ else:
277
+ raise ValueError("Unsupported tensor dimensions")
278
+ elif isinstance(image, np.ndarray):
279
+ # For NumPy array
280
+ if image.ndim == 2:
281
+ size_wh = (image.shape[1], image.shape[0]) # (width, height)
282
+ elif image.ndim == 3:
283
+ size_wh = (image.shape[1], image.shape[0]) # (width, height)
284
+ else:
285
+ raise ValueError("Unsupported array dimensions")
286
+ elif isinstance(image, PIL.Image.Image):
287
+ # For PIL Image
288
+ size_wh = image.size # Already in (width, height) format
289
+ else:
290
+ raise TypeError("Unsupported image type")
291
+ return size_wh
292
+
293
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
294
+ def batch_decode(self, *args, **kwargs):
295
+ """
296
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
297
+ refer to the docstring of this method for more information.
298
+ """
299
+ return self.tokenizer.batch_decode(*args, **kwargs)
300
+
301
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
302
+ def decode(self, *args, **kwargs):
303
+ """
304
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
305
+ the docstring of this method for more information.
306
+ """
307
+ return self.tokenizer.decode(*args, **kwargs)
308
+
309
+ @property
310
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
311
+ def model_input_names(self):
312
+ tokenizer_input_names = self.tokenizer.model_input_names
313
+ image_processor_input_names = self.image_processor.model_input_names
314
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
315
+
316
+ class BoxQuantizer(object):
317
+ def __init__(self, mode, bins):
318
+ self.mode = mode
319
+ self.bins = bins
320
+
321
+ def quantize(self, boxes, size_wh):
322
+ if not isinstance(boxes, torch.Tensor):
323
+ boxes = torch.tensor(boxes)
324
+ bins_w, bins_h = self.bins # Quantization bins.
325
+ size_w, size_h = size_wh # Original image size.
326
+ size_per_bin_w = size_w / bins_w
327
+ size_per_bin_h = size_h / bins_h
328
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
329
+
330
+ if self.mode == 'floor':
331
+ quantized_xmin = (
332
+ xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
333
+ quantized_ymin = (
334
+ ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
335
+ quantized_xmax = (
336
+ xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
337
+ quantized_ymax = (
338
+ ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
339
+
340
+ elif self.mode == 'round':
341
+ raise NotImplementedError()
342
+
343
+ else:
344
+ raise ValueError('Incorrect quantization type.')
345
+
346
+ quantized_boxes = torch.cat(
347
+ (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
348
+ ).int()
349
+
350
+ return quantized_boxes.tolist()
351
+
352
+ def dequantize_from_stringified_bboxes(self, stringified_bboxes, size_wh):
353
+ bboxes = stringified_bboxes.split(',')
354
+
355
+ def parse_bbox(bbox_string):
356
+ pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
357
+ match = re.match(pattern, bbox_string)
358
+ if match:
359
+ return [int(match.group(i)) for i in range(1, 5)]
360
+ else:
361
+ raise ValueError(f"Invalid bbox string format: {bbox_string}")
362
+
363
+ parsed_bboxes = [parse_bbox(bbox) for bbox in bboxes]
364
+ return self.dequantize(parsed_bboxes, size_wh).tolist()
365
+
366
+ def dequantize(self, boxes: torch.Tensor, size):
367
+ if not isinstance(boxes, torch.Tensor):
368
+ boxes = torch.tensor(boxes)
369
+ bins_w, bins_h = self.bins # Quantization bins.
370
+ size_w, size_h = size # Original image size.
371
+ size_per_bin_w = size_w / bins_w
372
+ size_per_bin_h = size_h / bins_h
373
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
374
+
375
+ if self.mode == 'floor':
376
+ # Add 0.5 to use the center position of the bin as the coordinate.
377
+ dequantized_xmin = (xmin + 0.5) * size_per_bin_w
378
+ dequantized_ymin = (ymin + 0.5) * size_per_bin_h
379
+ dequantized_xmax = (xmax + 0.5) * size_per_bin_w
380
+ dequantized_ymax = (ymax + 0.5) * size_per_bin_h
381
+
382
+ elif self.mode == 'round':
383
+ raise NotImplementedError()
384
+
385
+ else:
386
+ raise ValueError('Incorrect quantization type.')
387
+
388
+ dequantized_boxes = torch.cat(
389
+ (dequantized_xmin, dequantized_ymin,
390
+ dequantized_xmax, dequantized_ymax), dim=-1
391
+ )
392
+
393
+ return dequantized_boxes
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "processor_class": "Florence2Processor"
6
+ }
special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
utils.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import random
4
+ import matplotlib.pyplot as plt
5
+ import matplotlib.patches as patches
6
+ from shapely.geometry import Point, box
7
+ import networkx as nx
8
+ from copy import deepcopy
9
+ from itertools import groupby
10
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError
11
+
12
+ def move_to_device(inputs, device):
13
+ if hasattr(inputs, "keys"):
14
+ return {k: move_to_device(v, device) for k, v in inputs.items()}
15
+ elif isinstance(inputs, list):
16
+ return [move_to_device(v, device) for v in inputs]
17
+ elif isinstance(inputs, tuple):
18
+ return tuple([move_to_device(v, device) for v in inputs])
19
+ elif isinstance(inputs, np.ndarray):
20
+ return torch.from_numpy(inputs).to(device)
21
+ else:
22
+ return inputs.to(device)
23
+
24
+ class UnionFind:
25
+ def __init__(self, n):
26
+ self.parent = list(range(n))
27
+ self.size = [1] * n
28
+ self.num_components = n
29
+
30
+ @classmethod
31
+ def from_adj_matrix(cls, adj_matrix):
32
+ ufds = cls(adj_matrix.shape[0])
33
+ for i in range(adj_matrix.shape[0]):
34
+ for j in range(adj_matrix.shape[1]):
35
+ if adj_matrix[i, j] > 0:
36
+ ufds.unite(i, j)
37
+ return ufds
38
+
39
+ @classmethod
40
+ def from_adj_list(cls, adj_list):
41
+ ufds = cls(len(adj_list))
42
+ for i in range(len(adj_list)):
43
+ for j in adj_list[i]:
44
+ ufds.unite(i, j)
45
+ return ufds
46
+
47
+ @classmethod
48
+ def from_edge_list(cls, edge_list, num_nodes):
49
+ ufds = cls(num_nodes)
50
+ for edge in edge_list:
51
+ ufds.unite(edge[0], edge[1])
52
+ return ufds
53
+
54
+ def find(self, x):
55
+ if self.parent[x] == x:
56
+ return x
57
+ self.parent[x] = self.find(self.parent[x])
58
+ return self.parent[x]
59
+
60
+ def unite(self, x, y):
61
+ x = self.find(x)
62
+ y = self.find(y)
63
+ if x != y:
64
+ if self.size[x] < self.size[y]:
65
+ x, y = y, x
66
+ self.parent[y] = x
67
+ self.size[x] += self.size[y]
68
+ self.num_components -= 1
69
+
70
+ def get_components_of(self, x):
71
+ x = self.find(x)
72
+ return [i for i in range(len(self.parent)) if self.find(i) == x]
73
+
74
+ def are_connected(self, x, y):
75
+ return self.find(x) == self.find(y)
76
+
77
+ def get_size(self, x):
78
+ return self.size[self.find(x)]
79
+
80
+ def get_num_components(self):
81
+ return self.num_components
82
+
83
+ def get_labels_for_connected_components(self):
84
+ map_parent_to_label = {}
85
+ labels = []
86
+ for i in range(len(self.parent)):
87
+ parent = self.find(i)
88
+ if parent not in map_parent_to_label:
89
+ map_parent_to_label[parent] = len(map_parent_to_label)
90
+ labels.append(map_parent_to_label[parent])
91
+ return labels
92
+
93
+ def visualise_single_image_prediction(image_as_np_array, predictions, filename):
94
+ h, w = image_as_np_array.shape[:2]
95
+ if h > w:
96
+ figure, subplot = plt.subplots(1, 1, figsize=(10, 10 * h / w))
97
+ else:
98
+ figure, subplot = plt.subplots(1, 1, figsize=(10 * w / h, 10))
99
+ subplot.imshow(image_as_np_array)
100
+ plot_bboxes(subplot, predictions["panels"], color="green")
101
+ plot_bboxes(subplot, predictions["texts"], color="red", add_index=True)
102
+ plot_bboxes(subplot, predictions["characters"], color="blue")
103
+
104
+ COLOURS = [
105
+ "#b7ff51", # green
106
+ "#f50a8f", # pink
107
+ "#4b13b6", # purple
108
+ "#ddaa34", # orange
109
+ "#bea2a2", # brown
110
+ ]
111
+ colour_index = 0
112
+ character_cluster_labels = predictions["character_cluster_labels"]
113
+ unique_label_sorted_by_frequency = sorted(list(set(character_cluster_labels)), key=lambda x: character_cluster_labels.count(x), reverse=True)
114
+ for label in unique_label_sorted_by_frequency:
115
+ root = None
116
+ others = []
117
+ for i in range(len(predictions["characters"])):
118
+ if character_cluster_labels[i] == label:
119
+ if root is None:
120
+ root = i
121
+ else:
122
+ others.append(i)
123
+ if colour_index >= len(COLOURS):
124
+ random_colour = COLOURS[0]
125
+ while random_colour in COLOURS:
126
+ random_colour = "#" + "".join([random.choice("0123456789ABCDEF") for j in range(6)])
127
+ else:
128
+ random_colour = COLOURS[colour_index]
129
+ colour_index += 1
130
+ bbox_i = predictions["characters"][root]
131
+ x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
132
+ y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
133
+ subplot.plot([x1], [y1], color=random_colour, marker="o", markersize=5)
134
+ for j in others:
135
+ # draw line from centre of bbox i to centre of bbox j
136
+ bbox_j = predictions["characters"][j]
137
+ x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
138
+ y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
139
+ x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
140
+ y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
141
+ subplot.plot([x1, x2], [y1, y2], color=random_colour, linewidth=2)
142
+ subplot.plot([x2], [y2], color=random_colour, marker="o", markersize=5)
143
+
144
+ for (i, j) in predictions["text_character_associations"]:
145
+ score = predictions["dialog_confidences"][i]
146
+ bbox_i = predictions["texts"][i]
147
+ bbox_j = predictions["characters"][j]
148
+ x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
149
+ y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
150
+ x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
151
+ y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
152
+ subplot.plot([x1, x2], [y1, y2], color="red", linewidth=2, linestyle="dashed", alpha=score)
153
+
154
+ subplot.axis("off")
155
+ if filename is not None:
156
+ plt.savefig(filename, bbox_inches="tight", pad_inches=0)
157
+
158
+ figure.canvas.draw()
159
+ image = np.array(figure.canvas.renderer._renderer)
160
+ plt.close()
161
+ return image
162
+
163
+ def plot_bboxes(subplot, bboxes, color="red", add_index=False):
164
+ for id, bbox in enumerate(bboxes):
165
+ w = bbox[2] - bbox[0]
166
+ h = bbox[3] - bbox[1]
167
+ rect = patches.Rectangle(
168
+ bbox[:2], w, h, linewidth=1, edgecolor=color, facecolor="none", linestyle="solid"
169
+ )
170
+ subplot.add_patch(rect)
171
+ if add_index:
172
+ cx, cy = bbox[0] + w / 2, bbox[1] + h / 2
173
+ subplot.text(cx, cy, str(id), color=color, fontsize=10, ha="center", va="center")
174
+
175
+ def sort_panels(rects):
176
+ before_rects = convert_to_list_of_lists(rects)
177
+ # slightly erode all rectangles initially to account for imperfect detections
178
+ rects = [erode_rectangle(rect, 0.05) for rect in before_rects]
179
+ G = nx.DiGraph()
180
+ G.add_nodes_from(range(len(rects)))
181
+ for i in range(len(rects)):
182
+ for j in range(len(rects)):
183
+ if i == j:
184
+ continue
185
+ if is_there_a_directed_edge(i, j, rects):
186
+ G.add_edge(i, j, weight=get_distance(rects[i], rects[j]))
187
+ else:
188
+ G.add_edge(j, i, weight=get_distance(rects[i], rects[j]))
189
+ while True:
190
+ with ThreadPoolExecutor(max_workers=1) as executor:
191
+ future = executor.submit(list, nx.simple_cycles(G))
192
+ try:
193
+ cycles = future.result(timeout=60)
194
+ except TimeoutError:
195
+ print("Cycle finding timed out after 60 seconds")
196
+ return list(range(len(rects)))
197
+ cycles = [cycle for cycle in cycles if len(cycle) > 1]
198
+ if len(cycles) == 0:
199
+ break
200
+ cycle = cycles[0]
201
+ edges = [e for e in zip(cycle, cycle[1:] + cycle[:1])]
202
+ max_cyclic_edge = max(edges, key=lambda x: G.edges[x]["weight"])
203
+ G.remove_edge(*max_cyclic_edge)
204
+ return list(nx.topological_sort(G))
205
+
206
+ def is_strictly_above(rectA, rectB):
207
+ x1A, y1A, x2A, y2A = rectA
208
+ x1B, y1B, x2B, y2B = rectB
209
+ return y2A < y1B
210
+
211
+ def is_strictly_below(rectA, rectB):
212
+ x1A, y1A, x2A, y2A = rectA
213
+ x1B, y1B, x2B, y2B = rectB
214
+ return y2B < y1A
215
+
216
+ def is_strictly_left_of(rectA, rectB):
217
+ x1A, y1A, x2A, y2A = rectA
218
+ x1B, y1B, x2B, y2B = rectB
219
+ return x2A < x1B
220
+
221
+ def is_strictly_right_of(rectA, rectB):
222
+ x1A, y1A, x2A, y2A = rectA
223
+ x1B, y1B, x2B, y2B = rectB
224
+ return x2B < x1A
225
+
226
+ def intersects(rectA, rectB):
227
+ return box(*rectA).intersects(box(*rectB))
228
+
229
+ def is_there_a_directed_edge(a, b, rects):
230
+ rectA = rects[a]
231
+ rectB = rects[b]
232
+ centre_of_A = [rectA[0] + (rectA[2] - rectA[0]) / 2, rectA[1] + (rectA[3] - rectA[1]) / 2]
233
+ centre_of_B = [rectB[0] + (rectB[2] - rectB[0]) / 2, rectB[1] + (rectB[3] - rectB[1]) / 2]
234
+ if np.allclose(np.array(centre_of_A), np.array(centre_of_B)):
235
+ return box(*rectA).area > (box(*rectB)).area
236
+ copy_A = [rectA[0], rectA[1], rectA[2], rectA[3]]
237
+ copy_B = [rectB[0], rectB[1], rectB[2], rectB[3]]
238
+ while True:
239
+ if is_strictly_above(copy_A, copy_B) and not is_strictly_left_of(copy_A, copy_B):
240
+ return 1
241
+ if is_strictly_above(copy_B, copy_A) and not is_strictly_left_of(copy_B, copy_A):
242
+ return 0
243
+ if is_strictly_right_of(copy_A, copy_B) and not is_strictly_below(copy_A, copy_B):
244
+ return 1
245
+ if is_strictly_right_of(copy_B, copy_A) and not is_strictly_below(copy_B, copy_A):
246
+ return 0
247
+ if is_strictly_below(copy_A, copy_B) and is_strictly_right_of(copy_A, copy_B):
248
+ return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
249
+ if is_strictly_below(copy_B, copy_A) and is_strictly_right_of(copy_B, copy_A):
250
+ return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
251
+ # otherwise they intersect
252
+ copy_A = erode_rectangle(copy_A, 0.05)
253
+ copy_B = erode_rectangle(copy_B, 0.05)
254
+
255
+ def get_distance(rectA, rectB):
256
+ return box(rectA[0], rectA[1], rectA[2], rectA[3]).distance(box(rectB[0], rectB[1], rectB[2], rectB[3]))
257
+
258
+ def use_cuts_to_determine_edge_from_a_to_b(a, b, rects):
259
+ rects = deepcopy(rects)
260
+ while True:
261
+ xmin, ymin, xmax, ymax = min(rects[a][0], rects[b][0]), min(rects[a][1], rects[b][1]), max(rects[a][2], rects[b][2]), max(rects[a][3], rects[b][3])
262
+ rect_index = [i for i in range(len(rects)) if intersects(rects[i], [xmin, ymin, xmax, ymax])]
263
+ rects_copy = [rect for rect in rects if intersects(rect, [xmin, ymin, xmax, ymax])]
264
+
265
+ # try to split the panels using a "horizontal" lines
266
+ overlapping_y_ranges = merge_overlapping_ranges([(y1, y2) for x1, y1, x2, y2 in rects_copy])
267
+ panel_index_to_split = {}
268
+ for split_index, (y1, y2) in enumerate(overlapping_y_ranges):
269
+ for i, index in enumerate(rect_index):
270
+ if y1 <= rects_copy[i][1] <= rects_copy[i][3] <= y2:
271
+ panel_index_to_split[index] = split_index
272
+
273
+ if panel_index_to_split[a] != panel_index_to_split[b]:
274
+ return panel_index_to_split[a] < panel_index_to_split[b]
275
+
276
+ # try to split the panels using a "vertical" lines
277
+ overlapping_x_ranges = merge_overlapping_ranges([(x1, x2) for x1, y1, x2, y2 in rects_copy])
278
+ panel_index_to_split = {}
279
+ for split_index, (x1, x2) in enumerate(overlapping_x_ranges[::-1]):
280
+ for i, index in enumerate(rect_index):
281
+ if x1 <= rects_copy[i][0] <= rects_copy[i][2] <= x2:
282
+ panel_index_to_split[index] = split_index
283
+ if panel_index_to_split[a] != panel_index_to_split[b]:
284
+ return panel_index_to_split[a] < panel_index_to_split[b]
285
+
286
+ # otherwise, erode the rectangles and try again
287
+ rects = [erode_rectangle(rect, 0.05) for rect in rects]
288
+
289
+ def erode_rectangle(bbox, erosion_factor):
290
+ x1, y1, x2, y2 = bbox
291
+ w, h = x2 - x1, y2 - y1
292
+ cx, cy = x1 + w / 2, y1 + h / 2
293
+ if w < h:
294
+ aspect_ratio = w / h
295
+ erosion_factor_width = erosion_factor * aspect_ratio
296
+ erosion_factor_height = erosion_factor
297
+ else:
298
+ aspect_ratio = h / w
299
+ erosion_factor_width = erosion_factor
300
+ erosion_factor_height = erosion_factor * aspect_ratio
301
+ w = w - w * erosion_factor_width
302
+ h = h - h * erosion_factor_height
303
+ x1, y1, x2, y2 = cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2
304
+ return [x1, y1, x2, y2]
305
+
306
+ def merge_overlapping_ranges(ranges):
307
+ """
308
+ ranges: list of tuples (x1, x2)
309
+ """
310
+ if len(ranges) == 0:
311
+ return []
312
+ ranges = sorted(ranges, key=lambda x: x[0])
313
+ merged_ranges = []
314
+ for i, r in enumerate(ranges):
315
+ if i == 0:
316
+ prev_x1, prev_x2 = r
317
+ continue
318
+ x1, x2 = r
319
+ if x1 > prev_x2:
320
+ merged_ranges.append((prev_x1, prev_x2))
321
+ prev_x1, prev_x2 = x1, x2
322
+ else:
323
+ prev_x2 = max(prev_x2, x2)
324
+ merged_ranges.append((prev_x1, prev_x2))
325
+ return merged_ranges
326
+
327
+ def sort_text_boxes_in_reading_order(text_bboxes, sorted_panel_bboxes):
328
+ text_bboxes = convert_to_list_of_lists(text_bboxes)
329
+ sorted_panel_bboxes = convert_to_list_of_lists(sorted_panel_bboxes)
330
+
331
+ if len(text_bboxes) == 0:
332
+ return []
333
+
334
+ def indices_of_same_elements(nums):
335
+ groups = groupby(range(len(nums)), key=lambda i: nums[i])
336
+ return [list(indices) for _, indices in groups]
337
+
338
+ panel_id_for_text = get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes)
339
+ indices_of_texts = list(range(len(text_bboxes)))
340
+ indices_of_texts, panel_id_for_text = zip(*sorted(zip(indices_of_texts, panel_id_for_text), key=lambda x: x[1]))
341
+ indices_of_texts = list(indices_of_texts)
342
+ grouped_indices = indices_of_same_elements(panel_id_for_text)
343
+ for group in grouped_indices:
344
+ subset_of_text_indices = [indices_of_texts[i] for i in group]
345
+ text_bboxes_of_subset = [text_bboxes[i] for i in subset_of_text_indices]
346
+ sorted_subset_indices = sort_texts_within_panel(text_bboxes_of_subset)
347
+ indices_of_texts[group[0] : group[-1] + 1] = [subset_of_text_indices[i] for i in sorted_subset_indices]
348
+ return indices_of_texts
349
+
350
+ def get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes):
351
+ text_to_panel_mapping = []
352
+ for text_bbox in text_bboxes:
353
+ shapely_text_polygon = box(*text_bbox)
354
+ all_intersections = []
355
+ all_distances = []
356
+ if len(sorted_panel_bboxes) == 0:
357
+ text_to_panel_mapping.append(-1)
358
+ continue
359
+ for j, annotation in enumerate(sorted_panel_bboxes):
360
+ shapely_annotation_polygon = box(*annotation)
361
+ if shapely_text_polygon.intersects(shapely_annotation_polygon):
362
+ all_intersections.append((shapely_text_polygon.intersection(shapely_annotation_polygon).area, j))
363
+ all_distances.append((shapely_text_polygon.distance(shapely_annotation_polygon), j))
364
+ if len(all_intersections) == 0:
365
+ text_to_panel_mapping.append(min(all_distances, key=lambda x: x[0])[1])
366
+ else:
367
+ text_to_panel_mapping.append(max(all_intersections, key=lambda x: x[0])[1])
368
+ return text_to_panel_mapping
369
+
370
+ def sort_texts_within_panel(rects):
371
+ smallest_y = float("inf")
372
+ greatest_x = float("-inf")
373
+ for i, rect in enumerate(rects):
374
+ x1, y1, x2, y2 = rect
375
+ smallest_y = min(smallest_y, y1)
376
+ greatest_x = max(greatest_x, x2)
377
+
378
+ reference_point = Point(greatest_x, smallest_y)
379
+
380
+ polygons_and_index = []
381
+ for i, rect in enumerate(rects):
382
+ x1, y1, x2, y2 = rect
383
+ polygons_and_index.append((box(x1,y1,x2,y2), i))
384
+ # sort points by closest to reference point
385
+ polygons_and_index = sorted(polygons_and_index, key=lambda x: reference_point.distance(x[0]))
386
+ indices = [x[1] for x in polygons_and_index]
387
+ return indices
388
+
389
+ def force_to_be_valid_bboxes(bboxes):
390
+ if len(bboxes) == 0:
391
+ return bboxes
392
+ bboxes_as_xywh = [[x1, y1, x2-x1, y2-y1] for x1, y1, x2, y2 in bboxes]
393
+ bboxes_as_xywh = torch.tensor(bboxes_as_xywh)
394
+ bboxes_as_xywh[:, 2] = torch.clamp(bboxes_as_xywh[:, 2], min=1)
395
+ bboxes_as_xywh[:, 3] = torch.clamp(bboxes_as_xywh[:, 3], min=1)
396
+ bboxes_as_xywh = bboxes_as_xywh.tolist()
397
+ bboxes_as_xyxy = [[x1, y1, x1 + w, y1 + h] for x1, y1, w, h in bboxes_as_xywh]
398
+ return bboxes_as_xyxy
399
+
400
+ def x1y1wh_to_x1y1x2y2(bbox):
401
+ x1, y1, w, h = bbox
402
+ return [x1, y1, x1 + w, y1 + h]
403
+
404
+ def x1y1x2y2_to_xywh(bbox):
405
+ x1, y1, x2, y2 = bbox
406
+ return [x1, y1, x2 - x1, y2 - y1]
407
+
408
+ def convert_to_list_of_lists(rects):
409
+ if isinstance(rects, torch.Tensor):
410
+ return rects.tolist()
411
+ if isinstance(rects, np.ndarray):
412
+ return rects.tolist()
413
+ return [[a, b, c, d] for a, b, c, d in rects]
vocab.json ADDED
The diff for this file is too large to render. See raw diff