magiv3 with bugs

Browse files

Files changed (17) hide show

.gitattributes +35 -35
README.md +11 -0
added_tokens.json +1030 -0
config.json +238 -0
configuration_florence2.py +340 -0
generation_config.json +4 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_florence2.py +0 -0
preprocessor_config.json +33 -0
processing_florence2.py +393 -0
processor_config.json +6 -0
special_tokens_map.json +0 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0
utils.py +413 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Usage
+```python
+model = AutoModelForCausalLM.from_pretrained("ragavsachdeva/magiv3", torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
+processor = AutoProcessor.from_pretrained("ragavsachdeva/magiv3", trust_remote_code=True)
+model.predict_detections_and_associations(images, processor)
+model.predict_ocr(images, processor)
+model.predict_character_grounding(images, captions, processor)
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,1030 @@

+{
+  "</cap>": 51270,
+  "</dcap>": 51274,
+  "</grounding>": 51276,
+  "</ncap>": 51272,
+  "</ocr>": 50268,
+  "</od>": 50266,
+  "</poly>": 51287,
+  "</proposal>": 51285,
+  "</region_cap>": 51281,
+  "</region_to_desciption>": 51283,
+  "</seg>": 51278,
+  "<and>": 51288,
+  "<cap>": 51269,
+  "<character>": 51291,
+  "<dcap>": 51273,
+  "<grounding>": 51275,
+  "<loc_0>": 50269,
+  "<loc_100>": 50369,
+  "<loc_101>": 50370,
+  "<loc_102>": 50371,
+  "<loc_103>": 50372,
+  "<loc_104>": 50373,
+  "<loc_105>": 50374,
+  "<loc_106>": 50375,
+  "<loc_107>": 50376,
+  "<loc_108>": 50377,
+  "<loc_109>": 50378,
+  "<loc_10>": 50279,
+  "<loc_110>": 50379,
+  "<loc_111>": 50380,
+  "<loc_112>": 50381,
+  "<loc_113>": 50382,
+  "<loc_114>": 50383,
+  "<loc_115>": 50384,
+  "<loc_116>": 50385,
+  "<loc_117>": 50386,
+  "<loc_118>": 50387,
+  "<loc_119>": 50388,
+  "<loc_11>": 50280,
+  "<loc_120>": 50389,
+  "<loc_121>": 50390,
+  "<loc_122>": 50391,
+  "<loc_123>": 50392,
+  "<loc_124>": 50393,
+  "<loc_125>": 50394,
+  "<loc_126>": 50395,
+  "<loc_127>": 50396,
+  "<loc_128>": 50397,
+  "<loc_129>": 50398,
+  "<loc_12>": 50281,
+  "<loc_130>": 50399,
+  "<loc_131>": 50400,
+  "<loc_132>": 50401,
+  "<loc_133>": 50402,
+  "<loc_134>": 50403,
+  "<loc_135>": 50404,
+  "<loc_136>": 50405,
+  "<loc_137>": 50406,
+  "<loc_138>": 50407,
+  "<loc_139>": 50408,
+  "<loc_13>": 50282,
+  "<loc_140>": 50409,
+  "<loc_141>": 50410,
+  "<loc_142>": 50411,
+  "<loc_143>": 50412,
+  "<loc_144>": 50413,
+  "<loc_145>": 50414,
+  "<loc_146>": 50415,
+  "<loc_147>": 50416,
+  "<loc_148>": 50417,
+  "<loc_149>": 50418,
+  "<loc_14>": 50283,
+  "<loc_150>": 50419,
+  "<loc_151>": 50420,
+  "<loc_152>": 50421,
+  "<loc_153>": 50422,
+  "<loc_154>": 50423,
+  "<loc_155>": 50424,
+  "<loc_156>": 50425,
+  "<loc_157>": 50426,
+  "<loc_158>": 50427,
+  "<loc_159>": 50428,
+  "<loc_15>": 50284,
+  "<loc_160>": 50429,
+  "<loc_161>": 50430,
+  "<loc_162>": 50431,
+  "<loc_163>": 50432,
+  "<loc_164>": 50433,
+  "<loc_165>": 50434,
+  "<loc_166>": 50435,
+  "<loc_167>": 50436,
+  "<loc_168>": 50437,
+  "<loc_169>": 50438,
+  "<loc_16>": 50285,
+  "<loc_170>": 50439,
+  "<loc_171>": 50440,
+  "<loc_172>": 50441,
+  "<loc_173>": 50442,
+  "<loc_174>": 50443,
+  "<loc_175>": 50444,
+  "<loc_176>": 50445,
+  "<loc_177>": 50446,
+  "<loc_178>": 50447,
+  "<loc_179>": 50448,
+  "<loc_17>": 50286,
+  "<loc_180>": 50449,
+  "<loc_181>": 50450,
+  "<loc_182>": 50451,
+  "<loc_183>": 50452,
+  "<loc_184>": 50453,
+  "<loc_185>": 50454,
+  "<loc_186>": 50455,
+  "<loc_187>": 50456,
+  "<loc_188>": 50457,
+  "<loc_189>": 50458,
+  "<loc_18>": 50287,
+  "<loc_190>": 50459,
+  "<loc_191>": 50460,
+  "<loc_192>": 50461,
+  "<loc_193>": 50462,
+  "<loc_194>": 50463,
+  "<loc_195>": 50464,
+  "<loc_196>": 50465,
+  "<loc_197>": 50466,
+  "<loc_198>": 50467,
+  "<loc_199>": 50468,
+  "<loc_19>": 50288,
+  "<loc_1>": 50270,
+  "<loc_200>": 50469,
+  "<loc_201>": 50470,
+  "<loc_202>": 50471,
+  "<loc_203>": 50472,
+  "<loc_204>": 50473,
+  "<loc_205>": 50474,
+  "<loc_206>": 50475,
+  "<loc_207>": 50476,
+  "<loc_208>": 50477,
+  "<loc_209>": 50478,
+  "<loc_20>": 50289,
+  "<loc_210>": 50479,
+  "<loc_211>": 50480,
+  "<loc_212>": 50481,
+  "<loc_213>": 50482,
+  "<loc_214>": 50483,
+  "<loc_215>": 50484,
+  "<loc_216>": 50485,
+  "<loc_217>": 50486,
+  "<loc_218>": 50487,
+  "<loc_219>": 50488,
+  "<loc_21>": 50290,
+  "<loc_220>": 50489,
+  "<loc_221>": 50490,
+  "<loc_222>": 50491,
+  "<loc_223>": 50492,
+  "<loc_224>": 50493,
+  "<loc_225>": 50494,
+  "<loc_226>": 50495,
+  "<loc_227>": 50496,
+  "<loc_228>": 50497,
+  "<loc_229>": 50498,
+  "<loc_22>": 50291,
+  "<loc_230>": 50499,
+  "<loc_231>": 50500,
+  "<loc_232>": 50501,
+  "<loc_233>": 50502,
+  "<loc_234>": 50503,
+  "<loc_235>": 50504,
+  "<loc_236>": 50505,
+  "<loc_237>": 50506,
+  "<loc_238>": 50507,
+  "<loc_239>": 50508,
+  "<loc_23>": 50292,
+  "<loc_240>": 50509,
+  "<loc_241>": 50510,
+  "<loc_242>": 50511,
+  "<loc_243>": 50512,
+  "<loc_244>": 50513,
+  "<loc_245>": 50514,
+  "<loc_246>": 50515,
+  "<loc_247>": 50516,
+  "<loc_248>": 50517,
+  "<loc_249>": 50518,
+  "<loc_24>": 50293,
+  "<loc_250>": 50519,
+  "<loc_251>": 50520,
+  "<loc_252>": 50521,
+  "<loc_253>": 50522,
+  "<loc_254>": 50523,
+  "<loc_255>": 50524,
+  "<loc_256>": 50525,
+  "<loc_257>": 50526,
+  "<loc_258>": 50527,
+  "<loc_259>": 50528,
+  "<loc_25>": 50294,
+  "<loc_260>": 50529,
+  "<loc_261>": 50530,
+  "<loc_262>": 50531,
+  "<loc_263>": 50532,
+  "<loc_264>": 50533,
+  "<loc_265>": 50534,
+  "<loc_266>": 50535,
+  "<loc_267>": 50536,
+  "<loc_268>": 50537,
+  "<loc_269>": 50538,
+  "<loc_26>": 50295,
+  "<loc_270>": 50539,
+  "<loc_271>": 50540,
+  "<loc_272>": 50541,
+  "<loc_273>": 50542,
+  "<loc_274>": 50543,
+  "<loc_275>": 50544,
+  "<loc_276>": 50545,
+  "<loc_277>": 50546,
+  "<loc_278>": 50547,
+  "<loc_279>": 50548,
+  "<loc_27>": 50296,
+  "<loc_280>": 50549,
+  "<loc_281>": 50550,
+  "<loc_282>": 50551,
+  "<loc_283>": 50552,
+  "<loc_284>": 50553,
+  "<loc_285>": 50554,
+  "<loc_286>": 50555,
+  "<loc_287>": 50556,
+  "<loc_288>": 50557,
+  "<loc_289>": 50558,
+  "<loc_28>": 50297,
+  "<loc_290>": 50559,
+  "<loc_291>": 50560,
+  "<loc_292>": 50561,
+  "<loc_293>": 50562,
+  "<loc_294>": 50563,
+  "<loc_295>": 50564,
+  "<loc_296>": 50565,
+  "<loc_297>": 50566,
+  "<loc_298>": 50567,
+  "<loc_299>": 50568,
+  "<loc_29>": 50298,
+  "<loc_2>": 50271,
+  "<loc_300>": 50569,
+  "<loc_301>": 50570,
+  "<loc_302>": 50571,
+  "<loc_303>": 50572,
+  "<loc_304>": 50573,
+  "<loc_305>": 50574,
+  "<loc_306>": 50575,
+  "<loc_307>": 50576,
+  "<loc_308>": 50577,
+  "<loc_309>": 50578,
+  "<loc_30>": 50299,
+  "<loc_310>": 50579,
+  "<loc_311>": 50580,
+  "<loc_312>": 50581,
+  "<loc_313>": 50582,
+  "<loc_314>": 50583,
+  "<loc_315>": 50584,
+  "<loc_316>": 50585,
+  "<loc_317>": 50586,
+  "<loc_318>": 50587,
+  "<loc_319>": 50588,
+  "<loc_31>": 50300,
+  "<loc_320>": 50589,
+  "<loc_321>": 50590,
+  "<loc_322>": 50591,
+  "<loc_323>": 50592,
+  "<loc_324>": 50593,
+  "<loc_325>": 50594,
+  "<loc_326>": 50595,
+  "<loc_327>": 50596,
+  "<loc_328>": 50597,
+  "<loc_329>": 50598,
+  "<loc_32>": 50301,
+  "<loc_330>": 50599,
+  "<loc_331>": 50600,
+  "<loc_332>": 50601,
+  "<loc_333>": 50602,
+  "<loc_334>": 50603,
+  "<loc_335>": 50604,
+  "<loc_336>": 50605,
+  "<loc_337>": 50606,
+  "<loc_338>": 50607,
+  "<loc_339>": 50608,
+  "<loc_33>": 50302,
+  "<loc_340>": 50609,
+  "<loc_341>": 50610,
+  "<loc_342>": 50611,
+  "<loc_343>": 50612,
+  "<loc_344>": 50613,
+  "<loc_345>": 50614,
+  "<loc_346>": 50615,
+  "<loc_347>": 50616,
+  "<loc_348>": 50617,
+  "<loc_349>": 50618,
+  "<loc_34>": 50303,
+  "<loc_350>": 50619,
+  "<loc_351>": 50620,
+  "<loc_352>": 50621,
+  "<loc_353>": 50622,
+  "<loc_354>": 50623,
+  "<loc_355>": 50624,
+  "<loc_356>": 50625,
+  "<loc_357>": 50626,
+  "<loc_358>": 50627,
+  "<loc_359>": 50628,
+  "<loc_35>": 50304,
+  "<loc_360>": 50629,
+  "<loc_361>": 50630,
+  "<loc_362>": 50631,
+  "<loc_363>": 50632,
+  "<loc_364>": 50633,
+  "<loc_365>": 50634,
+  "<loc_366>": 50635,
+  "<loc_367>": 50636,
+  "<loc_368>": 50637,
+  "<loc_369>": 50638,
+  "<loc_36>": 50305,
+  "<loc_370>": 50639,
+  "<loc_371>": 50640,
+  "<loc_372>": 50641,
+  "<loc_373>": 50642,
+  "<loc_374>": 50643,
+  "<loc_375>": 50644,
+  "<loc_376>": 50645,
+  "<loc_377>": 50646,
+  "<loc_378>": 50647,
+  "<loc_379>": 50648,
+  "<loc_37>": 50306,
+  "<loc_380>": 50649,
+  "<loc_381>": 50650,
+  "<loc_382>": 50651,
+  "<loc_383>": 50652,
+  "<loc_384>": 50653,
+  "<loc_385>": 50654,
+  "<loc_386>": 50655,
+  "<loc_387>": 50656,
+  "<loc_388>": 50657,
+  "<loc_389>": 50658,
+  "<loc_38>": 50307,
+  "<loc_390>": 50659,
+  "<loc_391>": 50660,
+  "<loc_392>": 50661,
+  "<loc_393>": 50662,
+  "<loc_394>": 50663,
+  "<loc_395>": 50664,
+  "<loc_396>": 50665,
+  "<loc_397>": 50666,
+  "<loc_398>": 50667,
+  "<loc_399>": 50668,
+  "<loc_39>": 50308,
+  "<loc_3>": 50272,
+  "<loc_400>": 50669,
+  "<loc_401>": 50670,
+  "<loc_402>": 50671,
+  "<loc_403>": 50672,
+  "<loc_404>": 50673,
+  "<loc_405>": 50674,
+  "<loc_406>": 50675,
+  "<loc_407>": 50676,
+  "<loc_408>": 50677,
+  "<loc_409>": 50678,
+  "<loc_40>": 50309,
+  "<loc_410>": 50679,
+  "<loc_411>": 50680,
+  "<loc_412>": 50681,
+  "<loc_413>": 50682,
+  "<loc_414>": 50683,
+  "<loc_415>": 50684,
+  "<loc_416>": 50685,
+  "<loc_417>": 50686,
+  "<loc_418>": 50687,
+  "<loc_419>": 50688,
+  "<loc_41>": 50310,
+  "<loc_420>": 50689,
+  "<loc_421>": 50690,
+  "<loc_422>": 50691,
+  "<loc_423>": 50692,
+  "<loc_424>": 50693,
+  "<loc_425>": 50694,
+  "<loc_426>": 50695,
+  "<loc_427>": 50696,
+  "<loc_428>": 50697,
+  "<loc_429>": 50698,
+  "<loc_42>": 50311,
+  "<loc_430>": 50699,
+  "<loc_431>": 50700,
+  "<loc_432>": 50701,
+  "<loc_433>": 50702,
+  "<loc_434>": 50703,
+  "<loc_435>": 50704,
+  "<loc_436>": 50705,
+  "<loc_437>": 50706,
+  "<loc_438>": 50707,
+  "<loc_439>": 50708,
+  "<loc_43>": 50312,
+  "<loc_440>": 50709,
+  "<loc_441>": 50710,
+  "<loc_442>": 50711,
+  "<loc_443>": 50712,
+  "<loc_444>": 50713,
+  "<loc_445>": 50714,
+  "<loc_446>": 50715,
+  "<loc_447>": 50716,
+  "<loc_448>": 50717,
+  "<loc_449>": 50718,
+  "<loc_44>": 50313,
+  "<loc_450>": 50719,
+  "<loc_451>": 50720,
+  "<loc_452>": 50721,
+  "<loc_453>": 50722,
+  "<loc_454>": 50723,
+  "<loc_455>": 50724,
+  "<loc_456>": 50725,
+  "<loc_457>": 50726,
+  "<loc_458>": 50727,
+  "<loc_459>": 50728,
+  "<loc_45>": 50314,
+  "<loc_460>": 50729,
+  "<loc_461>": 50730,
+  "<loc_462>": 50731,
+  "<loc_463>": 50732,
+  "<loc_464>": 50733,
+  "<loc_465>": 50734,
+  "<loc_466>": 50735,
+  "<loc_467>": 50736,
+  "<loc_468>": 50737,
+  "<loc_469>": 50738,
+  "<loc_46>": 50315,
+  "<loc_470>": 50739,
+  "<loc_471>": 50740,
+  "<loc_472>": 50741,
+  "<loc_473>": 50742,
+  "<loc_474>": 50743,
+  "<loc_475>": 50744,
+  "<loc_476>": 50745,
+  "<loc_477>": 50746,
+  "<loc_478>": 50747,
+  "<loc_479>": 50748,
+  "<loc_47>": 50316,
+  "<loc_480>": 50749,
+  "<loc_481>": 50750,
+  "<loc_482>": 50751,
+  "<loc_483>": 50752,
+  "<loc_484>": 50753,
+  "<loc_485>": 50754,
+  "<loc_486>": 50755,
+  "<loc_487>": 50756,
+  "<loc_488>": 50757,
+  "<loc_489>": 50758,
+  "<loc_48>": 50317,
+  "<loc_490>": 50759,
+  "<loc_491>": 50760,
+  "<loc_492>": 50761,
+  "<loc_493>": 50762,
+  "<loc_494>": 50763,
+  "<loc_495>": 50764,
+  "<loc_496>": 50765,
+  "<loc_497>": 50766,
+  "<loc_498>": 50767,
+  "<loc_499>": 50768,
+  "<loc_49>": 50318,
+  "<loc_4>": 50273,
+  "<loc_500>": 50769,
+  "<loc_501>": 50770,
+  "<loc_502>": 50771,
+  "<loc_503>": 50772,
+  "<loc_504>": 50773,
+  "<loc_505>": 50774,
+  "<loc_506>": 50775,
+  "<loc_507>": 50776,
+  "<loc_508>": 50777,
+  "<loc_509>": 50778,
+  "<loc_50>": 50319,
+  "<loc_510>": 50779,
+  "<loc_511>": 50780,
+  "<loc_512>": 50781,
+  "<loc_513>": 50782,
+  "<loc_514>": 50783,
+  "<loc_515>": 50784,
+  "<loc_516>": 50785,
+  "<loc_517>": 50786,
+  "<loc_518>": 50787,
+  "<loc_519>": 50788,
+  "<loc_51>": 50320,
+  "<loc_520>": 50789,
+  "<loc_521>": 50790,
+  "<loc_522>": 50791,
+  "<loc_523>": 50792,
+  "<loc_524>": 50793,
+  "<loc_525>": 50794,
+  "<loc_526>": 50795,
+  "<loc_527>": 50796,
+  "<loc_528>": 50797,
+  "<loc_529>": 50798,
+  "<loc_52>": 50321,
+  "<loc_530>": 50799,
+  "<loc_531>": 50800,
+  "<loc_532>": 50801,
+  "<loc_533>": 50802,
+  "<loc_534>": 50803,
+  "<loc_535>": 50804,
+  "<loc_536>": 50805,
+  "<loc_537>": 50806,
+  "<loc_538>": 50807,
+  "<loc_539>": 50808,
+  "<loc_53>": 50322,
+  "<loc_540>": 50809,
+  "<loc_541>": 50810,
+  "<loc_542>": 50811,
+  "<loc_543>": 50812,
+  "<loc_544>": 50813,
+  "<loc_545>": 50814,
+  "<loc_546>": 50815,
+  "<loc_547>": 50816,
+  "<loc_548>": 50817,
+  "<loc_549>": 50818,
+  "<loc_54>": 50323,
+  "<loc_550>": 50819,
+  "<loc_551>": 50820,
+  "<loc_552>": 50821,
+  "<loc_553>": 50822,
+  "<loc_554>": 50823,
+  "<loc_555>": 50824,
+  "<loc_556>": 50825,
+  "<loc_557>": 50826,
+  "<loc_558>": 50827,
+  "<loc_559>": 50828,
+  "<loc_55>": 50324,
+  "<loc_560>": 50829,
+  "<loc_561>": 50830,
+  "<loc_562>": 50831,
+  "<loc_563>": 50832,
+  "<loc_564>": 50833,
+  "<loc_565>": 50834,
+  "<loc_566>": 50835,
+  "<loc_567>": 50836,
+  "<loc_568>": 50837,
+  "<loc_569>": 50838,
+  "<loc_56>": 50325,
+  "<loc_570>": 50839,
+  "<loc_571>": 50840,
+  "<loc_572>": 50841,
+  "<loc_573>": 50842,
+  "<loc_574>": 50843,
+  "<loc_575>": 50844,
+  "<loc_576>": 50845,
+  "<loc_577>": 50846,
+  "<loc_578>": 50847,
+  "<loc_579>": 50848,
+  "<loc_57>": 50326,
+  "<loc_580>": 50849,
+  "<loc_581>": 50850,
+  "<loc_582>": 50851,
+  "<loc_583>": 50852,
+  "<loc_584>": 50853,
+  "<loc_585>": 50854,
+  "<loc_586>": 50855,
+  "<loc_587>": 50856,
+  "<loc_588>": 50857,
+  "<loc_589>": 50858,
+  "<loc_58>": 50327,
+  "<loc_590>": 50859,
+  "<loc_591>": 50860,
+  "<loc_592>": 50861,
+  "<loc_593>": 50862,
+  "<loc_594>": 50863,
+  "<loc_595>": 50864,
+  "<loc_596>": 50865,
+  "<loc_597>": 50866,
+  "<loc_598>": 50867,
+  "<loc_599>": 50868,
+  "<loc_59>": 50328,
+  "<loc_5>": 50274,
+  "<loc_600>": 50869,
+  "<loc_601>": 50870,
+  "<loc_602>": 50871,
+  "<loc_603>": 50872,
+  "<loc_604>": 50873,
+  "<loc_605>": 50874,
+  "<loc_606>": 50875,
+  "<loc_607>": 50876,
+  "<loc_608>": 50877,
+  "<loc_609>": 50878,
+  "<loc_60>": 50329,
+  "<loc_610>": 50879,
+  "<loc_611>": 50880,
+  "<loc_612>": 50881,
+  "<loc_613>": 50882,
+  "<loc_614>": 50883,
+  "<loc_615>": 50884,
+  "<loc_616>": 50885,
+  "<loc_617>": 50886,
+  "<loc_618>": 50887,
+  "<loc_619>": 50888,
+  "<loc_61>": 50330,
+  "<loc_620>": 50889,
+  "<loc_621>": 50890,
+  "<loc_622>": 50891,
+  "<loc_623>": 50892,
+  "<loc_624>": 50893,
+  "<loc_625>": 50894,
+  "<loc_626>": 50895,
+  "<loc_627>": 50896,
+  "<loc_628>": 50897,
+  "<loc_629>": 50898,
+  "<loc_62>": 50331,
+  "<loc_630>": 50899,
+  "<loc_631>": 50900,
+  "<loc_632>": 50901,
+  "<loc_633>": 50902,
+  "<loc_634>": 50903,
+  "<loc_635>": 50904,
+  "<loc_636>": 50905,
+  "<loc_637>": 50906,
+  "<loc_638>": 50907,
+  "<loc_639>": 50908,
+  "<loc_63>": 50332,
+  "<loc_640>": 50909,
+  "<loc_641>": 50910,
+  "<loc_642>": 50911,
+  "<loc_643>": 50912,
+  "<loc_644>": 50913,
+  "<loc_645>": 50914,
+  "<loc_646>": 50915,
+  "<loc_647>": 50916,
+  "<loc_648>": 50917,
+  "<loc_649>": 50918,
+  "<loc_64>": 50333,
+  "<loc_650>": 50919,
+  "<loc_651>": 50920,
+  "<loc_652>": 50921,
+  "<loc_653>": 50922,
+  "<loc_654>": 50923,
+  "<loc_655>": 50924,
+  "<loc_656>": 50925,
+  "<loc_657>": 50926,
+  "<loc_658>": 50927,
+  "<loc_659>": 50928,
+  "<loc_65>": 50334,
+  "<loc_660>": 50929,
+  "<loc_661>": 50930,
+  "<loc_662>": 50931,
+  "<loc_663>": 50932,
+  "<loc_664>": 50933,
+  "<loc_665>": 50934,
+  "<loc_666>": 50935,
+  "<loc_667>": 50936,
+  "<loc_668>": 50937,
+  "<loc_669>": 50938,
+  "<loc_66>": 50335,
+  "<loc_670>": 50939,
+  "<loc_671>": 50940,
+  "<loc_672>": 50941,
+  "<loc_673>": 50942,
+  "<loc_674>": 50943,
+  "<loc_675>": 50944,
+  "<loc_676>": 50945,
+  "<loc_677>": 50946,
+  "<loc_678>": 50947,
+  "<loc_679>": 50948,
+  "<loc_67>": 50336,
+  "<loc_680>": 50949,
+  "<loc_681>": 50950,
+  "<loc_682>": 50951,
+  "<loc_683>": 50952,
+  "<loc_684>": 50953,
+  "<loc_685>": 50954,
+  "<loc_686>": 50955,
+  "<loc_687>": 50956,
+  "<loc_688>": 50957,
+  "<loc_689>": 50958,
+  "<loc_68>": 50337,
+  "<loc_690>": 50959,
+  "<loc_691>": 50960,
+  "<loc_692>": 50961,
+  "<loc_693>": 50962,
+  "<loc_694>": 50963,
+  "<loc_695>": 50964,
+  "<loc_696>": 50965,
+  "<loc_697>": 50966,
+  "<loc_698>": 50967,
+  "<loc_699>": 50968,
+  "<loc_69>": 50338,
+  "<loc_6>": 50275,
+  "<loc_700>": 50969,
+  "<loc_701>": 50970,
+  "<loc_702>": 50971,
+  "<loc_703>": 50972,
+  "<loc_704>": 50973,
+  "<loc_705>": 50974,
+  "<loc_706>": 50975,
+  "<loc_707>": 50976,
+  "<loc_708>": 50977,
+  "<loc_709>": 50978,
+  "<loc_70>": 50339,
+  "<loc_710>": 50979,
+  "<loc_711>": 50980,
+  "<loc_712>": 50981,
+  "<loc_713>": 50982,
+  "<loc_714>": 50983,
+  "<loc_715>": 50984,
+  "<loc_716>": 50985,
+  "<loc_717>": 50986,
+  "<loc_718>": 50987,
+  "<loc_719>": 50988,
+  "<loc_71>": 50340,
+  "<loc_720>": 50989,
+  "<loc_721>": 50990,
+  "<loc_722>": 50991,
+  "<loc_723>": 50992,
+  "<loc_724>": 50993,
+  "<loc_725>": 50994,
+  "<loc_726>": 50995,
+  "<loc_727>": 50996,
+  "<loc_728>": 50997,
+  "<loc_729>": 50998,
+  "<loc_72>": 50341,
+  "<loc_730>": 50999,
+  "<loc_731>": 51000,
+  "<loc_732>": 51001,
+  "<loc_733>": 51002,
+  "<loc_734>": 51003,
+  "<loc_735>": 51004,
+  "<loc_736>": 51005,
+  "<loc_737>": 51006,
+  "<loc_738>": 51007,
+  "<loc_739>": 51008,
+  "<loc_73>": 50342,
+  "<loc_740>": 51009,
+  "<loc_741>": 51010,
+  "<loc_742>": 51011,
+  "<loc_743>": 51012,
+  "<loc_744>": 51013,
+  "<loc_745>": 51014,
+  "<loc_746>": 51015,
+  "<loc_747>": 51016,
+  "<loc_748>": 51017,
+  "<loc_749>": 51018,
+  "<loc_74>": 50343,
+  "<loc_750>": 51019,
+  "<loc_751>": 51020,
+  "<loc_752>": 51021,
+  "<loc_753>": 51022,
+  "<loc_754>": 51023,
+  "<loc_755>": 51024,
+  "<loc_756>": 51025,
+  "<loc_757>": 51026,
+  "<loc_758>": 51027,
+  "<loc_759>": 51028,
+  "<loc_75>": 50344,
+  "<loc_760>": 51029,
+  "<loc_761>": 51030,
+  "<loc_762>": 51031,
+  "<loc_763>": 51032,
+  "<loc_764>": 51033,
+  "<loc_765>": 51034,
+  "<loc_766>": 51035,
+  "<loc_767>": 51036,
+  "<loc_768>": 51037,
+  "<loc_769>": 51038,
+  "<loc_76>": 50345,
+  "<loc_770>": 51039,
+  "<loc_771>": 51040,
+  "<loc_772>": 51041,
+  "<loc_773>": 51042,
+  "<loc_774>": 51043,
+  "<loc_775>": 51044,
+  "<loc_776>": 51045,
+  "<loc_777>": 51046,
+  "<loc_778>": 51047,
+  "<loc_779>": 51048,
+  "<loc_77>": 50346,
+  "<loc_780>": 51049,
+  "<loc_781>": 51050,
+  "<loc_782>": 51051,
+  "<loc_783>": 51052,
+  "<loc_784>": 51053,
+  "<loc_785>": 51054,
+  "<loc_786>": 51055,
+  "<loc_787>": 51056,
+  "<loc_788>": 51057,
+  "<loc_789>": 51058,
+  "<loc_78>": 50347,
+  "<loc_790>": 51059,
+  "<loc_791>": 51060,
+  "<loc_792>": 51061,
+  "<loc_793>": 51062,
+  "<loc_794>": 51063,
+  "<loc_795>": 51064,
+  "<loc_796>": 51065,
+  "<loc_797>": 51066,
+  "<loc_798>": 51067,
+  "<loc_799>": 51068,
+  "<loc_79>": 50348,
+  "<loc_7>": 50276,
+  "<loc_800>": 51069,
+  "<loc_801>": 51070,
+  "<loc_802>": 51071,
+  "<loc_803>": 51072,
+  "<loc_804>": 51073,
+  "<loc_805>": 51074,
+  "<loc_806>": 51075,
+  "<loc_807>": 51076,
+  "<loc_808>": 51077,
+  "<loc_809>": 51078,
+  "<loc_80>": 50349,
+  "<loc_810>": 51079,
+  "<loc_811>": 51080,
+  "<loc_812>": 51081,
+  "<loc_813>": 51082,
+  "<loc_814>": 51083,
+  "<loc_815>": 51084,
+  "<loc_816>": 51085,
+  "<loc_817>": 51086,
+  "<loc_818>": 51087,
+  "<loc_819>": 51088,
+  "<loc_81>": 50350,
+  "<loc_820>": 51089,
+  "<loc_821>": 51090,
+  "<loc_822>": 51091,
+  "<loc_823>": 51092,
+  "<loc_824>": 51093,
+  "<loc_825>": 51094,
+  "<loc_826>": 51095,
+  "<loc_827>": 51096,
+  "<loc_828>": 51097,
+  "<loc_829>": 51098,
+  "<loc_82>": 50351,
+  "<loc_830>": 51099,
+  "<loc_831>": 51100,
+  "<loc_832>": 51101,
+  "<loc_833>": 51102,
+  "<loc_834>": 51103,
+  "<loc_835>": 51104,
+  "<loc_836>": 51105,
+  "<loc_837>": 51106,
+  "<loc_838>": 51107,
+  "<loc_839>": 51108,
+  "<loc_83>": 50352,
+  "<loc_840>": 51109,
+  "<loc_841>": 51110,
+  "<loc_842>": 51111,
+  "<loc_843>": 51112,
+  "<loc_844>": 51113,
+  "<loc_845>": 51114,
+  "<loc_846>": 51115,
+  "<loc_847>": 51116,
+  "<loc_848>": 51117,
+  "<loc_849>": 51118,
+  "<loc_84>": 50353,
+  "<loc_850>": 51119,
+  "<loc_851>": 51120,
+  "<loc_852>": 51121,
+  "<loc_853>": 51122,
+  "<loc_854>": 51123,
+  "<loc_855>": 51124,
+  "<loc_856>": 51125,
+  "<loc_857>": 51126,
+  "<loc_858>": 51127,
+  "<loc_859>": 51128,
+  "<loc_85>": 50354,
+  "<loc_860>": 51129,
+  "<loc_861>": 51130,
+  "<loc_862>": 51131,
+  "<loc_863>": 51132,
+  "<loc_864>": 51133,
+  "<loc_865>": 51134,
+  "<loc_866>": 51135,
+  "<loc_867>": 51136,
+  "<loc_868>": 51137,
+  "<loc_869>": 51138,
+  "<loc_86>": 50355,
+  "<loc_870>": 51139,
+  "<loc_871>": 51140,
+  "<loc_872>": 51141,
+  "<loc_873>": 51142,
+  "<loc_874>": 51143,
+  "<loc_875>": 51144,
+  "<loc_876>": 51145,
+  "<loc_877>": 51146,
+  "<loc_878>": 51147,
+  "<loc_879>": 51148,
+  "<loc_87>": 50356,
+  "<loc_880>": 51149,
+  "<loc_881>": 51150,
+  "<loc_882>": 51151,
+  "<loc_883>": 51152,
+  "<loc_884>": 51153,
+  "<loc_885>": 51154,
+  "<loc_886>": 51155,
+  "<loc_887>": 51156,
+  "<loc_888>": 51157,
+  "<loc_889>": 51158,
+  "<loc_88>": 50357,
+  "<loc_890>": 51159,
+  "<loc_891>": 51160,
+  "<loc_892>": 51161,
+  "<loc_893>": 51162,
+  "<loc_894>": 51163,
+  "<loc_895>": 51164,
+  "<loc_896>": 51165,
+  "<loc_897>": 51166,
+  "<loc_898>": 51167,
+  "<loc_899>": 51168,
+  "<loc_89>": 50358,
+  "<loc_8>": 50277,
+  "<loc_900>": 51169,
+  "<loc_901>": 51170,
+  "<loc_902>": 51171,
+  "<loc_903>": 51172,
+  "<loc_904>": 51173,
+  "<loc_905>": 51174,
+  "<loc_906>": 51175,
+  "<loc_907>": 51176,
+  "<loc_908>": 51177,
+  "<loc_909>": 51178,
+  "<loc_90>": 50359,
+  "<loc_910>": 51179,
+  "<loc_911>": 51180,
+  "<loc_912>": 51181,
+  "<loc_913>": 51182,
+  "<loc_914>": 51183,
+  "<loc_915>": 51184,
+  "<loc_916>": 51185,
+  "<loc_917>": 51186,
+  "<loc_918>": 51187,
+  "<loc_919>": 51188,
+  "<loc_91>": 50360,
+  "<loc_920>": 51189,
+  "<loc_921>": 51190,
+  "<loc_922>": 51191,
+  "<loc_923>": 51192,
+  "<loc_924>": 51193,
+  "<loc_925>": 51194,
+  "<loc_926>": 51195,
+  "<loc_927>": 51196,
+  "<loc_928>": 51197,
+  "<loc_929>": 51198,
+  "<loc_92>": 50361,
+  "<loc_930>": 51199,
+  "<loc_931>": 51200,
+  "<loc_932>": 51201,
+  "<loc_933>": 51202,
+  "<loc_934>": 51203,
+  "<loc_935>": 51204,
+  "<loc_936>": 51205,
+  "<loc_937>": 51206,
+  "<loc_938>": 51207,
+  "<loc_939>": 51208,
+  "<loc_93>": 50362,
+  "<loc_940>": 51209,
+  "<loc_941>": 51210,
+  "<loc_942>": 51211,
+  "<loc_943>": 51212,
+  "<loc_944>": 51213,
+  "<loc_945>": 51214,
+  "<loc_946>": 51215,
+  "<loc_947>": 51216,
+  "<loc_948>": 51217,
+  "<loc_949>": 51218,
+  "<loc_94>": 50363,
+  "<loc_950>": 51219,
+  "<loc_951>": 51220,
+  "<loc_952>": 51221,
+  "<loc_953>": 51222,
+  "<loc_954>": 51223,
+  "<loc_955>": 51224,
+  "<loc_956>": 51225,
+  "<loc_957>": 51226,
+  "<loc_958>": 51227,
+  "<loc_959>": 51228,
+  "<loc_95>": 50364,
+  "<loc_960>": 51229,
+  "<loc_961>": 51230,
+  "<loc_962>": 51231,
+  "<loc_963>": 51232,
+  "<loc_964>": 51233,
+  "<loc_965>": 51234,
+  "<loc_966>": 51235,
+  "<loc_967>": 51236,
+  "<loc_968>": 51237,
+  "<loc_969>": 51238,
+  "<loc_96>": 50365,
+  "<loc_970>": 51239,
+  "<loc_971>": 51240,
+  "<loc_972>": 51241,
+  "<loc_973>": 51242,
+  "<loc_974>": 51243,
+  "<loc_975>": 51244,
+  "<loc_976>": 51245,
+  "<loc_977>": 51246,
+  "<loc_978>": 51247,
+  "<loc_979>": 51248,
+  "<loc_97>": 50366,
+  "<loc_980>": 51249,
+  "<loc_981>": 51250,
+  "<loc_982>": 51251,
+  "<loc_983>": 51252,
+  "<loc_984>": 51253,
+  "<loc_985>": 51254,
+  "<loc_986>": 51255,
+  "<loc_987>": 51256,
+  "<loc_988>": 51257,
+  "<loc_989>": 51258,
+  "<loc_98>": 50367,
+  "<loc_990>": 51259,
+  "<loc_991>": 51260,
+  "<loc_992>": 51261,
+  "<loc_993>": 51262,
+  "<loc_994>": 51263,
+  "<loc_995>": 51264,
+  "<loc_996>": 51265,
+  "<loc_997>": 51266,
+  "<loc_998>": 51267,
+  "<loc_999>": 51268,
+  "<loc_99>": 50368,
+  "<loc_9>": 50278,
+  "<ncap>": 51271,
+  "<ocr>": 50267,
+  "<od>": 50265,
+  "<panel>": 51289,
+  "<poly>": 51286,
+  "<proposal>": 51284,
+  "<region_cap>": 51280,
+  "<region_to_desciption>": 51282,
+  "<seg>": 51277,
+  "<sep>": 51279,
+  "<tail>": 51292,
+  "<text>": 51290
+}

config.json ADDED Viewed

	@@ -0,0 +1,238 @@

+{
+  "_name_or_path": "magiv3",
+  "architectures": [
+    "Florence2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_florence2.Florence2Config",
+    "AutoModel": "modeling_florence2.Florence2ForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
+  },
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "ignore_index": -100,
+  "is_encoder_decoder": true,
+  "model_type": "florence2",
+  "pad_token_id": 1,
+  "projection_dim": 1024,
+  "text_config": {
+    "_name_or_path": "",
+    "activation_dropout": 0.1,
+    "activation_function": "gelu",
+    "add_bias_logits": false,
+    "add_cross_attention": false,
+    "add_final_layer_norm": false,
+    "architectures": null,
+    "attention_dropout": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "classif_dropout": 0.1,
+    "classifier_dropout": 0.0,
+    "cross_attention_hidden_size": null,
+    "d_model": 1024,
+    "decoder_attention_heads": 16,
+    "decoder_ffn_dim": 4096,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 12,
+    "decoder_start_token_id": 2,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": true,
+    "encoder_attention_heads": 16,
+    "encoder_ffn_dim": 4096,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 12,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": 0,
+    "forced_eos_token_id": 2,
+    "gradient_checkpointing": false,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1",
+      "2": "LABEL_2"
+    },
+    "init_std": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1,
+      "LABEL_2": 2
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 1024,
+    "min_length": 0,
+    "model_type": "florence2_language",
+    "no_repeat_ngram_size": 3,
+    "normalize_before": false,
+    "num_beam_groups": 1,
+    "num_beams": 3,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 51293
+  },
+  "torch_dtype": "float16",
+  "transformers_version": "4.45.2",
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depths": [
+      1,
+      1,
+      9,
+      1
+    ],
+    "dim_embed": [
+      256,
+      512,
+      1024,
+      2048
+    ],
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "drop_path_rate": 0.1,
+    "early_stopping": false,
+    "enable_checkpoint": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_feature_source": [
+      "spatial_avg_pool",
+      "temporal_avg_pool"
+    ],
+    "image_pos_embed": {
+      "max_pos_embeddings": 50,
+      "type": "learned_abs_2d"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_groups": [
+      8,
+      16,
+      32,
+      64
+    ],
+    "num_heads": [
+      8,
+      16,
+      32,
+      64
+    ],
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_padding": [
+      3,
+      1,
+      1,
+      1
+    ],
+    "patch_prenorm": [
+      false,
+      true,
+      true,
+      true
+    ],
+    "patch_size": [
+      7,
+      3,
+      3,
+      3
+    ],
+    "patch_stride": [
+      4,
+      2,
+      2,
+      2
+    ],
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 1024,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "visual_temporal_embedding": {
+      "max_temporal_embeddings": 100,
+      "type": "COSINE"
+    },
+    "window_size": 12
+  },
+  "vocab_size": 51293
+}

configuration_florence2.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+""" Florence-2 configuration"""
+from typing import Optional
+from transformers import AutoConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Florence2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The dropout rate of the drop path layer.
+        patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
+            The patch size of the image.
+        patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
+            The patch stride of the image.
+        patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
+            The patch padding of the image.
+        patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
+            Whether to apply layer normalization before the patch embedding layer.
+        enable_checkpoint (`bool`, *optional*, defaults to False):
+            Whether to enable checkpointing.
+        dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
+            The dimension of the embedding layer.
+        num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of attention heads.
+        num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of groups.
+        depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
+            The depth of the model.
+        window_size (`int`, *optional*, defaults to 12):
+            The window size of the model.
+        projection_dim (`int`, *optional*, defaults to 1024):
+            The dimension of the projection layer.
+        visual_temporal_embedding (`dict`, *optional*):
+            The configuration of the visual temporal embedding.
+        image_pos_embed (`dict`, *optional*):
+            The configuration of the image position embedding.
+        image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
+            The source of the image feature.
+    Example:
+    ```python
+    >>> from transformers import Florence2VisionConfig, Florence2VisionModel
+    >>> # Initializing a Florence2 Vision style configuration
+    >>> configuration = Florence2VisionConfig()
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2_vision"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        drop_path_rate=0.1,
+        patch_size=[7, 3, 3, 3],
+        patch_stride=[4, 2, 2, 2],
+        patch_padding=[3, 1, 1, 1],
+        patch_prenorm=[False, True, True, True],
+        enable_checkpoint=False,
+        dim_embed=[256, 512, 1024, 2048],
+        num_heads=[8, 16, 32, 64],
+        num_groups=[8, 16, 32, 64],
+        depths=[1, 1, 9, 1],
+        window_size=12,
+        projection_dim=1024,
+        visual_temporal_embedding=None,
+        image_pos_embed=None,
+        image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
+        **kwargs,
+    ):
+        self.drop_path_rate = drop_path_rate
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.patch_prenorm = patch_prenorm
+        self.enable_checkpoint = enable_checkpoint
+        self.dim_embed = dim_embed
+        self.num_heads = num_heads
+        self.num_groups = num_groups
+        self.depths = depths
+        self.window_size = window_size
+        self.projection_dim = projection_dim
+        self.visual_temporal_embedding = visual_temporal_embedding
+        self.image_pos_embed = image_pos_embed
+        self.image_feature_source = image_feature_source
+        super().__init__(**kwargs)
+class Florence2LanguageConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BART
+    [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Florence2LanguageModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`Florence2LanguageForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+    Example:
+    ```python
+    >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
+    >>> # Initializing a Florence2 Language style configuration
+    >>> configuration = Florence2LanguageConfig()
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2LangaugeModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2_language"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=51289,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            warnings.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )
+class Florence2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
+    Florence-2 model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`Florence2VisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
+        projection_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the multimodal projection space.
+    Example:
+    ```python
+    >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
+    >>> # Initializing a clip-like vision config
+    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a Bart config
+    >>> text_config = BartConfig()
+    >>> # Initializing a Florence-2 configuration
+    >>> configuration = Florence2Config(vision_config, text_config)
+    >>> # Initializing a model from the florence-2 configuration
+    >>> model = Florence2ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2"
+    is_composition = False
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        vocab_size=51289,
+        projection_dim=1024,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        if vision_config is not None:
+            vision_config = PretrainedConfig(**vision_config)
+        self.vision_config = vision_config
+        self.vocab_size = self.vocab_size
+        self.text_config = text_config
+        if text_config is not None:
+            self.text_config = Florence2LanguageConfig(**text_config)
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "num_beams": 3,
+  "transformers_version": "4.45.2"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:922cf0a84284cfa3ddf7f487482040afce2d976c363070c5c13cccb4d62c6469
+size 1665460218

modeling_florence2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_florence2.Florence2Processor"
+  },
+  "crop_size": {
+    "height": 768,
+    "width": 768
+  },
+  "do_center_crop": false,
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_seq_length": 577,
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "processor_class": "Florence2Processor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 768,
+    "width": 768
+  }
+}

processing_florence2.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Florence-2.
+"""
+import re
+import logging
+from typing import List, Optional, Union
+import numpy as np
+import torch
+import PIL
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import (
+    PaddingStrategy,
+    TextInput,
+    TruncationStrategy,
+)
+from transformers.utils import TensorType
+import re
+logger = logging.getLogger(__name__)
+class Florence2Processor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        if not hasattr(image_processor, "image_seq_length"):
+            raise ValueError("Image processor is missing an `image_seq_length` attribute.")
+        self.image_seq_length = image_processor.image_seq_length
+        tokens_to_add = {
+                'additional_special_tokens': \
+                    tokenizer.additional_special_tokens + \
+                    ['<od>', '</od>', '<ocr>', '</ocr>'] + \
+                    [f'<loc_{x}>' for x in range(1000)] + \
+                    ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>'] + \
+                    ['<panel>', '<text>', '<character>', '<tail>']
+            }
+        tokenizer.add_special_tokens(tokens_to_add)
+        self.decoder_start_token_id = 2
+        self.box_quantizer = BoxQuantizer(
+            mode='floor',
+            bins=(1000, 1000),
+        )
+        super().__init__(image_processor, tokenizer)
+    def __call__(
+        self,
+        batch_input_text: List[TextInput] = None,
+        batch_input_list_of_list_of_bboxes: List[List[List[List[float]]]] = None,
+        batch_output_text: List[TextInput] = None,
+        batch_output_list_of_list_of_bboxes: List[List[List[List[float]]]] = None,
+        batch_images: ImageInput = None,
+        batch_character_cluster_labels = None,
+        batch_text_character_association_labels = None,
+        batch_text_tail_association_labels = None,
+        batch_is_essential_text_labels = None,
+        batch_tail_character_association_labels = None,
+        padding: Union[bool, str, PaddingStrategy] = None,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_input_length_including_image_tokens=None,
+        max_output_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        do_resize: bool = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional["ChannelDimension"] = "channels_first",  # noqa: F821
+        input_data_format: Optional[
+            Union[str, "ChannelDimension"]  # noqa: F821
+        ] = None,
+        resample: "PILImageResampling" = None,  # noqa: F821
+        do_convert_rgb: bool = None,
+        dtype: torch.dtype = None,
+        device: torch.device = None,
+    ) -> BatchFeature:
+        assert batch_images is not None, "`batch_images` are expected as arguments to a `Florence2Processor` instance."
+        assert batch_input_text is not None, "`batch_input_text` are expected as arguments to a `Florence2Processor` instance."
+        if batch_input_list_of_list_of_bboxes is None:
+            batch_input_list_of_list_of_bboxes = [[] for _ in range(len(batch_input_text))]
+        assert len(batch_input_text) == len(batch_input_list_of_list_of_bboxes) == len(batch_images), "`batch_input_text`, `batch_input_list_of_list_of_bboxes` and `batch_images` have different lengths."
+        if batch_output_text is None:
+            assert batch_output_list_of_list_of_bboxes is None, "`batch_output_text` and `batch_output_list_of_list_of_bboxes` should be provided together."
+        else:
+            if batch_output_list_of_list_of_bboxes is None:
+                batch_output_list_of_list_of_bboxes = [[] for _ in range(len(batch_output_text))]
+            assert len(batch_output_text) == len(batch_output_list_of_list_of_bboxes) == len(batch_images), "`batch_output_text`, `batch_output_list_of_list_of_bboxes` and `batch_images` have different lengths."
+        max_input_length = max_input_length_including_image_tokens - self.image_seq_length if max_input_length_including_image_tokens is not None else None
+        batch_input_texts = [self._format_text_with_bboxes(text, list_of_list_of_bboxes, image) for text, list_of_list_of_bboxes, image in zip(batch_input_text, batch_input_list_of_list_of_bboxes, batch_images)]
+        inputs = self.tokenizer(
+            batch_input_texts,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=False,
+        )
+        # Truncating manually because I don't want </s> token at the end of truncated sequences, which is the default behavior
+        if inputs["input_ids"].shape[1] > max_input_length:
+            inputs["input_ids"] = inputs["input_ids"][:, :max_input_length]
+            inputs["attention_mask"] = inputs["attention_mask"][:, :max_input_length]
+        if batch_output_text is not None:
+            batch_output_texts = [self._format_text_with_bboxes(text, list_of_list_of_bboxes, image) for text, list_of_list_of_bboxes, image in zip(batch_output_text, batch_output_list_of_list_of_bboxes, batch_images)]
+            decoder_inputs = self.tokenizer(
+                batch_output_texts,
+                return_tensors=return_tensors,
+                padding=padding,
+                truncation=False,
+            )
+            # Truncating manually because I don't want </s> token at the end of truncated sequences, which is the default behavior
+            if decoder_inputs["input_ids"].shape[1] > max_output_length:
+                decoder_inputs["input_ids"] = decoder_inputs["input_ids"][:, :max_output_length]
+                decoder_inputs["attention_mask"] = decoder_inputs["attention_mask"][:, :max_output_length]
+        pixel_values = self.image_processor(
+            batch_images,
+            do_resize=do_resize,
+            do_normalize=do_normalize,
+            return_tensors=return_tensors,
+            image_mean=image_mean,
+            image_std=image_std,
+            input_data_format=input_data_format,
+            data_format=data_format,
+            resample=resample,
+            do_convert_rgb=do_convert_rgb,
+        )["pixel_values"]
+        if dtype is not None:
+            pixel_values = pixel_values.to(dtype)
+        return_data = {**inputs, "pixel_values": pixel_values}
+        if batch_output_text is not None:
+            labels = decoder_inputs["input_ids"]
+            decoder_input_ids = labels.new_zeros(labels.shape)
+            decoder_input_ids[:, 1:] = labels[:, :-1].clone()
+            decoder_input_ids[:, 0] = self.decoder_start_token_id
+            decoder_attention_mask = decoder_inputs["attention_mask"].new_ones(decoder_input_ids.shape)
+            decoder_attention_mask[:, 1:] = decoder_inputs["attention_mask"][:, :-1].clone()
+            # Mask fill labels to replace pad token ID with -100
+            labels.masked_fill_(labels == self.tokenizer.pad_token_id, -100)
+            return_data.update({
+                "labels": labels,
+                "decoder_input_ids": decoder_input_ids,
+                "decoder_attention_mask": decoder_attention_mask,
+            })
+        if device is not None:
+            for key, value in return_data.items():
+                if isinstance(value, torch.Tensor):
+                    return_data[key] = value.to(device)
+        if batch_character_cluster_labels is not None:
+            return_data["character_cluster_labels"] = batch_character_cluster_labels
+        if batch_text_character_association_labels is not None:
+            return_data["text_character_association_labels"] = batch_text_character_association_labels
+        if batch_text_tail_association_labels is not None:
+            return_data["text_tail_association_labels"] = batch_text_tail_association_labels
+        if batch_is_essential_text_labels is not None:
+            return_data["is_essential_text_labels"] = batch_is_essential_text_labels
+        if batch_tail_character_association_labels is not None:
+            return_data["tail_character_association_labels"] = batch_tail_character_association_labels
+        return_data["tokenizer"] = self.tokenizer
+        return BatchFeature(data=return_data)
+    def cleanup_generated_text(self, generated_text):
+        return generated_text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+    def postprocess_output(self, generated_ids, images):
+        generated_ids.masked_fill_(generated_ids == -100, self.tokenizer.pad_token_id) # only for some testing purposes
+        batch_decoded_texts = self.batch_decode(generated_ids, skip_special_tokens=False)
+        batch_decoded_texts = [self.cleanup_generated_text(text) for text in batch_decoded_texts]
+        batch_list_of_list_of_bboxes = []
+        batch_indices_of_bboxes_in_new_string = []
+        batch_new_texts = []
+        for text, image in zip(batch_decoded_texts, images):
+            size_wh = self._get_image_size_wh(image)
+            parsed_text, list_of_stringified_bboxes, start_end_in_new_string = self._parse_text_with_bboxes(text)
+            list_of_list_of_bboxes = [self.box_quantizer.dequantize_from_stringified_bboxes(stringified_bbox, size_wh) for stringified_bbox in list_of_stringified_bboxes]
+            batch_list_of_list_of_bboxes.append(list_of_list_of_bboxes)
+            batch_indices_of_bboxes_in_new_string.append(start_end_in_new_string)
+            batch_new_texts.append(parsed_text)
+        return batch_new_texts, batch_list_of_list_of_bboxes, batch_indices_of_bboxes_in_new_string
+    def _parse_text_with_bboxes(self, text):
+        loc_pattern = r'((?:<loc_\d+>){4}(?:,(?:<loc_\d+>){4})*)'
+        grounding_pattern = r'<grounding>(.*?)</grounding>' + loc_pattern
+        list_of_stringified_bboxes = []
+        start_end_in_new_string = []
+        new_text = ""
+        original_pos = 0
+        new_pos = 0
+        for match in re.finditer(grounding_pattern + '|' + loc_pattern, text):
+            # Add text before the match
+            new_text += text[original_pos:match.start()]
+            new_pos += match.start() - original_pos
+            if match.group(0).startswith('<grounding>'):
+                # Handle grounding pattern
+                grounding_text = match.group(1)
+                locs = match.group(2)
+                new_text += grounding_text
+                list_of_stringified_bboxes.append(locs)
+                start_end_in_new_string.append((new_pos, new_pos + len(grounding_text)))
+                new_pos += len(grounding_text)
+            else:
+                # Handle loc pattern
+                locs = match.group(0)
+                replacement = ""
+                new_text += replacement
+                list_of_stringified_bboxes.append(locs)
+                start_end_in_new_string.append((new_pos, new_pos + len(replacement)))
+                new_pos += len(replacement)
+            original_pos = match.end()
+        # Add any remaining text
+        new_text += text[original_pos:]
+        return new_text, list_of_stringified_bboxes, start_end_in_new_string
+    def _format_text_with_bboxes(self, text, list_of_list_of_bboxes, image):
+        size_wh = self._get_image_size_wh(image)
+        quantized_bbox_lists = []
+        for list_of_bboxes in list_of_list_of_bboxes:
+            quantized_bboxes = self.box_quantizer.quantize(list_of_bboxes, size_wh=size_wh)
+            stringified_bboxes = [f"<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>" for x1, y1, x2, y2 in quantized_bboxes]
+            stringified_bboxes = ",".join(stringified_bboxes)
+            quantized_bbox_lists.append(stringified_bboxes)
+        return text.format(*quantized_bbox_lists)
+    def _get_image_size_wh(self, image):
+         # Get size_wh from image based on its type
+        if isinstance(image, torch.Tensor):
+            # For PyTorch tensor
+            if image.dim() == 3:
+                size_wh = (image.shape[2], image.shape[1])  # (width, height)
+            elif image.dim() == 4:
+                size_wh = (image.shape[3], image.shape[2])  # (width, height)
+            else:
+                raise ValueError("Unsupported tensor dimensions")
+        elif isinstance(image, np.ndarray):
+            # For NumPy array
+            if image.ndim == 2:
+                size_wh = (image.shape[1], image.shape[0])  # (width, height)
+            elif image.ndim == 3:
+                size_wh = (image.shape[1], image.shape[0])  # (width, height)
+            else:
+                raise ValueError("Unsupported array dimensions")
+        elif isinstance(image, PIL.Image.Image):
+            # For PIL Image
+            size_wh = image.size  # Already in (width, height) format
+        else:
+            raise TypeError("Unsupported image type")
+        return size_wh
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+class BoxQuantizer(object):
+    def __init__(self, mode, bins):
+        self.mode = mode
+        self.bins = bins
+    def quantize(self, boxes, size_wh):
+        if not isinstance(boxes, torch.Tensor):
+            boxes = torch.tensor(boxes)
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size_wh       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        xmin, ymin, xmax, ymax = boxes.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            quantized_xmin = (
+                xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
+            quantized_ymin = (
+                ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
+            quantized_xmax = (
+                xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
+            quantized_ymax = (
+                ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        quantized_boxes = torch.cat(
+            (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
+        ).int()
+        return quantized_boxes.tolist()
+    def dequantize_from_stringified_bboxes(self, stringified_bboxes, size_wh):
+        bboxes = stringified_bboxes.split(',')
+        def parse_bbox(bbox_string):
+            pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
+            match = re.match(pattern, bbox_string)
+            if match:
+                return [int(match.group(i)) for i in range(1, 5)]
+            else:
+                raise ValueError(f"Invalid bbox string format: {bbox_string}")
+        parsed_bboxes = [parse_bbox(bbox) for bbox in bboxes]
+        return self.dequantize(parsed_bboxes, size_wh).tolist()
+    def dequantize(self, boxes: torch.Tensor, size):
+        if not isinstance(boxes, torch.Tensor):
+            boxes = torch.tensor(boxes)
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        xmin, ymin, xmax, ymax = boxes.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            # Add 0.5 to use the center position of the bin as the coordinate.
+            dequantized_xmin = (xmin + 0.5) * size_per_bin_w
+            dequantized_ymin = (ymin + 0.5) * size_per_bin_h
+            dequantized_xmax = (xmax + 0.5) * size_per_bin_w
+            dequantized_ymax = (ymax + 0.5) * size_per_bin_h
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        dequantized_boxes = torch.cat(
+            (dequantized_xmin, dequantized_ymin,
+             dequantized_xmax, dequantized_ymax), dim=-1
+        )
+        return dequantized_boxes

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_florence2.Florence2Processor"
+  },
+  "processor_class": "Florence2Processor"
+}

special_tokens_map.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import torch
+import numpy as np
+import random
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from shapely.geometry import Point, box
+import networkx as nx
+from copy import deepcopy
+from itertools import groupby
+from concurrent.futures import ThreadPoolExecutor, TimeoutError
+def move_to_device(inputs, device):
+    if hasattr(inputs, "keys"):
+        return {k: move_to_device(v, device) for k, v in inputs.items()}
+    elif isinstance(inputs, list):
+        return [move_to_device(v, device) for v in inputs]
+    elif isinstance(inputs, tuple):
+        return tuple([move_to_device(v, device) for v in inputs])
+    elif isinstance(inputs, np.ndarray):
+        return torch.from_numpy(inputs).to(device)
+    else:
+        return inputs.to(device)
+class UnionFind:
+    def __init__(self, n):
+        self.parent = list(range(n))
+        self.size = [1] * n
+        self.num_components = n
+    @classmethod
+    def from_adj_matrix(cls, adj_matrix):
+        ufds = cls(adj_matrix.shape[0])
+        for i in range(adj_matrix.shape[0]):
+            for j in range(adj_matrix.shape[1]):
+                if adj_matrix[i, j] > 0:
+                    ufds.unite(i, j)
+        return ufds
+    @classmethod
+    def from_adj_list(cls, adj_list):
+        ufds = cls(len(adj_list))
+        for i in range(len(adj_list)):
+            for j in adj_list[i]:
+                ufds.unite(i, j)
+        return ufds
+    @classmethod
+    def from_edge_list(cls, edge_list, num_nodes):
+        ufds = cls(num_nodes)
+        for edge in edge_list:
+            ufds.unite(edge[0], edge[1])
+        return ufds
+    def find(self, x):
+        if self.parent[x] == x:
+            return x
+        self.parent[x] = self.find(self.parent[x])
+        return self.parent[x]
+    def unite(self, x, y):
+        x = self.find(x)
+        y = self.find(y)
+        if x != y:
+            if self.size[x] < self.size[y]:
+                x, y = y, x
+            self.parent[y] = x
+            self.size[x] += self.size[y]
+            self.num_components -= 1
+    def get_components_of(self, x):
+        x = self.find(x)
+        return [i for i in range(len(self.parent)) if self.find(i) == x]
+    def are_connected(self, x, y):
+        return self.find(x) == self.find(y)
+    def get_size(self, x):
+        return self.size[self.find(x)]
+    def get_num_components(self):
+        return self.num_components
+    def get_labels_for_connected_components(self):
+        map_parent_to_label = {}
+        labels = []
+        for i in range(len(self.parent)):
+            parent = self.find(i)
+            if parent not in map_parent_to_label:
+                map_parent_to_label[parent] = len(map_parent_to_label)
+            labels.append(map_parent_to_label[parent])
+        return labels
+def visualise_single_image_prediction(image_as_np_array, predictions, filename):
+    h, w = image_as_np_array.shape[:2]
+    if h > w:
+        figure, subplot = plt.subplots(1, 1, figsize=(10, 10 * h / w))
+    else:
+        figure, subplot = plt.subplots(1, 1, figsize=(10 * w / h, 10))
+    subplot.imshow(image_as_np_array)
+    plot_bboxes(subplot, predictions["panels"], color="green")
+    plot_bboxes(subplot, predictions["texts"], color="red", add_index=True)
+    plot_bboxes(subplot, predictions["characters"], color="blue")
+    COLOURS = [
+        "#b7ff51", # green
+        "#f50a8f", # pink
+        "#4b13b6", # purple
+        "#ddaa34", # orange
+        "#bea2a2", # brown
+    ]
+    colour_index = 0
+    character_cluster_labels = predictions["character_cluster_labels"]
+    unique_label_sorted_by_frequency = sorted(list(set(character_cluster_labels)), key=lambda x: character_cluster_labels.count(x), reverse=True)
+    for label in unique_label_sorted_by_frequency:
+        root = None
+        others = []
+        for i in range(len(predictions["characters"])):
+            if character_cluster_labels[i] == label:
+                if root is None:
+                    root = i
+                else:
+                    others.append(i)
+        if colour_index >= len(COLOURS):
+            random_colour = COLOURS[0]
+            while random_colour in COLOURS:
+                random_colour = "#" + "".join([random.choice("0123456789ABCDEF") for j in range(6)])
+        else:
+            random_colour = COLOURS[colour_index]
+            colour_index += 1
+        bbox_i = predictions["characters"][root]
+        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        subplot.plot([x1], [y1], color=random_colour, marker="o", markersize=5)
+        for j in others:
+            # draw line from centre of bbox i to centre of bbox j
+            bbox_j = predictions["characters"][j]
+            x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+            y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+            x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+            y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+            subplot.plot([x1, x2], [y1, y2], color=random_colour, linewidth=2)
+            subplot.plot([x2], [y2], color=random_colour, marker="o", markersize=5)
+    for (i, j) in predictions["text_character_associations"]:
+        score = predictions["dialog_confidences"][i]
+        bbox_i = predictions["texts"][i]
+        bbox_j = predictions["characters"][j]
+        x1 = bbox_i[0] + (bbox_i[2] - bbox_i[0]) / 2
+        y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
+        x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
+        y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+        subplot.plot([x1, x2], [y1, y2], color="red", linewidth=2, linestyle="dashed", alpha=score)
+    subplot.axis("off")
+    if filename is not None:
+        plt.savefig(filename, bbox_inches="tight", pad_inches=0)
+    figure.canvas.draw()
+    image = np.array(figure.canvas.renderer._renderer)
+    plt.close()
+    return image
+def plot_bboxes(subplot, bboxes, color="red", add_index=False):
+    for id, bbox in enumerate(bboxes):
+        w = bbox[2] - bbox[0]
+        h = bbox[3] - bbox[1]
+        rect = patches.Rectangle(
+            bbox[:2], w, h, linewidth=1, edgecolor=color, facecolor="none", linestyle="solid"
+        )
+        subplot.add_patch(rect)
+        if add_index:
+            cx, cy = bbox[0] + w / 2, bbox[1] + h / 2
+            subplot.text(cx, cy, str(id), color=color, fontsize=10, ha="center", va="center")
+def sort_panels(rects):
+    before_rects = convert_to_list_of_lists(rects)
+    # slightly erode all rectangles initially to account for imperfect detections
+    rects = [erode_rectangle(rect, 0.05) for rect in before_rects]
+    G = nx.DiGraph()
+    G.add_nodes_from(range(len(rects)))
+    for i in range(len(rects)):
+        for j in range(len(rects)):
+            if i == j:
+                continue
+            if is_there_a_directed_edge(i, j, rects):
+                G.add_edge(i, j, weight=get_distance(rects[i], rects[j]))
+            else:
+                G.add_edge(j, i, weight=get_distance(rects[i], rects[j]))
+    while True:
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(list, nx.simple_cycles(G))
+            try:
+                cycles = future.result(timeout=60)
+            except TimeoutError:
+                print("Cycle finding timed out after 60 seconds")
+                return list(range(len(rects)))
+        cycles = [cycle for cycle in cycles if len(cycle) > 1]
+        if len(cycles) == 0:
+            break
+        cycle = cycles[0]
+        edges = [e for e in zip(cycle, cycle[1:] + cycle[:1])]
+        max_cyclic_edge = max(edges, key=lambda x: G.edges[x]["weight"])
+        G.remove_edge(*max_cyclic_edge)
+    return list(nx.topological_sort(G))
+def is_strictly_above(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return y2A < y1B
+def is_strictly_below(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return y2B < y1A
+def is_strictly_left_of(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return x2A < x1B
+def is_strictly_right_of(rectA, rectB):
+    x1A, y1A, x2A, y2A = rectA
+    x1B, y1B, x2B, y2B = rectB
+    return x2B < x1A
+def intersects(rectA, rectB):
+    return box(*rectA).intersects(box(*rectB))
+def is_there_a_directed_edge(a, b, rects):
+    rectA = rects[a]
+    rectB = rects[b]
+    centre_of_A = [rectA[0] + (rectA[2] - rectA[0]) / 2, rectA[1] + (rectA[3] - rectA[1]) / 2]
+    centre_of_B = [rectB[0] + (rectB[2] - rectB[0]) / 2, rectB[1] + (rectB[3] - rectB[1]) / 2]
+    if np.allclose(np.array(centre_of_A), np.array(centre_of_B)):
+        return box(*rectA).area > (box(*rectB)).area
+    copy_A = [rectA[0], rectA[1], rectA[2], rectA[3]]
+    copy_B = [rectB[0], rectB[1], rectB[2], rectB[3]]
+    while True:
+        if is_strictly_above(copy_A, copy_B) and not is_strictly_left_of(copy_A, copy_B):
+            return 1
+        if is_strictly_above(copy_B, copy_A) and not is_strictly_left_of(copy_B, copy_A):
+            return 0
+        if is_strictly_right_of(copy_A, copy_B) and not is_strictly_below(copy_A, copy_B):
+            return 1
+        if is_strictly_right_of(copy_B, copy_A) and not is_strictly_below(copy_B, copy_A):
+            return 0
+        if is_strictly_below(copy_A, copy_B) and is_strictly_right_of(copy_A, copy_B):
+            return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
+        if is_strictly_below(copy_B, copy_A) and is_strictly_right_of(copy_B, copy_A):
+           return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
+        # otherwise they intersect
+        copy_A = erode_rectangle(copy_A, 0.05)
+        copy_B = erode_rectangle(copy_B, 0.05)
+def get_distance(rectA, rectB):
+    return box(rectA[0], rectA[1], rectA[2], rectA[3]).distance(box(rectB[0], rectB[1], rectB[2], rectB[3]))
+def use_cuts_to_determine_edge_from_a_to_b(a, b, rects):
+    rects = deepcopy(rects)
+    while True:
+        xmin, ymin, xmax, ymax = min(rects[a][0], rects[b][0]), min(rects[a][1], rects[b][1]), max(rects[a][2], rects[b][2]), max(rects[a][3], rects[b][3])
+        rect_index = [i for i in range(len(rects)) if intersects(rects[i], [xmin, ymin, xmax, ymax])]
+        rects_copy = [rect for rect in rects if intersects(rect, [xmin, ymin, xmax, ymax])]
+        # try to split the panels using a "horizontal" lines
+        overlapping_y_ranges = merge_overlapping_ranges([(y1, y2) for x1, y1, x2, y2 in rects_copy])
+        panel_index_to_split = {}
+        for split_index, (y1, y2) in enumerate(overlapping_y_ranges):
+            for i, index in enumerate(rect_index):
+                if y1 <= rects_copy[i][1] <= rects_copy[i][3] <= y2:
+                    panel_index_to_split[index] = split_index
+        if panel_index_to_split[a] != panel_index_to_split[b]:
+            return panel_index_to_split[a] < panel_index_to_split[b]
+        # try to split the panels using a "vertical" lines
+        overlapping_x_ranges = merge_overlapping_ranges([(x1, x2) for x1, y1, x2, y2 in rects_copy])
+        panel_index_to_split = {}
+        for split_index, (x1, x2) in enumerate(overlapping_x_ranges[::-1]):
+            for i, index in enumerate(rect_index):
+                if x1 <= rects_copy[i][0] <= rects_copy[i][2] <= x2:
+                    panel_index_to_split[index] = split_index
+        if panel_index_to_split[a] != panel_index_to_split[b]:
+            return panel_index_to_split[a] < panel_index_to_split[b]
+        # otherwise, erode the rectangles and try again
+        rects = [erode_rectangle(rect, 0.05) for rect in rects]
+def erode_rectangle(bbox, erosion_factor):
+    x1, y1, x2, y2 = bbox
+    w, h = x2 - x1, y2 - y1
+    cx, cy = x1 + w / 2, y1 + h / 2
+    if w < h:
+        aspect_ratio = w / h
+        erosion_factor_width = erosion_factor * aspect_ratio
+        erosion_factor_height = erosion_factor
+    else:
+        aspect_ratio = h / w
+        erosion_factor_width = erosion_factor
+        erosion_factor_height = erosion_factor * aspect_ratio
+    w = w - w * erosion_factor_width
+    h = h - h * erosion_factor_height
+    x1, y1, x2, y2 = cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2
+    return [x1, y1, x2, y2]
+def merge_overlapping_ranges(ranges):
+    """
+    ranges: list of tuples (x1, x2)
+    """
+    if len(ranges) == 0:
+        return []
+    ranges = sorted(ranges, key=lambda x: x[0])
+    merged_ranges = []
+    for i, r in enumerate(ranges):
+        if i == 0:
+            prev_x1, prev_x2 = r
+            continue
+        x1, x2 = r
+        if x1 > prev_x2:
+            merged_ranges.append((prev_x1, prev_x2))
+            prev_x1, prev_x2 = x1, x2
+        else:
+            prev_x2 = max(prev_x2, x2)
+    merged_ranges.append((prev_x1, prev_x2))
+    return merged_ranges
+def sort_text_boxes_in_reading_order(text_bboxes, sorted_panel_bboxes):
+    text_bboxes = convert_to_list_of_lists(text_bboxes)
+    sorted_panel_bboxes = convert_to_list_of_lists(sorted_panel_bboxes)
+    if len(text_bboxes) == 0:
+        return []
+    def indices_of_same_elements(nums):
+        groups = groupby(range(len(nums)), key=lambda i: nums[i])
+        return [list(indices) for _, indices in groups]
+    panel_id_for_text = get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes)
+    indices_of_texts = list(range(len(text_bboxes)))
+    indices_of_texts, panel_id_for_text = zip(*sorted(zip(indices_of_texts, panel_id_for_text), key=lambda x: x[1]))
+    indices_of_texts = list(indices_of_texts)
+    grouped_indices = indices_of_same_elements(panel_id_for_text)
+    for group in grouped_indices:
+        subset_of_text_indices = [indices_of_texts[i] for i in group]
+        text_bboxes_of_subset = [text_bboxes[i] for i in subset_of_text_indices]
+        sorted_subset_indices = sort_texts_within_panel(text_bboxes_of_subset)
+        indices_of_texts[group[0] : group[-1] + 1] = [subset_of_text_indices[i] for i in sorted_subset_indices]
+    return indices_of_texts
+def get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes):
+    text_to_panel_mapping = []
+    for text_bbox in text_bboxes:
+        shapely_text_polygon = box(*text_bbox)
+        all_intersections = []
+        all_distances = []
+        if len(sorted_panel_bboxes) == 0:
+            text_to_panel_mapping.append(-1)
+            continue
+        for j, annotation in enumerate(sorted_panel_bboxes):
+            shapely_annotation_polygon = box(*annotation)
+            if shapely_text_polygon.intersects(shapely_annotation_polygon):
+                all_intersections.append((shapely_text_polygon.intersection(shapely_annotation_polygon).area, j))
+            all_distances.append((shapely_text_polygon.distance(shapely_annotation_polygon), j))
+        if len(all_intersections) == 0:
+            text_to_panel_mapping.append(min(all_distances, key=lambda x: x[0])[1])
+        else:
+            text_to_panel_mapping.append(max(all_intersections, key=lambda x: x[0])[1])
+    return text_to_panel_mapping
+def sort_texts_within_panel(rects):
+    smallest_y = float("inf")
+    greatest_x = float("-inf")
+    for i, rect in enumerate(rects):
+        x1, y1, x2, y2 = rect
+        smallest_y = min(smallest_y, y1)
+        greatest_x = max(greatest_x, x2)
+    reference_point = Point(greatest_x, smallest_y)
+    polygons_and_index = []
+    for i, rect in enumerate(rects):
+        x1, y1, x2, y2 = rect
+        polygons_and_index.append((box(x1,y1,x2,y2), i))
+    # sort points by closest to reference point
+    polygons_and_index = sorted(polygons_and_index, key=lambda x: reference_point.distance(x[0]))
+    indices = [x[1] for x in polygons_and_index]
+    return indices
+def force_to_be_valid_bboxes(bboxes):
+    if len(bboxes) == 0:
+        return bboxes
+    bboxes_as_xywh = [[x1, y1, x2-x1, y2-y1] for x1, y1, x2, y2 in bboxes]
+    bboxes_as_xywh = torch.tensor(bboxes_as_xywh)
+    bboxes_as_xywh[:, 2] = torch.clamp(bboxes_as_xywh[:, 2], min=1)
+    bboxes_as_xywh[:, 3] = torch.clamp(bboxes_as_xywh[:, 3], min=1)
+    bboxes_as_xywh = bboxes_as_xywh.tolist()
+    bboxes_as_xyxy = [[x1, y1, x1 + w, y1 + h] for x1, y1, w, h in bboxes_as_xywh]
+    return bboxes_as_xyxy
+def x1y1wh_to_x1y1x2y2(bbox):
+    x1, y1, w, h = bbox
+    return [x1, y1, x1 + w, y1 + h]
+def x1y1x2y2_to_xywh(bbox):
+    x1, y1, x2, y2 = bbox
+    return [x1, y1, x2 - x1, y2 - y1]
+def convert_to_list_of_lists(rects):
+    if isinstance(rects, torch.Tensor):
+        return rects.tolist()
+    if isinstance(rects, np.ndarray):
+        return rects.tolist()
+    return [[a, b, c, d] for a, b, c, d in rects]

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff