| { |
| "_valid_processor_keys": [ |
| "images", |
| "do_resize", |
| "size", |
| "resample", |
| "do_center_crop", |
| "crop_size", |
| "do_rescale", |
| "rescale_factor", |
| "do_normalize", |
| "image_mean", |
| "image_std", |
| "do_convert_rgb", |
| "return_tensors", |
| "data_format", |
| "input_data_format" |
| ], |
| "crop_size": { |
| "height": 768, |
| "width": 768 |
| }, |
| "do_center_crop": false, |
| "do_convert_rgb": null, |
| "do_normalize": true, |
| "do_rescale": true, |
| "do_resize": true, |
| "image_mean": [ |
| 0.485, |
| 0.456, |
| 0.406 |
| ], |
| "image_processor_type": "CLIPImageProcessor", |
| "image_seq_length": 577, |
| "image_std": [ |
| 0.229, |
| 0.224, |
| 0.225 |
| ], |
| "processor_class": "Florence2Processor", |
| "resample": 3, |
| "rescale_factor": 0.00392156862745098, |
| "size": { |
| "height": 768, |
| "width": 768 |
| }, |
| "tasks_answer_post_processing_type": { |
| "<OCR>": "pure_text", |
| "<OCR_WITH_REGION>": "ocr", |
| "<CAPTION>": "pure_text", |
| "<DETAILED_CAPTION>": "pure_text", |
| "<MORE_DETAILED_CAPTION>": "pure_text", |
| "<OD>": "description_with_bboxes", |
| "<DENSE_REGION_CAPTION>": "description_with_bboxes", |
| "<CAPTION_TO_PHRASE_GROUNDING>": "phrase_grounding", |
| "<REFERRING_EXPRESSION_SEGMENTATION>": "polygons", |
| "<REGION_TO_SEGMENTATION>": "polygons", |
| "<OPEN_VOCABULARY_DETECTION>": "description_with_bboxes_or_polygons", |
| "<REGION_TO_CATEGORY>": "pure_text", |
| "<REGION_TO_DESCRIPTION>": "pure_text", |
| "<REGION_TO_OCR>": "pure_text", |
| "<REGION_PROPOSAL>": "bboxes" |
| }, |
| "task_prompts_without_inputs": { |
| "<OCR>": "What is the text in the image?", |
| "<OCR_WITH_REGION>": "What is the text in the image, with regions?", |
| "<CAPTION>": "What does the image describe?", |
| "<DETAILED_CAPTION>": "Describe in detail what is shown in the image.", |
| "<MORE_DETAILED_CAPTION>": "Describe with a paragraph what is shown in the image.", |
| "<OD>": "Locate the objects with category name in the image.", |
| "<DENSE_REGION_CAPTION>": "Locate the objects in the image, with their descriptions.", |
| "<REGION_PROPOSAL>": "Locate the region proposals in the image." |
| }, |
| "task_prompts_with_input": { |
| "<CAPTION_TO_PHRASE_GROUNDING>": "Locate the phrases in the caption: {input}", |
| "<REFERRING_EXPRESSION_SEGMENTATION>": "Locate {input} in the image with mask", |
| "<REGION_TO_SEGMENTATION>": "What is the polygon mask of region {input}", |
| "<OPEN_VOCABULARY_DETECTION>": "Locate {input} in the image.", |
| "<REGION_TO_CATEGORY>": "What is the region {input}?", |
| "<REGION_TO_DESCRIPTION>": "What does the region {input} describe?", |
| "<REGION_TO_OCR>": "What text is in the region {input}?" |
| } |
| } |
|
|