Upload folder using huggingface_hub
#2
by
silveroxides - opened
- config.json +83 -83
- generation_config.json +4 -4
- processing_florence2.py +51 -48
config.json
CHANGED
|
@@ -1,84 +1,84 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_name_or_path": "florence2",
|
| 3 |
-
"architectures": [
|
| 4 |
-
"Florence2ForConditionalGeneration"
|
| 5 |
-
],
|
| 6 |
-
"auto_map": {
|
| 7 |
-
"AutoConfig": "configuration_florence2.Florence2Config",
|
| 8 |
-
"AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
|
| 9 |
-
},
|
| 10 |
-
"bos_token_id": 0,
|
| 11 |
-
"eos_token_id": 2,
|
| 12 |
-
"ignore_index": -100,
|
| 13 |
-
"model_type": "florence2",
|
| 14 |
-
"pad_token_id": 1,
|
| 15 |
-
"projection_dim": 1024,
|
| 16 |
-
"text_config": {
|
| 17 |
-
"vocab_size": 51289,
|
| 18 |
-
"activation_dropout": 0.1,
|
| 19 |
-
"activation_function": "gelu",
|
| 20 |
-
"add_bias_logits": false,
|
| 21 |
-
"add_final_layer_norm": false,
|
| 22 |
-
"attention_dropout": 0.1,
|
| 23 |
-
"bos_token_id": 0,
|
| 24 |
-
"classif_dropout": 0.1,
|
| 25 |
-
"classifier_dropout": 0.0,
|
| 26 |
-
"d_model": 1024,
|
| 27 |
-
"decoder_attention_heads": 16,
|
| 28 |
-
"decoder_ffn_dim": 4096,
|
| 29 |
-
"decoder_layerdrop": 0.0,
|
| 30 |
-
"decoder_layers": 12,
|
| 31 |
-
"decoder_start_token_id": 2,
|
| 32 |
-
"dropout": 0.1,
|
| 33 |
-
"early_stopping": true,
|
| 34 |
-
"encoder_attention_heads": 16,
|
| 35 |
-
"encoder_ffn_dim": 4096,
|
| 36 |
-
"encoder_layerdrop": 0.0,
|
| 37 |
-
"encoder_layers": 12,
|
| 38 |
-
"eos_token_id": 2,
|
| 39 |
-
"forced_eos_token_id": 2,
|
| 40 |
-
"forced_bos_token_id": 0,
|
| 41 |
-
"gradient_checkpointing": false,
|
| 42 |
-
"init_std": 0.02,
|
| 43 |
-
"is_encoder_decoder": true,
|
| 44 |
-
"label2id": {
|
| 45 |
-
"LABEL_0": 0,
|
| 46 |
-
"LABEL_1": 1,
|
| 47 |
-
"LABEL_2": 2
|
| 48 |
-
},
|
| 49 |
-
"max_position_embeddings": 1024,
|
| 50 |
-
"no_repeat_ngram_size": 3,
|
| 51 |
-
"normalize_before": false,
|
| 52 |
-
"num_hidden_layers": 12,
|
| 53 |
-
"pad_token_id": 1,
|
| 54 |
-
"scale_embedding": false,
|
| 55 |
-
"num_beams": 3
|
| 56 |
-
},
|
| 57 |
-
"vision_config": {
|
| 58 |
-
"model_type": "davit",
|
| 59 |
-
"drop_path_rate": 0.1,
|
| 60 |
-
"patch_size": [7, 3, 3, 3],
|
| 61 |
-
"patch_stride": [4, 2, 2, 2],
|
| 62 |
-
"patch_padding": [3, 1, 1, 1],
|
| 63 |
-
"patch_prenorm": [false, true, true, true],
|
| 64 |
-
"enable_checkpoint": false,
|
| 65 |
-
"dim_embed": [256, 512, 1024, 2048],
|
| 66 |
-
"num_heads": [8, 16, 32, 64],
|
| 67 |
-
"num_groups": [8, 16, 32, 64],
|
| 68 |
-
"depths": [1, 1, 9, 1],
|
| 69 |
-
"window_size": 12,
|
| 70 |
-
"projection_dim": 1024,
|
| 71 |
-
"visual_temporal_embedding": {
|
| 72 |
-
"type": "COSINE",
|
| 73 |
-
"max_temporal_embeddings": 100
|
| 74 |
-
},
|
| 75 |
-
"image_pos_embed": {
|
| 76 |
-
"type": "learned_abs_2d",
|
| 77 |
-
"max_pos_embeddings": 50
|
| 78 |
-
},
|
| 79 |
-
"image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
|
| 80 |
-
},
|
| 81 |
-
"vocab_size": 51289,
|
| 82 |
-
"transformers_version": "4.48.3",
|
| 83 |
-
"is_encoder_decoder": true
|
| 84 |
}
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "florence2",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Florence2ForConditionalGeneration"
|
| 5 |
+
],
|
| 6 |
+
"auto_map": {
|
| 7 |
+
"AutoConfig": "configuration_florence2.Florence2Config",
|
| 8 |
+
"AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
|
| 9 |
+
},
|
| 10 |
+
"bos_token_id": 0,
|
| 11 |
+
"eos_token_id": 2,
|
| 12 |
+
"ignore_index": -100,
|
| 13 |
+
"model_type": "florence2",
|
| 14 |
+
"pad_token_id": 1,
|
| 15 |
+
"projection_dim": 1024,
|
| 16 |
+
"text_config": {
|
| 17 |
+
"vocab_size": 51289,
|
| 18 |
+
"activation_dropout": 0.1,
|
| 19 |
+
"activation_function": "gelu",
|
| 20 |
+
"add_bias_logits": false,
|
| 21 |
+
"add_final_layer_norm": false,
|
| 22 |
+
"attention_dropout": 0.1,
|
| 23 |
+
"bos_token_id": 0,
|
| 24 |
+
"classif_dropout": 0.1,
|
| 25 |
+
"classifier_dropout": 0.0,
|
| 26 |
+
"d_model": 1024,
|
| 27 |
+
"decoder_attention_heads": 16,
|
| 28 |
+
"decoder_ffn_dim": 4096,
|
| 29 |
+
"decoder_layerdrop": 0.0,
|
| 30 |
+
"decoder_layers": 12,
|
| 31 |
+
"decoder_start_token_id": 2,
|
| 32 |
+
"dropout": 0.1,
|
| 33 |
+
"early_stopping": true,
|
| 34 |
+
"encoder_attention_heads": 16,
|
| 35 |
+
"encoder_ffn_dim": 4096,
|
| 36 |
+
"encoder_layerdrop": 0.0,
|
| 37 |
+
"encoder_layers": 12,
|
| 38 |
+
"eos_token_id": 2,
|
| 39 |
+
"forced_eos_token_id": 2,
|
| 40 |
+
"forced_bos_token_id": 0,
|
| 41 |
+
"gradient_checkpointing": false,
|
| 42 |
+
"init_std": 0.02,
|
| 43 |
+
"is_encoder_decoder": true,
|
| 44 |
+
"label2id": {
|
| 45 |
+
"LABEL_0": 0,
|
| 46 |
+
"LABEL_1": 1,
|
| 47 |
+
"LABEL_2": 2
|
| 48 |
+
},
|
| 49 |
+
"max_position_embeddings": 1024,
|
| 50 |
+
"no_repeat_ngram_size": 3,
|
| 51 |
+
"normalize_before": false,
|
| 52 |
+
"num_hidden_layers": 12,
|
| 53 |
+
"pad_token_id": 1,
|
| 54 |
+
"scale_embedding": false,
|
| 55 |
+
"num_beams": 3
|
| 56 |
+
},
|
| 57 |
+
"vision_config": {
|
| 58 |
+
"model_type": "davit",
|
| 59 |
+
"drop_path_rate": 0.1,
|
| 60 |
+
"patch_size": [7, 3, 3, 3],
|
| 61 |
+
"patch_stride": [4, 2, 2, 2],
|
| 62 |
+
"patch_padding": [3, 1, 1, 1],
|
| 63 |
+
"patch_prenorm": [false, true, true, true],
|
| 64 |
+
"enable_checkpoint": false,
|
| 65 |
+
"dim_embed": [256, 512, 1024, 2048],
|
| 66 |
+
"num_heads": [8, 16, 32, 64],
|
| 67 |
+
"num_groups": [8, 16, 32, 64],
|
| 68 |
+
"depths": [1, 1, 9, 1],
|
| 69 |
+
"window_size": 12,
|
| 70 |
+
"projection_dim": 1024,
|
| 71 |
+
"visual_temporal_embedding": {
|
| 72 |
+
"type": "COSINE",
|
| 73 |
+
"max_temporal_embeddings": 100
|
| 74 |
+
},
|
| 75 |
+
"image_pos_embed": {
|
| 76 |
+
"type": "learned_abs_2d",
|
| 77 |
+
"max_pos_embeddings": 50
|
| 78 |
+
},
|
| 79 |
+
"image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
|
| 80 |
+
},
|
| 81 |
+
"vocab_size": 51289,
|
| 82 |
+
"transformers_version": "4.48.3",
|
| 83 |
+
"is_encoder_decoder": true
|
| 84 |
}
|
generation_config.json
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
{
|
| 2 |
-
"num_beams": 3,
|
| 3 |
-
"early_stopping": false
|
| 4 |
-
}
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_beams": 3,
|
| 3 |
+
"early_stopping": false
|
| 4 |
+
}
|
processing_florence2.py
CHANGED
|
@@ -84,9 +84,12 @@ class Florence2Processor(ProcessorMixin):
|
|
| 84 |
|
| 85 |
self.image_seq_length = image_processor.image_seq_length
|
| 86 |
|
|
|
|
|
|
|
|
|
|
| 87 |
tokens_to_add = {
|
| 88 |
'additional_special_tokens': \
|
| 89 |
-
|
| 90 |
['<od>', '</od>', '<ocr>', '</ocr>'] + \
|
| 91 |
[f'<loc_{x}>' for x in range(1000)] + \
|
| 92 |
['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
|
|
@@ -138,7 +141,7 @@ class Florence2Processor(ProcessorMixin):
|
|
| 138 |
|
| 139 |
|
| 140 |
super().__init__(image_processor, tokenizer)
|
| 141 |
-
|
| 142 |
def _construct_prompts(self, text):
|
| 143 |
# replace the task tokens with the task prompts if task token is in the text
|
| 144 |
prompts = []
|
|
@@ -149,7 +152,7 @@ class Florence2Processor(ProcessorMixin):
|
|
| 149 |
assert _text == task_token, f"Task token {task_token} should be the only token in the text."
|
| 150 |
_text = task_prompt
|
| 151 |
break
|
| 152 |
-
# 2. task prompts with additional inputs
|
| 153 |
for task_token, task_prompt in self.task_prompts_with_input.items():
|
| 154 |
if task_token in _text:
|
| 155 |
_text = task_prompt.format(input=_text.replace(task_token, ''))
|
|
@@ -381,7 +384,7 @@ class Florence2Processor(ProcessorMixin):
|
|
| 381 |
|
| 382 |
final_answer = {
|
| 383 |
task: final_answer}
|
| 384 |
-
return final_answer
|
| 385 |
|
| 386 |
class BoxQuantizer(object):
|
| 387 |
def __init__(self, mode, bins):
|
|
@@ -505,8 +508,8 @@ class CoordinatesQuantizer(object):
|
|
| 505 |
|
| 506 |
|
| 507 |
class Florence2PostProcesser(object):
|
| 508 |
-
|
| 509 |
-
Florence-2 post process for converting text prediction to various tasks results.
|
| 510 |
|
| 511 |
Args:
|
| 512 |
config: A dict of configs.
|
|
@@ -588,7 +591,7 @@ class Florence2PostProcesser(object):
|
|
| 588 |
)
|
| 589 |
|
| 590 |
return black_list
|
| 591 |
-
|
| 592 |
def _create_default_config(self):
|
| 593 |
config = {
|
| 594 |
'NUM_BBOX_HEIGHT_BINS': 1000,
|
|
@@ -645,7 +648,7 @@ class Florence2PostProcesser(object):
|
|
| 645 |
box_quantization_mode,
|
| 646 |
(num_bbox_width_bins, num_bbox_height_bins),
|
| 647 |
)
|
| 648 |
-
|
| 649 |
num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
|
| 650 |
num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
|
| 651 |
box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
|
|
@@ -698,7 +701,7 @@ class Florence2PostProcesser(object):
|
|
| 698 |
instance['bbox'] = self.box_quantizer.dequantize(
|
| 699 |
boxes=torch.tensor(bbox_bins),
|
| 700 |
size=image_size
|
| 701 |
-
).tolist()
|
| 702 |
|
| 703 |
if phrase_centric:
|
| 704 |
instance['cat_name'] = parsed[i].group(1).lower().strip()
|
|
@@ -708,9 +711,9 @@ class Florence2PostProcesser(object):
|
|
| 708 |
|
| 709 |
return instances
|
| 710 |
|
| 711 |
-
def parse_ocr_from_text_and_spans(self,
|
| 712 |
-
text,
|
| 713 |
-
pattern,
|
| 714 |
image_size,
|
| 715 |
area_threshold=-1.0,
|
| 716 |
):
|
|
@@ -752,7 +755,7 @@ class Florence2PostProcesser(object):
|
|
| 752 |
def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
|
| 753 |
# ignore <s> </s> and <pad>
|
| 754 |
cur_span = 0
|
| 755 |
-
if text.startswith('<s>'):
|
| 756 |
cur_span += 3
|
| 757 |
|
| 758 |
text = text.replace('<s>', '')
|
|
@@ -761,7 +764,7 @@ class Florence2PostProcesser(object):
|
|
| 761 |
|
| 762 |
pattern = r"([^<]+(?:<loc_\d+>){4,})"
|
| 763 |
phrases = re.findall(pattern, text)
|
| 764 |
-
|
| 765 |
# pattern should be text pattern and od pattern
|
| 766 |
pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
|
| 767 |
box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
|
|
@@ -778,7 +781,7 @@ class Florence2PostProcesser(object):
|
|
| 778 |
# Prepare instance.
|
| 779 |
instance = {}
|
| 780 |
|
| 781 |
-
# parse phrase, get string
|
| 782 |
phrase = re.search(pattern, phrase_text_strip)
|
| 783 |
if phrase is None:
|
| 784 |
cur_span += len(pharse_text)
|
|
@@ -798,12 +801,12 @@ class Florence2PostProcesser(object):
|
|
| 798 |
cur_span += len(pharse_text)
|
| 799 |
continue
|
| 800 |
|
| 801 |
-
# a list of list
|
| 802 |
bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
|
| 803 |
instance['bbox'] = self.box_quantizer.dequantize(
|
| 804 |
boxes=torch.tensor(bbox_bins),
|
| 805 |
size=image_size
|
| 806 |
-
).tolist()
|
| 807 |
|
| 808 |
# exclude non-ascii characters
|
| 809 |
phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
|
|
@@ -814,13 +817,13 @@ class Florence2PostProcesser(object):
|
|
| 814 |
return instances
|
| 815 |
|
| 816 |
def parse_description_with_bboxes_from_text_and_spans(
|
| 817 |
-
self,
|
| 818 |
-
text,
|
| 819 |
spans=None,
|
| 820 |
scores=None,
|
| 821 |
score_mode=None,
|
| 822 |
-
pattern=None,
|
| 823 |
-
image_size=None,
|
| 824 |
allow_empty_phrase=False
|
| 825 |
):
|
| 826 |
def find_matched_token_indices(cur_span, token_spans):
|
|
@@ -831,7 +834,7 @@ class Florence2PostProcesser(object):
|
|
| 831 |
return inds
|
| 832 |
|
| 833 |
cur_span = 0
|
| 834 |
-
if text.startswith('<s>'):
|
| 835 |
cur_span += 3
|
| 836 |
|
| 837 |
text = text.replace('<s>', '')
|
|
@@ -839,11 +842,11 @@ class Florence2PostProcesser(object):
|
|
| 839 |
text = text.replace('<pad>', '')
|
| 840 |
|
| 841 |
if allow_empty_phrase:
|
| 842 |
-
pattern =
|
| 843 |
else:
|
| 844 |
pattern = r"([^<]+(?:<loc_\d+>){4,})"
|
| 845 |
phrases = re.findall(pattern, text)
|
| 846 |
-
|
| 847 |
# pattern should be text pattern and od pattern
|
| 848 |
pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
|
| 849 |
box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
|
|
@@ -857,7 +860,7 @@ class Florence2PostProcesser(object):
|
|
| 857 |
cur_span += len(pharse_text)
|
| 858 |
continue
|
| 859 |
|
| 860 |
-
# parse phrase, get string
|
| 861 |
phrase = re.search(pattern, phrase_text_strip)
|
| 862 |
if phrase is None:
|
| 863 |
cur_span += len(pharse_text)
|
|
@@ -874,13 +877,13 @@ class Florence2PostProcesser(object):
|
|
| 874 |
cur_span += len(pharse_text)
|
| 875 |
continue
|
| 876 |
|
| 877 |
-
# a list of list
|
| 878 |
bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
|
| 879 |
|
| 880 |
bboxes = self.box_quantizer.dequantize(
|
| 881 |
boxes=torch.tensor(bbox_bins),
|
| 882 |
size=image_size
|
| 883 |
-
).tolist()
|
| 884 |
|
| 885 |
if score_mode == 'avg_loc_scores':
|
| 886 |
if spans is None or scores is None:
|
|
@@ -893,7 +896,7 @@ class Florence2PostProcesser(object):
|
|
| 893 |
loc_scores = [scores[token_i] for token_i in token_inds]
|
| 894 |
score = sum(loc_scores) / len(loc_scores)
|
| 895 |
all_scores.append(score)
|
| 896 |
-
elif score_mode == 'avg_cat_name_scores':
|
| 897 |
if spans is None or scores is None:
|
| 898 |
all_scores = None
|
| 899 |
else:
|
|
@@ -916,19 +919,19 @@ class Florence2PostProcesser(object):
|
|
| 916 |
if all_scores is not None:
|
| 917 |
instance['score'] = math.exp(all_scores[_idx])
|
| 918 |
instances.append(instance)
|
| 919 |
-
|
| 920 |
cur_span += len(pharse_text)
|
| 921 |
|
| 922 |
return instances
|
| 923 |
|
| 924 |
-
def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
|
| 925 |
allow_empty_phrase=False,
|
| 926 |
polygon_sep_token='<sep>',
|
| 927 |
polygon_start_token='<poly>',
|
| 928 |
polygon_end_token='</poly>',
|
| 929 |
with_box_at_start=False,
|
| 930 |
):
|
| 931 |
-
|
| 932 |
# ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
|
| 933 |
# ignore <s> </s> and <pad>
|
| 934 |
|
|
@@ -939,7 +942,7 @@ class Florence2PostProcesser(object):
|
|
| 939 |
if allow_empty_phrase:
|
| 940 |
pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
|
| 941 |
else:
|
| 942 |
-
# [^<]+: This part matches one or more characters that are not the < symbol.
|
| 943 |
# The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
|
| 944 |
#
|
| 945 |
pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
|
|
@@ -950,7 +953,7 @@ class Florence2PostProcesser(object):
|
|
| 950 |
|
| 951 |
# one polygons instance is separated by polygon_start_token and polygon_end_token
|
| 952 |
polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
|
| 953 |
-
|
| 954 |
instances = []
|
| 955 |
for phrase_text in phrases:
|
| 956 |
|
|
@@ -965,7 +968,7 @@ class Florence2PostProcesser(object):
|
|
| 965 |
continue
|
| 966 |
|
| 967 |
|
| 968 |
-
# parse phrase, get string
|
| 969 |
phrase = re.search(phrase_string_pattern, phrase_text_strip)
|
| 970 |
if phrase is None:
|
| 971 |
continue
|
|
@@ -986,7 +989,7 @@ class Florence2PostProcesser(object):
|
|
| 986 |
instance = {}
|
| 987 |
|
| 988 |
# polygons_parsed= list(re.finditer(box_pattern, phrase_text))
|
| 989 |
-
if isinstance(_polygons_instances_parsed, str):
|
| 990 |
polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
|
| 991 |
else:
|
| 992 |
polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
|
|
@@ -1008,10 +1011,10 @@ class Florence2PostProcesser(object):
|
|
| 1008 |
_polygon = _polygon[4:]
|
| 1009 |
else:
|
| 1010 |
bbox = [0, 0, 0, 0]
|
| 1011 |
-
# abandon last element if is not paired
|
| 1012 |
if len(_polygon) % 2 == 1:
|
| 1013 |
_polygon = _polygon[:-1]
|
| 1014 |
-
|
| 1015 |
# reshape into (n, 2)
|
| 1016 |
_polygon = self.coordinates_quantizer.dequantize(
|
| 1017 |
torch.tensor(np.array(_polygon).reshape(-1, 2)),
|
|
@@ -1026,7 +1029,7 @@ class Florence2PostProcesser(object):
|
|
| 1026 |
instance['bbox'] = self.box_quantizer.dequantize(
|
| 1027 |
boxes=torch.tensor([bbox]),
|
| 1028 |
size=image_size
|
| 1029 |
-
).tolist()[0]
|
| 1030 |
|
| 1031 |
instances.append(instance)
|
| 1032 |
|
|
@@ -1052,8 +1055,8 @@ class Florence2PostProcesser(object):
|
|
| 1052 |
parse_tasks = [parse_tasks]
|
| 1053 |
for _parse_task in parse_tasks:
|
| 1054 |
assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
|
| 1055 |
-
|
| 1056 |
-
# sequence or text should be provided
|
| 1057 |
assert sequence is not None or text is not None, 'sequence or text should be provided'
|
| 1058 |
assert sequence is None or text is None, 'only one of sequence and text should be provided'
|
| 1059 |
|
|
@@ -1087,16 +1090,16 @@ class Florence2PostProcesser(object):
|
|
| 1087 |
)
|
| 1088 |
parsed_dict['ocr'] = instances
|
| 1089 |
elif task == 'phrase_grounding':
|
| 1090 |
-
instances = self.parse_phrase_grounding_from_text_and_spans(
|
| 1091 |
text,
|
| 1092 |
pattern=pattern,
|
| 1093 |
image_size=image_size,
|
| 1094 |
)
|
| 1095 |
parsed_dict['phrase_grounding'] = instances
|
| 1096 |
elif task == 'pure_text':
|
| 1097 |
-
parsed_dict['pure_text'] = text
|
| 1098 |
elif task == 'description_with_bboxes':
|
| 1099 |
-
instances = self.parse_description_with_bboxes_from_text_and_spans(
|
| 1100 |
text,
|
| 1101 |
spans=spans,
|
| 1102 |
scores=transition_beam_score,
|
|
@@ -1106,14 +1109,14 @@ class Florence2PostProcesser(object):
|
|
| 1106 |
)
|
| 1107 |
parsed_dict['description_with_bboxes'] = instances
|
| 1108 |
elif task == 'description_with_polygons':
|
| 1109 |
-
instances = self.parse_description_with_polygons_from_text_and_spans(
|
| 1110 |
text,
|
| 1111 |
pattern=pattern,
|
| 1112 |
image_size=image_size,
|
| 1113 |
)
|
| 1114 |
parsed_dict['description_with_polygons'] = instances
|
| 1115 |
elif task == 'polygons':
|
| 1116 |
-
instances = self.parse_description_with_polygons_from_text_and_spans(
|
| 1117 |
text,
|
| 1118 |
pattern=pattern,
|
| 1119 |
image_size=image_size,
|
|
@@ -1121,7 +1124,7 @@ class Florence2PostProcesser(object):
|
|
| 1121 |
)
|
| 1122 |
parsed_dict['polygons'] = instances
|
| 1123 |
elif task == 'bboxes':
|
| 1124 |
-
instances = self.parse_description_with_bboxes_from_text_and_spans(
|
| 1125 |
text,
|
| 1126 |
pattern=pattern,
|
| 1127 |
image_size=image_size,
|
|
@@ -1131,13 +1134,13 @@ class Florence2PostProcesser(object):
|
|
| 1131 |
elif task == 'description_with_bboxes_or_polygons':
|
| 1132 |
if '<poly>' in text:
|
| 1133 |
# only support either polygons or bboxes, not both at the same time
|
| 1134 |
-
instances = self.parse_description_with_polygons_from_text_and_spans(
|
| 1135 |
text,
|
| 1136 |
pattern=pattern,
|
| 1137 |
image_size=image_size,
|
| 1138 |
)
|
| 1139 |
else:
|
| 1140 |
-
instances = self.parse_description_with_bboxes_from_text_and_spans(
|
| 1141 |
text,
|
| 1142 |
pattern=pattern,
|
| 1143 |
image_size=image_size,
|
|
|
|
| 84 |
|
| 85 |
self.image_seq_length = image_processor.image_seq_length
|
| 86 |
|
| 87 |
+
# Get existing additional_special_tokens safely (works with both Roberta and BART tokenizers)
|
| 88 |
+
existing_special_tokens = list(getattr(tokenizer, 'additional_special_tokens', []) or [])
|
| 89 |
+
|
| 90 |
tokens_to_add = {
|
| 91 |
'additional_special_tokens': \
|
| 92 |
+
existing_special_tokens + \
|
| 93 |
['<od>', '</od>', '<ocr>', '</ocr>'] + \
|
| 94 |
[f'<loc_{x}>' for x in range(1000)] + \
|
| 95 |
['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
|
|
|
|
| 141 |
|
| 142 |
|
| 143 |
super().__init__(image_processor, tokenizer)
|
| 144 |
+
|
| 145 |
def _construct_prompts(self, text):
|
| 146 |
# replace the task tokens with the task prompts if task token is in the text
|
| 147 |
prompts = []
|
|
|
|
| 152 |
assert _text == task_token, f"Task token {task_token} should be the only token in the text."
|
| 153 |
_text = task_prompt
|
| 154 |
break
|
| 155 |
+
# 2. task prompts with additional inputs
|
| 156 |
for task_token, task_prompt in self.task_prompts_with_input.items():
|
| 157 |
if task_token in _text:
|
| 158 |
_text = task_prompt.format(input=_text.replace(task_token, ''))
|
|
|
|
| 384 |
|
| 385 |
final_answer = {
|
| 386 |
task: final_answer}
|
| 387 |
+
return final_answer
|
| 388 |
|
| 389 |
class BoxQuantizer(object):
|
| 390 |
def __init__(self, mode, bins):
|
|
|
|
| 508 |
|
| 509 |
|
| 510 |
class Florence2PostProcesser(object):
|
| 511 |
+
"""
|
| 512 |
+
Florence-2 post process for converting text prediction to various tasks results.
|
| 513 |
|
| 514 |
Args:
|
| 515 |
config: A dict of configs.
|
|
|
|
| 591 |
)
|
| 592 |
|
| 593 |
return black_list
|
| 594 |
+
|
| 595 |
def _create_default_config(self):
|
| 596 |
config = {
|
| 597 |
'NUM_BBOX_HEIGHT_BINS': 1000,
|
|
|
|
| 648 |
box_quantization_mode,
|
| 649 |
(num_bbox_width_bins, num_bbox_height_bins),
|
| 650 |
)
|
| 651 |
+
|
| 652 |
num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
|
| 653 |
num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
|
| 654 |
box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
|
|
|
|
| 701 |
instance['bbox'] = self.box_quantizer.dequantize(
|
| 702 |
boxes=torch.tensor(bbox_bins),
|
| 703 |
size=image_size
|
| 704 |
+
).tolist()
|
| 705 |
|
| 706 |
if phrase_centric:
|
| 707 |
instance['cat_name'] = parsed[i].group(1).lower().strip()
|
|
|
|
| 711 |
|
| 712 |
return instances
|
| 713 |
|
| 714 |
+
def parse_ocr_from_text_and_spans(self,
|
| 715 |
+
text,
|
| 716 |
+
pattern,
|
| 717 |
image_size,
|
| 718 |
area_threshold=-1.0,
|
| 719 |
):
|
|
|
|
| 755 |
def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
|
| 756 |
# ignore <s> </s> and <pad>
|
| 757 |
cur_span = 0
|
| 758 |
+
if text.startswith('<s>'):
|
| 759 |
cur_span += 3
|
| 760 |
|
| 761 |
text = text.replace('<s>', '')
|
|
|
|
| 764 |
|
| 765 |
pattern = r"([^<]+(?:<loc_\d+>){4,})"
|
| 766 |
phrases = re.findall(pattern, text)
|
| 767 |
+
|
| 768 |
# pattern should be text pattern and od pattern
|
| 769 |
pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
|
| 770 |
box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
|
|
|
|
| 781 |
# Prepare instance.
|
| 782 |
instance = {}
|
| 783 |
|
| 784 |
+
# parse phrase, get string
|
| 785 |
phrase = re.search(pattern, phrase_text_strip)
|
| 786 |
if phrase is None:
|
| 787 |
cur_span += len(pharse_text)
|
|
|
|
| 801 |
cur_span += len(pharse_text)
|
| 802 |
continue
|
| 803 |
|
| 804 |
+
# a list of list
|
| 805 |
bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
|
| 806 |
instance['bbox'] = self.box_quantizer.dequantize(
|
| 807 |
boxes=torch.tensor(bbox_bins),
|
| 808 |
size=image_size
|
| 809 |
+
).tolist()
|
| 810 |
|
| 811 |
# exclude non-ascii characters
|
| 812 |
phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
|
|
|
|
| 817 |
return instances
|
| 818 |
|
| 819 |
def parse_description_with_bboxes_from_text_and_spans(
|
| 820 |
+
self,
|
| 821 |
+
text,
|
| 822 |
spans=None,
|
| 823 |
scores=None,
|
| 824 |
score_mode=None,
|
| 825 |
+
pattern=None,
|
| 826 |
+
image_size=None,
|
| 827 |
allow_empty_phrase=False
|
| 828 |
):
|
| 829 |
def find_matched_token_indices(cur_span, token_spans):
|
|
|
|
| 834 |
return inds
|
| 835 |
|
| 836 |
cur_span = 0
|
| 837 |
+
if text.startswith('<s>'):
|
| 838 |
cur_span += 3
|
| 839 |
|
| 840 |
text = text.replace('<s>', '')
|
|
|
|
| 842 |
text = text.replace('<pad>', '')
|
| 843 |
|
| 844 |
if allow_empty_phrase:
|
| 845 |
+
pattern = r"(?:(?:<loc_\d+>){{4,}})"
|
| 846 |
else:
|
| 847 |
pattern = r"([^<]+(?:<loc_\d+>){4,})"
|
| 848 |
phrases = re.findall(pattern, text)
|
| 849 |
+
|
| 850 |
# pattern should be text pattern and od pattern
|
| 851 |
pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
|
| 852 |
box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
|
|
|
|
| 860 |
cur_span += len(pharse_text)
|
| 861 |
continue
|
| 862 |
|
| 863 |
+
# parse phrase, get string
|
| 864 |
phrase = re.search(pattern, phrase_text_strip)
|
| 865 |
if phrase is None:
|
| 866 |
cur_span += len(pharse_text)
|
|
|
|
| 877 |
cur_span += len(pharse_text)
|
| 878 |
continue
|
| 879 |
|
| 880 |
+
# a list of list
|
| 881 |
bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
|
| 882 |
|
| 883 |
bboxes = self.box_quantizer.dequantize(
|
| 884 |
boxes=torch.tensor(bbox_bins),
|
| 885 |
size=image_size
|
| 886 |
+
).tolist()
|
| 887 |
|
| 888 |
if score_mode == 'avg_loc_scores':
|
| 889 |
if spans is None or scores is None:
|
|
|
|
| 896 |
loc_scores = [scores[token_i] for token_i in token_inds]
|
| 897 |
score = sum(loc_scores) / len(loc_scores)
|
| 898 |
all_scores.append(score)
|
| 899 |
+
elif score_mode == 'avg_cat_name_scores':
|
| 900 |
if spans is None or scores is None:
|
| 901 |
all_scores = None
|
| 902 |
else:
|
|
|
|
| 919 |
if all_scores is not None:
|
| 920 |
instance['score'] = math.exp(all_scores[_idx])
|
| 921 |
instances.append(instance)
|
| 922 |
+
|
| 923 |
cur_span += len(pharse_text)
|
| 924 |
|
| 925 |
return instances
|
| 926 |
|
| 927 |
+
def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
|
| 928 |
allow_empty_phrase=False,
|
| 929 |
polygon_sep_token='<sep>',
|
| 930 |
polygon_start_token='<poly>',
|
| 931 |
polygon_end_token='</poly>',
|
| 932 |
with_box_at_start=False,
|
| 933 |
):
|
| 934 |
+
|
| 935 |
# ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
|
| 936 |
# ignore <s> </s> and <pad>
|
| 937 |
|
|
|
|
| 942 |
if allow_empty_phrase:
|
| 943 |
pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
|
| 944 |
else:
|
| 945 |
+
# [^<]+: This part matches one or more characters that are not the < symbol.
|
| 946 |
# The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
|
| 947 |
#
|
| 948 |
pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
|
|
|
|
| 953 |
|
| 954 |
# one polygons instance is separated by polygon_start_token and polygon_end_token
|
| 955 |
polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
|
| 956 |
+
|
| 957 |
instances = []
|
| 958 |
for phrase_text in phrases:
|
| 959 |
|
|
|
|
| 968 |
continue
|
| 969 |
|
| 970 |
|
| 971 |
+
# parse phrase, get string
|
| 972 |
phrase = re.search(phrase_string_pattern, phrase_text_strip)
|
| 973 |
if phrase is None:
|
| 974 |
continue
|
|
|
|
| 989 |
instance = {}
|
| 990 |
|
| 991 |
# polygons_parsed= list(re.finditer(box_pattern, phrase_text))
|
| 992 |
+
if isinstance(_polygons_instances_parsed, str):
|
| 993 |
polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
|
| 994 |
else:
|
| 995 |
polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
|
|
|
|
| 1011 |
_polygon = _polygon[4:]
|
| 1012 |
else:
|
| 1013 |
bbox = [0, 0, 0, 0]
|
| 1014 |
+
# abandon last element if is not paired
|
| 1015 |
if len(_polygon) % 2 == 1:
|
| 1016 |
_polygon = _polygon[:-1]
|
| 1017 |
+
|
| 1018 |
# reshape into (n, 2)
|
| 1019 |
_polygon = self.coordinates_quantizer.dequantize(
|
| 1020 |
torch.tensor(np.array(_polygon).reshape(-1, 2)),
|
|
|
|
| 1029 |
instance['bbox'] = self.box_quantizer.dequantize(
|
| 1030 |
boxes=torch.tensor([bbox]),
|
| 1031 |
size=image_size
|
| 1032 |
+
).tolist()[0]
|
| 1033 |
|
| 1034 |
instances.append(instance)
|
| 1035 |
|
|
|
|
| 1055 |
parse_tasks = [parse_tasks]
|
| 1056 |
for _parse_task in parse_tasks:
|
| 1057 |
assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
|
| 1058 |
+
|
| 1059 |
+
# sequence or text should be provided
|
| 1060 |
assert sequence is not None or text is not None, 'sequence or text should be provided'
|
| 1061 |
assert sequence is None or text is None, 'only one of sequence and text should be provided'
|
| 1062 |
|
|
|
|
| 1090 |
)
|
| 1091 |
parsed_dict['ocr'] = instances
|
| 1092 |
elif task == 'phrase_grounding':
|
| 1093 |
+
instances = self.parse_phrase_grounding_from_text_and_spans(
|
| 1094 |
text,
|
| 1095 |
pattern=pattern,
|
| 1096 |
image_size=image_size,
|
| 1097 |
)
|
| 1098 |
parsed_dict['phrase_grounding'] = instances
|
| 1099 |
elif task == 'pure_text':
|
| 1100 |
+
parsed_dict['pure_text'] = text
|
| 1101 |
elif task == 'description_with_bboxes':
|
| 1102 |
+
instances = self.parse_description_with_bboxes_from_text_and_spans(
|
| 1103 |
text,
|
| 1104 |
spans=spans,
|
| 1105 |
scores=transition_beam_score,
|
|
|
|
| 1109 |
)
|
| 1110 |
parsed_dict['description_with_bboxes'] = instances
|
| 1111 |
elif task == 'description_with_polygons':
|
| 1112 |
+
instances = self.parse_description_with_polygons_from_text_and_spans(
|
| 1113 |
text,
|
| 1114 |
pattern=pattern,
|
| 1115 |
image_size=image_size,
|
| 1116 |
)
|
| 1117 |
parsed_dict['description_with_polygons'] = instances
|
| 1118 |
elif task == 'polygons':
|
| 1119 |
+
instances = self.parse_description_with_polygons_from_text_and_spans(
|
| 1120 |
text,
|
| 1121 |
pattern=pattern,
|
| 1122 |
image_size=image_size,
|
|
|
|
| 1124 |
)
|
| 1125 |
parsed_dict['polygons'] = instances
|
| 1126 |
elif task == 'bboxes':
|
| 1127 |
+
instances = self.parse_description_with_bboxes_from_text_and_spans(
|
| 1128 |
text,
|
| 1129 |
pattern=pattern,
|
| 1130 |
image_size=image_size,
|
|
|
|
| 1134 |
elif task == 'description_with_bboxes_or_polygons':
|
| 1135 |
if '<poly>' in text:
|
| 1136 |
# only support either polygons or bboxes, not both at the same time
|
| 1137 |
+
instances = self.parse_description_with_polygons_from_text_and_spans(
|
| 1138 |
text,
|
| 1139 |
pattern=pattern,
|
| 1140 |
image_size=image_size,
|
| 1141 |
)
|
| 1142 |
else:
|
| 1143 |
+
instances = self.parse_description_with_bboxes_from_text_and_spans(
|
| 1144 |
text,
|
| 1145 |
pattern=pattern,
|
| 1146 |
image_size=image_size,
|