Upload folder using huggingface_hub

#2
Files changed (3) hide show
  1. config.json +83 -83
  2. generation_config.json +4 -4
  3. processing_florence2.py +51 -48
config.json CHANGED
@@ -1,84 +1,84 @@
1
- {
2
- "_name_or_path": "florence2",
3
- "architectures": [
4
- "Florence2ForConditionalGeneration"
5
- ],
6
- "auto_map": {
7
- "AutoConfig": "configuration_florence2.Florence2Config",
8
- "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
9
- },
10
- "bos_token_id": 0,
11
- "eos_token_id": 2,
12
- "ignore_index": -100,
13
- "model_type": "florence2",
14
- "pad_token_id": 1,
15
- "projection_dim": 1024,
16
- "text_config": {
17
- "vocab_size": 51289,
18
- "activation_dropout": 0.1,
19
- "activation_function": "gelu",
20
- "add_bias_logits": false,
21
- "add_final_layer_norm": false,
22
- "attention_dropout": 0.1,
23
- "bos_token_id": 0,
24
- "classif_dropout": 0.1,
25
- "classifier_dropout": 0.0,
26
- "d_model": 1024,
27
- "decoder_attention_heads": 16,
28
- "decoder_ffn_dim": 4096,
29
- "decoder_layerdrop": 0.0,
30
- "decoder_layers": 12,
31
- "decoder_start_token_id": 2,
32
- "dropout": 0.1,
33
- "early_stopping": true,
34
- "encoder_attention_heads": 16,
35
- "encoder_ffn_dim": 4096,
36
- "encoder_layerdrop": 0.0,
37
- "encoder_layers": 12,
38
- "eos_token_id": 2,
39
- "forced_eos_token_id": 2,
40
- "forced_bos_token_id": 0,
41
- "gradient_checkpointing": false,
42
- "init_std": 0.02,
43
- "is_encoder_decoder": true,
44
- "label2id": {
45
- "LABEL_0": 0,
46
- "LABEL_1": 1,
47
- "LABEL_2": 2
48
- },
49
- "max_position_embeddings": 1024,
50
- "no_repeat_ngram_size": 3,
51
- "normalize_before": false,
52
- "num_hidden_layers": 12,
53
- "pad_token_id": 1,
54
- "scale_embedding": false,
55
- "num_beams": 3
56
- },
57
- "vision_config": {
58
- "model_type": "davit",
59
- "drop_path_rate": 0.1,
60
- "patch_size": [7, 3, 3, 3],
61
- "patch_stride": [4, 2, 2, 2],
62
- "patch_padding": [3, 1, 1, 1],
63
- "patch_prenorm": [false, true, true, true],
64
- "enable_checkpoint": false,
65
- "dim_embed": [256, 512, 1024, 2048],
66
- "num_heads": [8, 16, 32, 64],
67
- "num_groups": [8, 16, 32, 64],
68
- "depths": [1, 1, 9, 1],
69
- "window_size": 12,
70
- "projection_dim": 1024,
71
- "visual_temporal_embedding": {
72
- "type": "COSINE",
73
- "max_temporal_embeddings": 100
74
- },
75
- "image_pos_embed": {
76
- "type": "learned_abs_2d",
77
- "max_pos_embeddings": 50
78
- },
79
- "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
80
- },
81
- "vocab_size": 51289,
82
- "transformers_version": "4.48.3",
83
- "is_encoder_decoder": true
84
  }
 
1
+ {
2
+ "_name_or_path": "florence2",
3
+ "architectures": [
4
+ "Florence2ForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_florence2.Florence2Config",
8
+ "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
9
+ },
10
+ "bos_token_id": 0,
11
+ "eos_token_id": 2,
12
+ "ignore_index": -100,
13
+ "model_type": "florence2",
14
+ "pad_token_id": 1,
15
+ "projection_dim": 1024,
16
+ "text_config": {
17
+ "vocab_size": 51289,
18
+ "activation_dropout": 0.1,
19
+ "activation_function": "gelu",
20
+ "add_bias_logits": false,
21
+ "add_final_layer_norm": false,
22
+ "attention_dropout": 0.1,
23
+ "bos_token_id": 0,
24
+ "classif_dropout": 0.1,
25
+ "classifier_dropout": 0.0,
26
+ "d_model": 1024,
27
+ "decoder_attention_heads": 16,
28
+ "decoder_ffn_dim": 4096,
29
+ "decoder_layerdrop": 0.0,
30
+ "decoder_layers": 12,
31
+ "decoder_start_token_id": 2,
32
+ "dropout": 0.1,
33
+ "early_stopping": true,
34
+ "encoder_attention_heads": 16,
35
+ "encoder_ffn_dim": 4096,
36
+ "encoder_layerdrop": 0.0,
37
+ "encoder_layers": 12,
38
+ "eos_token_id": 2,
39
+ "forced_eos_token_id": 2,
40
+ "forced_bos_token_id": 0,
41
+ "gradient_checkpointing": false,
42
+ "init_std": 0.02,
43
+ "is_encoder_decoder": true,
44
+ "label2id": {
45
+ "LABEL_0": 0,
46
+ "LABEL_1": 1,
47
+ "LABEL_2": 2
48
+ },
49
+ "max_position_embeddings": 1024,
50
+ "no_repeat_ngram_size": 3,
51
+ "normalize_before": false,
52
+ "num_hidden_layers": 12,
53
+ "pad_token_id": 1,
54
+ "scale_embedding": false,
55
+ "num_beams": 3
56
+ },
57
+ "vision_config": {
58
+ "model_type": "davit",
59
+ "drop_path_rate": 0.1,
60
+ "patch_size": [7, 3, 3, 3],
61
+ "patch_stride": [4, 2, 2, 2],
62
+ "patch_padding": [3, 1, 1, 1],
63
+ "patch_prenorm": [false, true, true, true],
64
+ "enable_checkpoint": false,
65
+ "dim_embed": [256, 512, 1024, 2048],
66
+ "num_heads": [8, 16, 32, 64],
67
+ "num_groups": [8, 16, 32, 64],
68
+ "depths": [1, 1, 9, 1],
69
+ "window_size": 12,
70
+ "projection_dim": 1024,
71
+ "visual_temporal_embedding": {
72
+ "type": "COSINE",
73
+ "max_temporal_embeddings": 100
74
+ },
75
+ "image_pos_embed": {
76
+ "type": "learned_abs_2d",
77
+ "max_pos_embeddings": 50
78
+ },
79
+ "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
80
+ },
81
+ "vocab_size": 51289,
82
+ "transformers_version": "4.48.3",
83
+ "is_encoder_decoder": true
84
  }
generation_config.json CHANGED
@@ -1,4 +1,4 @@
1
- {
2
- "num_beams": 3,
3
- "early_stopping": false
4
- }
 
1
+ {
2
+ "num_beams": 3,
3
+ "early_stopping": false
4
+ }
processing_florence2.py CHANGED
@@ -84,9 +84,12 @@ class Florence2Processor(ProcessorMixin):
84
 
85
  self.image_seq_length = image_processor.image_seq_length
86
 
 
 
 
87
  tokens_to_add = {
88
  'additional_special_tokens': \
89
- tokenizer.additional_special_tokens + \
90
  ['<od>', '</od>', '<ocr>', '</ocr>'] + \
91
  [f'<loc_{x}>' for x in range(1000)] + \
92
  ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
@@ -138,7 +141,7 @@ class Florence2Processor(ProcessorMixin):
138
 
139
 
140
  super().__init__(image_processor, tokenizer)
141
-
142
  def _construct_prompts(self, text):
143
  # replace the task tokens with the task prompts if task token is in the text
144
  prompts = []
@@ -149,7 +152,7 @@ class Florence2Processor(ProcessorMixin):
149
  assert _text == task_token, f"Task token {task_token} should be the only token in the text."
150
  _text = task_prompt
151
  break
152
- # 2. task prompts with additional inputs
153
  for task_token, task_prompt in self.task_prompts_with_input.items():
154
  if task_token in _text:
155
  _text = task_prompt.format(input=_text.replace(task_token, ''))
@@ -381,7 +384,7 @@ class Florence2Processor(ProcessorMixin):
381
 
382
  final_answer = {
383
  task: final_answer}
384
- return final_answer
385
 
386
  class BoxQuantizer(object):
387
  def __init__(self, mode, bins):
@@ -505,8 +508,8 @@ class CoordinatesQuantizer(object):
505
 
506
 
507
  class Florence2PostProcesser(object):
508
- r"""
509
- Florence-2 post process for converting text prediction to various tasks results.
510
 
511
  Args:
512
  config: A dict of configs.
@@ -588,7 +591,7 @@ class Florence2PostProcesser(object):
588
  )
589
 
590
  return black_list
591
-
592
  def _create_default_config(self):
593
  config = {
594
  'NUM_BBOX_HEIGHT_BINS': 1000,
@@ -645,7 +648,7 @@ class Florence2PostProcesser(object):
645
  box_quantization_mode,
646
  (num_bbox_width_bins, num_bbox_height_bins),
647
  )
648
-
649
  num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
650
  num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
651
  box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
@@ -698,7 +701,7 @@ class Florence2PostProcesser(object):
698
  instance['bbox'] = self.box_quantizer.dequantize(
699
  boxes=torch.tensor(bbox_bins),
700
  size=image_size
701
- ).tolist()
702
 
703
  if phrase_centric:
704
  instance['cat_name'] = parsed[i].group(1).lower().strip()
@@ -708,9 +711,9 @@ class Florence2PostProcesser(object):
708
 
709
  return instances
710
 
711
- def parse_ocr_from_text_and_spans(self,
712
- text,
713
- pattern,
714
  image_size,
715
  area_threshold=-1.0,
716
  ):
@@ -752,7 +755,7 @@ class Florence2PostProcesser(object):
752
  def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
753
  # ignore <s> </s> and <pad>
754
  cur_span = 0
755
- if text.startswith('<s>'):
756
  cur_span += 3
757
 
758
  text = text.replace('<s>', '')
@@ -761,7 +764,7 @@ class Florence2PostProcesser(object):
761
 
762
  pattern = r"([^<]+(?:<loc_\d+>){4,})"
763
  phrases = re.findall(pattern, text)
764
-
765
  # pattern should be text pattern and od pattern
766
  pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
767
  box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
@@ -778,7 +781,7 @@ class Florence2PostProcesser(object):
778
  # Prepare instance.
779
  instance = {}
780
 
781
- # parse phrase, get string
782
  phrase = re.search(pattern, phrase_text_strip)
783
  if phrase is None:
784
  cur_span += len(pharse_text)
@@ -798,12 +801,12 @@ class Florence2PostProcesser(object):
798
  cur_span += len(pharse_text)
799
  continue
800
 
801
- # a list of list
802
  bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
803
  instance['bbox'] = self.box_quantizer.dequantize(
804
  boxes=torch.tensor(bbox_bins),
805
  size=image_size
806
- ).tolist()
807
 
808
  # exclude non-ascii characters
809
  phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
@@ -814,13 +817,13 @@ class Florence2PostProcesser(object):
814
  return instances
815
 
816
  def parse_description_with_bboxes_from_text_and_spans(
817
- self,
818
- text,
819
  spans=None,
820
  scores=None,
821
  score_mode=None,
822
- pattern=None,
823
- image_size=None,
824
  allow_empty_phrase=False
825
  ):
826
  def find_matched_token_indices(cur_span, token_spans):
@@ -831,7 +834,7 @@ class Florence2PostProcesser(object):
831
  return inds
832
 
833
  cur_span = 0
834
- if text.startswith('<s>'):
835
  cur_span += 3
836
 
837
  text = text.replace('<s>', '')
@@ -839,11 +842,11 @@ class Florence2PostProcesser(object):
839
  text = text.replace('<pad>', '')
840
 
841
  if allow_empty_phrase:
842
- pattern = rf"(?:(?:<loc_\d+>){{4,}})"
843
  else:
844
  pattern = r"([^<]+(?:<loc_\d+>){4,})"
845
  phrases = re.findall(pattern, text)
846
-
847
  # pattern should be text pattern and od pattern
848
  pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
849
  box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
@@ -857,7 +860,7 @@ class Florence2PostProcesser(object):
857
  cur_span += len(pharse_text)
858
  continue
859
 
860
- # parse phrase, get string
861
  phrase = re.search(pattern, phrase_text_strip)
862
  if phrase is None:
863
  cur_span += len(pharse_text)
@@ -874,13 +877,13 @@ class Florence2PostProcesser(object):
874
  cur_span += len(pharse_text)
875
  continue
876
 
877
- # a list of list
878
  bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
879
 
880
  bboxes = self.box_quantizer.dequantize(
881
  boxes=torch.tensor(bbox_bins),
882
  size=image_size
883
- ).tolist()
884
 
885
  if score_mode == 'avg_loc_scores':
886
  if spans is None or scores is None:
@@ -893,7 +896,7 @@ class Florence2PostProcesser(object):
893
  loc_scores = [scores[token_i] for token_i in token_inds]
894
  score = sum(loc_scores) / len(loc_scores)
895
  all_scores.append(score)
896
- elif score_mode == 'avg_cat_name_scores':
897
  if spans is None or scores is None:
898
  all_scores = None
899
  else:
@@ -916,19 +919,19 @@ class Florence2PostProcesser(object):
916
  if all_scores is not None:
917
  instance['score'] = math.exp(all_scores[_idx])
918
  instances.append(instance)
919
-
920
  cur_span += len(pharse_text)
921
 
922
  return instances
923
 
924
- def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
925
  allow_empty_phrase=False,
926
  polygon_sep_token='<sep>',
927
  polygon_start_token='<poly>',
928
  polygon_end_token='</poly>',
929
  with_box_at_start=False,
930
  ):
931
-
932
  # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
933
  # ignore <s> </s> and <pad>
934
 
@@ -939,7 +942,7 @@ class Florence2PostProcesser(object):
939
  if allow_empty_phrase:
940
  pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
941
  else:
942
- # [^<]+: This part matches one or more characters that are not the < symbol.
943
  # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
944
  #
945
  pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
@@ -950,7 +953,7 @@ class Florence2PostProcesser(object):
950
 
951
  # one polygons instance is separated by polygon_start_token and polygon_end_token
952
  polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
953
-
954
  instances = []
955
  for phrase_text in phrases:
956
 
@@ -965,7 +968,7 @@ class Florence2PostProcesser(object):
965
  continue
966
 
967
 
968
- # parse phrase, get string
969
  phrase = re.search(phrase_string_pattern, phrase_text_strip)
970
  if phrase is None:
971
  continue
@@ -986,7 +989,7 @@ class Florence2PostProcesser(object):
986
  instance = {}
987
 
988
  # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
989
- if isinstance(_polygons_instances_parsed, str):
990
  polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
991
  else:
992
  polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
@@ -1008,10 +1011,10 @@ class Florence2PostProcesser(object):
1008
  _polygon = _polygon[4:]
1009
  else:
1010
  bbox = [0, 0, 0, 0]
1011
- # abandon last element if is not paired
1012
  if len(_polygon) % 2 == 1:
1013
  _polygon = _polygon[:-1]
1014
-
1015
  # reshape into (n, 2)
1016
  _polygon = self.coordinates_quantizer.dequantize(
1017
  torch.tensor(np.array(_polygon).reshape(-1, 2)),
@@ -1026,7 +1029,7 @@ class Florence2PostProcesser(object):
1026
  instance['bbox'] = self.box_quantizer.dequantize(
1027
  boxes=torch.tensor([bbox]),
1028
  size=image_size
1029
- ).tolist()[0]
1030
 
1031
  instances.append(instance)
1032
 
@@ -1052,8 +1055,8 @@ class Florence2PostProcesser(object):
1052
  parse_tasks = [parse_tasks]
1053
  for _parse_task in parse_tasks:
1054
  assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
1055
-
1056
- # sequence or text should be provided
1057
  assert sequence is not None or text is not None, 'sequence or text should be provided'
1058
  assert sequence is None or text is None, 'only one of sequence and text should be provided'
1059
 
@@ -1087,16 +1090,16 @@ class Florence2PostProcesser(object):
1087
  )
1088
  parsed_dict['ocr'] = instances
1089
  elif task == 'phrase_grounding':
1090
- instances = self.parse_phrase_grounding_from_text_and_spans(
1091
  text,
1092
  pattern=pattern,
1093
  image_size=image_size,
1094
  )
1095
  parsed_dict['phrase_grounding'] = instances
1096
  elif task == 'pure_text':
1097
- parsed_dict['pure_text'] = text
1098
  elif task == 'description_with_bboxes':
1099
- instances = self.parse_description_with_bboxes_from_text_and_spans(
1100
  text,
1101
  spans=spans,
1102
  scores=transition_beam_score,
@@ -1106,14 +1109,14 @@ class Florence2PostProcesser(object):
1106
  )
1107
  parsed_dict['description_with_bboxes'] = instances
1108
  elif task == 'description_with_polygons':
1109
- instances = self.parse_description_with_polygons_from_text_and_spans(
1110
  text,
1111
  pattern=pattern,
1112
  image_size=image_size,
1113
  )
1114
  parsed_dict['description_with_polygons'] = instances
1115
  elif task == 'polygons':
1116
- instances = self.parse_description_with_polygons_from_text_and_spans(
1117
  text,
1118
  pattern=pattern,
1119
  image_size=image_size,
@@ -1121,7 +1124,7 @@ class Florence2PostProcesser(object):
1121
  )
1122
  parsed_dict['polygons'] = instances
1123
  elif task == 'bboxes':
1124
- instances = self.parse_description_with_bboxes_from_text_and_spans(
1125
  text,
1126
  pattern=pattern,
1127
  image_size=image_size,
@@ -1131,13 +1134,13 @@ class Florence2PostProcesser(object):
1131
  elif task == 'description_with_bboxes_or_polygons':
1132
  if '<poly>' in text:
1133
  # only support either polygons or bboxes, not both at the same time
1134
- instances = self.parse_description_with_polygons_from_text_and_spans(
1135
  text,
1136
  pattern=pattern,
1137
  image_size=image_size,
1138
  )
1139
  else:
1140
- instances = self.parse_description_with_bboxes_from_text_and_spans(
1141
  text,
1142
  pattern=pattern,
1143
  image_size=image_size,
 
84
 
85
  self.image_seq_length = image_processor.image_seq_length
86
 
87
+ # Get existing additional_special_tokens safely (works with both Roberta and BART tokenizers)
88
+ existing_special_tokens = list(getattr(tokenizer, 'additional_special_tokens', []) or [])
89
+
90
  tokens_to_add = {
91
  'additional_special_tokens': \
92
+ existing_special_tokens + \
93
  ['<od>', '</od>', '<ocr>', '</ocr>'] + \
94
  [f'<loc_{x}>' for x in range(1000)] + \
95
  ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
 
141
 
142
 
143
  super().__init__(image_processor, tokenizer)
144
+
145
  def _construct_prompts(self, text):
146
  # replace the task tokens with the task prompts if task token is in the text
147
  prompts = []
 
152
  assert _text == task_token, f"Task token {task_token} should be the only token in the text."
153
  _text = task_prompt
154
  break
155
+ # 2. task prompts with additional inputs
156
  for task_token, task_prompt in self.task_prompts_with_input.items():
157
  if task_token in _text:
158
  _text = task_prompt.format(input=_text.replace(task_token, ''))
 
384
 
385
  final_answer = {
386
  task: final_answer}
387
+ return final_answer
388
 
389
  class BoxQuantizer(object):
390
  def __init__(self, mode, bins):
 
508
 
509
 
510
  class Florence2PostProcesser(object):
511
+ """
512
+ Florence-2 post process for converting text prediction to various tasks results.
513
 
514
  Args:
515
  config: A dict of configs.
 
591
  )
592
 
593
  return black_list
594
+
595
  def _create_default_config(self):
596
  config = {
597
  'NUM_BBOX_HEIGHT_BINS': 1000,
 
648
  box_quantization_mode,
649
  (num_bbox_width_bins, num_bbox_height_bins),
650
  )
651
+
652
  num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
653
  num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
654
  box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
 
701
  instance['bbox'] = self.box_quantizer.dequantize(
702
  boxes=torch.tensor(bbox_bins),
703
  size=image_size
704
+ ).tolist()
705
 
706
  if phrase_centric:
707
  instance['cat_name'] = parsed[i].group(1).lower().strip()
 
711
 
712
  return instances
713
 
714
+ def parse_ocr_from_text_and_spans(self,
715
+ text,
716
+ pattern,
717
  image_size,
718
  area_threshold=-1.0,
719
  ):
 
755
  def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
756
  # ignore <s> </s> and <pad>
757
  cur_span = 0
758
+ if text.startswith('<s>'):
759
  cur_span += 3
760
 
761
  text = text.replace('<s>', '')
 
764
 
765
  pattern = r"([^<]+(?:<loc_\d+>){4,})"
766
  phrases = re.findall(pattern, text)
767
+
768
  # pattern should be text pattern and od pattern
769
  pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
770
  box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
 
781
  # Prepare instance.
782
  instance = {}
783
 
784
+ # parse phrase, get string
785
  phrase = re.search(pattern, phrase_text_strip)
786
  if phrase is None:
787
  cur_span += len(pharse_text)
 
801
  cur_span += len(pharse_text)
802
  continue
803
 
804
+ # a list of list
805
  bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
806
  instance['bbox'] = self.box_quantizer.dequantize(
807
  boxes=torch.tensor(bbox_bins),
808
  size=image_size
809
+ ).tolist()
810
 
811
  # exclude non-ascii characters
812
  phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
 
817
  return instances
818
 
819
  def parse_description_with_bboxes_from_text_and_spans(
820
+ self,
821
+ text,
822
  spans=None,
823
  scores=None,
824
  score_mode=None,
825
+ pattern=None,
826
+ image_size=None,
827
  allow_empty_phrase=False
828
  ):
829
  def find_matched_token_indices(cur_span, token_spans):
 
834
  return inds
835
 
836
  cur_span = 0
837
+ if text.startswith('<s>'):
838
  cur_span += 3
839
 
840
  text = text.replace('<s>', '')
 
842
  text = text.replace('<pad>', '')
843
 
844
  if allow_empty_phrase:
845
+ pattern = r"(?:(?:<loc_\d+>){{4,}})"
846
  else:
847
  pattern = r"([^<]+(?:<loc_\d+>){4,})"
848
  phrases = re.findall(pattern, text)
849
+
850
  # pattern should be text pattern and od pattern
851
  pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
852
  box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
 
860
  cur_span += len(pharse_text)
861
  continue
862
 
863
+ # parse phrase, get string
864
  phrase = re.search(pattern, phrase_text_strip)
865
  if phrase is None:
866
  cur_span += len(pharse_text)
 
877
  cur_span += len(pharse_text)
878
  continue
879
 
880
+ # a list of list
881
  bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
882
 
883
  bboxes = self.box_quantizer.dequantize(
884
  boxes=torch.tensor(bbox_bins),
885
  size=image_size
886
+ ).tolist()
887
 
888
  if score_mode == 'avg_loc_scores':
889
  if spans is None or scores is None:
 
896
  loc_scores = [scores[token_i] for token_i in token_inds]
897
  score = sum(loc_scores) / len(loc_scores)
898
  all_scores.append(score)
899
+ elif score_mode == 'avg_cat_name_scores':
900
  if spans is None or scores is None:
901
  all_scores = None
902
  else:
 
919
  if all_scores is not None:
920
  instance['score'] = math.exp(all_scores[_idx])
921
  instances.append(instance)
922
+
923
  cur_span += len(pharse_text)
924
 
925
  return instances
926
 
927
+ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
928
  allow_empty_phrase=False,
929
  polygon_sep_token='<sep>',
930
  polygon_start_token='<poly>',
931
  polygon_end_token='</poly>',
932
  with_box_at_start=False,
933
  ):
934
+
935
  # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
936
  # ignore <s> </s> and <pad>
937
 
 
942
  if allow_empty_phrase:
943
  pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
944
  else:
945
+ # [^<]+: This part matches one or more characters that are not the < symbol.
946
  # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
947
  #
948
  pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
 
953
 
954
  # one polygons instance is separated by polygon_start_token and polygon_end_token
955
  polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
956
+
957
  instances = []
958
  for phrase_text in phrases:
959
 
 
968
  continue
969
 
970
 
971
+ # parse phrase, get string
972
  phrase = re.search(phrase_string_pattern, phrase_text_strip)
973
  if phrase is None:
974
  continue
 
989
  instance = {}
990
 
991
  # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
992
+ if isinstance(_polygons_instances_parsed, str):
993
  polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
994
  else:
995
  polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
 
1011
  _polygon = _polygon[4:]
1012
  else:
1013
  bbox = [0, 0, 0, 0]
1014
+ # abandon last element if is not paired
1015
  if len(_polygon) % 2 == 1:
1016
  _polygon = _polygon[:-1]
1017
+
1018
  # reshape into (n, 2)
1019
  _polygon = self.coordinates_quantizer.dequantize(
1020
  torch.tensor(np.array(_polygon).reshape(-1, 2)),
 
1029
  instance['bbox'] = self.box_quantizer.dequantize(
1030
  boxes=torch.tensor([bbox]),
1031
  size=image_size
1032
+ ).tolist()[0]
1033
 
1034
  instances.append(instance)
1035
 
 
1055
  parse_tasks = [parse_tasks]
1056
  for _parse_task in parse_tasks:
1057
  assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
1058
+
1059
+ # sequence or text should be provided
1060
  assert sequence is not None or text is not None, 'sequence or text should be provided'
1061
  assert sequence is None or text is None, 'only one of sequence and text should be provided'
1062
 
 
1090
  )
1091
  parsed_dict['ocr'] = instances
1092
  elif task == 'phrase_grounding':
1093
+ instances = self.parse_phrase_grounding_from_text_and_spans(
1094
  text,
1095
  pattern=pattern,
1096
  image_size=image_size,
1097
  )
1098
  parsed_dict['phrase_grounding'] = instances
1099
  elif task == 'pure_text':
1100
+ parsed_dict['pure_text'] = text
1101
  elif task == 'description_with_bboxes':
1102
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1103
  text,
1104
  spans=spans,
1105
  scores=transition_beam_score,
 
1109
  )
1110
  parsed_dict['description_with_bboxes'] = instances
1111
  elif task == 'description_with_polygons':
1112
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1113
  text,
1114
  pattern=pattern,
1115
  image_size=image_size,
1116
  )
1117
  parsed_dict['description_with_polygons'] = instances
1118
  elif task == 'polygons':
1119
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1120
  text,
1121
  pattern=pattern,
1122
  image_size=image_size,
 
1124
  )
1125
  parsed_dict['polygons'] = instances
1126
  elif task == 'bboxes':
1127
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1128
  text,
1129
  pattern=pattern,
1130
  image_size=image_size,
 
1134
  elif task == 'description_with_bboxes_or_polygons':
1135
  if '<poly>' in text:
1136
  # only support either polygons or bboxes, not both at the same time
1137
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1138
  text,
1139
  pattern=pattern,
1140
  image_size=image_size,
1141
  )
1142
  else:
1143
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1144
  text,
1145
  pattern=pattern,
1146
  image_size=image_size,