Upload folder using huggingface_hub

by silveroxides - opened Feb 12

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+138

-135

Files changed (3) hide show

config.json +83 -83
generation_config.json +4 -4
processing_florence2.py +51 -48

config.json CHANGED Viewed

@@ -1,84 +1,84 @@
-{
-  "_name_or_path": "florence2",
-  "architectures": [
-    "Florence2ForConditionalGeneration"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_florence2.Florence2Config",
-    "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
-  },
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "ignore_index": -100,
-  "model_type": "florence2",
-  "pad_token_id": 1,
-  "projection_dim": 1024,
-  "text_config": {
-      "vocab_size": 51289,
-      "activation_dropout": 0.1,
-      "activation_function": "gelu",
-      "add_bias_logits": false,
-      "add_final_layer_norm": false,
-      "attention_dropout": 0.1,
-      "bos_token_id": 0,
-      "classif_dropout": 0.1,
-      "classifier_dropout": 0.0,
-      "d_model": 1024,
-      "decoder_attention_heads": 16,
-      "decoder_ffn_dim": 4096,
-      "decoder_layerdrop": 0.0,
-      "decoder_layers": 12,
-      "decoder_start_token_id": 2,
-      "dropout": 0.1,
-      "early_stopping": true,
-      "encoder_attention_heads": 16,
-      "encoder_ffn_dim": 4096,
-      "encoder_layerdrop": 0.0,
-      "encoder_layers": 12,
-      "eos_token_id": 2,
-      "forced_eos_token_id": 2,
-      "forced_bos_token_id": 0,
-      "gradient_checkpointing": false,
-      "init_std": 0.02,
-      "is_encoder_decoder": true,
-      "label2id": {
-        "LABEL_0": 0,
-        "LABEL_1": 1,
-        "LABEL_2": 2
-      },
-      "max_position_embeddings": 1024,
-      "no_repeat_ngram_size": 3,
-      "normalize_before": false,
-      "num_hidden_layers": 12,
-      "pad_token_id": 1,
-      "scale_embedding": false,
-      "num_beams": 3
-  },
-  "vision_config": {
-    "model_type": "davit",
-    "drop_path_rate": 0.1,
-    "patch_size": [7, 3, 3, 3],
-    "patch_stride": [4, 2, 2, 2],
-    "patch_padding": [3, 1, 1, 1],
-    "patch_prenorm": [false, true, true, true],
-    "enable_checkpoint": false,
-    "dim_embed": [256, 512, 1024, 2048],
-    "num_heads": [8, 16, 32, 64],
-    "num_groups": [8, 16, 32, 64],
-    "depths": [1, 1, 9, 1],
-    "window_size": 12,
-    "projection_dim": 1024,
-    "visual_temporal_embedding": {
-        "type": "COSINE",
-        "max_temporal_embeddings": 100
-    },
-    "image_pos_embed": {
-        "type": "learned_abs_2d",
-        "max_pos_embeddings": 50
-    },
-    "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
-  },
-  "vocab_size": 51289,
-  "transformers_version": "4.48.3",
-  "is_encoder_decoder": true
 }

+{
+  "_name_or_path": "florence2",
+  "architectures": [
+    "Florence2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_florence2.Florence2Config",
+    "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
+  },
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "ignore_index": -100,
+  "model_type": "florence2",
+  "pad_token_id": 1,
+  "projection_dim": 1024,
+  "text_config": {
+      "vocab_size": 51289,
+      "activation_dropout": 0.1,
+      "activation_function": "gelu",
+      "add_bias_logits": false,
+      "add_final_layer_norm": false,
+      "attention_dropout": 0.1,
+      "bos_token_id": 0,
+      "classif_dropout": 0.1,
+      "classifier_dropout": 0.0,
+      "d_model": 1024,
+      "decoder_attention_heads": 16,
+      "decoder_ffn_dim": 4096,
+      "decoder_layerdrop": 0.0,
+      "decoder_layers": 12,
+      "decoder_start_token_id": 2,
+      "dropout": 0.1,
+      "early_stopping": true,
+      "encoder_attention_heads": 16,
+      "encoder_ffn_dim": 4096,
+      "encoder_layerdrop": 0.0,
+      "encoder_layers": 12,
+      "eos_token_id": 2,
+      "forced_eos_token_id": 2,
+      "forced_bos_token_id": 0,
+      "gradient_checkpointing": false,
+      "init_std": 0.02,
+      "is_encoder_decoder": true,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1,
+        "LABEL_2": 2
+      },
+      "max_position_embeddings": 1024,
+      "no_repeat_ngram_size": 3,
+      "normalize_before": false,
+      "num_hidden_layers": 12,
+      "pad_token_id": 1,
+      "scale_embedding": false,
+      "num_beams": 3
+  },
+  "vision_config": {
+    "model_type": "davit",
+    "drop_path_rate": 0.1,
+    "patch_size": [7, 3, 3, 3],
+    "patch_stride": [4, 2, 2, 2],
+    "patch_padding": [3, 1, 1, 1],
+    "patch_prenorm": [false, true, true, true],
+    "enable_checkpoint": false,
+    "dim_embed": [256, 512, 1024, 2048],
+    "num_heads": [8, 16, 32, 64],
+    "num_groups": [8, 16, 32, 64],
+    "depths": [1, 1, 9, 1],
+    "window_size": 12,
+    "projection_dim": 1024,
+    "visual_temporal_embedding": {
+        "type": "COSINE",
+        "max_temporal_embeddings": 100
+    },
+    "image_pos_embed": {
+        "type": "learned_abs_2d",
+        "max_pos_embeddings": 50
+    },
+    "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
+  },
+  "vocab_size": 51289,
+  "transformers_version": "4.48.3",
+  "is_encoder_decoder": true
 }

generation_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
-{
-    "num_beams": 3,
-    "early_stopping": false
-}

+{
+    "num_beams": 3,
+    "early_stopping": false
+}

processing_florence2.py CHANGED Viewed

@@ -84,9 +84,12 @@ class Florence2Processor(ProcessorMixin):
         self.image_seq_length = image_processor.image_seq_length
         tokens_to_add = {
                 'additional_special_tokens': \
-                    tokenizer.additional_special_tokens + \
                     ['<od>', '</od>', '<ocr>', '</ocr>'] + \
                     [f'<loc_{x}>' for x in range(1000)] + \
                     ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
@@ -138,7 +141,7 @@ class Florence2Processor(ProcessorMixin):
         super().__init__(image_processor, tokenizer)
     def _construct_prompts(self, text):
         # replace the task tokens with the task prompts if task token is in the text
         prompts = []
@@ -149,7 +152,7 @@ class Florence2Processor(ProcessorMixin):
                     assert _text == task_token, f"Task token {task_token} should be the only token in the text."
                     _text = task_prompt
                     break
-            # 2. task prompts with additional inputs
             for task_token, task_prompt in self.task_prompts_with_input.items():
                 if task_token in _text:
                     _text = task_prompt.format(input=_text.replace(task_token, ''))
@@ -381,7 +384,7 @@ class Florence2Processor(ProcessorMixin):
         final_answer = {
             task: final_answer}
-        return final_answer
 class BoxQuantizer(object):
     def __init__(self, mode, bins):
@@ -505,8 +508,8 @@ class CoordinatesQuantizer(object):
 class Florence2PostProcesser(object):
-    r"""
-    Florence-2 post process for converting text prediction to various tasks results.
     Args:
         config: A dict of configs.
@@ -588,7 +591,7 @@ class Florence2PostProcesser(object):
             )
         return black_list
     def _create_default_config(self):
         config = {
             'NUM_BBOX_HEIGHT_BINS': 1000,
@@ -645,7 +648,7 @@ class Florence2PostProcesser(object):
             box_quantization_mode,
             (num_bbox_width_bins, num_bbox_height_bins),
         )
         num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
         num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
         box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
@@ -698,7 +701,7 @@ class Florence2PostProcesser(object):
             instance['bbox'] = self.box_quantizer.dequantize(
                 boxes=torch.tensor(bbox_bins),
                 size=image_size
-            ).tolist()
             if phrase_centric:
                 instance['cat_name'] = parsed[i].group(1).lower().strip()
@@ -708,9 +711,9 @@ class Florence2PostProcesser(object):
         return instances
-    def parse_ocr_from_text_and_spans(self,
-                                     text,
-                                     pattern,
                                      image_size,
                                      area_threshold=-1.0,
         ):
@@ -752,7 +755,7 @@ class Florence2PostProcesser(object):
     def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
         # ignore <s> </s> and <pad>
         cur_span = 0
-        if text.startswith('<s>'):
             cur_span += 3
         text = text.replace('<s>', '')
@@ -761,7 +764,7 @@ class Florence2PostProcesser(object):
         pattern = r"([^<]+(?:<loc_\d+>){4,})"
         phrases = re.findall(pattern, text)
         # pattern should be text pattern and od pattern
         pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
         box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
@@ -778,7 +781,7 @@ class Florence2PostProcesser(object):
             # Prepare instance.
             instance = {}
-            # parse phrase, get string
             phrase = re.search(pattern, phrase_text_strip)
             if phrase is None:
                 cur_span += len(pharse_text)
@@ -798,12 +801,12 @@ class Florence2PostProcesser(object):
                 cur_span += len(pharse_text)
                 continue
-            # a list of list
             bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
             instance['bbox'] = self.box_quantizer.dequantize(
                 boxes=torch.tensor(bbox_bins),
                 size=image_size
-            ).tolist()
             # exclude non-ascii characters
             phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
@@ -814,13 +817,13 @@ class Florence2PostProcesser(object):
         return instances
     def parse_description_with_bboxes_from_text_and_spans(
-            self,
-            text,
             spans=None,
             scores=None,
             score_mode=None,
-            pattern=None,
-            image_size=None,
             allow_empty_phrase=False
         ):
         def find_matched_token_indices(cur_span, token_spans):
@@ -831,7 +834,7 @@ class Florence2PostProcesser(object):
             return inds
         cur_span = 0
-        if text.startswith('<s>'):
             cur_span += 3
         text = text.replace('<s>', '')
@@ -839,11 +842,11 @@ class Florence2PostProcesser(object):
         text = text.replace('<pad>', '')
         if allow_empty_phrase:
-            pattern = rf"(?:(?:<loc_\d+>){{4,}})"
         else:
             pattern = r"([^<]+(?:<loc_\d+>){4,})"
         phrases = re.findall(pattern, text)
         # pattern should be text pattern and od pattern
         pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
         box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
@@ -857,7 +860,7 @@ class Florence2PostProcesser(object):
                 cur_span += len(pharse_text)
                 continue
-            # parse phrase, get string
             phrase = re.search(pattern, phrase_text_strip)
             if phrase is None:
                 cur_span += len(pharse_text)
@@ -874,13 +877,13 @@ class Florence2PostProcesser(object):
                 cur_span += len(pharse_text)
                 continue
-            # a list of list
             bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
             bboxes = self.box_quantizer.dequantize(
                 boxes=torch.tensor(bbox_bins),
                 size=image_size
-            ).tolist()
             if score_mode == 'avg_loc_scores':
                 if spans is None or scores is None:
@@ -893,7 +896,7 @@ class Florence2PostProcesser(object):
                         loc_scores = [scores[token_i] for token_i in token_inds]
                         score = sum(loc_scores) / len(loc_scores)
                         all_scores.append(score)
-            elif score_mode == 'avg_cat_name_scores':
                 if spans is None or scores is None:
                     all_scores = None
                 else:
@@ -916,19 +919,19 @@ class Florence2PostProcesser(object):
                 if all_scores is not None:
                     instance['score'] = math.exp(all_scores[_idx])
                 instances.append(instance)
             cur_span += len(pharse_text)
         return instances
-    def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
                                                             allow_empty_phrase=False,
                                                             polygon_sep_token='<sep>',
                                                             polygon_start_token='<poly>',
                                                             polygon_end_token='</poly>',
                                                             with_box_at_start=False,
                                                             ):
         # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
         # ignore <s> </s> and <pad>
@@ -939,7 +942,7 @@ class Florence2PostProcesser(object):
         if allow_empty_phrase:
             pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
         else:
-            # [^<]+: This part matches one or more characters that are not the < symbol.
             # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
             #
             pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
@@ -950,7 +953,7 @@ class Florence2PostProcesser(object):
         # one polygons instance is separated by polygon_start_token and polygon_end_token
         polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
         instances = []
         for phrase_text in phrases:
@@ -965,7 +968,7 @@ class Florence2PostProcesser(object):
                 continue
-            # parse phrase, get string
             phrase = re.search(phrase_string_pattern, phrase_text_strip)
             if phrase is None:
                 continue
@@ -986,7 +989,7 @@ class Florence2PostProcesser(object):
                 instance = {}
                 # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
-                if isinstance(_polygons_instances_parsed, str):
                     polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
                 else:
                     polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
@@ -1008,10 +1011,10 @@ class Florence2PostProcesser(object):
                             _polygon = _polygon[4:]
                         else:
                             bbox = [0, 0, 0, 0]
-                    # abandon last element if is not paired
                     if len(_polygon) % 2 == 1:
                         _polygon = _polygon[:-1]
                     # reshape into (n, 2)
                     _polygon = self.coordinates_quantizer.dequantize(
                         torch.tensor(np.array(_polygon).reshape(-1, 2)),
@@ -1026,7 +1029,7 @@ class Florence2PostProcesser(object):
                     instance['bbox'] = self.box_quantizer.dequantize(
                         boxes=torch.tensor([bbox]),
                         size=image_size
-                    ).tolist()[0]
                 instances.append(instance)
@@ -1052,8 +1055,8 @@ class Florence2PostProcesser(object):
                 parse_tasks = [parse_tasks]
             for _parse_task in parse_tasks:
                 assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
-        # sequence or text should be provided
         assert sequence is not None or text is not None, 'sequence or text should be provided'
         assert sequence is None or text is None, 'only one of sequence and text should be provided'
@@ -1087,16 +1090,16 @@ class Florence2PostProcesser(object):
                 )
                 parsed_dict['ocr'] = instances
             elif task == 'phrase_grounding':
-                instances = self.parse_phrase_grounding_from_text_and_spans(
                     text,
                     pattern=pattern,
                     image_size=image_size,
                 )
                 parsed_dict['phrase_grounding'] = instances
             elif task == 'pure_text':
-                parsed_dict['pure_text'] = text
             elif task == 'description_with_bboxes':
-                instances = self.parse_description_with_bboxes_from_text_and_spans(
                     text,
                     spans=spans,
                     scores=transition_beam_score,
@@ -1106,14 +1109,14 @@ class Florence2PostProcesser(object):
                 )
                 parsed_dict['description_with_bboxes'] = instances
             elif task == 'description_with_polygons':
-                instances = self.parse_description_with_polygons_from_text_and_spans(
                     text,
                     pattern=pattern,
                     image_size=image_size,
                 )
                 parsed_dict['description_with_polygons'] = instances
             elif task == 'polygons':
-                instances = self.parse_description_with_polygons_from_text_and_spans(
                     text,
                     pattern=pattern,
                     image_size=image_size,
@@ -1121,7 +1124,7 @@ class Florence2PostProcesser(object):
                 )
                 parsed_dict['polygons'] = instances
             elif task == 'bboxes':
-                instances = self.parse_description_with_bboxes_from_text_and_spans(
                     text,
                     pattern=pattern,
                     image_size=image_size,
@@ -1131,13 +1134,13 @@ class Florence2PostProcesser(object):
             elif task == 'description_with_bboxes_or_polygons':
                 if '<poly>' in text:
                     # only support either polygons or bboxes, not both at the same time
-                    instances = self.parse_description_with_polygons_from_text_and_spans(
                         text,
                         pattern=pattern,
                         image_size=image_size,
                     )
                 else:
-                    instances = self.parse_description_with_bboxes_from_text_and_spans(
                         text,
                         pattern=pattern,
                         image_size=image_size,

         self.image_seq_length = image_processor.image_seq_length
+        # Get existing additional_special_tokens safely (works with both Roberta and BART tokenizers)
+        existing_special_tokens = list(getattr(tokenizer, 'additional_special_tokens', []) or [])
         tokens_to_add = {
                 'additional_special_tokens': \
+                    existing_special_tokens + \
                     ['<od>', '</od>', '<ocr>', '</ocr>'] + \
                     [f'<loc_{x}>' for x in range(1000)] + \
                     ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
         super().__init__(image_processor, tokenizer)
     def _construct_prompts(self, text):
         # replace the task tokens with the task prompts if task token is in the text
         prompts = []
                     assert _text == task_token, f"Task token {task_token} should be the only token in the text."
                     _text = task_prompt
                     break
+            # 2. task prompts with additional inputs
             for task_token, task_prompt in self.task_prompts_with_input.items():
                 if task_token in _text:
                     _text = task_prompt.format(input=_text.replace(task_token, ''))
         final_answer = {
             task: final_answer}
+        return final_answer
 class BoxQuantizer(object):
     def __init__(self, mode, bins):
 class Florence2PostProcesser(object):
+    """
+    Florence-2 post process for converting text prediction to various tasks results.
     Args:
         config: A dict of configs.
             )
         return black_list
     def _create_default_config(self):
         config = {
             'NUM_BBOX_HEIGHT_BINS': 1000,
             box_quantization_mode,
             (num_bbox_width_bins, num_bbox_height_bins),
         )
         num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
         num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
         box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
             instance['bbox'] = self.box_quantizer.dequantize(
                 boxes=torch.tensor(bbox_bins),
                 size=image_size
+            ).tolist()
             if phrase_centric:
                 instance['cat_name'] = parsed[i].group(1).lower().strip()
         return instances
+    def parse_ocr_from_text_and_spans(self,
+                                     text,
+                                     pattern,
                                      image_size,
                                      area_threshold=-1.0,
         ):
     def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
         # ignore <s> </s> and <pad>
         cur_span = 0
+        if text.startswith('<s>'):
             cur_span += 3
         text = text.replace('<s>', '')
         pattern = r"([^<]+(?:<loc_\d+>){4,})"
         phrases = re.findall(pattern, text)
         # pattern should be text pattern and od pattern
         pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
         box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
             # Prepare instance.
             instance = {}
+            # parse phrase, get string
             phrase = re.search(pattern, phrase_text_strip)
             if phrase is None:
                 cur_span += len(pharse_text)
                 cur_span += len(pharse_text)
                 continue
+            # a list of list
             bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
             instance['bbox'] = self.box_quantizer.dequantize(
                 boxes=torch.tensor(bbox_bins),
                 size=image_size
+            ).tolist()
             # exclude non-ascii characters
             phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
         return instances
     def parse_description_with_bboxes_from_text_and_spans(
+            self,
+            text,
             spans=None,
             scores=None,
             score_mode=None,
+            pattern=None,
+            image_size=None,
             allow_empty_phrase=False
         ):
         def find_matched_token_indices(cur_span, token_spans):
             return inds
         cur_span = 0
+        if text.startswith('<s>'):
             cur_span += 3
         text = text.replace('<s>', '')
         text = text.replace('<pad>', '')
         if allow_empty_phrase:
+            pattern = r"(?:(?:<loc_\d+>){{4,}})"
         else:
             pattern = r"([^<]+(?:<loc_\d+>){4,})"
         phrases = re.findall(pattern, text)
         # pattern should be text pattern and od pattern
         pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
         box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
                 cur_span += len(pharse_text)
                 continue
+            # parse phrase, get string
             phrase = re.search(pattern, phrase_text_strip)
             if phrase is None:
                 cur_span += len(pharse_text)
                 cur_span += len(pharse_text)
                 continue
+            # a list of list
             bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
             bboxes = self.box_quantizer.dequantize(
                 boxes=torch.tensor(bbox_bins),
                 size=image_size
+            ).tolist()
             if score_mode == 'avg_loc_scores':
                 if spans is None or scores is None:
                         loc_scores = [scores[token_i] for token_i in token_inds]
                         score = sum(loc_scores) / len(loc_scores)
                         all_scores.append(score)
+            elif score_mode == 'avg_cat_name_scores':
                 if spans is None or scores is None:
                     all_scores = None
                 else:
                 if all_scores is not None:
                     instance['score'] = math.exp(all_scores[_idx])
                 instances.append(instance)
             cur_span += len(pharse_text)
         return instances
+    def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
                                                             allow_empty_phrase=False,
                                                             polygon_sep_token='<sep>',
                                                             polygon_start_token='<poly>',
                                                             polygon_end_token='</poly>',
                                                             with_box_at_start=False,
                                                             ):
         # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
         # ignore <s> </s> and <pad>
         if allow_empty_phrase:
             pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
         else:
+            # [^<]+: This part matches one or more characters that are not the < symbol.
             # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
             #
             pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
         # one polygons instance is separated by polygon_start_token and polygon_end_token
         polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
         instances = []
         for phrase_text in phrases:
                 continue
+            # parse phrase, get string
             phrase = re.search(phrase_string_pattern, phrase_text_strip)
             if phrase is None:
                 continue
                 instance = {}
                 # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
+                if isinstance(_polygons_instances_parsed, str):
                     polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
                 else:
                     polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
                             _polygon = _polygon[4:]
                         else:
                             bbox = [0, 0, 0, 0]
+                    # abandon last element if is not paired
                     if len(_polygon) % 2 == 1:
                         _polygon = _polygon[:-1]
                     # reshape into (n, 2)
                     _polygon = self.coordinates_quantizer.dequantize(
                         torch.tensor(np.array(_polygon).reshape(-1, 2)),
                     instance['bbox'] = self.box_quantizer.dequantize(
                         boxes=torch.tensor([bbox]),
                         size=image_size
+                    ).tolist()[0]
                 instances.append(instance)
                 parse_tasks = [parse_tasks]
             for _parse_task in parse_tasks:
                 assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
+        # sequence or text should be provided
         assert sequence is not None or text is not None, 'sequence or text should be provided'
         assert sequence is None or text is None, 'only one of sequence and text should be provided'
                 )
                 parsed_dict['ocr'] = instances
             elif task == 'phrase_grounding':
+                instances = self.parse_phrase_grounding_from_text_and_spans(
                     text,
                     pattern=pattern,
                     image_size=image_size,
                 )
                 parsed_dict['phrase_grounding'] = instances
             elif task == 'pure_text':
+                parsed_dict['pure_text'] = text
             elif task == 'description_with_bboxes':
+                instances = self.parse_description_with_bboxes_from_text_and_spans(
                     text,
                     spans=spans,
                     scores=transition_beam_score,
                 )
                 parsed_dict['description_with_bboxes'] = instances
             elif task == 'description_with_polygons':
+                instances = self.parse_description_with_polygons_from_text_and_spans(
                     text,
                     pattern=pattern,
                     image_size=image_size,
                 )
                 parsed_dict['description_with_polygons'] = instances
             elif task == 'polygons':
+                instances = self.parse_description_with_polygons_from_text_and_spans(
                     text,
                     pattern=pattern,
                     image_size=image_size,
                 )
                 parsed_dict['polygons'] = instances
             elif task == 'bboxes':
+                instances = self.parse_description_with_bboxes_from_text_and_spans(
                     text,
                     pattern=pattern,
                     image_size=image_size,
             elif task == 'description_with_bboxes_or_polygons':
                 if '<poly>' in text:
                     # only support either polygons or bboxes, not both at the same time
+                    instances = self.parse_description_with_polygons_from_text_and_spans(
                         text,
                         pattern=pattern,
                         image_size=image_size,
                     )
                 else:
+                    instances = self.parse_description_with_bboxes_from_text_and_spans(
                         text,
                         pattern=pattern,
                         image_size=image_size,