stepfun-ai
/

GOT-OCR2_0

@@ -558,7 +558,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         image_tensor_1 = image_processor_high(image)
-        input_ids = torch.as_tensor(inputs.input_ids).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
@@ -569,7 +569,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
                     num_beams = 1,
                     no_repeat_ngram_size = 20,
@@ -581,7 +581,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
                     num_beams = 1,
                     no_repeat_ngram_size = 20,
@@ -589,9 +589,9 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
                     )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
@@ -616,7 +616,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             if ocr_type == 'format' and '**kern' not in outputs:
                 if  '\\begin{tikzpicture}' not in outputs:
                     html_path_2 = save_render_file
                     right_num = outputs.count('\\right')
@@ -631,8 +631,8 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                     outputs_list = outputs.split('\n')
                     gt= ''
                     for out in outputs_list:
-                        gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
                     gt = gt[:-2]
@@ -652,7 +652,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
                                     if out[-1] != ';':
                                         gt += out[:-1] + ';\n'
@@ -671,7 +671,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
         def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
             best_ratio_diff = float('inf')
             best_ratio = (1, 1)
@@ -687,7 +687,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                         best_ratio = ratio
             # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
             return best_ratio
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
@@ -785,7 +785,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         if use_im_start_end:
-            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
@@ -812,7 +812,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         inputs = tokenizer([prompt])
-        input_ids = torch.as_tensor(inputs.input_ids).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
@@ -823,7 +823,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_list.half().cuda()],
                     do_sample=False,
                     num_beams = 1,
                     # no_repeat_ngram_size = 20,
@@ -835,7 +835,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_list.half().cuda()],
                     do_sample=False,
                     num_beams = 1,
                     # no_repeat_ngram_size = 20,

         image_tensor_1 = image_processor_high(image)
+        input_ids = torch.as_tensor(inputs.input_ids).to(self.device)
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_tensor_1.unsqueeze(0).to(self.device)],
                     do_sample=False,
                     num_beams = 1,
                     no_repeat_ngram_size = 20,
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_tensor_1.unsqueeze(0).to(self.device)],
                     do_sample=False,
                     num_beams = 1,
                     no_repeat_ngram_size = 20,
                     max_new_tokens=4096,
                     stopping_criteria=[stopping_criteria]
                     )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         if outputs.endswith(stop_str):
             outputs = outputs[:-len(stop_str)]
         outputs = outputs.strip()
             if ocr_type == 'format' and '**kern' not in outputs:
                 if  '\\begin{tikzpicture}' not in outputs:
                     html_path_2 = save_render_file
                     right_num = outputs.count('\\right')
                     outputs_list = outputs.split('\n')
                     gt= ''
                     for out in outputs_list:
+                        gt +=  '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
                     gt = gt[:-2]
                                     out = out[:-1]
                                     if out is None:
                                         break
                                 if out:
                                     if out[-1] != ';':
                                         gt += out[:-1] + ';\n'
         return response_str
     def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
         def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
             best_ratio_diff = float('inf')
             best_ratio = (1, 1)
                         best_ratio = ratio
             # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
             return best_ratio
         orig_width, orig_height = image.size
         aspect_ratio = orig_width / orig_height
         if use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
         else:
             qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
         inputs = tokenizer([prompt])
+        input_ids = torch.as_tensor(inputs.input_ids).to(self.device)
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_list.to(self.device)],
                     do_sample=False,
                     num_beams = 1,
                     # no_repeat_ngram_size = 20,
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_list.to(self.device)],
                     do_sample=False,
                     num_beams = 1,
                     # no_repeat_ngram_size = 20,