stepfun-ai
/

GOT-OCR2_0

@@ -1,25 +1,37 @@
-from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM, StoppingCriteria, TextStreamer
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from typing import List, Optional, Tuple, Union
-from transformers.cache_utils import Cache
 import requests
-from PIL import Image
-from io import BytesIO
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
-from .got_vision_b import build_GOT_vit_b
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
-import dataclasses
 DEFAULT_IMAGE_TOKEN = "<image>"
 DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
 DEFAULT_IM_START_TOKEN = '<img>'
 DEFAULT_IM_END_TOKEN = '</img>'
-from enum import auto, Enum
 class SeparatorStyle(Enum):
     """Different separator style."""
     SINGLE = auto()
@@ -164,7 +176,7 @@ class GOTQwenModel(Qwen2Model):
         use_im_start_end=False,
         vision_select_layer=-1,
         dtype=torch.float16,
-        device="cuda"
     ):
@@ -453,7 +465,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         tokenizer,
         freeze_lm_model=False,
         pretrained_stage1_model=None,
-        device="cuda"
     ):
         config = self.get_model().config
@@ -488,6 +500,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         self.disable_torch_init()
         image_processor_high =  GOTImageEvalProcessor(image_size=1024)
@@ -558,7 +571,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         image_tensor_1 = image_processor_high(image)
-        input_ids = torch.as_tensor(inputs.input_ids).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
@@ -566,10 +579,10 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
                     num_beams = 1,
                     no_repeat_ngram_size = 20,
@@ -578,10 +591,10 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                     stopping_criteria=[stopping_criteria]
                     )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
                     do_sample=False,
                     num_beams = 1,
                     no_repeat_ngram_size = 20,
@@ -599,7 +612,12 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         if render:
             print('==============rendering===============')
-            from .render_tools import svg_to_html, content_mmd_to_html, tik_html, translation_table
             if '**kern' in outputs:
                 import verovio
@@ -812,7 +830,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         inputs = tokenizer([prompt])
-        input_ids = torch.as_tensor(inputs.input_ids).cuda()
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
@@ -820,10 +838,10 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_list.half().cuda()],
                     do_sample=False,
                     num_beams = 1,
                     # no_repeat_ngram_size = 20,
@@ -832,10 +850,10 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                     stopping_criteria=[stopping_criteria]
                     )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
-                    images=[image_list.half().cuda()],
                     do_sample=False,
                     num_beams = 1,
                     # no_repeat_ngram_size = 20,

+import dataclasses
+from io import BytesIO
 from typing import List, Optional, Tuple, Union
 import requests
 import torch
 import torch.nn as nn
+from PIL import Image
 from torch.nn import CrossEntropyLoss
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode
+from transformers import (
+    Qwen2Config,
+    Qwen2ForCausalLM,
+    Qwen2Model,
+    StoppingCriteria,
+    TextStreamer,
+)
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from .got_vision_b import build_GOT_vit_b
 DEFAULT_IMAGE_TOKEN = "<image>"
 DEFAULT_IMAGE_PATCH_TOKEN = '<imgpad>'
 DEFAULT_IM_START_TOKEN = '<img>'
 DEFAULT_IM_END_TOKEN = '</img>'
+from enum import Enum, auto
 class SeparatorStyle(Enum):
     """Different separator style."""
     SINGLE = auto()
         use_im_start_end=False,
         vision_select_layer=-1,
         dtype=torch.float16,
+        device=None
     ):
         tokenizer,
         freeze_lm_model=False,
         pretrained_stage1_model=None,
+        device=None
     ):
         config = self.get_model().config
         self.disable_torch_init()
+        tokenizer.pad_token_id = tokenizer.eos_token_id
         image_processor_high =  GOTImageEvalProcessor(image_size=1024)
         image_tensor_1 = image_processor_high(image)
+        input_ids = torch.as_tensor(inputs.input_ids).to(self.device)
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
+            with torch.autocast(str(self.device), dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_tensor_1.unsqueeze(0).half().to(self.device)],
                     do_sample=False,
                     num_beams = 1,
                     no_repeat_ngram_size = 20,
                     stopping_criteria=[stopping_criteria]
                     )
         else:
+            with torch.autocast(str(self.device), dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_tensor_1.unsqueeze(0).half().to(self.device)],
                     do_sample=False,
                     num_beams = 1,
                     no_repeat_ngram_size = 20,
         if render:
             print('==============rendering===============')
+            from .render_tools import (
+                content_mmd_to_html,
+                svg_to_html,
+                tik_html,
+                translation_table,
+            )
             if '**kern' in outputs:
                 import verovio
         inputs = tokenizer([prompt])
+        input_ids = torch.as_tensor(inputs.input_ids).to(self.device)
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         if stream_flag:
+            with torch.autocast(str(self.device), dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_list.half().to(self.device)],
                     do_sample=False,
                     num_beams = 1,
                     # no_repeat_ngram_size = 20,
                     stopping_criteria=[stopping_criteria]
                     )
         else:
+            with torch.autocast(str(self.device), dtype=torch.bfloat16):
                 output_ids = self.generate(
                     input_ids,
+                    images=[image_list.half().to(self.device)],
                     do_sample=False,
                     num_beams = 1,
                     # no_repeat_ngram_size = 20,