support 'cpu'
Browse files- modeling_GOT.py +15 -15
modeling_GOT.py
CHANGED
|
@@ -558,7 +558,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 558 |
|
| 559 |
image_tensor_1 = image_processor_high(image)
|
| 560 |
|
| 561 |
-
input_ids = torch.as_tensor(inputs.input_ids).
|
| 562 |
|
| 563 |
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
| 564 |
keywords = [stop_str]
|
|
@@ -569,7 +569,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 569 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 570 |
output_ids = self.generate(
|
| 571 |
input_ids,
|
| 572 |
-
images=[image_tensor_1.unsqueeze(0).
|
| 573 |
do_sample=False,
|
| 574 |
num_beams = 1,
|
| 575 |
no_repeat_ngram_size = 20,
|
|
@@ -581,7 +581,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 581 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 582 |
output_ids = self.generate(
|
| 583 |
input_ids,
|
| 584 |
-
images=[image_tensor_1.unsqueeze(0).
|
| 585 |
do_sample=False,
|
| 586 |
num_beams = 1,
|
| 587 |
no_repeat_ngram_size = 20,
|
|
@@ -589,9 +589,9 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 589 |
max_new_tokens=4096,
|
| 590 |
stopping_criteria=[stopping_criteria]
|
| 591 |
)
|
| 592 |
-
|
| 593 |
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
|
| 594 |
-
|
| 595 |
if outputs.endswith(stop_str):
|
| 596 |
outputs = outputs[:-len(stop_str)]
|
| 597 |
outputs = outputs.strip()
|
|
@@ -616,7 +616,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 616 |
|
| 617 |
if ocr_type == 'format' and '**kern' not in outputs:
|
| 618 |
|
| 619 |
-
|
| 620 |
if '\\begin{tikzpicture}' not in outputs:
|
| 621 |
html_path_2 = save_render_file
|
| 622 |
right_num = outputs.count('\\right')
|
|
@@ -631,8 +631,8 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 631 |
outputs_list = outputs.split('\n')
|
| 632 |
gt= ''
|
| 633 |
for out in outputs_list:
|
| 634 |
-
gt += '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
|
| 635 |
-
|
| 636 |
gt = gt[:-2]
|
| 637 |
|
| 638 |
|
|
@@ -652,7 +652,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 652 |
out = out[:-1]
|
| 653 |
if out is None:
|
| 654 |
break
|
| 655 |
-
|
| 656 |
if out:
|
| 657 |
if out[-1] != ';':
|
| 658 |
gt += out[:-1] + ';\n'
|
|
@@ -671,7 +671,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 671 |
return response_str
|
| 672 |
|
| 673 |
def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
|
| 674 |
-
|
| 675 |
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
| 676 |
best_ratio_diff = float('inf')
|
| 677 |
best_ratio = (1, 1)
|
|
@@ -687,7 +687,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 687 |
best_ratio = ratio
|
| 688 |
# print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
|
| 689 |
return best_ratio
|
| 690 |
-
|
| 691 |
orig_width, orig_height = image.size
|
| 692 |
aspect_ratio = orig_width / orig_height
|
| 693 |
|
|
@@ -785,7 +785,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 785 |
|
| 786 |
|
| 787 |
if use_im_start_end:
|
| 788 |
-
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
|
| 789 |
else:
|
| 790 |
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
|
| 791 |
|
|
@@ -812,7 +812,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 812 |
|
| 813 |
inputs = tokenizer([prompt])
|
| 814 |
|
| 815 |
-
input_ids = torch.as_tensor(inputs.input_ids).
|
| 816 |
|
| 817 |
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
| 818 |
keywords = [stop_str]
|
|
@@ -823,7 +823,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 823 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 824 |
output_ids = self.generate(
|
| 825 |
input_ids,
|
| 826 |
-
images=[image_list.
|
| 827 |
do_sample=False,
|
| 828 |
num_beams = 1,
|
| 829 |
# no_repeat_ngram_size = 20,
|
|
@@ -835,7 +835,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
|
|
| 835 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 836 |
output_ids = self.generate(
|
| 837 |
input_ids,
|
| 838 |
-
images=[image_list.
|
| 839 |
do_sample=False,
|
| 840 |
num_beams = 1,
|
| 841 |
# no_repeat_ngram_size = 20,
|
|
|
|
| 558 |
|
| 559 |
image_tensor_1 = image_processor_high(image)
|
| 560 |
|
| 561 |
+
input_ids = torch.as_tensor(inputs.input_ids).to(self.device)
|
| 562 |
|
| 563 |
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
| 564 |
keywords = [stop_str]
|
|
|
|
| 569 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 570 |
output_ids = self.generate(
|
| 571 |
input_ids,
|
| 572 |
+
images=[image_tensor_1.unsqueeze(0).to(self.device)],
|
| 573 |
do_sample=False,
|
| 574 |
num_beams = 1,
|
| 575 |
no_repeat_ngram_size = 20,
|
|
|
|
| 581 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 582 |
output_ids = self.generate(
|
| 583 |
input_ids,
|
| 584 |
+
images=[image_tensor_1.unsqueeze(0).to(self.device)],
|
| 585 |
do_sample=False,
|
| 586 |
num_beams = 1,
|
| 587 |
no_repeat_ngram_size = 20,
|
|
|
|
| 589 |
max_new_tokens=4096,
|
| 590 |
stopping_criteria=[stopping_criteria]
|
| 591 |
)
|
| 592 |
+
|
| 593 |
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
|
| 594 |
+
|
| 595 |
if outputs.endswith(stop_str):
|
| 596 |
outputs = outputs[:-len(stop_str)]
|
| 597 |
outputs = outputs.strip()
|
|
|
|
| 616 |
|
| 617 |
if ocr_type == 'format' and '**kern' not in outputs:
|
| 618 |
|
| 619 |
+
|
| 620 |
if '\\begin{tikzpicture}' not in outputs:
|
| 621 |
html_path_2 = save_render_file
|
| 622 |
right_num = outputs.count('\\right')
|
|
|
|
| 631 |
outputs_list = outputs.split('\n')
|
| 632 |
gt= ''
|
| 633 |
for out in outputs_list:
|
| 634 |
+
gt += '"' + out.replace('\\', '\\\\') + r'\n' + '"' + '+' + '\n'
|
| 635 |
+
|
| 636 |
gt = gt[:-2]
|
| 637 |
|
| 638 |
|
|
|
|
| 652 |
out = out[:-1]
|
| 653 |
if out is None:
|
| 654 |
break
|
| 655 |
+
|
| 656 |
if out:
|
| 657 |
if out[-1] != ';':
|
| 658 |
gt += out[:-1] + ';\n'
|
|
|
|
| 671 |
return response_str
|
| 672 |
|
| 673 |
def dynamic_preprocess(self, image, min_num=1, max_num=6, image_size=1024, use_thumbnail=True):
|
| 674 |
+
|
| 675 |
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
| 676 |
best_ratio_diff = float('inf')
|
| 677 |
best_ratio = (1, 1)
|
|
|
|
| 687 |
best_ratio = ratio
|
| 688 |
# print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
|
| 689 |
return best_ratio
|
| 690 |
+
|
| 691 |
orig_width, orig_height = image.size
|
| 692 |
aspect_ratio = orig_width / orig_height
|
| 693 |
|
|
|
|
| 785 |
|
| 786 |
|
| 787 |
if use_im_start_end:
|
| 788 |
+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_PATCH_TOKEN*image_token_len*ll + DEFAULT_IM_END_TOKEN + '\n' + qs
|
| 789 |
else:
|
| 790 |
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
|
| 791 |
|
|
|
|
| 812 |
|
| 813 |
inputs = tokenizer([prompt])
|
| 814 |
|
| 815 |
+
input_ids = torch.as_tensor(inputs.input_ids).to(self.device)
|
| 816 |
|
| 817 |
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
| 818 |
keywords = [stop_str]
|
|
|
|
| 823 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 824 |
output_ids = self.generate(
|
| 825 |
input_ids,
|
| 826 |
+
images=[image_list.to(self.device)],
|
| 827 |
do_sample=False,
|
| 828 |
num_beams = 1,
|
| 829 |
# no_repeat_ngram_size = 20,
|
|
|
|
| 835 |
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 836 |
output_ids = self.generate(
|
| 837 |
input_ids,
|
| 838 |
+
images=[image_list.to(self.device)],
|
| 839 |
do_sample=False,
|
| 840 |
num_beams = 1,
|
| 841 |
# no_repeat_ngram_size = 20,
|