Spaces:

OFA-Sys
/

OFA-OCR

Runtime error

App Files Files Community

JustinLin610 commited on Nov 16, 2022

Commit

2915058

1 Parent(s): 7883098

debug

Browse files

Files changed (2) hide show

app.py +9 -9
data/mm_data/ocr_dataset.py +10 -4

app.py CHANGED Viewed

@@ -70,7 +70,7 @@ def get_images(img: str, reader: ReaderLite, **kwargs):
     return results
-def draw_boxes(image, bounds, color='red', width=2):
     draw = ImageDraw.Draw(image)
     for i, bound in enumerate(bounds):
         p0, p1, p2, p3 = bound
@@ -102,7 +102,7 @@ def patch_resize_transform(patch_image_size=480, is_document=False):
     _patch_resize_transform = transforms.Compose(
         [
             lambda image: ocr_resize(
-                image, patch_image_size, is_document=is_document
             ),
             transforms.ToTensor(),
             transforms.Normalize(mean=mean, std=std),
@@ -113,7 +113,7 @@ def patch_resize_transform(patch_image_size=480, is_document=False):
 reader = ReaderLite()
-overrides={"eval_cider": False, "beam": 8, "max_len_b": 128, "patch_image_size": 480,
            "orig_patch_image_size": 224, "no_repeat_ngram_size": 0, "seed": 7}
 models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
     utils.split_paths('checkpoints/ocr_general_clean.pt'),
@@ -163,9 +163,9 @@ def apply_half(t):
     return t
-def ocr(img):
-    out_img = Image.open(img)
-    results = get_images(img, reader)
     box_list, image_list = zip(*results)
     draw_boxes(out_img, box_list)
@@ -191,9 +191,9 @@ description = "Gradio Demo for OFA-OCR. Upload your own image or click any one o
 article = "<p style='text-align: center'><a href='https://github.com/OFA-Sys/OFA' target='_blank'>OFA Github " \
           "Repo</a></p> "
 examples = [['lihe.png']]
-io = gr.Interface(fn=ocr, inputs=gr.inputs.Image(type='filepath'),
-                  outputs=[gr.outputs.Image(type='pil'), gr.outputs.Textbox(label="OCR result")],
                   title=title, description=description, article=article, examples=examples,
-                  allow_flagging=False, allow_screenshot=False)
 io.launch(cache_examples=True)

     return results
+def draw_boxes(image, bounds, color='red', width=10):
     draw = ImageDraw.Draw(image)
     for i, bound in enumerate(bounds):
         p0, p1, p2, p3 = bound
     _patch_resize_transform = transforms.Compose(
         [
             lambda image: ocr_resize(
+                image, patch_image_size, is_document=is_document, split='test',
             ),
             transforms.ToTensor(),
             transforms.Normalize(mean=mean, std=std),
 reader = ReaderLite()
+overrides={"eval_cider": False, "beam": 4, "max_len_b": 32, "patch_image_size": 480,
            "orig_patch_image_size": 224, "no_repeat_ngram_size": 0, "seed": 7}
 models, cfg, task = checkpoint_utils.load_model_ensemble_and_task(
     utils.split_paths('checkpoints/ocr_general_clean.pt'),
     return t
+def ocr(Image):
+    out_img = Image.open(Image)
+    results = get_images(Image, reader, link_threshold=0.2)
     box_list, image_list = zip(*results)
     draw_boxes(out_img, box_list)
 article = "<p style='text-align: center'><a href='https://github.com/OFA-Sys/OFA' target='_blank'>OFA Github " \
           "Repo</a></p> "
 examples = [['lihe.png']]
+io = gr.Interface(fn=ocr, inputs=gr.inputs.Image(type='filepath', label='Image'),
+                  outputs=[gr.outputs.Image(type='pil', label='Image'), gr.outputs.Textbox(label="OCR result")],
                   title=title, description=description, article=article, examples=examples,
+                  allow_flagging='never', allow_screenshot=False)
 io.launch(cache_examples=True)

data/mm_data/ocr_dataset.py CHANGED Viewed

@@ -82,7 +82,7 @@ def collate(samples, pad_idx, eos_idx):
     return batch
-def ocr_resize(img, patch_image_size, is_document=False):
     img = img.convert("RGB")
     width, height = img.size
@@ -92,13 +92,19 @@ def ocr_resize(img, patch_image_size, is_document=False):
         if width >= height:
             new_width = max(64, patch_image_size)
             new_height = max(64, int(patch_image_size * (height / width)))
-            top = random.randint(0, patch_image_size - new_height)
             bottom = patch_image_size - new_height - top
             left, right = 0, 0
         else:
             new_height = max(64, patch_image_size)
             new_width = max(64, int(patch_image_size * (width / height)))
-            left = random.randint(0, patch_image_size - new_width)
             right = patch_image_size - new_width - left
             top, bottom = 0, 0
@@ -151,7 +157,7 @@ class OcrDataset(OFADataset):
         self.patch_resize_transform = transforms.Compose(
             [
                 lambda image: ocr_resize(
-                    image, patch_image_size, is_document=is_document
                 ),
                 transforms.ToTensor(),
                 transforms.Normalize(mean=mean, std=std),

     return batch
+def ocr_resize(img, patch_image_size, is_document=False, split='train'):
     img = img.convert("RGB")
     width, height = img.size
         if width >= height:
             new_width = max(64, patch_image_size)
             new_height = max(64, int(patch_image_size * (height / width)))
+            if split != 'train':
+                top = int((patch_image_size - new_height) // 2)
+            else:
+                top = random.randint(0, patch_image_size - new_height)
             bottom = patch_image_size - new_height - top
             left, right = 0, 0
         else:
             new_height = max(64, patch_image_size)
             new_width = max(64, int(patch_image_size * (width / height)))
+            if split != 'train':
+                left = int((patch_image_size - new_width) // 2)
+            else:
+                left = random.randint(0, patch_image_size - new_width)
             right = patch_image_size - new_width - left
             top, bottom = 0, 0
         self.patch_resize_transform = transforms.Compose(
             [
                 lambda image: ocr_resize(
+                    image, patch_image_size, is_document=is_document, split=split,
                 ),
                 transforms.ToTensor(),
                 transforms.Normalize(mean=mean, std=std),