ctranslate2-4you
/

GOT-OCR2_0-Customized

Image-Text-to-Text

vision-language

Model card Files Files and versions

ctranslate2-4you commited on Feb 15, 2025

Commit

cf5898c

·

verified ·

1 Parent(s): 119d30c

Update README.md

Files changed (1) hide show

README.md +9 -5

README.md CHANGED Viewed

@@ -127,8 +127,12 @@ from PIL import Image
 from transformers import AutoModel, AutoTokenizer
 import torch
-# Initialize model and tokenizer
-MODEL_PATH = "ctranslate2-4you/GOT-OCR2_0-Customized"  # Replace with local path if needed
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
 model = AutoModel.from_pretrained(
     MODEL_PATH,
@@ -142,9 +146,8 @@ model = model.eval().cuda()
 def clean_repetitive_lines(text):
     """
-    Cleans up repetitive lines from the OCR output. This is necessary because the model
-    sometimes produces duplicate lines as artifacts in the OCR process. This function
-    identifies sequences of repeated lines and removes the duplicates above 2 instances.
     """
     lines = text.split('\n')
     cleaned_lines = []
@@ -175,6 +178,7 @@ def process_pdf_for_ocr(tokenizer, model, pdf_path):
         matrix = fitz.Matrix(zoom, zoom)
         pix = page.get_pixmap(matrix=matrix)
         img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
         if res.strip():

 from transformers import AutoModel, AutoTokenizer
 import torch
+# The following three lines are optional - removes the last remaining logging message from Transformers.
+# import warnings
+# from transformers import logging as transformers_logging
+# transformers_logging.set_verbosity_error()
+MODEL_PATH = "ctranslate2-4you/GOT-OCR2_0-Customized"  # Replace with local path if desired
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
 model = AutoModel.from_pretrained(
     MODEL_PATH,
 def clean_repetitive_lines(text):
     """
+    Removes repetitive lines from the OCR output before saving the .txt file. This is necessary because
+    the model sometimes produces OCR artifacts.  All duplicates above 2 instances are removed.
     """
     lines = text.split('\n')
     cleaned_lines = []
         matrix = fitz.Matrix(zoom, zoom)
         pix = page.get_pixmap(matrix=matrix)
         img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        # gradio_input=True is used because we're creating images for each page of a .pdf using PyMuPDF and Pillow instead of relying on the model's internal code
         res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
         if res.strip():