Update README.md
Browse files
README.md
CHANGED
|
@@ -127,8 +127,12 @@ from PIL import Image
|
|
| 127 |
from transformers import AutoModel, AutoTokenizer
|
| 128 |
import torch
|
| 129 |
|
| 130 |
-
#
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
| 133 |
model = AutoModel.from_pretrained(
|
| 134 |
MODEL_PATH,
|
|
@@ -142,9 +146,8 @@ model = model.eval().cuda()
|
|
| 142 |
|
| 143 |
def clean_repetitive_lines(text):
|
| 144 |
"""
|
| 145 |
-
|
| 146 |
-
sometimes produces
|
| 147 |
-
identifies sequences of repeated lines and removes the duplicates above 2 instances.
|
| 148 |
"""
|
| 149 |
lines = text.split('\n')
|
| 150 |
cleaned_lines = []
|
|
@@ -175,6 +178,7 @@ def process_pdf_for_ocr(tokenizer, model, pdf_path):
|
|
| 175 |
matrix = fitz.Matrix(zoom, zoom)
|
| 176 |
pix = page.get_pixmap(matrix=matrix)
|
| 177 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
|
|
| 178 |
res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
|
| 179 |
|
| 180 |
if res.strip():
|
|
|
|
| 127 |
from transformers import AutoModel, AutoTokenizer
|
| 128 |
import torch
|
| 129 |
|
| 130 |
+
# The following three lines are optional - removes the last remaining logging message from Transformers.
|
| 131 |
+
# import warnings
|
| 132 |
+
# from transformers import logging as transformers_logging
|
| 133 |
+
# transformers_logging.set_verbosity_error()
|
| 134 |
+
|
| 135 |
+
MODEL_PATH = "ctranslate2-4you/GOT-OCR2_0-Customized" # Replace with local path if desired
|
| 136 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
| 137 |
model = AutoModel.from_pretrained(
|
| 138 |
MODEL_PATH,
|
|
|
|
| 146 |
|
| 147 |
def clean_repetitive_lines(text):
|
| 148 |
"""
|
| 149 |
+
Removes repetitive lines from the OCR output before saving the .txt file. This is necessary because
|
| 150 |
+
the model sometimes produces OCR artifacts. All duplicates above 2 instances are removed.
|
|
|
|
| 151 |
"""
|
| 152 |
lines = text.split('\n')
|
| 153 |
cleaned_lines = []
|
|
|
|
| 178 |
matrix = fitz.Matrix(zoom, zoom)
|
| 179 |
pix = page.get_pixmap(matrix=matrix)
|
| 180 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 181 |
+
# gradio_input=True is used because we're creating images for each page of a .pdf using PyMuPDF and Pillow instead of relying on the model's internal code
|
| 182 |
res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
|
| 183 |
|
| 184 |
if res.strip():
|