ctranslate2-4you commited on
Commit
cf5898c
·
verified ·
1 Parent(s): 119d30c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -5
README.md CHANGED
@@ -127,8 +127,12 @@ from PIL import Image
127
  from transformers import AutoModel, AutoTokenizer
128
  import torch
129
 
130
- # Initialize model and tokenizer
131
- MODEL_PATH = "ctranslate2-4you/GOT-OCR2_0-Customized" # Replace with local path if needed
 
 
 
 
132
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
133
  model = AutoModel.from_pretrained(
134
  MODEL_PATH,
@@ -142,9 +146,8 @@ model = model.eval().cuda()
142
 
143
  def clean_repetitive_lines(text):
144
  """
145
- Cleans up repetitive lines from the OCR output. This is necessary because the model
146
- sometimes produces duplicate lines as artifacts in the OCR process. This function
147
- identifies sequences of repeated lines and removes the duplicates above 2 instances.
148
  """
149
  lines = text.split('\n')
150
  cleaned_lines = []
@@ -175,6 +178,7 @@ def process_pdf_for_ocr(tokenizer, model, pdf_path):
175
  matrix = fitz.Matrix(zoom, zoom)
176
  pix = page.get_pixmap(matrix=matrix)
177
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
178
  res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
179
 
180
  if res.strip():
 
127
  from transformers import AutoModel, AutoTokenizer
128
  import torch
129
 
130
+ # The following three lines are optional - removes the last remaining logging message from Transformers.
131
+ # import warnings
132
+ # from transformers import logging as transformers_logging
133
+ # transformers_logging.set_verbosity_error()
134
+
135
+ MODEL_PATH = "ctranslate2-4you/GOT-OCR2_0-Customized" # Replace with local path if desired
136
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
137
  model = AutoModel.from_pretrained(
138
  MODEL_PATH,
 
146
 
147
  def clean_repetitive_lines(text):
148
  """
149
+ Removes repetitive lines from the OCR output before saving the .txt file. This is necessary because
150
+ the model sometimes produces OCR artifacts. All duplicates above 2 instances are removed.
 
151
  """
152
  lines = text.split('\n')
153
  cleaned_lines = []
 
178
  matrix = fitz.Matrix(zoom, zoom)
179
  pix = page.get_pixmap(matrix=matrix)
180
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
181
+ # gradio_input=True is used because we're creating images for each page of a .pdf using PyMuPDF and Pillow instead of relying on the model's internal code
182
  res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
183
 
184
  if res.strip():