ctranslate2-4you commited on
Commit
119d30c
·
verified ·
1 Parent(s): 3a00d0c

update readme

Browse files
Files changed (1) hide show
  1. README.md +77 -1
README.md CHANGED
@@ -116,4 +116,80 @@ If you find our work helpful, please consider citing our papers 📝 and liking
116
  }
117
  ```
118
 
119
- </details>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  }
117
  ```
118
 
119
+ </details>
120
+ <br>
121
+
122
+ # Example Usage
123
+
124
+ ```python
125
+ import fitz
126
+ from PIL import Image
127
+ from transformers import AutoModel, AutoTokenizer
128
+ import torch
129
+
130
+ # Initialize model and tokenizer
131
+ MODEL_PATH = "ctranslate2-4you/GOT-OCR2_0-Customized" # Replace with local path if needed
132
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
133
+ model = AutoModel.from_pretrained(
134
+ MODEL_PATH,
135
+ trust_remote_code=True,
136
+ low_cpu_mem_usage=True,
137
+ device_map='cuda',
138
+ use_safetensors=True,
139
+ pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>")
140
+ )
141
+ model = model.eval().cuda()
142
+
143
+ def clean_repetitive_lines(text):
144
+ """
145
+ Cleans up repetitive lines from the OCR output. This is necessary because the model
146
+ sometimes produces duplicate lines as artifacts in the OCR process. This function
147
+ identifies sequences of repeated lines and removes the duplicates above 2 instances.
148
+ """
149
+ lines = text.split('\n')
150
+ cleaned_lines = []
151
+ i = 0
152
+ while i < len(lines):
153
+ cleaned_lines.append(lines[i])
154
+ repeat_count = 1
155
+ j = i + 1
156
+ while j < len(lines) and lines[j] == lines[i]:
157
+ repeat_count += 1
158
+ j += 1
159
+ if repeat_count > 2:
160
+ if i + 1 < len(lines):
161
+ cleaned_lines.append(lines[i + 1])
162
+ i = j
163
+ else:
164
+ i += 1
165
+ return '\n'.join(cleaned_lines)
166
+
167
+ @torch.inference_mode()
168
+ def process_pdf_for_ocr(tokenizer, model, pdf_path):
169
+ pdf_document = fitz.open(pdf_path)
170
+ full_text = []
171
+
172
+ for page_num in range(len(pdf_document)):
173
+ page = pdf_document[page_num]
174
+ zoom = 2
175
+ matrix = fitz.Matrix(zoom, zoom)
176
+ pix = page.get_pixmap(matrix=matrix)
177
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
178
+ res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
179
+
180
+ if res.strip():
181
+ full_text.append(res)
182
+
183
+ complete_text = '\n'.join(full_text)
184
+ cleaned_text = clean_repetitive_lines(complete_text)
185
+
186
+ with open("extracted_text_got_ocr.txt", "w", encoding="utf-8") as f:
187
+ f.write(cleaned_text)
188
+
189
+ pdf_document.close()
190
+ print("Results have been saved to extracted_text_got_ocr.txt")
191
+
192
+ # Example usage
193
+ pdf_path = "path/to/your/pdf"
194
+ process_pdf_for_ocr(tokenizer, model, pdf_path)
195
+ ```