update readme
Browse files
README.md
CHANGED
|
@@ -116,4 +116,80 @@ If you find our work helpful, please consider citing our papers 📝 and liking
|
|
| 116 |
}
|
| 117 |
```
|
| 118 |
|
| 119 |
-
</details>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
}
|
| 117 |
```
|
| 118 |
|
| 119 |
+
</details>
|
| 120 |
+
<br>
|
| 121 |
+
|
| 122 |
+
# Example Usage
|
| 123 |
+
|
| 124 |
+
```python
|
| 125 |
+
import fitz
|
| 126 |
+
from PIL import Image
|
| 127 |
+
from transformers import AutoModel, AutoTokenizer
|
| 128 |
+
import torch
|
| 129 |
+
|
| 130 |
+
# Initialize model and tokenizer
|
| 131 |
+
MODEL_PATH = "ctranslate2-4you/GOT-OCR2_0-Customized" # Replace with local path if needed
|
| 132 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
| 133 |
+
model = AutoModel.from_pretrained(
|
| 134 |
+
MODEL_PATH,
|
| 135 |
+
trust_remote_code=True,
|
| 136 |
+
low_cpu_mem_usage=True,
|
| 137 |
+
device_map='cuda',
|
| 138 |
+
use_safetensors=True,
|
| 139 |
+
pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>")
|
| 140 |
+
)
|
| 141 |
+
model = model.eval().cuda()
|
| 142 |
+
|
| 143 |
+
def clean_repetitive_lines(text):
|
| 144 |
+
"""
|
| 145 |
+
Cleans up repetitive lines from the OCR output. This is necessary because the model
|
| 146 |
+
sometimes produces duplicate lines as artifacts in the OCR process. This function
|
| 147 |
+
identifies sequences of repeated lines and removes the duplicates above 2 instances.
|
| 148 |
+
"""
|
| 149 |
+
lines = text.split('\n')
|
| 150 |
+
cleaned_lines = []
|
| 151 |
+
i = 0
|
| 152 |
+
while i < len(lines):
|
| 153 |
+
cleaned_lines.append(lines[i])
|
| 154 |
+
repeat_count = 1
|
| 155 |
+
j = i + 1
|
| 156 |
+
while j < len(lines) and lines[j] == lines[i]:
|
| 157 |
+
repeat_count += 1
|
| 158 |
+
j += 1
|
| 159 |
+
if repeat_count > 2:
|
| 160 |
+
if i + 1 < len(lines):
|
| 161 |
+
cleaned_lines.append(lines[i + 1])
|
| 162 |
+
i = j
|
| 163 |
+
else:
|
| 164 |
+
i += 1
|
| 165 |
+
return '\n'.join(cleaned_lines)
|
| 166 |
+
|
| 167 |
+
@torch.inference_mode()
|
| 168 |
+
def process_pdf_for_ocr(tokenizer, model, pdf_path):
|
| 169 |
+
pdf_document = fitz.open(pdf_path)
|
| 170 |
+
full_text = []
|
| 171 |
+
|
| 172 |
+
for page_num in range(len(pdf_document)):
|
| 173 |
+
page = pdf_document[page_num]
|
| 174 |
+
zoom = 2
|
| 175 |
+
matrix = fitz.Matrix(zoom, zoom)
|
| 176 |
+
pix = page.get_pixmap(matrix=matrix)
|
| 177 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 178 |
+
res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
|
| 179 |
+
|
| 180 |
+
if res.strip():
|
| 181 |
+
full_text.append(res)
|
| 182 |
+
|
| 183 |
+
complete_text = '\n'.join(full_text)
|
| 184 |
+
cleaned_text = clean_repetitive_lines(complete_text)
|
| 185 |
+
|
| 186 |
+
with open("extracted_text_got_ocr.txt", "w", encoding="utf-8") as f:
|
| 187 |
+
f.write(cleaned_text)
|
| 188 |
+
|
| 189 |
+
pdf_document.close()
|
| 190 |
+
print("Results have been saved to extracted_text_got_ocr.txt")
|
| 191 |
+
|
| 192 |
+
# Example usage
|
| 193 |
+
pdf_path = "path/to/your/pdf"
|
| 194 |
+
process_pdf_for_ocr(tokenizer, model, pdf_path)
|
| 195 |
+
```
|