ctranslate2-4you
/

GOT-OCR2_0-Customized

Image-Text-to-Text

vision-language

Model card Files Files and versions

ctranslate2-4you commited on Feb 15, 2025

Commit

119d30c

·

verified ·

1 Parent(s): 3a00d0c

update readme

Files changed (1) hide show

README.md +77 -1

README.md CHANGED Viewed

@@ -116,4 +116,80 @@ If you find our work helpful, please consider citing our papers 📝 and liking
 }
 ```
-</details>

 }
 ```
+</details>
+<br>
+# Example Usage
+```python
+import fitz
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+import torch
+# Initialize model and tokenizer
+MODEL_PATH = "ctranslate2-4you/GOT-OCR2_0-Customized"  # Replace with local path if needed
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+model = AutoModel.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True,
+    low_cpu_mem_usage=True,
+    device_map='cuda',
+    use_safetensors=True,
+    pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>")
+)
+model = model.eval().cuda()
+def clean_repetitive_lines(text):
+    """
+    Cleans up repetitive lines from the OCR output. This is necessary because the model
+    sometimes produces duplicate lines as artifacts in the OCR process. This function
+    identifies sequences of repeated lines and removes the duplicates above 2 instances.
+    """
+    lines = text.split('\n')
+    cleaned_lines = []
+    i = 0
+    while i < len(lines):
+        cleaned_lines.append(lines[i])
+        repeat_count = 1
+        j = i + 1
+        while j < len(lines) and lines[j] == lines[i]:
+            repeat_count += 1
+            j += 1
+        if repeat_count > 2:
+            if i + 1 < len(lines):
+                cleaned_lines.append(lines[i + 1])
+            i = j
+        else:
+            i += 1
+    return '\n'.join(cleaned_lines)
+@torch.inference_mode()
+def process_pdf_for_ocr(tokenizer, model, pdf_path):
+    pdf_document = fitz.open(pdf_path)
+    full_text = []
+    for page_num in range(len(pdf_document)):
+        page = pdf_document[page_num]
+        zoom = 2
+        matrix = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=matrix)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        res = model.chat_crop(tokenizer, img, ocr_type='ocr', gradio_input=True)
+        if res.strip():
+            full_text.append(res)
+    complete_text = '\n'.join(full_text)
+    cleaned_text = clean_repetitive_lines(complete_text)
+    with open("extracted_text_got_ocr.txt", "w", encoding="utf-8") as f:
+        f.write(cleaned_text)
+    pdf_document.close()
+    print("Results have been saved to extracted_text_got_ocr.txt")
+# Example usage
+pdf_path = "path/to/your/pdf"
+process_pdf_for_ocr(tokenizer, model, pdf_path)
+```