WeightedAI
/

Persian_OCR

@@ -30,32 +30,41 @@ trained with CTC loss to extract text from images.
 ## Usage Example
-import json
 import torch
-import torch.nn as nn
 from PIL import Image
 import torchvision.transforms.functional as TF
 import cv2
 import matplotlib.pyplot as plt
 from huggingface_hub import hf_hub_download
-# -----------------------------
-# 1️⃣ Device
-# -----------------------------
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# -----------------------------
-# 2️⃣ Load vocab
-# -----------------------------
 vocab_path = hf_hub_download(repo_id="farbodpya/Persian-OCR", filename="vocab.json")
 with open(vocab_path, "r", encoding="utf-8") as f:
-    vocab = json.load(f)
-char_to_idx = vocab["char_to_idx"]
-idx_to_char = {int(k): v for k, v in vocab["idx_to_char"].items()}
-# -----------------------------
-# 3️⃣ Model definition
-# -----------------------------
 def GN(c, groups=16): return nn.GroupNorm(min(groups, c), c)
 class LightResNetCNN(nn.Module):
@@ -105,33 +114,13 @@ class CNN_Transformer_OCR(nn.Module):
         out = self.fc(out)
         return out.log_softmax(2)
-# -----------------------------
-# 4️⃣ Load model weights
-# -----------------------------
 model_path = hf_hub_download(repo_id="farbodpya/Persian-OCR", filename="pytorch_model.bin")
 model = CNN_Transformer_OCR(num_classes=len(idx_to_char)+1).to(device)
 model.load_state_dict(torch.load(model_path, map_location=device))
 model.eval()
-# -----------------------------
-# 5️⃣ Greedy decoder
-# -----------------------------
-def greedy_decode(output, idx_to_char):
-    output = output.argmax(2)
-    texts = []
-    for seq in output:
-        prev = -1
-        chars = []
-        for idx in seq.cpu().numpy():
-            if idx != prev and idx != 0:
-                chars.append(idx_to_char.get(idx, ""))
-            prev = idx
-        texts.append("".join(chars))
-    return texts
-# -----------------------------
-# 6️⃣ Transforms
-# -----------------------------
 class OCRTestTransform:
     def __init__(self, img_height=64, max_width=1600):
         self.img_height = img_height
@@ -149,9 +138,7 @@ class OCRTestTransform:
 transform_test = OCRTestTransform()
-# -----------------------------
-# 7️⃣ Line segmentation
-# -----------------------------
 def segment_lines_precise(image_path, min_line_height=12, margin=6, visualize=False):
     img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
     _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
@@ -176,9 +163,7 @@ def segment_lines_precise(image_path, min_line_height=12, margin=6, visualize=Fa
             plt.show()
     return lines
-# -----------------------------
-# 8️⃣ OCR function
-# -----------------------------
 def ocr_page(image_path, visualize=False):
     lines = segment_lines_precise(image_path, visualize=visualize)
     all_texts = []
@@ -191,10 +176,7 @@ def ocr_page(image_path, visualize=False):
         print(f"Line {idx}: {pred_text}")
     return "\n".join(all_texts)
-# -----------------------------
-# 9️⃣ Example usage
-# -----------------------------
-img_path = "example.png"  # put your own image path here
 final_text = ocr_page(img_path, visualize=True)
 print("\n=== Final OCR Page ===\n", final_text)

 ## Usage Example
+```python
 import torch
 from PIL import Image
 import torchvision.transforms.functional as TF
 import cv2
 import matplotlib.pyplot as plt
 from huggingface_hub import hf_hub_download
+import torch.nn as nn
+import json
+# --- Device ---
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# --- Load vocab ---
 vocab_path = hf_hub_download(repo_id="farbodpya/Persian-OCR", filename="vocab.json")
 with open(vocab_path, "r", encoding="utf-8") as f:
+    idx_to_char = json.load(f)
+idx_to_char = {int(k): v for k, v in idx_to_char.items()}
+char_to_idx = {v: k for k, v in idx_to_char.items()}
+# --- Greedy decoder ---
+def greedy_decode(logits, idx_to_char):
+    pred = logits.argmax(-1).cpu().numpy()
+    texts = []
+    for seq in pred:
+        prev = -1
+        text = ""
+        for p in seq:
+            if p != prev and p != len(idx_to_char):  # CTC blank
+                text += idx_to_char[p]
+            prev = p
+        texts.append(text)
+    return texts
+# --- Define CNN + Transformer Model ---
 def GN(c, groups=16): return nn.GroupNorm(min(groups, c), c)
 class LightResNetCNN(nn.Module):
         out = self.fc(out)
         return out.log_softmax(2)
+# --- Load pretrained model ---
 model_path = hf_hub_download(repo_id="farbodpya/Persian-OCR", filename="pytorch_model.bin")
 model = CNN_Transformer_OCR(num_classes=len(idx_to_char)+1).to(device)
 model.load_state_dict(torch.load(model_path, map_location=device))
 model.eval()
+# --- Define transforms ---
 class OCRTestTransform:
     def __init__(self, img_height=64, max_width=1600):
         self.img_height = img_height
 transform_test = OCRTestTransform()
+# --- Line segmentation ---
 def segment_lines_precise(image_path, min_line_height=12, margin=6, visualize=False):
     img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
     _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
             plt.show()
     return lines
+# --- OCR function ---
 def ocr_page(image_path, visualize=False):
     lines = segment_lines_precise(image_path, visualize=visualize)
     all_texts = []
         print(f"Line {idx}: {pred_text}")
     return "\n".join(all_texts)
+# --- Example ---
+img_path = "example.png"
 final_text = ocr_page(img_path, visualize=True)
 print("\n=== Final OCR Page ===\n", final_text)