WeightedAI
/

Persian_OCR

@@ -30,41 +30,33 @@ trained with CTC loss to extract text from images.
 ## Usage Example
-```python
 import torch
 from PIL import Image
 import torchvision.transforms.functional as TF
 import cv2
 import matplotlib.pyplot as plt
 from huggingface_hub import hf_hub_download
-import torch.nn as nn
-import json
-# --- Device ---
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# --- Load vocab ---
 vocab_path = hf_hub_download(repo_id="farbodpya/Persian-OCR", filename="vocab.json")
 with open(vocab_path, "r", encoding="utf-8") as f:
-    idx_to_char = json.load(f)
-idx_to_char = {int(k): v for k, v in idx_to_char.items()}
-char_to_idx = {v: k for k, v in idx_to_char.items()}
-# --- Greedy decoder ---
-def greedy_decode(logits, idx_to_char):
-    pred = logits.argmax(-1).cpu().numpy()
-    texts = []
-    for seq in pred:
-        prev = -1
-        text = ""
-        for p in seq:
-            if p != prev and p != len(idx_to_char):  # CTC blank
-                text += idx_to_char[p]
-            prev = p
-        texts.append(text)
-    return texts
-# --- Define CNN + Transformer Model ---
 def GN(c, groups=16): return nn.GroupNorm(min(groups, c), c)
 class LightResNetCNN(nn.Module):
@@ -114,13 +106,33 @@ class CNN_Transformer_OCR(nn.Module):
         out = self.fc(out)
         return out.log_softmax(2)
-# --- Load pretrained model ---
 model_path = hf_hub_download(repo_id="farbodpya/Persian-OCR", filename="pytorch_model.bin")
 model = CNN_Transformer_OCR(num_classes=len(idx_to_char)+1).to(device)
 model.load_state_dict(torch.load(model_path, map_location=device))
 model.eval()
-# --- Define transforms ---
 class OCRTestTransform:
     def __init__(self, img_height=64, max_width=1600):
         self.img_height = img_height
@@ -138,7 +150,9 @@ class OCRTestTransform:
 transform_test = OCRTestTransform()
-# --- Line segmentation ---
 def segment_lines_precise(image_path, min_line_height=12, margin=6, visualize=False):
     img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
     _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
@@ -163,7 +177,9 @@ def segment_lines_precise(image_path, min_line_height=12, margin=6, visualize=Fa
             plt.show()
     return lines
-# --- OCR function ---
 def ocr_page(image_path, visualize=False):
     lines = segment_lines_precise(image_path, visualize=visualize)
     all_texts = []
@@ -176,7 +192,9 @@ def ocr_page(image_path, visualize=False):
         print(f"Line {idx}: {pred_text}")
     return "\n".join(all_texts)
-# --- Example ---
-img_path = "example.png"
 final_text = ocr_page(img_path, visualize=True)
 print("\n=== Final OCR Page ===\n", final_text)

 ## Usage Example
+import json
 import torch
+import torch.nn as nn
 from PIL import Image
 import torchvision.transforms.functional as TF
 import cv2
 import matplotlib.pyplot as plt
 from huggingface_hub import hf_hub_download
+# -----------------------------
+# 1️⃣ Device
+# -----------------------------
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# -----------------------------
+# 2️⃣ Load vocab
+# -----------------------------
 vocab_path = hf_hub_download(repo_id="farbodpya/Persian-OCR", filename="vocab.json")
 with open(vocab_path, "r", encoding="utf-8") as f:
+    vocab = json.load(f)
+char_to_idx = vocab["char_to_idx"]
+idx_to_char = {int(k): v for k, v in vocab["idx_to_char"].items()}
+# -----------------------------
+# 3️⃣ Model definition
+# -----------------------------
 def GN(c, groups=16): return nn.GroupNorm(min(groups, c), c)
 class LightResNetCNN(nn.Module):
         out = self.fc(out)
         return out.log_softmax(2)
+# -----------------------------
+# 4️⃣ Load model weights
+# -----------------------------
 model_path = hf_hub_download(repo_id="farbodpya/Persian-OCR", filename="pytorch_model.bin")
 model = CNN_Transformer_OCR(num_classes=len(idx_to_char)+1).to(device)
 model.load_state_dict(torch.load(model_path, map_location=device))
 model.eval()
+# -----------------------------
+# 5️⃣ Greedy decoder
+# -----------------------------
+def greedy_decode(output, idx_to_char):
+    output = output.argmax(2)
+    texts = []
+    for seq in output:
+        prev = -1
+        chars = []
+        for idx in seq.cpu().numpy():
+            if idx != prev and idx != 0:
+                chars.append(idx_to_char.get(idx, ""))
+            prev = idx
+        texts.append("".join(chars))
+    return texts
+# -----------------------------
+# 6️⃣ Transforms
+# -----------------------------
 class OCRTestTransform:
     def __init__(self, img_height=64, max_width=1600):
         self.img_height = img_height
 transform_test = OCRTestTransform()
+# -----------------------------
+# 7️⃣ Line segmentation
+# -----------------------------
 def segment_lines_precise(image_path, min_line_height=12, margin=6, visualize=False):
     img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
     _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
             plt.show()
     return lines
+# -----------------------------
+# 8️⃣ OCR function
+# -----------------------------
 def ocr_page(image_path, visualize=False):
     lines = segment_lines_precise(image_path, visualize=visualize)
     all_texts = []
         print(f"Line {idx}: {pred_text}")
     return "\n".join(all_texts)
+# -----------------------------
+# 9️⃣ Example usage
+# -----------------------------
+img_path = "/content/farsi_line.png"  # put your own image path here
 final_text = ocr_page(img_path, visualize=True)
 print("\n=== Final OCR Page ===\n", final_text)