gujarati-ocr

Sleeping

App Files Files Community

umangchaudhari commited on Mar 17

Commit

3f44ee0

verified ·

1 Parent(s): c73a65b

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +143 -0

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+from pdf2image import convert_from_path
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from doctr.models import detection_predictor
+import tempfile
+MODEL_ID = "umangchaudhari/gujarati-ocr"
+DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
+PAD      = 4
+MIN_SCORE = 0.4
+print("Loading models...")
+processor = TrOCRProcessor.from_pretrained(MODEL_ID)
+model     = VisionEncoderDecoderModel.from_pretrained(MODEL_ID).to(DEVICE)
+model.eval()
+detector = detection_predictor(
+    arch='db_resnet50',
+    pretrained=True,
+    assume_straight_pages=True,
+    preserve_aspect_ratio=True,
+    symmetric_pad=True,
+).to(DEVICE)
+print("Models ready.")
+def recognize_batch(crops):
+    if not crops:
+        return []
+    pixel_values = processor(
+        images=[c.convert("RGB") for c in crops],
+        return_tensors="pt"
+    ).pixel_values.to(DEVICE)
+    with torch.no_grad():
+        generated = model.generate(pixel_values, max_new_tokens=64)
+    return [t.strip() for t in processor.batch_decode(generated, skip_special_tokens=True)]
+def ocr_image(page_image):
+    W, H    = page_image.width, page_image.height
+    page_np = np.array(page_image.convert("RGB"))
+    with torch.no_grad():
+        result = detector([page_np])
+    raw = result[0].get("words", np.zeros((0, 5)))
+    if raw.shape[0] == 0:
+        return ""
+    raw = raw[raw[:, 4] >= MIN_SCORE]
+    boxes_abs = []
+    for det in raw:
+        xmin, ymin, xmax, ymax, score = det
+        x0 = max(0, int(xmin * W) - PAD)
+        y0 = max(0, int(ymin * H) - PAD)
+        x1 = min(W, int(xmax * W) + PAD)
+        y1 = min(H, int(ymax * H) + PAD)
+        if x1 - x0 < 5 or y1 - y0 < 5:
+            continue
+        boxes_abs.append((y0, x0, x1, y1))
+    boxes_abs.sort(key=lambda b: (b[0] // 15, b[1]))
+    crops     = []
+    valid_pos = []
+    for (y0, x0, x1, y1) in boxes_abs:
+        crops.append(page_image.crop((x0, y0, x1, y1)))
+        valid_pos.append((y0, x0))
+    all_texts = []
+    for i in range(0, len(crops), 64):
+        all_texts.extend(recognize_batch(crops[i:i+64]))
+    lines = {}
+    for (y, x), text in zip(valid_pos, all_texts):
+        if not text:
+            continue
+        row_key = y // 15
+        if row_key not in lines:
+            lines[row_key] = []
+        lines[row_key].append((x, text))
+    result_lines = []
+    for key in sorted(lines.keys()):
+        words = [t for _, t in sorted(lines[key], key=lambda z: z[0])]
+        result_lines.append(" ".join(words))
+    return "\n".join(result_lines)
+def process_pdf(pdf_file):
+    if pdf_file is None:
+        return "Please upload a PDF or image."
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+            tmp.write(open(pdf_file.name, "rb").read())
+            tmp_path = tmp.name
+        pages = convert_from_path(tmp_path, dpi=200)
+        all_text = []
+        for i, page in enumerate(pages):
+            text = ocr_image(page)
+            all_text.append(f"--- Page {i+1} ---\n{text}")
+        return "\n\n".join(all_text)
+    except Exception as e:
+        return f"Error: {str(e)}"
+def process_image(image):
+    if image is None:
+        return "Please upload an image."
+    try:
+        return ocr_image(image)
+    except Exception as e:
+        return f"Error: {str(e)}"
+with gr.Blocks(title="Gujarati OCR") as demo:
+    gr.Markdown("""
+    # 🔤 Gujarati OCR
+    Extract text from Gujarati documents and images.
+    Fine-tuned TrOCR model trained on 80,000+ Gujarati word samples — **96.2% accuracy**.
+    """)
+    with gr.Tab("PDF"):
+        pdf_input  = gr.File(label="Upload PDF", file_types=[".pdf"])
+        pdf_button = gr.Button("Extract Text", variant="primary")
+        pdf_output = gr.Textbox(label="Extracted Text", lines=20)
+        pdf_button.click(process_pdf, inputs=pdf_input, outputs=pdf_output)
+    with gr.Tab("Image"):
+        img_input  = gr.Image(label="Upload Image", type="pil")
+        img_button = gr.Button("Extract Text", variant="primary")
+        img_output = gr.Textbox(label="Extracted Text", lines=20)
+        img_button.click(process_image, inputs=img_input, outputs=img_output)
+    gr.Markdown("""
+    **Model:** [umangchaudhari/gujarati-ocr](https://huggingface.co/umangchaudhari/gujarati-ocr)
+    **Detection:** docTR db_resnet50
+    **Recognition:** Fine-tuned Microsoft TrOCR
+    """)
+demo.launch()