Spaces:

manishw10
/

devgen-devanagari-ocr

Sleeping

App Files Files Community

manishw7 commited on 8 days ago

Commit

ae3fa31

1 Parent(s): adb49d0

Feature: Full Smart Pipeline with Auto-Routing and Preprocessing

Browse files

Files changed (1) hide show

app.py +112 -63

app.py CHANGED Viewed

@@ -2,23 +2,23 @@ import os
 import gradio as gr
 import torch
 import numpy as np
 from PIL import Image
 from peft import PeftModel
 from transformers import AutoTokenizer, TrOCRProcessor, ViTImageProcessor, VisionEncoderDecoderModel
-from cnn_model import CharacterClassifier  # Importing your CNN logic
 # --- CONFIGURATION ---
 BASE_MODEL_ID = "paudelanil/trocr-devanagari-2"
 ADAPTER_ID = "manishw10/devgen-trocr-devanagari-lora"
 CNN_MODEL_PATH = "devanagari-cnn-classifier.pt"
-# Detect environment
 IS_SPACE = "SPACE_ID" in os.environ
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"System: Initializing Models (Env: {'Hugging Face Space' if IS_SPACE else 'Local'})")
-# 1. Load TrOCR Model & Processor
 try:
     processor = TrOCRProcessor.from_pretrained(BASE_MODEL_ID)
 except Exception:
@@ -31,77 +31,126 @@ model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
 model.to(device)
 model.eval()
-# 2. Load CNN Classifier
 cnn_engine = CharacterClassifier(model_path=CNN_MODEL_PATH, device=device)
 print(f"System: Models loaded successfully on {device}")
-def predict_trocr(image):
     if image is None:
-        return "Error: No image uploaded"
     try:
-        image = image.convert("RGB")
-        pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
-        # --- HIGH-QUALITY GENERATION ---
-        # Added num_beams and length_penalty to fix the "rubbish" output.
-        # This makes TrOCR use Beam Search instead of Greedy Search.
-        with torch.no_grad():
-            generated_ids = model.base_model.generate(
-                pixel_values=pixel_values,
-                num_beams=4,
-                length_penalty=1.0,
-                max_new_tokens=64,
-                early_stopping=True,
-                decoder_start_token_id=model.config.decoder_start_token_id
-            )
-            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        return generated_text
     except Exception as e:
         import traceback
         print(traceback.format_exc())
-        return f"TrOCR Error: {str(e)}"
-def predict_cnn(image):
-    if image is None:
-        return "Error: No image uploaded"
-    try:
-        image = image.convert("RGB")
-        result = cnn_engine.predict(image)
-        if "error" in result:
-            return result["error"]
-        return f"Character: {result['text']} (Confidence: {result['confidence']:.2%})"
-    except Exception as e:
-        return f"CNN Error: {str(e)}"
-# --- CUSTOM GRADIO INTERFACE ---
-with gr.Blocks(title="DevGen OCR Suite") as demo:
-    gr.Markdown("# 🕉️ DevGen Devanagari OCR Suite")
-    gr.Markdown("Switch between TrOCR (for words/sentences) and CNN (for single characters).")
-    with gr.Tabs():
-        with gr.TabItem("TrOCR (Word/Sentence Recognition)"):
-            with gr.Row():
-                with gr.Column():
-                    img_input = gr.Image(type="pil", label="Upload Handwritten Word")
-                    btn_trocr = gr.Button("Recognize Word", variant="primary")
-                with gr.Column():
-                    text_output = gr.Textbox(label="Recognized Text")
-            btn_trocr.click(fn=predict_trocr, inputs=img_input, outputs=text_output)
-        with gr.TabItem("CNN (Single Character Recognition)"):
-            with gr.Row():
-                with gr.Column():
-                    char_input = gr.Image(type="pil", label="Upload Single Character")
-                    btn_cnn = gr.Button("Classify Character", variant="primary")
-                with gr.Column():
-                    char_output = gr.Textbox(label="Classification Result")
-            btn_cnn.click(fn=predict_cnn, inputs=char_input, outputs=char_output)
-    gr.Markdown("---")
-    gr.Markdown("Built with ❤️ by DevGen Team. Using TrOCR + LoRA and custom 3-layer CNN.")
 if __name__ == "__main__":
-    server_name = "0.0.0.0" if IS_SPACE else "127.0.0.1"
-    # Note: We don't use monkey-patching here, the base_model.generate handles it.
-    demo.launch(server_name=server_name)

 import gradio as gr
 import torch
 import numpy as np
+import cv2
 from PIL import Image
 from peft import PeftModel
 from transformers import AutoTokenizer, TrOCRProcessor, ViTImageProcessor, VisionEncoderDecoderModel
+from cnn_model import CharacterClassifier
 # --- CONFIGURATION ---
 BASE_MODEL_ID = "paudelanil/trocr-devanagari-2"
 ADAPTER_ID = "manishw10/devgen-trocr-devanagari-lora"
 CNN_MODEL_PATH = "devanagari-cnn-classifier.pt"
 IS_SPACE = "SPACE_ID" in os.environ
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- MODEL INITIALIZATION ---
+print(f"System: Initializing Smart Engine (Env: {'Hugging Face Space' if IS_SPACE else 'Local'})")
 try:
     processor = TrOCRProcessor.from_pretrained(BASE_MODEL_ID)
 except Exception:
 model.to(device)
 model.eval()
 cnn_engine = CharacterClassifier(model_path=CNN_MODEL_PATH, device=device)
 print(f"System: Models loaded successfully on {device}")
+# --- SMART ROUTING LOGIC ---
+def _count_blobs(binary, min_size=10):
+    h, w = binary.shape
+    visited = np.zeros_like(binary, dtype=bool)
+    count = 0
+    for y in range(h):
+        for x in range(w):
+            if binary[y, x] and not visited[y, x]:
+                # Simple iterative flood fill
+                stack = [(y, x)]
+                size = 0
+                while stack:
+                    py, px = stack.pop()
+                    if py<0 or py>=h or px<0 or px>=w or visited[py, px] or not binary[py, px]:
+                        continue
+                    visited[py, px] = True
+                    size += 1
+                    stack.extend([(py+1, px), (py-1, px), (py, px+1), (py, px-1)])
+                if size >= min_size:
+                    count += 1
+    return count
+def classify_input(image):
+    gray = image.convert("L")
+    arr = np.array(gray)
+    threshold = min(arr.mean() * 0.75, 200)
+    binary = (arr < threshold).astype(np.uint8)
+    rows = np.any(binary, axis=1)
+    cols = np.any(binary, axis=0)
+    if not rows.any() or not cols.any():
+        return "character", 0.5, "no_ink"
+    rmin, rmax = np.where(rows)[0][[0, -1]]
+    cmin, cmax = np.where(cols)[0][[0, -1]]
+    aspect_ratio = (cmax - cmin + 1) / max(rmax - rmin + 1, 1)
+    blob_count = _count_blobs(binary, min_size=max(binary.size * 0.001, 10))
+    if aspect_ratio > 1.8 or blob_count >= 3:
+        return "word", 0.9, "wide_or_multiple_blobs"
+    return "character", 0.8, "square_compact"
+# --- PREPROCESSING ---
+def preprocess_for_trocr(image):
+    # Standard cleanup for word recognition
+    image = image.convert("RGB")
+    # Tightly crop to ink
+    gray = np.array(image.convert("L"))
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    coords = np.column_stack(np.where(binary > 0))
+    if len(coords) > 0:
+        y0, x0 = coords.min(axis=0)
+        y1, x1 = coords.max(axis=0)
+        # Pad slightly
+        image = image.crop((max(0, x0-10), max(0, y0-10), min(image.width, x1+10), min(image.height, y1+10)))
+    return image
+# --- MAIN INFERENCE PIPELINE ---
+def smart_predict(image):
     if image is None:
+        return "Please upload an image.", "Waiting...", "None"
     try:
+        # 1. Smart Routing
+        input_type, confidence, reason = classify_input(image)
+        system_status = f"Mode: {input_type.upper()} | Reason: {reason} (Conf: {confidence:.0%})"
+        if input_type == "character" and cnn_engine.available:
+            # 2. CNN Pipeline
+            result = cnn_engine.predict(image)
+            return result["text"], system_status, "CNN Classifier"
+        else:
+            # 3. TrOCR Pipeline
+            image_cleaned = preprocess_for_trocr(image)
+            pixel_values = processor(image_cleaned, return_tensors="pt").pixel_values.to(device)
+            with torch.no_grad():
+                generated_ids = model.base_model.generate(
+                    pixel_values=pixel_values,
+                    num_beams=4,
+                    length_penalty=1.0,
+                    max_new_tokens=64,
+                    early_stopping=True,
+                    decoder_start_token_id=model.config.decoder_start_token_id
+                )
+                text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            return text, system_status, "TrOCR + LoRA"
     except Exception as e:
         import traceback
         print(traceback.format_exc())
+        return f"Error: {str(e)}", "System Failure", "Error"
+# --- INTERFACE ---
+with gr.Blocks(theme=gr.themes.Soft(), title="DevGen Smart OCR") as demo:
+    gr.Markdown("# 🕉️ DevGen Smart Devanagari OCR")
+    gr.Markdown("Automatic detection and recognition for both single characters and full words.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_img = gr.Image(type="pil", label="Upload Handwritten Input")
+            submit_btn = gr.Button("Recognize", variant="primary")
+        with gr.Column(scale=1):
+            output_text = gr.Textbox(label="Recognized Text", placeholder="Result will appear here...", interactive=False)
+            status_text = gr.Label(label="Engine Status")
+            model_used = gr.Textbox(label="Model Used", interactive=False)
+    submit_btn.click(
+        fn=smart_predict,
+        inputs=input_img,
+        outputs=[output_text, status_text, model_used]
+    )
+    gr.Examples(
+        examples=[], # You can add local test images here
+        inputs=input_img
+    )
 if __name__ == "__main__":
+    demo.launch()