Spaces:

manishw10
/

devgen-devanagari-ocr

Sleeping

App Files Files Community

manishw7 commited on 4 days ago

Commit

9ebb598

1 Parent(s): ecce7a8

Design: Final Premium Suite with Pro Mode Toggle

Browse files

Files changed (1) hide show

app.py +46 -45

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import io
 import gradio as gr
 import torch
 import numpy as np
@@ -17,8 +18,8 @@ CNN_MODEL_PATH = "devanagari-cnn-classifier.pt"
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# --- MODEL LOADING ---
-print("System: Loading Engine with Visual Debug...")
 processor = TrOCRProcessor.from_pretrained(BASE_MODEL_ID)
 base_model = VisionEncoderDecoderModel.from_pretrained(BASE_MODEL_ID)
@@ -32,16 +33,14 @@ base_model.config.vocab_size = base_model.config.decoder.vocab_size
 peft_model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
 try:
     model = peft_model.merge_and_unload()
-    print("System: LoRA weights merged.")
 except Exception:
     model = peft_model
 model.to(device)
 model.eval()
-# Load CNN
 cnn_engine = CharacterClassifier(model_path=CNN_MODEL_PATH, device=device)
-# --- ORIGINAL ROUTING LOGIC ---
 def _flood_fill(binary, visited, start_y, start_x, h, w):
     stack = [(start_y, start_x)]
     size = 0
@@ -75,31 +74,32 @@ def original_classify_input(image):
     cmin, cmax = np.where(cols)[0][[0, -1]]
     w, h = cmax - cmin + 1, rmax - rmin + 1
     ar, bc = w/h, count_blobs(binary, min_size=max(binary.size * 0.001, 10))
     is_char = True
     if ar > 2.5: is_char = False
     elif ar > 1.8 and bc >= 3: is_char = False
     elif bc >= 4: is_char = False
     elif ar < 1.3 and bc <= 2: is_char = True
     elif bc == 1 and ar < 1.5: is_char = True
-    elif ar < 1.75 and bc <= 2: is_char = True  # <--- RESTORED THIS LINE
     elif ar > 1.6: is_char = False
     return ("character" if is_char else "word"), ar, bc
-# --- PREDICT ---
-def predict(image):
     if image is None: return None, None, "Upload image.", ""
-    # 1. PREPROCESS (Critical!)
     buf = io.BytesIO()
     image.save(buf, format="PNG")
-    image_bytes = buf.getvalue()
-    preprocessed_pil = preprocess_for_ocr(image_bytes)
-    if preprocessed_pil is None: return None, None, "Preprocessing Failed", ""
-    # 2. ROUTE
-    mode, ar, bc = original_classify_input(preprocessed_pil)
-    status = f"Mode: {mode.upper()} | AR: {ar:.2f} | Blobs: {bc}"
     try:
         if mode == "character" and cnn_engine.available:
@@ -108,43 +108,44 @@ def predict(image):
         else:
             pixel_values = processor(preprocessed_pil, return_tensors="pt").pixel_values.to(device)
             with torch.no_grad():
-                outputs = model.generate(
-                    pixel_values,
-                    num_beams=4,
-                    max_length=128,
-                    early_stopping=True,
-                    decoder_start_token_id=model.config.decoder_start_token_id
-                )
-            text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
             return preprocessed_pil, text, status, "TrOCR + LoRA"
     except Exception as e:
-        return preprocessed_pil, f"Error: {str(e)}", "Inference Failed", "None"
-# --- UI ---
 CSS = """
-.gradio-container { background: #0f172a; color: white; font-family: 'Inter', sans-serif; }
-.panel { background: rgba(30, 41, 59, 0.8); border-radius: 20px; padding: 20px; border: 1px solid #334155; }
-.result-text { font-size: 2.2rem !important; font-weight: bold; color: #818cf8; text-align: center; background: rgba(0,0,0,0.3) !important; border-radius: 12px; }
 """
-with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🕉️ DevGen OCR — Diagnostic Suite")
-    gr.Markdown("See exactly what the engine sees to debug quality issues.")
-    with gr.Row(elem_classes="panel"):
-        with gr.Column():
-            input_img = gr.Image(type="pil", label="1. Original Upload")
-            run_btn = gr.Button("🔍 Run Diagnostic Recognition", variant="primary")
-        with gr.Column():
-            processed_img = gr.Image(type="pil", label="2. What the Model Sees", interactive=False)
-            output_text = gr.Textbox(label="3. Recognition Result", elem_classes="result-text")
-    with gr.Row(elem_classes="panel"):
-        status_lbl = gr.Markdown("## System Insights\nReady to analyze.")
-        engine_lbl = gr.Textbox(label="Model Used", interactive=False)
-    run_btn.click(predict, [input_img], [processed_img, output_text, status_lbl, engine_lbl])
 if __name__ == "__main__":
     demo.launch()

 import os
 import io
+import time
 import gradio as gr
 import torch
 import numpy as np
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- ENGINE CORE ---
+print("System: Initializing DevGen Premium Engine...")
 processor = TrOCRProcessor.from_pretrained(BASE_MODEL_ID)
 base_model = VisionEncoderDecoderModel.from_pretrained(BASE_MODEL_ID)
 peft_model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
 try:
     model = peft_model.merge_and_unload()
 except Exception:
     model = peft_model
 model.to(device)
 model.eval()
 cnn_engine = CharacterClassifier(model_path=CNN_MODEL_PATH, device=device)
+# --- ORIGINAL ROUTING ---
 def _flood_fill(binary, visited, start_y, start_x, h, w):
     stack = [(start_y, start_x)]
     size = 0
     cmin, cmax = np.where(cols)[0][[0, -1]]
     w, h = cmax - cmin + 1, rmax - rmin + 1
     ar, bc = w/h, count_blobs(binary, min_size=max(binary.size * 0.001, 10))
     is_char = True
     if ar > 2.5: is_char = False
     elif ar > 1.8 and bc >= 3: is_char = False
     elif bc >= 4: is_char = False
     elif ar < 1.3 and bc <= 2: is_char = True
     elif bc == 1 and ar < 1.5: is_char = True
+    elif ar < 1.75 and bc <= 2: is_char = True
     elif ar > 1.6: is_char = False
     return ("character" if is_char else "word"), ar, bc
+# --- PIPELINE ---
+def predict(image, manual_mode):
     if image is None: return None, None, "Upload image.", ""
     buf = io.BytesIO()
     image.save(buf, format="PNG")
+    preprocessed_pil = preprocess_for_ocr(buf.getvalue())
+    if manual_mode == "Automatic":
+        mode, ar, bc = original_classify_input(preprocessed_pil)
+        status = f"**System Insight**: Auto-detected **{mode.upper()}** (AR: {ar:.2f}, Blobs: {bc})"
+    else:
+        mode = manual_mode.lower()
+        status = f"**System Insight**: Manual Override set to **{mode.upper()}**"
     try:
         if mode == "character" and cnn_engine.available:
         else:
             pixel_values = processor(preprocessed_pil, return_tensors="pt").pixel_values.to(device)
             with torch.no_grad():
+                gen = model.generate(pixel_values, num_beams=4, max_length=128, early_stopping=True, decoder_start_token_id=model.config.decoder_start_token_id)
+            text = processor.batch_decode(gen, skip_special_tokens=True)[0]
             return preprocessed_pil, text, status, "TrOCR + LoRA"
     except Exception as e:
+        return preprocessed_pil, f"Inference Error: {str(e)}", "Process Failed", "None"
+# --- PREMIUM CSS ---
 CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@400;600&family=Inter:wght@400;500&display=swap');
+.gradio-container { background: linear-gradient(135deg, #0f172a 0%, #1e1b4b 100%) !important; color: white !important; font-family: 'Inter', sans-serif !important; }
+.premium-card { background: rgba(30, 41, 59, 0.7) !important; backdrop-filter: blur(12px); border: 1px solid rgba(255,255,255,0.1); border-radius: 24px; padding: 2rem; box-shadow: 0 25px 50px -12px rgba(0,0,0,0.5); margin-bottom: 20px; }
+h1 { font-family: 'Outfit', sans-serif; font-size: 3rem !important; font-weight: 600; background: linear-gradient(90deg, #818cf8, #c084fc); -webkit-background-clip: text; -webkit-fill-color: transparent; margin-bottom: 1rem; }
+.result-box { font-size: 2.5rem !important; font-weight: 600; text-align: center; color: #818cf8; background: rgba(0,0,0,0.2) !important; border: 1px solid rgba(129, 140, 248, 0.3) !important; border-radius: 16px !important; }
+.btn-primary { background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%) !important; border: none !important; border-radius: 12px !important; font-family: 'Outfit', sans-serif !important; font-weight: 600 !important; }
 """
+with gr.Blocks(css=CSS, theme=gr.themes.Default()) as demo:
+    with gr.Column(elem_classes="premium-card"):
+        gr.Markdown("# 🕉️ DevGen OCR")
+        gr.Markdown("A high-fidelity neuro-generative OCR suite for Devanagari.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                img_in = gr.Image(type="pil", label="Input Document", mirror_webcam=False)
+                mode_ctrl = gr.Radio(["Automatic", "Word", "Character"], value="Automatic", label="Recognition Logic")
+                sub_btn = gr.Button("Recognize Handwriting", variant="primary", elem_classes="btn-primary")
+            with gr.Column(scale=1):
+                text_out = gr.Textbox(label="Recognition Result", elem_classes="result-box", interactive=False)
+                status_md = gr.Markdown("Engine is ready.")
+                engine_txt = gr.Textbox(label="Active Model", interactive=False)
+    with gr.Accordion("🛠️ Technical Diagnostics", open=False):
+        with gr.Row(elem_classes="premium-card"):
+            img_proc = gr.Image(type="pil", label="Preprocessed Input (What the model sees)", interactive=False)
+            gr.Markdown("### Processing Notes\nThis view shows the image after binarization and aspect-ratio normalization. If the image here is blurry or cut off, it may affect accuracy.")
+    sub_btn.click(predict, [img_in, mode_ctrl], [img_proc, text_out, status_md, engine_txt])
 if __name__ == "__main__":
     demo.launch()