Spaces:

mrrtmob
/

kiri-ocr

Running

App Files Files Community

mrrtmob commited on Jan 25

Commit

2f36e07

verified ·

1 Parent(s): 5789876

Update app.py

Browse files

Files changed (1) hide show

app.py +340 -50

app.py CHANGED Viewed

@@ -1,67 +1,357 @@
 import gradio as gr
-from kiri_ocr import OCR
-from PIL import Image, ImageDraw
 import numpy as np
-import os
 # Initialize OCR
-try:
-    print("Loading Kiri OCR model...")
-    # Use verbose=True to see what's happening
-    ocr = OCR(verbose=True)
-    print("Model loaded successfully")
-except Exception as e:
-    print(f"Error loading model: {e}")
-    ocr = None
-def process_image(image_path):
     if ocr is None:
-        return None, "Error: OCR model failed to load."
-    if image_path is None:
-        return None, "Please upload an image."
     try:
-        print(f"Processing image: {image_path}")
-        # extract_text returns (text, results)
-        text, results = ocr.extract_text(image_path, verbose=True)
-        print(f"Extracted {len(results)} regions.")
-        # Open image for drawing
-        img = Image.open(image_path)
-        if img.mode != 'RGB':
-            img = img.convert('RGB')
-        draw = ImageDraw.Draw(img)
-        # Draw boxes
-        for item in results:
-            if 'box' in item:
-                x, y, w, h = item['box']
-                # Ensure coordinates are ints
-                x, y, w, h = int(x), int(y), int(w), int(h)
-                draw.rectangle([x, y, x + w, y + h], outline="red", width=3)
-        return np.array(img), text
     except Exception as e:
         import traceback
-        traceback.print_exc()
-        return None, f"Error during extraction: {str(e)}"
-# Build the interface
-demo = gr.Interface(
-    fn=process_image,
-    inputs=gr.Image(type="filepath", label="Upload Image"),
-    outputs=[
-        gr.Image(label="Detected Text Regions"),
-        gr.Textbox(label="Extracted Text", lines=10)
-    ],
-    title="Kiri OCR Demo",
-    description="Upload an image to extract English and Khmer text. Detected regions are highlighted in red.",
-    examples=[]
-)
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

+"""
+Kiri OCR - Gradio Demo for Hugging Face Spaces
+A lightweight OCR library for English and Khmer documents.
+"""
 import gradio as gr
 import numpy as np
+from PIL import Image
+import cv2
 # Initialize OCR
+def load_ocr():
+    """Load the OCR model."""
+    from kiri_ocr import OCR
+    return OCR(
+        model_path="mrrtmob/kiri-ocr",
+        det_method="db",
+        device="cpu",
+        verbose=False
+    )
+# Global OCR instance (loaded once)
+ocr = None
+def get_ocr():
+    """Get or create OCR instance."""
+    global ocr
     if ocr is None:
+        ocr = load_ocr()
+    return ocr
+def process_image(image, mode="lines", show_boxes=True):
+    """
+    Process an image and extract text.
+    Args:
+        image: Input image (PIL Image or numpy array)
+        mode: Detection mode ('lines' or 'words')
+        show_boxes: Whether to draw bounding boxes on the image
+    Returns:
+        Tuple of (annotated_image, extracted_text, detailed_results)
+    """
+    if image is None:
+        return None, "Please upload an image.", ""
     try:
+        ocr_engine = get_ocr()
+        # Convert to numpy array if needed
+        if isinstance(image, Image.Image):
+            img_array = np.array(image)
+        else:
+            img_array = image
+        # Ensure image is in correct format
+        if len(img_array.shape) == 2:
+            # Grayscale - convert to BGR for cv2
+            img_display = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
+        elif img_array.shape[2] == 4:
+            # RGBA - convert to BGR
+            img_display = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
+        else:
+            # RGB - convert to BGR
+            img_display = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
+        # Save temp file for processing
+        import tempfile
+        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
+            temp_path = f.name
+        cv2.imwrite(temp_path, img_display)
+        # Process document
+        results = ocr_engine.process_document(temp_path, mode=mode, verbose=False)
+        # Clean up temp file
+        import os
+        os.unlink(temp_path)
+        if not results:
+            return image, "No text detected in the image.", ""
+        # Sort results by Y then X for reading order
+        results.sort(key=lambda r: (r["box"][1], r["box"][0]))
+        # Draw boxes on image if requested
+        annotated = img_display.copy()
+        if show_boxes:
+            for i, r in enumerate(results):
+                x, y, w, h = r["box"]
+                # Draw box
+                cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
+                # Draw line number
+                cv2.putText(
+                    annotated, f"{i+1}", (x, y - 5),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1
+                )
+        # Convert back to RGB for display
+        annotated_rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
+        # Extract full text
+        lines = []
+        current_line = []
+        prev_y = None
+        prev_h = None
+        for res in results:
+            y, h = res["box"][1], res["box"][3]
+            center_y = y + h / 2
+            if prev_y is not None:
+                prev_center = prev_y + prev_h / 2
+                if abs(center_y - prev_center) < max(h, prev_h) / 2:
+                    current_line.append(res["text"])
+                else:
+                    lines.append(" ".join(current_line))
+                    current_line = [res["text"]]
+            else:
+                current_line = [res["text"]]
+            prev_y, prev_h = y, h
+        if current_line:
+            lines.append(" ".join(current_line))
+        full_text = "\n".join(lines)
+        # Format detailed results
+        detailed = "### Detailed Results\n\n"
+        detailed += "| # | Text | Confidence | Box (x,y,w,h) |\n"
+        detailed += "|---|------|------------|---------------|\n"
+        for i, r in enumerate(results, 1):
+            text = r["text"][:50] + "..." if len(r["text"]) > 50 else r["text"]
+            conf = f"{r['confidence']*100:.1f}%"
+            box = f"({r['box'][0]}, {r['box'][1]}, {r['box'][2]}, {r['box'][3]})"
+            detailed += f"| {i} | {text} | {conf} | {box} |\n"
+        return annotated_rgb, full_text, detailed
     except Exception as e:
         import traceback
+        error_msg = f"Error processing image: {str(e)}\n\n{traceback.format_exc()}"
+        return image, error_msg, ""
+def recognize_single_line(image):
+    """
+    Recognize text from a single-line image (no detection).
+    Args:
+        image: Input image containing a single line of text
+    Returns:
+        Tuple of (text, confidence)
+    """
+    if image is None:
+        return "Please upload an image.", ""
+    try:
+        ocr_engine = get_ocr()
+        # Convert to numpy array
+        if isinstance(image, Image.Image):
+            img_array = np.array(image)
+        else:
+            img_array = image
+        # Convert to grayscale
+        if len(img_array.shape) == 3:
+            img_gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+        else:
+            img_gray = img_array
+        # Invert if dark background
+        if np.mean(img_gray) < 127:
+            img_gray = 255 - img_gray
+        # Preprocess and recognize
+        from kiri_ocr.model import preprocess_pil
+        img_pil = Image.fromarray(img_gray)
+        img_tensor = preprocess_pil(ocr_engine.cfg, img_pil)
+        text, confidence = ocr_engine.recognize_region(img_tensor)
+        return text, f"Confidence: {confidence*100:.1f}%"
+    except Exception as e:
+        return f"Error: {str(e)}", ""
+# Custom CSS
+css = """
+.gradio-container {
+    font-family: 'Inter', sans-serif;
+}
+.output-text {
+    font-size: 16px;
+    line-height: 1.6;
+}
+footer {
+    visibility: hidden;
+}
+"""
+# Create Gradio interface
+with gr.Blocks(css=css, title="Kiri OCR - Khmer & English OCR") as demo:
+    gr.Markdown(
+        """
+        # 🔤 Kiri OCR
+        **Lightweight OCR for English and Khmer documents**
+        Upload an image containing text and get the extracted text. Supports both English and Khmer languages.
+        """
+    )
+    with gr.Tabs():
+        # Document OCR Tab
+        with gr.TabItem("📄 Document OCR"):
+            gr.Markdown("Upload a document image to extract text with automatic text line detection.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    doc_input = gr.Image(
+                        label="Upload Document",
+                        type="pil",
+                        sources=["upload", "clipboard"]
+                    )
+                    with gr.Row():
+                        mode_select = gr.Radio(
+                            choices=["lines", "words"],
+                            value="lines",
+                            label="Detection Mode"
+                        )
+                        show_boxes = gr.Checkbox(
+                            value=True,
+                            label="Show Bounding Boxes"
+                        )
+                    doc_btn = gr.Button("Extract Text", variant="primary")
+                with gr.Column(scale=1):
+                    doc_output_img = gr.Image(label="Detected Regions")
+                    doc_output_text = gr.Textbox(
+                        label="Extracted Text",
+                        lines=10,
+                        show_copy_button=True
+                    )
+            with gr.Accordion("Detailed Results", open=False):
+                doc_details = gr.Markdown()
+            doc_btn.click(
+                fn=process_image,
+                inputs=[doc_input, mode_select, show_boxes],
+                outputs=[doc_output_img, doc_output_text, doc_details]
+            )
+            # Examples
+            gr.Examples(
+                examples=[
+                    ["assets/sample_khmer.png"],
+                    ["assets/sample_english.png"],
+                ],
+                inputs=doc_input,
+                outputs=[doc_output_img, doc_output_text, doc_details],
+                fn=process_image,
+                cache_examples=False
+            ) if False else None  # Disabled - add your own example images
+        # Single Line OCR Tab
+        with gr.TabItem("✏️ Single Line OCR"):
+            gr.Markdown("For single-line text images (cropped text lines). No detection needed.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    line_input = gr.Image(
+                        label="Upload Text Line",
+                        type="pil",
+                        sources=["upload", "clipboard"]
+                    )
+                    line_btn = gr.Button("Recognize Text", variant="primary")
+                with gr.Column(scale=1):
+                    line_output_text = gr.Textbox(
+                        label="Recognized Text",
+                        lines=3,
+                        show_copy_button=True
+                    )
+                    line_confidence = gr.Textbox(label="Confidence")
+            line_btn.click(
+                fn=recognize_single_line,
+                inputs=line_input,
+                outputs=[line_output_text, line_confidence]
+            )
+        # About Tab
+        with gr.TabItem("ℹ️ About"):
+            gr.Markdown(
+                """
+                ## About Kiri OCR
+                Kiri OCR is a lightweight OCR library designed for **English** and **Khmer** documents.
+                ### Features
+                - 🚀 **Fast**: Optimized for quick text extraction
+                - 🎯 **Accurate**: Transformer-based architecture with CTC + Attention decoder
+                - 🌏 **Multilingual**: Supports English and Khmer text
+                - 📦 **Lightweight**: Easy to deploy and use
+                ### Technical Details
+                - **Model Architecture**: CNN backbone + Transformer encoder + CTC/Attention decoder
+                - **Text Detection**: DB (Differentiable Binarization) based detector
+                - **Input Size**: 48×640 pixels (images are automatically resized)
+                ### Links
+                - 📚 [GitHub Repository](https://github.com/mrrtmob/kiri-ocr)
+                - 🤗 [Model on Hugging Face](https://huggingface.co/mrrtmob/kiri-ocr)
+                - 📖 [Documentation](https://github.com/mrrtmob/kiri-ocr#readme)
+                ### Usage
+                ```python
+                from kiri_ocr import OCR
+                # Initialize OCR
+                ocr = OCR(model_path="mrrtmob/kiri-ocr")
+                # Extract text from document
+                text, results = ocr.extract_text("document.png")
+                print(text)
+                ```
+                ### License
+                Apache 2.0
+                """
+            )
+    gr.Markdown(
+        """
+        ---
+        Made with ❤️ by [Kiri OCR Team](https://github.com/mrrtmob/kiri-ocr)
+        """
+    )
+# Launch
 if __name__ == "__main__":
+    demo.launch()