Spaces:

mrrtmob
/

kiri-ocr

Running

App Files Files Community

mrrtmob commited on 18 days ago

Commit

b16ee4a

verified ·

1 Parent(s): 5543d33

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -202

app.py CHANGED Viewed

@@ -1,18 +1,20 @@
 """
 Kiri OCR - Gradio Demo for Hugging Face Spaces
-A lightweight OCR library for English and Khmer documents.
 """
 import gradio as gr
 import numpy as np
 from PIL import Image
 import cv2
-# Initialize OCR
 def load_ocr():
     """Load the OCR model."""
     from kiri_ocr import OCR
     return OCR(
         model_path="mrrtmob/kiri-ocr",
         det_method="db",
@@ -20,11 +22,9 @@ def load_ocr():
         verbose=False
     )
-# Global OCR instance (loaded once)
 ocr = None
 def get_ocr():
     """Get or create OCR instance."""
     global ocr
@@ -32,225 +32,184 @@ def get_ocr():
         ocr = load_ocr()
     return ocr
-def process_image(image, mode="lines", show_boxes=True):
     """
-    Process an image and extract text.
     Args:
         image: Input image (PIL Image or numpy array)
         mode: Detection mode ('lines' or 'words')
-        show_boxes: Whether to draw bounding boxes on the image
-    Returns:
-        Tuple of (annotated_image, extracted_text, detailed_results)
     """
     if image is None:
-        return None, "Please upload an image.", ""
     try:
         ocr_engine = get_ocr()
-        # Convert to numpy array if needed
         if isinstance(image, Image.Image):
             img_array = np.array(image)
         else:
             img_array = image
-        # Ensure image is in correct format
         if len(img_array.shape) == 2:
-            # Grayscale - convert to BGR for cv2
             img_display = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
         elif img_array.shape[2] == 4:
-            # RGBA - convert to BGR
             img_display = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
         else:
-            # RGB - convert to BGR
             img_display = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
-        # Save temp file for processing
-        import tempfile
         with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
             temp_path = f.name
         cv2.imwrite(temp_path, img_display)
-        # Process document
-        results = ocr_engine.process_document(temp_path, mode=mode, verbose=False)
-        # Clean up temp file
-        import os
-        os.unlink(temp_path)
-        if not results:
-            return image, "No text detected in the image.", ""
-        # Sort results by Y then X for reading order
-        results.sort(key=lambda r: (r["box"][1], r["box"][0]))
-        # Draw boxes on image if requested
         annotated = img_display.copy()
-        if show_boxes:
-            for i, r in enumerate(results):
-                x, y, w, h = r["box"]
-                # Draw box
-                cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
-                # Draw line number
-                cv2.putText(
-                    annotated, f"{i+1}", (x, y - 5),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1
-                )
-        # Convert back to RGB for display
-        annotated_rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
-        # Extract full text
-        lines = []
-        current_line = []
-        prev_y = None
-        prev_h = None
-        for res in results:
-            y, h = res["box"][1], res["box"][3]
-            center_y = y + h / 2
-            if prev_y is not None:
-                prev_center = prev_y + prev_h / 2
-                if abs(center_y - prev_center) < max(h, prev_h) / 2:
-                    current_line.append(res["text"])
-                else:
-                    lines.append(" ".join(current_line))
-                    current_line = [res["text"]]
-            else:
-                current_line = [res["text"]]
-            prev_y, prev_h = y, h
-        if current_line:
-            lines.append(" ".join(current_line))
-        full_text = "\n".join(lines)
-        # Format detailed results
-        detailed = "### Detailed Results\n\n"
-        detailed += "| # | Text | Confidence | Box (x,y,w,h) |\n"
-        detailed += "|---|------|------------|---------------|\n"
-        for i, r in enumerate(results, 1):
-            text = r["text"][:50] + "..." if len(r["text"]) > 50 else r["text"]
-            conf = f"{r['confidence']*100:.1f}%"
-            box = f"({r['box'][0]}, {r['box'][1]}, {r['box'][2]}, {r['box'][3]})"
-            detailed += f"| {i} | {text} | {conf} | {box} |\n"
-        return annotated_rgb, full_text, detailed
     except Exception as e:
         import traceback
-        error_msg = f"Error processing image: {str(e)}\n\n{traceback.format_exc()}"
-        return image, error_msg, ""
-def recognize_single_line(image):
     """
-    Recognize text from a single-line image (no detection).
-    Args:
-        image: Input image containing a single line of text
-    Returns:
-        Tuple of (text, confidence)
     """
     if image is None:
-        return "Please upload an image.", ""
     try:
         ocr_engine = get_ocr()
-        # Convert to numpy array
         if isinstance(image, Image.Image):
-            img_array = np.array(image)
-        else:
-            img_array = image
-        # Convert to grayscale
-        if len(img_array.shape) == 3:
-            img_gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
         else:
-            img_gray = img_array
-        # Invert if dark background
-        if np.mean(img_gray) < 127:
-            img_gray = 255 - img_gray
-        # Preprocess and recognize
-        from kiri_ocr.model import preprocess_pil
-        img_pil = Image.fromarray(img_gray)
-        img_tensor = preprocess_pil(ocr_engine.cfg, img_pil)
-        text, confidence = ocr_engine.recognize_region(img_tensor)
-        return text, f"Confidence: {confidence*100:.1f}%"
     except Exception as e:
-        return f"Error: {str(e)}", ""
 # Create Gradio interface
-with gr.Blocks(title="Kiri OCR - Khmer & English OCR") as demo:
     gr.Markdown(
         """
-        # 🔤 Kiri OCR
-        **Lightweight OCR for English and Khmer documents**
-        Upload an image containing text and get the extracted text. Supports both English and Khmer languages.
         """
     )
     with gr.Tabs():
         # Document OCR Tab
-        with gr.TabItem("📄 Document OCR"):
-            gr.Markdown("Upload a document image to extract text with automatic text line detection.")
             with gr.Row():
                 with gr.Column(scale=1):
                     doc_input = gr.Image(
                         label="Upload Document",
                         type="pil",
-                        sources=["upload", "clipboard"]
                     )
-                    with gr.Row():
-                        mode_select = gr.Radio(
-                            choices=["lines", "words"],
-                            value="lines",
-                            label="Detection Mode"
-                        )
-                        show_boxes = gr.Checkbox(
-                            value=True,
-                            label="Show Bounding Boxes"
-                        )
-                    doc_btn = gr.Button("Extract Text", variant="primary")
                 with gr.Column(scale=1):
-                    doc_output_img = gr.Image(label="Detected Regions")
                     doc_output_text = gr.Textbox(
-                        label="Extracted Text",
-                        lines=10
                     )
-            with gr.Accordion("Detailed Results", open=False):
-                doc_details = gr.Markdown()
             doc_btn.click(
-                fn=process_image,
-                inputs=[doc_input, mode_select, show_boxes],
-                outputs=[doc_output_img, doc_output_text, doc_details]
             )
         # Single Line OCR Tab
-        with gr.TabItem("✏️ Single Line OCR"):
-            gr.Markdown("For single-line text images (cropped text lines). No detection needed.")
             with gr.Row():
                 with gr.Column(scale=1):
@@ -259,71 +218,32 @@ with gr.Blocks(title="Kiri OCR - Khmer & English OCR") as demo:
                         type="pil",
                         sources=["upload", "clipboard"]
                     )
-                    line_btn = gr.Button("Recognize Text", variant="primary")
                 with gr.Column(scale=1):
                     line_output_text = gr.Textbox(
-                        label="Recognized Text",
-                        lines=3
                     )
-                    line_confidence = gr.Textbox(label="Confidence")
             line_btn.click(
-                fn=recognize_single_line,
                 inputs=line_input,
-                outputs=[line_output_text, line_confidence]
-            )
-        # About Tab
-        with gr.TabItem("ℹ️ About"):
-            gr.Markdown(
-                """
-                ## About Kiri OCR
-                Kiri OCR is a lightweight OCR library designed for **English** and **Khmer** documents.
-                ### Features
-                - 🚀 **Fast**: Optimized for quick text extraction
-                - 🎯 **Accurate**: Transformer-based architecture with CTC + Attention decoder
-                - 🌏 **Multilingual**: Supports English and Khmer text
-                - 📦 **Lightweight**: Easy to deploy and use
-                ### Technical Details
-                - **Model Architecture**: CNN backbone + Transformer encoder + CTC/Attention decoder
-                - **Text Detection**: DB (Differentiable Binarization) based detector
-                - **Input Size**: 48×640 pixels (images are automatically resized)
-                ### Links
-                - 📚 [GitHub Repository](https://github.com/mrrtmob/kiri-ocr)
-                - 🤗 [Model on Hugging Face](https://huggingface.co/mrrtmob/kiri-ocr)
-                - 📖 [Documentation](https://github.com/mrrtmob/kiri-ocr#readme)
-                ### Usage
-                ```python
-                from kiri_ocr import OCR
-                # Initialize OCR
-                ocr = OCR(model_path="mrrtmob/kiri-ocr")
-                # Extract text from document
-                text, results = ocr.extract_text("document.png")
-                print(text)
-                ```
-                ### License
-                Apache 2.0
-                """
             )
     gr.Markdown(
         """
-        ---
-        Made with ❤️ by [Kiri OCR Team](https://github.com/mrrtmob/kiri-ocr)
         """
     )
 # Launch
 if __name__ == "__main__":
-    demo.launch()

 """
 Kiri OCR - Gradio Demo for Hugging Face Spaces
+A lightweight OCR library for English and Khmer documents with streaming output support.
 """
 import gradio as gr
 import numpy as np
 from PIL import Image
 import cv2
+import tempfile
+import os
+# Initialize OCR (lazy load)
 def load_ocr():
     """Load the OCR model."""
     from kiri_ocr import OCR
+    print("Loading OCR model...")
     return OCR(
         model_path="mrrtmob/kiri-ocr",
         det_method="db",
         verbose=False
     )
+# Global OCR instance
 ocr = None
 def get_ocr():
     """Get or create OCR instance."""
     global ocr
         ocr = load_ocr()
     return ocr
+def process_document_stream(image, mode="lines"):
     """
+    Process document image with real-time character streaming.
     Args:
         image: Input image (PIL Image or numpy array)
         mode: Detection mode ('lines' or 'words')
+    Yields:
+        Tuple of (annotated_image, extracted_text)
     """
     if image is None:
+        yield None, "Please upload an image."
+        return
     try:
         ocr_engine = get_ocr()
+        # Save temp file for processing (required by current API)
+        # Convert PIL to BGR numpy array first if needed
         if isinstance(image, Image.Image):
             img_array = np.array(image)
         else:
             img_array = image
+        # Handle channels
         if len(img_array.shape) == 2:
             img_display = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
         elif img_array.shape[2] == 4:
             img_display = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
         else:
             img_display = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
         with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
             temp_path = f.name
         cv2.imwrite(temp_path, img_display)
+        # Variables for state tracking
         annotated = img_display.copy()
+        extracted_text = ""
+        current_region_text = ""
+        # Use the streaming generator
+        for chunk in ocr_engine.extract_text_stream_chars(temp_path, mode=mode):
+            # Handle region boundaries
+            if chunk.get("region_start"):
+                # Draw box for new region
+                if "box" in chunk:
+                    x, y, w, h = chunk["box"]
+                    # Draw box
+                    cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
+                    # Draw region number
+                    cv2.putText(
+                        annotated, str(chunk["region_number"]), (x, y - 5),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1
+                    )
+                # Add newline if not first region
+                if chunk["region_number"] > 1:
+                    extracted_text += "\n"
+            # Append new token
+            token = chunk.get("token", "")
+            if token:
+                extracted_text += token
+                current_region_text += token
+            # Update display every few chars or at region boundaries to keep UI responsive
+            # (Gradio streaming works best with frequent updates)
+            if chunk.get("region_start") or chunk.get("region_finished") or len(current_region_text) % 3 == 0:
+                # Convert BGR back to RGB for Gradio
+                yield cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB), extracted_text
+        # Final update
+        yield cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB), extracted_text
+        # Cleanup
+        os.unlink(temp_path)
     except Exception as e:
         import traceback
+        yield image, f"Error: {str(e)}\n{traceback.format_exc()}"
+def recognize_line_stream(image):
     """
+    Stream text from single line image.
     """
     if image is None:
+        yield "Please upload an image."
+        return
     try:
         ocr_engine = get_ocr()
+        # Save temp file
         if isinstance(image, Image.Image):
+            image.save("temp_line.png")
+            path = "temp_line.png"
         else:
+            cv2.imwrite("temp_line.png", cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
+            path = "temp_line.png"
+        extracted_text = ""
+        for chunk in ocr_engine.recognize_streaming(path):
+            token = chunk.get("token", "")
+            if token:
+                extracted_text += token
+                yield extracted_text
+        if os.path.exists(path):
+            os.unlink(path)
     except Exception as e:
+        yield f"Error: {str(e)}"
+# Custom CSS
+css = """
+.container { max-width: 1200px; margin: auto; }
+.output-text { font-family: monospace; }
+"""
 # Create Gradio interface
+with gr.Blocks(title="Kiri OCR - Streaming Demo", css=css, theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # ⚡ Kiri OCR Streaming Demo
+        **Real-time OCR for English and Khmer documents**
+        This demo showcases the **character-by-character streaming** capability of Kiri OCR, similar to how LLMs generate text.
         """
     )
     with gr.Tabs():
         # Document OCR Tab
+        with gr.TabItem("📄 Document Stream"):
+            gr.Markdown("Upload a document to see text appear in real-time as it's recognized.")
             with gr.Row():
                 with gr.Column(scale=1):
                     doc_input = gr.Image(
                         label="Upload Document",
                         type="pil",
+                        sources=["upload", "clipboard", "webcam"]
                     )
+                    mode_select = gr.Radio(
+                        choices=["lines", "words"],
+                        value="lines",
+                        label="Detection Mode"
+                    )
+                    doc_btn = gr.Button("⚡ Stream Text", variant="primary")
                 with gr.Column(scale=1):
+                    # Annotated image updates in real-time
+                    doc_output_img = gr.Image(label="Live Detection")
+                    # Text updates character-by-character
                     doc_output_text = gr.Textbox(
+                        label="Streaming Text",
+                        lines=15,
+                        autoscroll=True,
+                        elem_classes=["output-text"]
                     )
             doc_btn.click(
+                fn=process_document_stream,
+                inputs=[doc_input, mode_select],
+                outputs=[doc_output_img, doc_output_text]
             )
         # Single Line OCR Tab
+        with gr.TabItem("✏️ Single Line Stream"):
+            gr.Markdown("Stream text recognition for a single cropped text line.")
             with gr.Row():
                 with gr.Column(scale=1):
                         type="pil",
                         sources=["upload", "clipboard"]
                     )
+                    line_btn = gr.Button("⚡ Stream Recognize", variant="primary")
                 with gr.Column(scale=1):
                     line_output_text = gr.Textbox(
+                        label="Streaming Output",
+                        lines=3,
+                        elem_classes=["output-text"]
                     )
             line_btn.click(
+                fn=recognize_line_stream,
                 inputs=line_input,
+                outputs=line_output_text
             )
     gr.Markdown(
         """
+        ### 🚀 Features
+        - **Real-time Feedback**: See boxes drawn and text generated instantly
+        - **LLM-style Streaming**: Characters appear one by one during decoding
+        - **Hybrid Architecture**: Uses Transformer + CTC + Attention for high accuracy
+        [GitHub Repository](https://github.com/mrrtmob/kiri-ocr) | [Hugging Face Model](https://huggingface.co/mrrtmob/kiri-ocr)
         """
     )
 # Launch
 if __name__ == "__main__":
+    demo.queue().launch()