mrrtmob commited on
Commit
b16ee4a
·
verified ·
1 Parent(s): 5543d33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -202
app.py CHANGED
@@ -1,18 +1,20 @@
1
  """
2
  Kiri OCR - Gradio Demo for Hugging Face Spaces
3
 
4
- A lightweight OCR library for English and Khmer documents.
5
  """
6
  import gradio as gr
7
  import numpy as np
8
  from PIL import Image
9
  import cv2
 
 
10
 
11
-
12
- # Initialize OCR
13
  def load_ocr():
14
  """Load the OCR model."""
15
  from kiri_ocr import OCR
 
16
  return OCR(
17
  model_path="mrrtmob/kiri-ocr",
18
  det_method="db",
@@ -20,11 +22,9 @@ def load_ocr():
20
  verbose=False
21
  )
22
 
23
-
24
- # Global OCR instance (loaded once)
25
  ocr = None
26
 
27
-
28
  def get_ocr():
29
  """Get or create OCR instance."""
30
  global ocr
@@ -32,225 +32,184 @@ def get_ocr():
32
  ocr = load_ocr()
33
  return ocr
34
 
35
-
36
- def process_image(image, mode="lines", show_boxes=True):
37
  """
38
- Process an image and extract text.
39
 
40
  Args:
41
  image: Input image (PIL Image or numpy array)
42
  mode: Detection mode ('lines' or 'words')
43
- show_boxes: Whether to draw bounding boxes on the image
44
 
45
- Returns:
46
- Tuple of (annotated_image, extracted_text, detailed_results)
47
  """
48
  if image is None:
49
- return None, "Please upload an image.", ""
50
-
 
51
  try:
52
  ocr_engine = get_ocr()
53
 
54
- # Convert to numpy array if needed
 
55
  if isinstance(image, Image.Image):
56
  img_array = np.array(image)
57
  else:
58
  img_array = image
59
-
60
- # Ensure image is in correct format
61
  if len(img_array.shape) == 2:
62
- # Grayscale - convert to BGR for cv2
63
  img_display = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
64
  elif img_array.shape[2] == 4:
65
- # RGBA - convert to BGR
66
  img_display = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
67
  else:
68
- # RGB - convert to BGR
69
  img_display = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
70
-
71
- # Save temp file for processing
72
- import tempfile
73
  with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
74
  temp_path = f.name
75
-
76
  cv2.imwrite(temp_path, img_display)
77
 
78
- # Process document
79
- results = ocr_engine.process_document(temp_path, mode=mode, verbose=False)
80
-
81
- # Clean up temp file
82
- import os
83
- os.unlink(temp_path)
84
-
85
- if not results:
86
- return image, "No text detected in the image.", ""
87
-
88
- # Sort results by Y then X for reading order
89
- results.sort(key=lambda r: (r["box"][1], r["box"][0]))
90
-
91
- # Draw boxes on image if requested
92
  annotated = img_display.copy()
93
- if show_boxes:
94
- for i, r in enumerate(results):
95
- x, y, w, h = r["box"]
96
- # Draw box
97
- cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
98
- # Draw line number
99
- cv2.putText(
100
- annotated, f"{i+1}", (x, y - 5),
101
- cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1
102
- )
103
-
104
- # Convert back to RGB for display
105
- annotated_rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
106
-
107
- # Extract full text
108
- lines = []
109
- current_line = []
110
- prev_y = None
111
- prev_h = None
112
 
113
- for res in results:
114
- y, h = res["box"][1], res["box"][3]
115
- center_y = y + h / 2
116
 
117
- if prev_y is not None:
118
- prev_center = prev_y + prev_h / 2
119
- if abs(center_y - prev_center) < max(h, prev_h) / 2:
120
- current_line.append(res["text"])
121
- else:
122
- lines.append(" ".join(current_line))
123
- current_line = [res["text"]]
124
- else:
125
- current_line = [res["text"]]
 
 
 
 
 
 
 
126
 
127
- prev_y, prev_h = y, h
128
-
129
- if current_line:
130
- lines.append(" ".join(current_line))
131
-
132
- full_text = "\n".join(lines)
133
-
134
- # Format detailed results
135
- detailed = "### Detailed Results\n\n"
136
- detailed += "| # | Text | Confidence | Box (x,y,w,h) |\n"
137
- detailed += "|---|------|------------|---------------|\n"
138
- for i, r in enumerate(results, 1):
139
- text = r["text"][:50] + "..." if len(r["text"]) > 50 else r["text"]
140
- conf = f"{r['confidence']*100:.1f}%"
141
- box = f"({r['box'][0]}, {r['box'][1]}, {r['box'][2]}, {r['box'][3]})"
142
- detailed += f"| {i} | {text} | {conf} | {box} |\n"
143
 
144
- return annotated_rgb, full_text, detailed
 
145
 
146
  except Exception as e:
147
  import traceback
148
- error_msg = f"Error processing image: {str(e)}\n\n{traceback.format_exc()}"
149
- return image, error_msg, ""
150
 
151
 
152
- def recognize_single_line(image):
153
  """
154
- Recognize text from a single-line image (no detection).
155
-
156
- Args:
157
- image: Input image containing a single line of text
158
-
159
- Returns:
160
- Tuple of (text, confidence)
161
  """
162
  if image is None:
163
- return "Please upload an image.", ""
164
-
 
165
  try:
166
  ocr_engine = get_ocr()
167
 
168
- # Convert to numpy array
169
  if isinstance(image, Image.Image):
170
- img_array = np.array(image)
171
- else:
172
- img_array = image
173
-
174
- # Convert to grayscale
175
- if len(img_array.shape) == 3:
176
- img_gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
177
  else:
178
- img_gray = img_array
179
-
180
- # Invert if dark background
181
- if np.mean(img_gray) < 127:
182
- img_gray = 255 - img_gray
183
-
184
- # Preprocess and recognize
185
- from kiri_ocr.model import preprocess_pil
186
- img_pil = Image.fromarray(img_gray)
187
- img_tensor = preprocess_pil(ocr_engine.cfg, img_pil)
188
-
189
- text, confidence = ocr_engine.recognize_region(img_tensor)
190
-
191
- return text, f"Confidence: {confidence*100:.1f}%"
192
 
 
 
 
 
 
 
 
 
 
193
  except Exception as e:
194
- return f"Error: {str(e)}", ""
195
 
 
 
 
 
 
196
 
197
  # Create Gradio interface
198
- with gr.Blocks(title="Kiri OCR - Khmer & English OCR") as demo:
199
  gr.Markdown(
200
  """
201
- # 🔤 Kiri OCR
202
 
203
- **Lightweight OCR for English and Khmer documents**
204
 
205
- Upload an image containing text and get the extracted text. Supports both English and Khmer languages.
206
  """
207
  )
208
 
209
  with gr.Tabs():
210
  # Document OCR Tab
211
- with gr.TabItem("📄 Document OCR"):
212
- gr.Markdown("Upload a document image to extract text with automatic text line detection.")
213
 
214
  with gr.Row():
215
  with gr.Column(scale=1):
216
  doc_input = gr.Image(
217
  label="Upload Document",
218
  type="pil",
219
- sources=["upload", "clipboard"]
220
  )
221
 
222
- with gr.Row():
223
- mode_select = gr.Radio(
224
- choices=["lines", "words"],
225
- value="lines",
226
- label="Detection Mode"
227
- )
228
- show_boxes = gr.Checkbox(
229
- value=True,
230
- label="Show Bounding Boxes"
231
- )
232
 
233
- doc_btn = gr.Button("Extract Text", variant="primary")
234
 
235
  with gr.Column(scale=1):
236
- doc_output_img = gr.Image(label="Detected Regions")
 
 
237
  doc_output_text = gr.Textbox(
238
- label="Extracted Text",
239
- lines=10
 
 
240
  )
241
 
242
- with gr.Accordion("Detailed Results", open=False):
243
- doc_details = gr.Markdown()
244
-
245
  doc_btn.click(
246
- fn=process_image,
247
- inputs=[doc_input, mode_select, show_boxes],
248
- outputs=[doc_output_img, doc_output_text, doc_details]
249
  )
250
 
251
  # Single Line OCR Tab
252
- with gr.TabItem("✏️ Single Line OCR"):
253
- gr.Markdown("For single-line text images (cropped text lines). No detection needed.")
254
 
255
  with gr.Row():
256
  with gr.Column(scale=1):
@@ -259,71 +218,32 @@ with gr.Blocks(title="Kiri OCR - Khmer & English OCR") as demo:
259
  type="pil",
260
  sources=["upload", "clipboard"]
261
  )
262
- line_btn = gr.Button("Recognize Text", variant="primary")
263
 
264
  with gr.Column(scale=1):
265
  line_output_text = gr.Textbox(
266
- label="Recognized Text",
267
- lines=3
 
268
  )
269
- line_confidence = gr.Textbox(label="Confidence")
270
 
271
  line_btn.click(
272
- fn=recognize_single_line,
273
  inputs=line_input,
274
- outputs=[line_output_text, line_confidence]
275
- )
276
-
277
- # About Tab
278
- with gr.TabItem("ℹ️ About"):
279
- gr.Markdown(
280
- """
281
- ## About Kiri OCR
282
-
283
- Kiri OCR is a lightweight OCR library designed for **English** and **Khmer** documents.
284
-
285
- ### Features
286
- - 🚀 **Fast**: Optimized for quick text extraction
287
- - 🎯 **Accurate**: Transformer-based architecture with CTC + Attention decoder
288
- - 🌏 **Multilingual**: Supports English and Khmer text
289
- - 📦 **Lightweight**: Easy to deploy and use
290
-
291
- ### Technical Details
292
- - **Model Architecture**: CNN backbone + Transformer encoder + CTC/Attention decoder
293
- - **Text Detection**: DB (Differentiable Binarization) based detector
294
- - **Input Size**: 48×640 pixels (images are automatically resized)
295
-
296
- ### Links
297
- - 📚 [GitHub Repository](https://github.com/mrrtmob/kiri-ocr)
298
- - 🤗 [Model on Hugging Face](https://huggingface.co/mrrtmob/kiri-ocr)
299
- - 📖 [Documentation](https://github.com/mrrtmob/kiri-ocr#readme)
300
-
301
- ### Usage
302
-
303
- ```python
304
- from kiri_ocr import OCR
305
-
306
- # Initialize OCR
307
- ocr = OCR(model_path="mrrtmob/kiri-ocr")
308
-
309
- # Extract text from document
310
- text, results = ocr.extract_text("document.png")
311
- print(text)
312
- ```
313
-
314
- ### License
315
- Apache 2.0
316
- """
317
  )
318
 
319
  gr.Markdown(
320
  """
321
- ---
322
- Made with ❤️ by [Kiri OCR Team](https://github.com/mrrtmob/kiri-ocr)
 
 
 
 
323
  """
324
  )
325
 
326
-
327
  # Launch
328
  if __name__ == "__main__":
329
- demo.launch()
 
1
  """
2
  Kiri OCR - Gradio Demo for Hugging Face Spaces
3
 
4
+ A lightweight OCR library for English and Khmer documents with streaming output support.
5
  """
6
  import gradio as gr
7
  import numpy as np
8
  from PIL import Image
9
  import cv2
10
+ import tempfile
11
+ import os
12
 
13
+ # Initialize OCR (lazy load)
 
14
  def load_ocr():
15
  """Load the OCR model."""
16
  from kiri_ocr import OCR
17
+ print("Loading OCR model...")
18
  return OCR(
19
  model_path="mrrtmob/kiri-ocr",
20
  det_method="db",
 
22
  verbose=False
23
  )
24
 
25
+ # Global OCR instance
 
26
  ocr = None
27
 
 
28
  def get_ocr():
29
  """Get or create OCR instance."""
30
  global ocr
 
32
  ocr = load_ocr()
33
  return ocr
34
 
35
+ def process_document_stream(image, mode="lines"):
 
36
  """
37
+ Process document image with real-time character streaming.
38
 
39
  Args:
40
  image: Input image (PIL Image or numpy array)
41
  mode: Detection mode ('lines' or 'words')
 
42
 
43
+ Yields:
44
+ Tuple of (annotated_image, extracted_text)
45
  """
46
  if image is None:
47
+ yield None, "Please upload an image."
48
+ return
49
+
50
  try:
51
  ocr_engine = get_ocr()
52
 
53
+ # Save temp file for processing (required by current API)
54
+ # Convert PIL to BGR numpy array first if needed
55
  if isinstance(image, Image.Image):
56
  img_array = np.array(image)
57
  else:
58
  img_array = image
59
+
60
+ # Handle channels
61
  if len(img_array.shape) == 2:
 
62
  img_display = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
63
  elif img_array.shape[2] == 4:
 
64
  img_display = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
65
  else:
 
66
  img_display = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
67
+
 
 
68
  with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
69
  temp_path = f.name
70
+
71
  cv2.imwrite(temp_path, img_display)
72
 
73
+ # Variables for state tracking
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  annotated = img_display.copy()
75
+ extracted_text = ""
76
+ current_region_text = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # Use the streaming generator
79
+ for chunk in ocr_engine.extract_text_stream_chars(temp_path, mode=mode):
 
80
 
81
+ # Handle region boundaries
82
+ if chunk.get("region_start"):
83
+ # Draw box for new region
84
+ if "box" in chunk:
85
+ x, y, w, h = chunk["box"]
86
+ # Draw box
87
+ cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
88
+ # Draw region number
89
+ cv2.putText(
90
+ annotated, str(chunk["region_number"]), (x, y - 5),
91
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1
92
+ )
93
+
94
+ # Add newline if not first region
95
+ if chunk["region_number"] > 1:
96
+ extracted_text += "\n"
97
 
98
+ # Append new token
99
+ token = chunk.get("token", "")
100
+ if token:
101
+ extracted_text += token
102
+ current_region_text += token
103
+
104
+ # Update display every few chars or at region boundaries to keep UI responsive
105
+ # (Gradio streaming works best with frequent updates)
106
+ if chunk.get("region_start") or chunk.get("region_finished") or len(current_region_text) % 3 == 0:
107
+ # Convert BGR back to RGB for Gradio
108
+ yield cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB), extracted_text
109
+
110
+ # Final update
111
+ yield cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB), extracted_text
 
 
112
 
113
+ # Cleanup
114
+ os.unlink(temp_path)
115
 
116
  except Exception as e:
117
  import traceback
118
+ yield image, f"Error: {str(e)}\n{traceback.format_exc()}"
 
119
 
120
 
121
+ def recognize_line_stream(image):
122
  """
123
+ Stream text from single line image.
 
 
 
 
 
 
124
  """
125
  if image is None:
126
+ yield "Please upload an image."
127
+ return
128
+
129
  try:
130
  ocr_engine = get_ocr()
131
 
132
+ # Save temp file
133
  if isinstance(image, Image.Image):
134
+ image.save("temp_line.png")
135
+ path = "temp_line.png"
 
 
 
 
 
136
  else:
137
+ cv2.imwrite("temp_line.png", cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
138
+ path = "temp_line.png"
139
+
140
+ extracted_text = ""
 
 
 
 
 
 
 
 
 
 
141
 
142
+ for chunk in ocr_engine.recognize_streaming(path):
143
+ token = chunk.get("token", "")
144
+ if token:
145
+ extracted_text += token
146
+ yield extracted_text
147
+
148
+ if os.path.exists(path):
149
+ os.unlink(path)
150
+
151
  except Exception as e:
152
+ yield f"Error: {str(e)}"
153
 
154
+ # Custom CSS
155
+ css = """
156
+ .container { max-width: 1200px; margin: auto; }
157
+ .output-text { font-family: monospace; }
158
+ """
159
 
160
  # Create Gradio interface
161
+ with gr.Blocks(title="Kiri OCR - Streaming Demo", css=css, theme=gr.themes.Soft()) as demo:
162
  gr.Markdown(
163
  """
164
+ # Kiri OCR Streaming Demo
165
 
166
+ **Real-time OCR for English and Khmer documents**
167
 
168
+ This demo showcases the **character-by-character streaming** capability of Kiri OCR, similar to how LLMs generate text.
169
  """
170
  )
171
 
172
  with gr.Tabs():
173
  # Document OCR Tab
174
+ with gr.TabItem("📄 Document Stream"):
175
+ gr.Markdown("Upload a document to see text appear in real-time as it's recognized.")
176
 
177
  with gr.Row():
178
  with gr.Column(scale=1):
179
  doc_input = gr.Image(
180
  label="Upload Document",
181
  type="pil",
182
+ sources=["upload", "clipboard", "webcam"]
183
  )
184
 
185
+ mode_select = gr.Radio(
186
+ choices=["lines", "words"],
187
+ value="lines",
188
+ label="Detection Mode"
189
+ )
 
 
 
 
 
190
 
191
+ doc_btn = gr.Button(" Stream Text", variant="primary")
192
 
193
  with gr.Column(scale=1):
194
+ # Annotated image updates in real-time
195
+ doc_output_img = gr.Image(label="Live Detection")
196
+ # Text updates character-by-character
197
  doc_output_text = gr.Textbox(
198
+ label="Streaming Text",
199
+ lines=15,
200
+ autoscroll=True,
201
+ elem_classes=["output-text"]
202
  )
203
 
 
 
 
204
  doc_btn.click(
205
+ fn=process_document_stream,
206
+ inputs=[doc_input, mode_select],
207
+ outputs=[doc_output_img, doc_output_text]
208
  )
209
 
210
  # Single Line OCR Tab
211
+ with gr.TabItem("✏️ Single Line Stream"):
212
+ gr.Markdown("Stream text recognition for a single cropped text line.")
213
 
214
  with gr.Row():
215
  with gr.Column(scale=1):
 
218
  type="pil",
219
  sources=["upload", "clipboard"]
220
  )
221
+ line_btn = gr.Button("⚡ Stream Recognize", variant="primary")
222
 
223
  with gr.Column(scale=1):
224
  line_output_text = gr.Textbox(
225
+ label="Streaming Output",
226
+ lines=3,
227
+ elem_classes=["output-text"]
228
  )
 
229
 
230
  line_btn.click(
231
+ fn=recognize_line_stream,
232
  inputs=line_input,
233
+ outputs=line_output_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  )
235
 
236
  gr.Markdown(
237
  """
238
+ ### 🚀 Features
239
+ - **Real-time Feedback**: See boxes drawn and text generated instantly
240
+ - **LLM-style Streaming**: Characters appear one by one during decoding
241
+ - **Hybrid Architecture**: Uses Transformer + CTC + Attention for high accuracy
242
+
243
+ [GitHub Repository](https://github.com/mrrtmob/kiri-ocr) | [Hugging Face Model](https://huggingface.co/mrrtmob/kiri-ocr)
244
  """
245
  )
246
 
 
247
  # Launch
248
  if __name__ == "__main__":
249
+ demo.queue().launch()