mrrtmob commited on
Commit
2f36e07
Β·
verified Β·
1 Parent(s): 5789876

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +340 -50
app.py CHANGED
@@ -1,67 +1,357 @@
 
 
 
 
 
1
  import gradio as gr
2
- from kiri_ocr import OCR
3
- from PIL import Image, ImageDraw
4
  import numpy as np
5
- import os
 
 
6
 
7
  # Initialize OCR
8
- try:
9
- print("Loading Kiri OCR model...")
10
- # Use verbose=True to see what's happening
11
- ocr = OCR(verbose=True)
12
- print("Model loaded successfully")
13
- except Exception as e:
14
- print(f"Error loading model: {e}")
15
- ocr = None
16
-
17
- def process_image(image_path):
 
 
 
 
 
 
 
 
18
  if ocr is None:
19
- return None, "Error: OCR model failed to load."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- if image_path is None:
22
- return None, "Please upload an image."
23
-
24
  try:
25
- print(f"Processing image: {image_path}")
26
- # extract_text returns (text, results)
27
- text, results = ocr.extract_text(image_path, verbose=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- print(f"Extracted {len(results)} regions.")
30
 
31
- # Open image for drawing
32
- img = Image.open(image_path)
33
- if img.mode != 'RGB':
34
- img = img.convert('RGB')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- draw = ImageDraw.Draw(img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Draw boxes
39
- for item in results:
40
- if 'box' in item:
41
- x, y, w, h = item['box']
42
- # Ensure coordinates are ints
43
- x, y, w, h = int(x), int(y), int(w), int(h)
44
- draw.rectangle([x, y, x + w, y + h], outline="red", width=3)
 
 
45
 
46
- return np.array(img), text
47
 
48
  except Exception as e:
49
  import traceback
50
- traceback.print_exc()
51
- return None, f"Error during extraction: {str(e)}"
52
-
53
- # Build the interface
54
- demo = gr.Interface(
55
- fn=process_image,
56
- inputs=gr.Image(type="filepath", label="Upload Image"),
57
- outputs=[
58
- gr.Image(label="Detected Text Regions"),
59
- gr.Textbox(label="Extracted Text", lines=10)
60
- ],
61
- title="Kiri OCR Demo",
62
- description="Upload an image to extract English and Khmer text. Detected regions are highlighted in red.",
63
- examples=[]
64
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
 
66
  if __name__ == "__main__":
67
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 
1
+ """
2
+ Kiri OCR - Gradio Demo for Hugging Face Spaces
3
+
4
+ A lightweight OCR library for English and Khmer documents.
5
+ """
6
  import gradio as gr
 
 
7
  import numpy as np
8
+ from PIL import Image
9
+ import cv2
10
+
11
 
12
  # Initialize OCR
13
+ def load_ocr():
14
+ """Load the OCR model."""
15
+ from kiri_ocr import OCR
16
+ return OCR(
17
+ model_path="mrrtmob/kiri-ocr",
18
+ det_method="db",
19
+ device="cpu",
20
+ verbose=False
21
+ )
22
+
23
+
24
+ # Global OCR instance (loaded once)
25
+ ocr = None
26
+
27
+
28
+ def get_ocr():
29
+ """Get or create OCR instance."""
30
+ global ocr
31
  if ocr is None:
32
+ ocr = load_ocr()
33
+ return ocr
34
+
35
+
36
+ def process_image(image, mode="lines", show_boxes=True):
37
+ """
38
+ Process an image and extract text.
39
+
40
+ Args:
41
+ image: Input image (PIL Image or numpy array)
42
+ mode: Detection mode ('lines' or 'words')
43
+ show_boxes: Whether to draw bounding boxes on the image
44
+
45
+ Returns:
46
+ Tuple of (annotated_image, extracted_text, detailed_results)
47
+ """
48
+ if image is None:
49
+ return None, "Please upload an image.", ""
50
 
 
 
 
51
  try:
52
+ ocr_engine = get_ocr()
53
+
54
+ # Convert to numpy array if needed
55
+ if isinstance(image, Image.Image):
56
+ img_array = np.array(image)
57
+ else:
58
+ img_array = image
59
+
60
+ # Ensure image is in correct format
61
+ if len(img_array.shape) == 2:
62
+ # Grayscale - convert to BGR for cv2
63
+ img_display = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR)
64
+ elif img_array.shape[2] == 4:
65
+ # RGBA - convert to BGR
66
+ img_display = cv2.cvtColor(img_array, cv2.COLOR_RGBA2BGR)
67
+ else:
68
+ # RGB - convert to BGR
69
+ img_display = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
70
+
71
+ # Save temp file for processing
72
+ import tempfile
73
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
74
+ temp_path = f.name
75
 
76
+ cv2.imwrite(temp_path, img_display)
77
 
78
+ # Process document
79
+ results = ocr_engine.process_document(temp_path, mode=mode, verbose=False)
80
+
81
+ # Clean up temp file
82
+ import os
83
+ os.unlink(temp_path)
84
+
85
+ if not results:
86
+ return image, "No text detected in the image.", ""
87
+
88
+ # Sort results by Y then X for reading order
89
+ results.sort(key=lambda r: (r["box"][1], r["box"][0]))
90
+
91
+ # Draw boxes on image if requested
92
+ annotated = img_display.copy()
93
+ if show_boxes:
94
+ for i, r in enumerate(results):
95
+ x, y, w, h = r["box"]
96
+ # Draw box
97
+ cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 2)
98
+ # Draw line number
99
+ cv2.putText(
100
+ annotated, f"{i+1}", (x, y - 5),
101
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1
102
+ )
103
+
104
+ # Convert back to RGB for display
105
+ annotated_rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
106
+
107
+ # Extract full text
108
+ lines = []
109
+ current_line = []
110
+ prev_y = None
111
+ prev_h = None
112
+
113
+ for res in results:
114
+ y, h = res["box"][1], res["box"][3]
115
+ center_y = y + h / 2
116
 
117
+ if prev_y is not None:
118
+ prev_center = prev_y + prev_h / 2
119
+ if abs(center_y - prev_center) < max(h, prev_h) / 2:
120
+ current_line.append(res["text"])
121
+ else:
122
+ lines.append(" ".join(current_line))
123
+ current_line = [res["text"]]
124
+ else:
125
+ current_line = [res["text"]]
126
+
127
+ prev_y, prev_h = y, h
128
+
129
+ if current_line:
130
+ lines.append(" ".join(current_line))
131
+
132
+ full_text = "\n".join(lines)
133
 
134
+ # Format detailed results
135
+ detailed = "### Detailed Results\n\n"
136
+ detailed += "| # | Text | Confidence | Box (x,y,w,h) |\n"
137
+ detailed += "|---|------|------------|---------------|\n"
138
+ for i, r in enumerate(results, 1):
139
+ text = r["text"][:50] + "..." if len(r["text"]) > 50 else r["text"]
140
+ conf = f"{r['confidence']*100:.1f}%"
141
+ box = f"({r['box'][0]}, {r['box'][1]}, {r['box'][2]}, {r['box'][3]})"
142
+ detailed += f"| {i} | {text} | {conf} | {box} |\n"
143
 
144
+ return annotated_rgb, full_text, detailed
145
 
146
  except Exception as e:
147
  import traceback
148
+ error_msg = f"Error processing image: {str(e)}\n\n{traceback.format_exc()}"
149
+ return image, error_msg, ""
150
+
151
+
152
+ def recognize_single_line(image):
153
+ """
154
+ Recognize text from a single-line image (no detection).
155
+
156
+ Args:
157
+ image: Input image containing a single line of text
158
+
159
+ Returns:
160
+ Tuple of (text, confidence)
161
+ """
162
+ if image is None:
163
+ return "Please upload an image.", ""
164
+
165
+ try:
166
+ ocr_engine = get_ocr()
167
+
168
+ # Convert to numpy array
169
+ if isinstance(image, Image.Image):
170
+ img_array = np.array(image)
171
+ else:
172
+ img_array = image
173
+
174
+ # Convert to grayscale
175
+ if len(img_array.shape) == 3:
176
+ img_gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
177
+ else:
178
+ img_gray = img_array
179
+
180
+ # Invert if dark background
181
+ if np.mean(img_gray) < 127:
182
+ img_gray = 255 - img_gray
183
+
184
+ # Preprocess and recognize
185
+ from kiri_ocr.model import preprocess_pil
186
+ img_pil = Image.fromarray(img_gray)
187
+ img_tensor = preprocess_pil(ocr_engine.cfg, img_pil)
188
+
189
+ text, confidence = ocr_engine.recognize_region(img_tensor)
190
+
191
+ return text, f"Confidence: {confidence*100:.1f}%"
192
+
193
+ except Exception as e:
194
+ return f"Error: {str(e)}", ""
195
+
196
+
197
+ # Custom CSS
198
+ css = """
199
+ .gradio-container {
200
+ font-family: 'Inter', sans-serif;
201
+ }
202
+ .output-text {
203
+ font-size: 16px;
204
+ line-height: 1.6;
205
+ }
206
+ footer {
207
+ visibility: hidden;
208
+ }
209
+ """
210
+
211
+ # Create Gradio interface
212
+ with gr.Blocks(css=css, title="Kiri OCR - Khmer & English OCR") as demo:
213
+ gr.Markdown(
214
+ """
215
+ # πŸ”€ Kiri OCR
216
+
217
+ **Lightweight OCR for English and Khmer documents**
218
+
219
+ Upload an image containing text and get the extracted text. Supports both English and Khmer languages.
220
+ """
221
+ )
222
+
223
+ with gr.Tabs():
224
+ # Document OCR Tab
225
+ with gr.TabItem("πŸ“„ Document OCR"):
226
+ gr.Markdown("Upload a document image to extract text with automatic text line detection.")
227
+
228
+ with gr.Row():
229
+ with gr.Column(scale=1):
230
+ doc_input = gr.Image(
231
+ label="Upload Document",
232
+ type="pil",
233
+ sources=["upload", "clipboard"]
234
+ )
235
+
236
+ with gr.Row():
237
+ mode_select = gr.Radio(
238
+ choices=["lines", "words"],
239
+ value="lines",
240
+ label="Detection Mode"
241
+ )
242
+ show_boxes = gr.Checkbox(
243
+ value=True,
244
+ label="Show Bounding Boxes"
245
+ )
246
+
247
+ doc_btn = gr.Button("Extract Text", variant="primary")
248
+
249
+ with gr.Column(scale=1):
250
+ doc_output_img = gr.Image(label="Detected Regions")
251
+ doc_output_text = gr.Textbox(
252
+ label="Extracted Text",
253
+ lines=10,
254
+ show_copy_button=True
255
+ )
256
+
257
+ with gr.Accordion("Detailed Results", open=False):
258
+ doc_details = gr.Markdown()
259
+
260
+ doc_btn.click(
261
+ fn=process_image,
262
+ inputs=[doc_input, mode_select, show_boxes],
263
+ outputs=[doc_output_img, doc_output_text, doc_details]
264
+ )
265
+
266
+ # Examples
267
+ gr.Examples(
268
+ examples=[
269
+ ["assets/sample_khmer.png"],
270
+ ["assets/sample_english.png"],
271
+ ],
272
+ inputs=doc_input,
273
+ outputs=[doc_output_img, doc_output_text, doc_details],
274
+ fn=process_image,
275
+ cache_examples=False
276
+ ) if False else None # Disabled - add your own example images
277
+
278
+ # Single Line OCR Tab
279
+ with gr.TabItem("✏️ Single Line OCR"):
280
+ gr.Markdown("For single-line text images (cropped text lines). No detection needed.")
281
+
282
+ with gr.Row():
283
+ with gr.Column(scale=1):
284
+ line_input = gr.Image(
285
+ label="Upload Text Line",
286
+ type="pil",
287
+ sources=["upload", "clipboard"]
288
+ )
289
+ line_btn = gr.Button("Recognize Text", variant="primary")
290
+
291
+ with gr.Column(scale=1):
292
+ line_output_text = gr.Textbox(
293
+ label="Recognized Text",
294
+ lines=3,
295
+ show_copy_button=True
296
+ )
297
+ line_confidence = gr.Textbox(label="Confidence")
298
+
299
+ line_btn.click(
300
+ fn=recognize_single_line,
301
+ inputs=line_input,
302
+ outputs=[line_output_text, line_confidence]
303
+ )
304
+
305
+ # About Tab
306
+ with gr.TabItem("ℹ️ About"):
307
+ gr.Markdown(
308
+ """
309
+ ## About Kiri OCR
310
+
311
+ Kiri OCR is a lightweight OCR library designed for **English** and **Khmer** documents.
312
+
313
+ ### Features
314
+ - πŸš€ **Fast**: Optimized for quick text extraction
315
+ - 🎯 **Accurate**: Transformer-based architecture with CTC + Attention decoder
316
+ - 🌏 **Multilingual**: Supports English and Khmer text
317
+ - πŸ“¦ **Lightweight**: Easy to deploy and use
318
+
319
+ ### Technical Details
320
+ - **Model Architecture**: CNN backbone + Transformer encoder + CTC/Attention decoder
321
+ - **Text Detection**: DB (Differentiable Binarization) based detector
322
+ - **Input Size**: 48Γ—640 pixels (images are automatically resized)
323
+
324
+ ### Links
325
+ - πŸ“š [GitHub Repository](https://github.com/mrrtmob/kiri-ocr)
326
+ - πŸ€— [Model on Hugging Face](https://huggingface.co/mrrtmob/kiri-ocr)
327
+ - πŸ“– [Documentation](https://github.com/mrrtmob/kiri-ocr#readme)
328
+
329
+ ### Usage
330
+
331
+ ```python
332
+ from kiri_ocr import OCR
333
+
334
+ # Initialize OCR
335
+ ocr = OCR(model_path="mrrtmob/kiri-ocr")
336
+
337
+ # Extract text from document
338
+ text, results = ocr.extract_text("document.png")
339
+ print(text)
340
+ ```
341
+
342
+ ### License
343
+ Apache 2.0
344
+ """
345
+ )
346
+
347
+ gr.Markdown(
348
+ """
349
+ ---
350
+ Made with ❀️ by [Kiri OCR Team](https://github.com/mrrtmob/kiri-ocr)
351
+ """
352
+ )
353
+
354
 
355
+ # Launch
356
  if __name__ == "__main__":
357
+ demo.launch()