Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,15 +1,13 @@
|
|
| 1 |
import atexit
|
| 2 |
import functools
|
| 3 |
-
import re
|
| 4 |
-
import cv2
|
| 5 |
-
import numpy as np
|
| 6 |
from queue import Queue
|
| 7 |
from threading import Event, Thread
|
|
|
|
| 8 |
from paddleocr import PaddleOCR, draw_ocr
|
| 9 |
from PIL import Image
|
| 10 |
import gradio as gr
|
| 11 |
|
| 12 |
-
|
| 13 |
LANG_CONFIG = {
|
| 14 |
"ch": {"num_workers": 2},
|
| 15 |
"en": {"num_workers": 2},
|
|
@@ -19,92 +17,8 @@ LANG_CONFIG = {
|
|
| 19 |
"japan": {"num_workers": 1},
|
| 20 |
}
|
| 21 |
CONCURRENCY_LIMIT = 8
|
| 22 |
-
MIN_CONFIDENCE = 0.6 # Threshold confidence
|
| 23 |
-
# ========================================================
|
| 24 |
-
|
| 25 |
-
# ****************** PREPROCESSING ********************
|
| 26 |
-
def preprocess_image(img_path):
|
| 27 |
-
"""Enhance image quality sebelum OCR"""
|
| 28 |
-
try:
|
| 29 |
-
img = cv2.imread(img_path)
|
| 30 |
-
|
| 31 |
-
# Convert ke grayscale
|
| 32 |
-
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 33 |
-
|
| 34 |
-
# Denoising dengan Non-Local Means
|
| 35 |
-
denoised = cv2.fastNlMeansDenoising(
|
| 36 |
-
gray,
|
| 37 |
-
h=15, # Parameter kekuatan denoising (sesuaikan)
|
| 38 |
-
templateWindowSize=7,
|
| 39 |
-
searchWindowSize=21
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
# Sharpening dengan kernel custom
|
| 43 |
-
kernel = np.array([[-1, -1, -1],
|
| 44 |
-
[-1, 9, -1],
|
| 45 |
-
[-1, -1, -1]])
|
| 46 |
-
sharpened = cv2.filter2D(denoised, -1, kernel)
|
| 47 |
-
|
| 48 |
-
# Adaptive Thresholding
|
| 49 |
-
thresholded = cv2.adaptiveThreshold(
|
| 50 |
-
sharpened,
|
| 51 |
-
255,
|
| 52 |
-
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 53 |
-
cv2.THRESH_BINARY,
|
| 54 |
-
11, # Ukuran blok
|
| 55 |
-
2 # Konstanta pengurangan
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
# Simpan gambar hasil preprocessing
|
| 59 |
-
cv2.imwrite("temp_processed.jpg", thresholded)
|
| 60 |
-
return "temp_processed.jpg"
|
| 61 |
-
|
| 62 |
-
except Exception as e:
|
| 63 |
-
print(f"Error preprocessing: {e}")
|
| 64 |
-
return img_path # Fallback ke gambar asli
|
| 65 |
-
|
| 66 |
-
# ****************** POST-PROCESSING ********************
|
| 67 |
-
CORRECTION_RULES = {
|
| 68 |
-
r"\bKEMENTERAN\b": "KEMENTERIAN",
|
| 69 |
-
r"\bKAR\.\b": "KAB.",
|
| 70 |
-
r"\bTHN,\b": "TAHUN",
|
| 71 |
-
r"RP\s*,\s*": "Rp",
|
| 72 |
-
r"(\d{1,3}(?:\.\d{3})*)(,?)": r"\1\2", # Format ribuan
|
| 73 |
-
r"(\d+)\s*-\s*\(": r"\1 (",
|
| 74 |
-
r"CV\.(\w)": r"CV. \1" # Spasi setelah CV
|
| 75 |
-
}
|
| 76 |
|
| 77 |
-
def format_currency(match):
|
| 78 |
-
"""Helper untuk formatting mata uang"""
|
| 79 |
-
amount = match.group(1).replace('.', '').replace(',', '.')
|
| 80 |
-
return f'Rp{amount},00'
|
| 81 |
|
| 82 |
-
def apply_post_ocr_corrections(text):
|
| 83 |
-
"""Koreksi pola umum dalam dokumen kontrak"""
|
| 84 |
-
# Format mata uang: Rp, 87,640,000,- → Rp87.640.000,00
|
| 85 |
-
text = re.sub(
|
| 86 |
-
r'Rp\s*([\d.,]+)\s*-',
|
| 87 |
-
format_currency,
|
| 88 |
-
text,
|
| 89 |
-
flags=re.IGNORECASE
|
| 90 |
-
)
|
| 91 |
-
|
| 92 |
-
# Aplikasi koreksi regex
|
| 93 |
-
for pattern, replacement in CORRECTION_RULES.items():
|
| 94 |
-
text = re.sub(pattern, replacement, text)
|
| 95 |
-
|
| 96 |
-
# Format baris bernomor
|
| 97 |
-
lines = text.split('\n')
|
| 98 |
-
formatted_lines = []
|
| 99 |
-
for line in lines:
|
| 100 |
-
if re.match(r'^\d+[).]', line.strip()):
|
| 101 |
-
formatted_lines.append(f"\n{line.strip()}")
|
| 102 |
-
else:
|
| 103 |
-
formatted_lines.append(line.strip())
|
| 104 |
-
|
| 105 |
-
return '\n'.join(formatted_lines)
|
| 106 |
-
|
| 107 |
-
# ****************** MODEL & INFERENCE ********************
|
| 108 |
class PaddleOCRModelManager(object):
|
| 109 |
def __init__(self,
|
| 110 |
num_workers,
|
|
@@ -153,59 +67,42 @@ class PaddleOCRModelManager(object):
|
|
| 153 |
finally:
|
| 154 |
self._queue.task_done()
|
| 155 |
|
|
|
|
| 156 |
def create_model(lang):
|
| 157 |
-
return PaddleOCR(lang=lang, use_angle_cls=True, use_gpu=False
|
|
|
|
| 158 |
|
| 159 |
model_managers = {}
|
| 160 |
for lang, config in LANG_CONFIG.items():
|
| 161 |
model_manager = PaddleOCRModelManager(config["num_workers"], functools.partial(create_model, lang=lang))
|
| 162 |
model_managers[lang] = model_manager
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
def inference(img, lang):
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
# 3. PROCESS RESULTS
|
| 174 |
-
image = Image.open(img).convert("RGB")
|
| 175 |
-
boxes = [line[0] for line in result]
|
| 176 |
-
txts = []
|
| 177 |
-
confidences = []
|
| 178 |
-
|
| 179 |
-
for line in result:
|
| 180 |
-
text = line[1][0]
|
| 181 |
-
confidence = line[1][1]
|
| 182 |
-
|
| 183 |
-
# Flag teks dengan confidence rendah
|
| 184 |
-
if confidence < MIN_CONFIDENCE:
|
| 185 |
-
txts.append(f"[? {text} ?]")
|
| 186 |
-
else:
|
| 187 |
-
txts.append(text)
|
| 188 |
-
confidences.append(confidence)
|
| 189 |
-
|
| 190 |
-
# 4. POST-PROCESSING
|
| 191 |
-
raw_text = "\n".join(txts)
|
| 192 |
-
cleaned_text = apply_post_ocr_corrections(raw_text)
|
| 193 |
-
|
| 194 |
-
# 5. DRAW OUTPUT
|
| 195 |
-
im_show = draw_ocr(
|
| 196 |
-
image,
|
| 197 |
-
boxes,
|
| 198 |
-
txts,
|
| 199 |
-
confidences,
|
| 200 |
-
font_path="./simfang.ttf"
|
| 201 |
-
)
|
| 202 |
-
|
| 203 |
-
return im_show, cleaned_text, f"Confidence Scores: {confidences}"
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
-
# ****************** GRADO UI ********************
|
| 209 |
title = 'PaddleOCR'
|
| 210 |
description = '''
|
| 211 |
- Gradio demo for PaddleOCR. PaddleOCR demo supports Chinese, English, French, German, Korean and Japanese.
|
|
@@ -219,43 +116,21 @@ examples = [
|
|
| 219 |
['jp_example.jpg','japan'],
|
| 220 |
]
|
| 221 |
|
| 222 |
-
css = ""
|
| 223 |
-
.
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
with gr.Column():
|
| 241 |
-
img_output = gr.Image(type='pil', label='Hasil Anotasi')
|
| 242 |
-
text_output = gr.Textbox(
|
| 243 |
-
label="Teks Hasil OCR",
|
| 244 |
-
lines=15,
|
| 245 |
-
show_copy_button=True,
|
| 246 |
-
placeholder="Teks hasil OCR akan muncul di sini..."
|
| 247 |
-
)
|
| 248 |
-
debug_output = gr.Textbox(
|
| 249 |
-
label="Debug Info",
|
| 250 |
-
visible=False # Ubah ke True jika perlu debug
|
| 251 |
-
)
|
| 252 |
-
|
| 253 |
-
# Contoh dan handler
|
| 254 |
-
gr.Examples(examples, inputs=[img_input, lang_dropdown])
|
| 255 |
-
submit_btn.click(
|
| 256 |
-
fn=inference,
|
| 257 |
-
inputs=[img_input, lang_dropdown],
|
| 258 |
-
outputs=[img_output, text_output, debug_output]
|
| 259 |
-
)
|
| 260 |
-
|
| 261 |
-
demo.launch(debug=False)
|
|
|
|
| 1 |
import atexit
|
| 2 |
import functools
|
|
|
|
|
|
|
|
|
|
| 3 |
from queue import Queue
|
| 4 |
from threading import Event, Thread
|
| 5 |
+
|
| 6 |
from paddleocr import PaddleOCR, draw_ocr
|
| 7 |
from PIL import Image
|
| 8 |
import gradio as gr
|
| 9 |
|
| 10 |
+
|
| 11 |
LANG_CONFIG = {
|
| 12 |
"ch": {"num_workers": 2},
|
| 13 |
"en": {"num_workers": 2},
|
|
|
|
| 17 |
"japan": {"num_workers": 1},
|
| 18 |
}
|
| 19 |
CONCURRENCY_LIMIT = 8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
class PaddleOCRModelManager(object):
|
| 23 |
def __init__(self,
|
| 24 |
num_workers,
|
|
|
|
| 67 |
finally:
|
| 68 |
self._queue.task_done()
|
| 69 |
|
| 70 |
+
|
| 71 |
def create_model(lang):
|
| 72 |
+
return PaddleOCR(lang=lang, use_angle_cls=True, use_gpu=False)
|
| 73 |
+
|
| 74 |
|
| 75 |
model_managers = {}
|
| 76 |
for lang, config in LANG_CONFIG.items():
|
| 77 |
model_manager = PaddleOCRModelManager(config["num_workers"], functools.partial(create_model, lang=lang))
|
| 78 |
model_managers[lang] = model_manager
|
| 79 |
|
| 80 |
+
|
| 81 |
+
def close_model_managers():
|
| 82 |
+
for manager in model_managers.values():
|
| 83 |
+
manager.close()
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# XXX: Not sure if gradio allows adding custom teardown logic
|
| 87 |
+
atexit.register(close_model_managers)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
def inference(img, lang):
|
| 91 |
+
ocr = model_managers[lang]
|
| 92 |
+
result = ocr.infer(img, cls=True)[0]
|
| 93 |
+
img_path = img
|
| 94 |
+
image = Image.open(img_path).convert("RGB")
|
| 95 |
+
boxes = [line[0] for line in result]
|
| 96 |
+
txts = [line[1][0] for line in result]
|
| 97 |
+
scores = [line[1][1] for line in result]
|
| 98 |
+
im_show = draw_ocr(image, boxes, txts, scores, font_path="./simfang.ttf")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
# Tambahkan ini untuk text yang bisa disalin
|
| 101 |
+
combined_text = "\n".join(txts) # Gabungkan semua teks dengan newline
|
| 102 |
+
|
| 103 |
+
return im_show, combined_text # Return kedua output
|
| 104 |
+
|
| 105 |
|
|
|
|
| 106 |
title = 'PaddleOCR'
|
| 107 |
description = '''
|
| 108 |
- Gradio demo for PaddleOCR. PaddleOCR demo supports Chinese, English, French, German, Korean and Japanese.
|
|
|
|
| 116 |
['jp_example.jpg','japan'],
|
| 117 |
]
|
| 118 |
|
| 119 |
+
css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
|
| 120 |
+
gr.Interface(
|
| 121 |
+
inference,
|
| 122 |
+
[
|
| 123 |
+
gr.Image(type='filepath', label='Input'),
|
| 124 |
+
gr.Dropdown(choices=list(LANG_CONFIG.keys()), value='en', label='Language')
|
| 125 |
+
],
|
| 126 |
+
[ # Sekarang ada 2 output
|
| 127 |
+
gr.Image(type='pil', label='Annotated Image'),
|
| 128 |
+
gr.Textbox(label="Extracted Text", lines=10, interactive=True)
|
| 129 |
+
],
|
| 130 |
+
title=title,
|
| 131 |
+
description=description,
|
| 132 |
+
examples=examples,
|
| 133 |
+
cache_examples=False,
|
| 134 |
+
css=css,
|
| 135 |
+
concurrency_limit=CONCURRENCY_LIMIT,
|
| 136 |
+
).launch(debug=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|