Spaces:

ranbac
/

PaddleOCR

Running

App Files Files Community

ranbac commited on 23 days ago

Commit

9f6817b

verified ·

1 Parent(s): 819108a

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -213

app.py CHANGED Viewed

@@ -1,35 +1,34 @@
 import os
-import base64
-import io
-import json
 import logging
 import re
-import cv2
-import numpy as np
-import requests
-from PIL import Image, ImageDraw, ImageFont
 import gradio as gr
 from paddleocr import PaddleOCR
-# --- PHẦN 1: CẤU HÌNH & KHỞI TẠO PADDLEOCR (LOCAL ENGINE) ---
-os.environ["FLAGS_use_mkldnn"] = "0"
-os.environ["FLAGS_enable_mkldnn"] = "0"
-os.environ["CPP_MIN_LOG_LEVEL"] = "3"
 logging.getLogger("ppocr").setLevel(logging.WARNING)
-print("🚀 Đang khởi tạo PaddleOCR Local...")
 try:
-    # Cấu hình OCR Local
-    ocr = PaddleOCR(use_textline_orientation=True,
-                    use_doc_orientation_classify=False,
-                    use_doc_unwarping=False,
-                    lang='ch') # Có thể đổi sang 'en' hoặc 'vi'
 except Exception as e:
-    print(f"⚠️ Lỗi khởi tạo nâng cao: {e}. Dùng chế độ mặc định.")
     ocr = PaddleOCR(lang='ch')
-print("✅ Model đã sẵn sàng!")
-# Tải Font để vẽ chữ (Từ Phần 1)
 def check_and_download_font():
     font_path = "./simfang.ttf"
     if not os.path.exists(font_path):
@@ -44,19 +43,15 @@ def check_and_download_font():
 FONT_PATH = check_and_download_font()
-# --- HELPER FUNCTIONS (HỖ TRỢ XỬ LÝ ẢNH & TEXT) ---
-def pil_to_base64_html(image):
-    """Chuyển đổi PIL Image thành thẻ HTML <img> base64"""
-    buffered = io.BytesIO()
-    image.save(buffered, format="JPEG")
-    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
-    return f'<img src="data:image/jpeg;base64,{img_str}" alt="Result" style="width:100%; object-fit:contain;">'
 def universal_draw(image, raw_data, font_path):
-    """Hàm vẽ box lên ảnh (Từ Phần 1)"""
     if image is None: return image
-    if isinstance(image, np.ndarray): image = Image.fromarray(image)
     canvas = image.copy()
     draw = ImageDraw.Draw(canvas)
@@ -66,198 +61,161 @@ def universal_draw(image, raw_data, font_path):
     except:
         font = ImageFont.load_default()
-    boxes = [line[0] for line in raw_data[0]] if raw_data and raw_data[0] else []
-    txts = [line[1][0] for line in raw_data[0]] if raw_data and raw_data[0] else []
-    scores = [line[1][1] for line in raw_data[0]] if raw_data and raw_data[0] else []
-    for box, txt in zip(boxes, txts):
-        box = [tuple(p) for p in box]
-        draw.polygon(box, outline="red", width=3)
-        # Vẽ nền chữ
-        if hasattr(draw, "textbbox"):
-            text_bbox = draw.textbbox(box[0], txt, font=font, anchor="lb")
-            draw.rectangle(text_bbox, fill="red")
-            draw.text(box[0], txt, fill="white", font=font, anchor="lb")
-        else:
-            draw.text((box[0][0], box[0][1] - font_size), txt, fill="white", font=font)
-    return canvas
-# --- HÀM XỬ LÝ CHÍNH (LOGIC CẦU NỐI) ---
-# Hàm này nhận input từ UI Phần 2, chạy Logic Phần 1, trả về format UI Phần 2
-def local_inference(image_path, mode="Document"):
-    if not image_path:
-        return "Please upload an image.", "", ""
-    try:
-        # 1. Đọc ảnh
-        img = Image.open(image_path).convert("RGB")
-        img_np = np.array(img)
-        # 2. Chạy PaddleOCR (Local)
-        # Lưu ý: Model Local cơ bản không hỗ trợ tách bảng/công thức chuyên sâu như API
-        # nhưng ta vẫn chạy OCR để lấy text.
-        result = ocr.ocr(img_np, cls=True)
-        # 3. Xử lý kết quả để hiển thị
-        if not result or result[0] is None:
-            return "No text found.", "<p>No text detected</p>", "[]"
-        # Tạo ảnh visualization (Vẽ box)
-        annotated_img = universal_draw(img, result, FONT_PATH)
-        html_vis = pil_to_base64_html(annotated_img)
-        # Tạo Markdown Output
-        # Gom nhóm text lại thành đoạn văn
-        texts = [line[1][0] for line in result[0]]
-        if mode == "Formula":
-            md_text = "### Recognized Formula (Raw Text):\n\n" + " ".join(texts)
-            md_text += "\n\n*(Note: Local generic OCR model cannot convert to LaTeX math syntax)*"
-        elif mode == "Table":
-            md_text = "### Recognized Table Content:\n\n" + "\n".join(texts)
-            md_text += "\n\n*(Note: Local generic OCR model does not reconstruct HTML structure)*"
-        else: # Document / Generic
-            md_text = "### Document Content:\n\n" + "\n".join(texts)
-        # Raw Data (JSON string để debug)
-        raw_json = json.dumps(result[0], ensure_ascii=False, indent=2)
-        return md_text, html_vis, raw_json
     except Exception as e:
         import traceback
-        err = traceback.format_exc()
-        return f"Error: {str(e)}", f"<p style='color:red'>{str(e)}</p>", err
-# Wrapper cho các Tab khác nhau
-def run_doc_parsing(file, *args):
-    return local_inference(file, mode="Document")
-def run_element_recognition(file, prompt_label, *args):
-    # prompt_label: "Formula Recognition", "Table Recognition", etc.
-    mode = prompt_label.split()[0] # Lấy từ đầu tiên (Formula/Table...)
-    return local_inference(file, mode=mode)
-def run_spotting(file, *args):
-    # Spotting giả lập: Trả về bounding boxes của text dưới dạng JSON
-    if not file: return "", "{}"
-    img = Image.open(file).convert("RGB")
-    result = ocr.ocr(np.array(img), cls=True)
-    if not result or result[0] is None:
-        return "<p>No objects found</p>", "[]"
-    annotated_img = universal_draw(img, result, FONT_PATH)
-    html_vis = pil_to_base64_html(annotated_img)
-    # Format lại JSON cho giống spotting
-    spotting_res = []
-    for line in result[0]:
-        spotting_res.append({
-            "label": "text_block",
-            "text": line[1][0],
-            "confidence": line[1][1],
-            "box": line[0]
-        })
-    return html_vis, json.dumps(spotting_res, ensure_ascii=False, indent=2)
-# --- PHẦN 2: GIAO DIỆN (UI TỪ FILE 2) ---
-custom_css = """
-body, .gradio-container { font-family: "Noto Sans SC", sans-serif; }
-.app-header { text-align: center; margin-bottom: 20px; }
-.prompt-grid { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 6px; }
-.prompt-grid button { height: 40px !important; }
-.notice { background: #f0f9ff; padding: 10px; border-radius: 8px; border: 1px solid #bae6fd; font-size: 14px; margin-bottom: 10px;}
-"""
-with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
-    gr.HTML("""
-    <div class="app-header">
-        <h1>PaddleOCR Local - Pro Interface</h1>
-        <p>Giao diện nâng cao chạy trên Backend Local (CPU)</p>
-    </div>
-    <div class="notice">
-        <strong>Lưu ý:</strong> Đây là phiên bản chạy model Local.
-        Các tính năng như <em>Formula to Latex</em>, <em>Table to HTML</em> hay <em>Layout Analysis</em>
-        chỉ trả về văn bản thô (Raw OCR) do giới hạn của model cài đặt cục bộ.
-    </div>
-    """)
-    with gr.Tabs():
-        # ===================== Tab 1: Document Parsing =====================
-        with gr.Tab("Document Parsing"):
-            with gr.Row():
-                with gr.Column(scale=5):
-                    file_doc = gr.File(label="Upload Image", type="filepath", file_types=["image"])
-                    btn_parse = gr.Button("Parse Document", variant="primary")
-                    # Các tùy chọn checkbox (Dummy - vì local model config đơn giản)
-                    with gr.Row():
-                        gr.Checkbox(label="Chart parsing (N/A)", value=False, interactive=False)
-                        gr.Checkbox(label="Doc unwarping (N/A)", value=False, interactive=False)
-                with gr.Column(scale=7):
-                    with gr.Tabs():
-                        with gr.Tab("Markdown Preview"):
-                            md_preview_doc = gr.Markdown()
-                        with gr.Tab("Visualization"):
-                            vis_image_doc = gr.HTML()
-                        with gr.Tab("Raw Data"):
-                            raw_doc = gr.Code(language="json")
-            btn_parse.click(run_doc_parsing, inputs=[file_doc], outputs=[md_preview_doc, vis_image_doc, raw_doc])
-        # ===================== Tab 2: Element-level Recognition =====================
-        with gr.Tab("Element-level Recognition"):
-            with gr.Row():
-                with gr.Column(scale=5):
-                    file_vl = gr.File(label="Upload Image", type="filepath", file_types=["image"])
-                    gr.Markdown("_(Chế độ này tối ưu cho từng thành phần riêng lẻ)_")
-                    with gr.Row(elem_classes=["prompt-grid"]):
-                        btn_ocr = gr.Button("Text Recognition", variant="secondary")
-                        btn_formula = gr.Button("Formula Recognition", variant="secondary")
-                    with gr.Row(elem_classes=["prompt-grid"]):
-                        btn_table = gr.Button("Table Recognition", variant="secondary")
-                        btn_seal = gr.Button("Seal Recognition", variant="secondary")
-                with gr.Column(scale=7):
-                    with gr.Tabs():
-                        with gr.Tab("Result"):
-                            md_preview_vl = gr.Markdown()
-                        with gr.Tab("Visualization"):
-                             vis_image_vl = gr.HTML()
-                        with gr.Tab("Raw Output"):
-                            md_raw_vl = gr.Code(language="json")
-            # Gán sự kiện cho các nút
-            for btn, label in [(btn_ocr, "Text"), (btn_formula, "Formula"), (btn_table, "Table"), (btn_seal, "Seal")]:
-                btn.click(
-                    fn=run_element_recognition,
-                    inputs=[file_vl, gr.State(label)],
-                    outputs=[md_preview_vl, vis_image_vl, md_raw_vl]
-                )
-        # ===================== Tab 3: Spotting =====================
-        with gr.Tab("Spotting"):
-            with gr.Row():
-                with gr.Column(scale=5):
-                    file_spot = gr.File(label="Upload Image", type="filepath", file_types=["image"])
-                    btn_run_spot = gr.Button("Run Spotting", variant="primary")
-                    gr.Markdown("_(Phát hiện vị trí văn bản)_")
-                with gr.Column(scale=7):
-                    with gr.Tabs():
-                        with gr.Tab("Visualization"):
-                            vis_image_spot = gr.HTML()
-                        with gr.Tab("JSON Result"):
-                            json_spot = gr.Code(language="json")
-            btn_run_spot.click(run_spotting, inputs=[file_spot], outputs=[vis_image_spot, json_spot])
 if __name__ == "__main__":
-    demo.queue().launch(server_name="0.0.0.0", server_port=7860,
-        ssr_mode=False)

 import os
+# --- CẤU HÌNH HỆ THỐNG ---
+os.environ["FLAGS_use_mkldnn"] = "0"
+os.environ["FLAGS_enable_mkldnn"] = "0"
+os.environ["DN_ENABLE_MKLDNN"] = "0"
+os.environ["CPP_MIN_LOG_LEVEL"] = "3"
 import logging
 import re
 import gradio as gr
 from paddleocr import PaddleOCR
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import requests
+# Tắt log thừa
 logging.getLogger("ppocr").setLevel(logging.WARNING)
+print("Đang khởi tạo PaddleOCR (Coordinate Sync Mode)...")
 try:
+    ocr = PaddleOCR(use_textline_orientation=True, use_doc_orientation_classify=False,
+        use_doc_unwarping=False, lang='ch')
 except Exception as e:
+    print(f"Lỗi khởi tạo: {e}. Chuyển về chế độ mặc định.")
     ocr = PaddleOCR(lang='ch')
+print("Model đã sẵn sàng!")
+# --- TẢI FONT ---
 def check_and_download_font():
     font_path = "./simfang.ttf"
     if not os.path.exists(font_path):
 FONT_PATH = check_and_download_font()
+# --- HÀM VẼ ĐA NĂNG ---
 def universal_draw(image, raw_data, font_path):
     if image is None: return image
+    # Đảm bảo image là PIL
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    # Copy để vẽ
     canvas = image.copy()
     draw = ImageDraw.Draw(canvas)
     except:
         font = ImageFont.load_default()
+    # Hàm parse box
+    def parse_box(b):
+        try:
+            if hasattr(b, 'tolist'): b = b.tolist()
+            if len(b) > 0 and isinstance(b[0], list): return [tuple(p) for p in b]
+            if len(b) == 4 and isinstance(b[0], (int, float)):
+                 return [(b[0], b[1]), (b[2], b[1]), (b[2], b[3]), (b[0], b[3])]
+            return None
+        except: return None
+    items_to_draw = []
+    # Logic tìm box/text
+    # Ưu tiên cấu trúc PaddleX: rec_texts + dt_polys
+    processed = False
+    if isinstance(raw_data, list) and len(raw_data) > 0 and isinstance(raw_data[0], dict):
+        data_dict = raw_data[0]
+        texts = data_dict.get('rec_texts')
+        boxes = data_dict.get('dt_polys', data_dict.get('rec_polys', data_dict.get('dt_boxes')))
+        if texts and boxes and isinstance(texts, list) and isinstance(boxes, list):
+            for i in range(min(len(texts), len(boxes))):
+                txt = texts[i]
+                box = parse_box(boxes[i])
+                if box and txt: items_to_draw.append((box, txt))
+            processed = True
+    # Fallback Logic
+    if not processed:
+        def hunt(data):
+            if isinstance(data, dict):
+                box = None; text = None
+                for k in ['points', 'box', 'dt_boxes', 'poly']:
+                    if k in data: box = parse_box(data[k]); break
+                for k in ['transcription', 'text', 'rec_text', 'label']:
+                    if k in data: text = data[k]; break
+                if box and text: items_to_draw.append((box, text)); return
+                for v in data.values(): hunt(v)
+            elif isinstance(data, (list, tuple)):
+                if len(data) == 2 and isinstance(data[0], list) and len(data[0]) == 4:
+                    box = parse_box(data[0])
+                    txt_obj = data[1]
+                    text = txt_obj[0] if isinstance(txt_obj, (list, tuple)) else txt_obj
+                    if box and isinstance(text, str): items_to_draw.append((box, text)); return
+                for item in data: hunt(item)
+        hunt(raw_data)
+    # Vẽ
+    for box, txt in items_to_draw:
+        try:
+            # Vẽ khung đỏ
+            draw.polygon(box, outline="red", width=3)
+            # Vẽ chữ
+            txt_x, txt_y = box[0]
+            if hasattr(draw, "textbbox"):
+                text_bbox = draw.textbbox((txt_x, txt_y), txt, font=font, anchor="lb")
+                draw.rectangle(text_bbox, fill="red")
+                draw.text((txt_x, txt_y), txt, fill="white", font=font, anchor="lb")
+            else:
+                draw.text((txt_x, txt_y - font_size), txt, fill="white", font=font)
+        except: continue
+    return canvas
+# --- HÀM XỬ LÝ TEXT ---
+def deep_extract_text(data):
+    found_texts = []
+    if isinstance(data, str):
+        if len(data.strip()) > 0: return [data]
+        return []
+    if isinstance(data, (list, tuple)):
+        for item in data: found_texts.extend(deep_extract_text(item))
+    elif isinstance(data, dict):
+        for val in data.values(): found_texts.extend(deep_extract_text(val))
+    elif hasattr(data, '__dict__'): found_texts.extend(deep_extract_text(data.__dict__))
+    return found_texts
+def clean_text_result(text_list):
+    cleaned = []
+    block_list = ['min', 'max', 'general', 'header', 'footer', 'structure']
+    for t in text_list:
+        t = t.strip()
+        if len(t) < 2 and not any(u'\u4e00' <= c <= u'\u9fff' for c in t): continue
+        if t.lower().endswith(('.ttf', '.json', '.pdparams', '.yml', '.log')): continue
+        if t.lower() in block_list: continue
+        if not re.search(r'[\w\u4e00-\u9fff]', t): continue
+        cleaned.append(t)
+    return cleaned
+# --- MAIN PREDICT ---
+def predict(image):
+    if image is None: return None, "Chưa có ảnh.", "No Data"
+    try:
+        # Chuẩn bị ảnh đầu vào
+        original_pil = image.copy() if isinstance(image, Image.Image) else Image.fromarray(image).copy()
+        image_np = np.array(image)
+        # 1. OCR
+        raw_result = ocr.ocr(image_np)
+        # 2. XỬ LÝ ẢNH ĐỂ VẼ (KEY FIX: Lấy ảnh từ Preprocessor nếu có)
+        target_image_for_drawing = original_pil
+        # Kiểm tra xem Paddle có chỉnh sửa ảnh không (dựa vào key 'doc_preprocessor_res')
+        if isinstance(raw_result, list) and len(raw_result) > 0 and isinstance(raw_result[0], dict):
+            if 'doc_preprocessor_res' in raw_result[0]:
+                proc_res = raw_result[0]['doc_preprocessor_res']
+                # Nếu có ảnh đầu ra đã chỉnh sửa (output_img)
+                if 'output_img' in proc_res:
+                    print("Phát hiện ảnh đã qua xử lý hình học. Đang đồng bộ tọa độ...")
+                    numpy_img = proc_res['output_img']
+                    target_image_for_drawing = Image.fromarray(numpy_img)
+        # 3. Vẽ lên ảnh ĐÚNG (Target Image)
+        annotated_image = universal_draw(target_image_for_drawing, raw_result, FONT_PATH)
+        # 4. Xử lý Text
+        all_texts = deep_extract_text(raw_result)
+        final_texts = clean_text_result(all_texts)
+        text_output = "\n".join(final_texts) if final_texts else "Không tìm thấy văn bản."
+        # Debug Info
+        debug_str = str(raw_result)[:1000]
+        debug_info = f"Used Image Source: {'Preprocessed' if target_image_for_drawing != original_pil else 'Original'}\nData Preview:\n{debug_str}..."
+        return annotated_image, text_output, debug_info
     except Exception as e:
         import traceback
+        return image, f"Lỗi: {str(e)}", traceback.format_exc()
+# --- GIAO DIỆN ---
+with gr.Blocks(title="PaddleOCR Perfect Overlay") as iface:
+    gr.Markdown("## PaddleOCR Chinese - High Precision Overlay")
+    with gr.Row():
+        with gr.Column():
+            input_img = gr.Image(type="pil", label="Input Image")
+            submit_btn = gr.Button("RUN OCR", variant="primary")
+        with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("🖼️ Kết quả Khớp Tọa Độ"):
+                    output_img = gr.Image(type="pil", label="Overlay Result")
+                with gr.TabItem("📝 Văn bản"):
+                    output_txt = gr.Textbox(label="Text Content", lines=15)
+                with gr.TabItem("🐞 Debug"):
+                    output_debug = gr.Textbox(label="Debug Info", lines=15)
+    submit_btn.click(
+        fn=predict,
+        inputs=input_img,
+        outputs=[output_img, output_txt, output_debug]
+    )
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)