Spaces:

ranbac
/

PaddleOCR

Sleeping

App Files Files Community

ranbac commited on 21 days ago

Commit

02a63b3

verified ·

1 Parent(s): 9e8099e

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -170

app.py CHANGED Viewed

@@ -1,105 +1,77 @@
 import os
-import logging
-import re
-import requests
-import numpy as np
-from PIL import Image, ImageDraw, ImageFont
-import gradio as gr
-from paddleocr import PaddleOCR
-# --- CẤU HÌNH HỆ THỐNG (Tránh lỗi xung đột thư viện) ---
 os.environ["FLAGS_use_mkldnn"] = "0"
 os.environ["FLAGS_enable_mkldnn"] = "0"
 os.environ["DN_ENABLE_MKLDNN"] = "0"
 os.environ["CPP_MIN_LOG_LEVEL"] = "3"
-# Tắt log thừa của Paddle
 logging.getLogger("ppocr").setLevel(logging.WARNING)
-# --- QUẢN LÝ FONT CHỮ (Tự động tải font theo ngôn ngữ) ---
-FONTS_DIR = "./fonts"
-if not os.path.exists(FONTS_DIR):
-    os.makedirs(FONTS_DIR)
-def get_font_path(lang_code):
-    """
-    Trả về đường dẫn font phù hợp với ngôn ngữ.
-    Tự động tải nếu chưa có file.
-    """
-    font_config = {
-        'ch': {
-            'filename': 'simfang.ttf',
-            'url': 'https://github.com/StellarCN/scp_zh/raw/master/fonts/SimFang.ttf'
-        },
-        'vi': {
-            'filename': 'roboto.ttf',
-            'url': 'https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf'
-        }
-    }
-    cfg = font_config.get(lang_code, font_config['ch']) # Mặc định là Trung nếu lỗi
-    font_path = os.path.join(FONTS_DIR, cfg['filename'])
     if not os.path.exists(font_path):
-        print(f"📡 Đang tải font cho '{lang_code}' từ internet...")
         try:
-            r = requests.get(cfg['url'], allow_redirects=True)
             with open(font_path, 'wb') as f:
                 f.write(r.content)
-            print(f"✅ Đã tải xong: {font_path}")
-        except Exception as e:
-            print(f"❌ Lỗi tải font: {e}")
             return None
     return font_path
-# --- QUẢN LÝ MODEL (Cache Model) ---
-# Biến toàn cục lưu các model đã load để tránh khởi tạo lại nhiều lần
-loaded_models = {}
-def get_ocr_model(lang_code):
-    """
-    Lấy model OCR từ cache hoặc khởi tạo mới nếu chưa có.
-    """
-    if lang_code in loaded_models:
-        return loaded_models[lang_code]
-    print(f"🚀 Đang khởi tạo Model PaddleOCR ngôn ngữ: {lang_code}...")
-    try:
-        # use_angle_cls=True giúp xoay ảnh nếu văn bản bị nghiêng
-        model = PaddleOCR(use_angle_cls=True, lang=lang_code, show_log=False)
-        loaded_models[lang_code] = model
-        print(f"✅ Model {lang_code} đã sẵn sàng!")
-        return model
-    except Exception as e:
-        print(f"❌ Lỗi khởi tạo model: {e}")
-        return None
-# --- HÀM VẼ KHUNG VÀ CHỮ LÊN ẢNH ---
 def universal_draw(image, raw_data, font_path):
     if image is None: return image
-    # Chuyển về PIL Image để vẽ
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
     canvas = image.copy()
     draw = ImageDraw.Draw(canvas)
-    # Load Font
     try:
-        font_size = 20
         font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default()
     except:
         font = ImageFont.load_default()
-    # Hàm phụ: Parse tọa độ box từ nhiều định dạng dữ liệu khác nhau
     def parse_box(b):
         try:
             if hasattr(b, 'tolist'): b = b.tolist()
-            # Dạng 4 điểm [[x1,y1], [x2,y2], ...]
             if len(b) > 0 and isinstance(b[0], list): return [tuple(p) for p in b]
-            # Dạng [xmin, ymin, xmax, ymax]
             if len(b) == 4 and isinstance(b[0], (int, float)):
                  return [(b[0], b[1]), (b[2], b[1]), (b[2], b[3]), (b[0], b[3])]
             return None
@@ -107,46 +79,54 @@ def universal_draw(image, raw_data, font_path):
     items_to_draw = []
-    # Hàm đệ quy tìm text và box trong JSON
-    def hunt(data):
-        if isinstance(data, dict):
-            box = None; text = None
-            # Tìm box
-            for k in ['points', 'box', 'dt_boxes', 'poly']:
-                if k in data: box = parse_box(data[k]); break
-            # Tìm text
-            for k in ['transcription', 'text', 'rec_text', 'label']:
-                if k in data: text = data[k]; break
-            if box and text: items_to_draw.append((box, text)); return
-            for v in data.values(): hunt(v)
-        elif isinstance(data, (list, tuple)):
-            # Cấu trúc phổ biến của PaddleOCR v3/v4: [[[x,y]...], (text, conf)]
-            if len(data) == 2 and isinstance(data[0], list) and len(data[0]) == 4:
-                box = parse_box(data[0])
-                txt_obj = data[1]
-                text = txt_obj[0] if isinstance(txt_obj, (list, tuple)) else txt_obj
-                if box and isinstance(text, str): items_to_draw.append((box, text)); return
-            for item in data: hunt(item)
-    hunt(raw_data)
-    # Thực hiện vẽ
     for box, txt in items_to_draw:
         try:
             # Vẽ khung đỏ
-            draw.polygon(box, outline="red", width=2)
-            # Tính toán vị trí vẽ chữ (nền đỏ chữ trắng)
             txt_x, txt_y = box[0]
-            if hasattr(draw, "textbbox"): # Pillow mới
-                left, top, right, bottom = draw.textbbox((txt_x, txt_y), txt, font=font)
-                draw.rectangle((left-2, top-2, right+2, bottom+2), fill="red")
-                draw.text((txt_x, txt_y), txt, fill="white", font=font)
-            else: # Pillow cũ
-                draw.text((txt_x, txt_y - font_size), txt, fill="red", font=font)
         except: continue
     return canvas
@@ -161,114 +141,88 @@ def deep_extract_text(data):
         for item in data: found_texts.extend(deep_extract_text(item))
     elif isinstance(data, dict):
         for val in data.values(): found_texts.extend(deep_extract_text(val))
     return found_texts
 def clean_text_result(text_list):
     cleaned = []
-    # Các từ khóa rác hệ thống cần loại bỏ
-    block_list = ['min', 'max', 'general', 'header', 'footer', 'structure']
     for t in text_list:
         t = t.strip()
-        # Loại bỏ text quá ngắn nếu không phải tiếng Trung/Việt
-        if len(t) < 2 and not any(u'\u4e00' <= c <= u'\u9fff' for c in t): continue
         if t.lower().endswith(('.ttf', '.json', '.pdparams', '.yml', '.log')): continue
         if t.lower() in block_list: continue
         cleaned.append(t)
     return cleaned
-# --- MAIN PREDICT FUNCTION ---
-def predict(image, lang_option):
-    if image is None: return None, "Vui lòng tải ảnh lên.", "No Data"
-    # 1. Xác định ngôn ngữ
-    lang_code = 'vi' if lang_option == "Tiếng Việt" else 'ch'
     try:
-        # 2. Lấy Model và Font
-        ocr_model = get_ocr_model(lang_code)
-        font_path = get_font_path(lang_code)
-        if ocr_model is None:
-            return image, "Lỗi khởi tạo Model PaddleOCR.", "Error init model"
-        # 3. Chuẩn bị ảnh
         original_pil = image.copy() if isinstance(image, Image.Image) else Image.fromarray(image).copy()
         image_np = np.array(image)
-        # 4. CHẠY OCR
-        # cls=True: Tự động sửa góc nghiêng (ví dụ ảnh chụp ngược 180 độ)
-        raw_result = ocr_model.ocr(image_np, cls=True)
-        # 5. Xử lý hiển thị (Overlay)
-        # Kiểm tra xem Paddle có unwarp ảnh (nắn thẳng) không?
         target_image_for_drawing = original_pil
-        # Logic check unwarping (nếu dùng các model structure nâng cao)
         if isinstance(raw_result, list) and len(raw_result) > 0 and isinstance(raw_result[0], dict):
-             if 'doc_preprocessor_res' in raw_result[0]:
                 proc_res = raw_result[0]['doc_preprocessor_res']
                 if 'output_img' in proc_res:
-                    target_image_for_drawing = Image.fromarray(proc_res['output_img'])
-        # Vẽ kết quả lên ảnh
-        annotated_image = universal_draw(target_image_for_drawing, raw_result, font_path)
-        # 6. Trích xuất văn bản thuần
         all_texts = deep_extract_text(raw_result)
         final_texts = clean_text_result(all_texts)
         text_output = "\n".join(final_texts) if final_texts else "Không tìm thấy văn bản."
-        # Debug info
-        debug_info = f"Language: {lang_code}\nModel Loaded: {lang_code in loaded_models}\nFont Used: {font_path}\n\nRaw Data Preview:\n{str(raw_result)[:1000]}"
         return annotated_image, text_output, debug_info
     except Exception as e:
         import traceback
-        return image, f"Lỗi hệ thống: {str(e)}", traceback.format_exc()
-# --- GIAO DIỆN GRADIO ---
-css = """
-footer {visibility: hidden}
-.gradio-container {min-height: 0px !important}
-"""
-with gr.Blocks(title="PaddleOCR Multi-Language", css=css, theme=gr.themes.Soft()) as iface:
-    gr.Markdown("# 🈯 PaddleOCR - Hỗ trợ Tiếng Việt & Trung")
-    gr.Markdown("Tải ảnh lên, chọn ngôn ngữ và xem AI nhận diện tọa độ + văn bản.")
     with gr.Row():
-        # CỘT TRÁI: INPUT
-        with gr.Column(scale=1):
-            input_img = gr.Image(type="pil", label="Ảnh đầu vào (Input)")
-            # Dropdown chọn ngôn ngữ
-            lang_dropdown = gr.Dropdown(
-                choices=["Tiếng Trung (Chinese)", "Tiếng Việt"],
-                value="Tiếng Việt",
-                label="Chọn ngôn ngữ xử lý",
-                info="Lần đầu chọn ngôn ngữ mới sẽ tốn chút th��i gian để tải model."
-            )
-            submit_btn = gr.Button("🚀 CHẠY NHẬN DIỆN (RUN OCR)", variant="primary", size="lg")
-        # CỘT PHẢI: OUTPUT
-        with gr.Column(scale=2):
             with gr.Tabs():
-                with gr.TabItem("🖼️ Kết quả Hình ảnh"):
-                    output_img = gr.Image(type="pil", label="Ảnh đã vẽ khung")
-                with gr.TabItem("📝 Văn bản (Text)"):
-                    output_txt = gr.Textbox(label="Nội dung trích xuất", lines=15)
-                with gr.TabItem("🐞 Debug Info"):
-                    output_debug = gr.Textbox(label="Dữ liệu kỹ thuật", lines=15)
-    # SỰ KIỆN CLICK
     submit_btn.click(
         fn=predict,
-        inputs=[input_img, lang_dropdown],
         outputs=[output_img, output_txt, output_debug]
     )
 if __name__ == "__main__":
-    # Server name="0.0.0.0" để có thể truy cập từ máy khác trong mạng LAN nếu cần
-    iface.launch(server_name="0.0.0.0", server_port=7860, share=False)

 import os
+# --- CẤU HÌNH HỆ THỐNG ---
 os.environ["FLAGS_use_mkldnn"] = "0"
 os.environ["FLAGS_enable_mkldnn"] = "0"
 os.environ["DN_ENABLE_MKLDNN"] = "0"
 os.environ["CPP_MIN_LOG_LEVEL"] = "3"
+import logging
+import re
+import gradio as gr
+from paddleocr import PaddleOCR
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import requests
+# Tắt log thừa
 logging.getLogger("ppocr").setLevel(logging.WARNING)
+print("Đang khởi tạo PaddleOCR (Coordinate Sync Mode) - Ngôn ngữ: Tiếng Việt...")
+try:
+    # THAY ĐỔI 1: Chuyển lang='ch' thành lang='vi'
+    ocr = PaddleOCR(use_textline_orientation=True, use_doc_orientation_classify=False,
+        use_doc_unwarping=False, lang='vi')
+except Exception as e:
+    print(f"Lỗi khởi tạo: {e}. Chuyển về chế độ mặc định.")
+    ocr = PaddleOCR(lang='vi')
+print("Model đã sẵn sàng!")
+# --- TẢI FONT (HỖ TRỢ TIẾNG VIỆT) ---
+def check_and_download_font():
+    # THAY ĐỔI 2: Sử dụng font Roboto để hiển thị đúng dấu Tiếng Việt
+    font_path = "./Roboto-Regular.ttf"
     if not os.path.exists(font_path):
         try:
+            print("Đang tải font Roboto hỗ trợ tiếng Việt...")
+            # URL Font Roboto chuẩn từ Google Fonts
+            url = "https://github.com/google/fonts/raw/main/apache/roboto/Roboto-Regular.ttf"
+            r = requests.get(url, allow_redirects=True)
             with open(font_path, 'wb') as f:
                 f.write(r.content)
+            print("Đã tải xong font.")
+        except:
+            print("Không tải được font. Sẽ sử dụng font mặc định hệ thống (có thể lỗi dấu).")
             return None
     return font_path
+FONT_PATH = check_and_download_font()
+# --- HÀM VẼ ĐA NĂNG ---
 def universal_draw(image, raw_data, font_path):
     if image is None: return image
+    # Đảm bảo image là PIL
     if isinstance(image, np.ndarray):
         image = Image.fromarray(image)
+    # Copy để vẽ
     canvas = image.copy()
     draw = ImageDraw.Draw(canvas)
     try:
+        font_size = 24
         font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default()
     except:
         font = ImageFont.load_default()
+    # Hàm parse box
     def parse_box(b):
         try:
             if hasattr(b, 'tolist'): b = b.tolist()
             if len(b) > 0 and isinstance(b[0], list): return [tuple(p) for p in b]
             if len(b) == 4 and isinstance(b[0], (int, float)):
                  return [(b[0], b[1]), (b[2], b[1]), (b[2], b[3]), (b[0], b[3])]
             return None
     items_to_draw = []
+    # Logic tìm box/text
+    # Ưu tiên cấu trúc PaddleX: rec_texts + dt_polys
+    processed = False
+    if isinstance(raw_data, list) and len(raw_data) > 0 and isinstance(raw_data[0], dict):
+        data_dict = raw_data[0]
+        texts = data_dict.get('rec_texts')
+        boxes = data_dict.get('dt_polys', data_dict.get('rec_polys', data_dict.get('dt_boxes')))
+        if texts and boxes and isinstance(texts, list) and isinstance(boxes, list):
+            for i in range(min(len(texts), len(boxes))):
+                txt = texts[i]
+                box = parse_box(boxes[i])
+                if box and txt: items_to_draw.append((box, txt))
+            processed = True
+    # Fallback Logic
+    if not processed:
+        def hunt(data):
+            if isinstance(data, dict):
+                box = None; text = None
+                for k in ['points', 'box', 'dt_boxes', 'poly']:
+                    if k in data: box = parse_box(data[k]); break
+                for k in ['transcription', 'text', 'rec_text', 'label']:
+                    if k in data: text = data[k]; break
+                if box and text: items_to_draw.append((box, text)); return
+                for v in data.values(): hunt(v)
+            elif isinstance(data, (list, tuple)):
+                if len(data) == 2 and isinstance(data[0], list) and len(data[0]) == 4:
+                    box = parse_box(data[0])
+                    txt_obj = data[1]
+                    text = txt_obj[0] if isinstance(txt_obj, (list, tuple)) else txt_obj
+                    if box and isinstance(text, str): items_to_draw.append((box, text)); return
+                for item in data: hunt(item)
+        hunt(raw_data)
+    # Vẽ
     for box, txt in items_to_draw:
         try:
             # Vẽ khung đỏ
+            draw.polygon(box, outline="red", width=3)
+            # Vẽ chữ
             txt_x, txt_y = box[0]
+            if hasattr(draw, "textbbox"):
+                text_bbox = draw.textbbox((txt_x, txt_y), txt, font=font, anchor="lb")
+                draw.rectangle(text_bbox, fill="red")
+                draw.text((txt_x, txt_y), txt, fill="white", font=font, anchor="lb")
+            else:
+                draw.text((txt_x, txt_y - font_size), txt, fill="white", font=font)
         except: continue
     return canvas
         for item in data: found_texts.extend(deep_extract_text(item))
     elif isinstance(data, dict):
         for val in data.values(): found_texts.extend(deep_extract_text(val))
+    elif hasattr(data, '__dict__'): found_texts.extend(deep_extract_text(data.__dict__))
     return found_texts
 def clean_text_result(text_list):
     cleaned = []
+    block_list = ['min', 'max', 'general', 'header', 'footer', 'structure']
     for t in text_list:
         t = t.strip()
+        # Giữ lại nếu là ký tự Unicode thông thường (bao gồm tiếng Việt)
+        if len(t) < 2 and not re.search(r'\w', t): continue
         if t.lower().endswith(('.ttf', '.json', '.pdparams', '.yml', '.log')): continue
         if t.lower() in block_list: continue
+        if not re.search(r'[\w\u00C0-\u1EF9]', t): continue # Regex mở rộng cho tiếng Việt
         cleaned.append(t)
     return cleaned
+# --- MAIN PREDICT ---
+def predict(image):
+    if image is None: return None, "Chưa có ảnh.", "No Data"
     try:
+        # Chuẩn bị ảnh đầu vào
         original_pil = image.copy() if isinstance(image, Image.Image) else Image.fromarray(image).copy()
         image_np = np.array(image)
+        # 1. OCR
+        raw_result = ocr.ocr(image_np)
+        # 2. XỬ LÝ ẢNH ĐỂ VẼ (KEY FIX: Lấy ảnh từ Preprocessor nếu có)
         target_image_for_drawing = original_pil
+        # Kiểm tra xem Paddle có chỉnh sửa ảnh không (dựa vào key 'doc_preprocessor_res')
         if isinstance(raw_result, list) and len(raw_result) > 0 and isinstance(raw_result[0], dict):
+            if 'doc_preprocessor_res' in raw_result[0]:
                 proc_res = raw_result[0]['doc_preprocessor_res']
+                # Nếu có ảnh đầu ra đã chỉnh sửa (output_img)
                 if 'output_img' in proc_res:
+                    print("Phát hiện ảnh đã qua xử lý hình học. Đang đồng bộ tọa độ...")
+                    numpy_img = proc_res['output_img']
+                    target_image_for_drawing = Image.fromarray(numpy_img)
+        # 3. Vẽ lên ảnh ĐÚNG (Target Image)
+        annotated_image = universal_draw(target_image_for_drawing, raw_result, FONT_PATH)
+        # 4. Xử lý Text
         all_texts = deep_extract_text(raw_result)
         final_texts = clean_text_result(all_texts)
         text_output = "\n".join(final_texts) if final_texts else "Không tìm thấy văn bản."
+        # Debug Info
+        debug_str = str(raw_result)[:1000]
+        debug_info = f"Used Image Source: {'Preprocessed' if target_image_for_drawing != original_pil else 'Original'}\nData Preview:\n{debug_str}..."
         return annotated_image, text_output, debug_info
     except Exception as e:
         import traceback
+        return image, f"Lỗi: {str(e)}", traceback.format_exc()
+# --- GIAO DIỆN ---
+with gr.Blocks(title="PaddleOCR Perfect Overlay (Vietnamese)") as iface:
+    gr.Markdown("## PaddleOCR Vietnamese - High Precision Overlay")
     with gr.Row():
+        with gr.Column():
+            input_img = gr.Image(type="pil", label="Input Image")
+            submit_btn = gr.Button("RUN OCR", variant="primary")
+        with gr.Column():
             with gr.Tabs():
+                with gr.TabItem("🖼️ Kết quả Khớp Tọa Độ"):
+                    output_img = gr.Image(type="pil", label="Overlay Result")
+                with gr.TabItem("📝 Văn bản"):
+                    output_txt = gr.Textbox(label="Text Content", lines=15)
+                with gr.TabItem("🐞 Debug"):
+                    output_debug = gr.Textbox(label="Debug Info", lines=15)
     submit_btn.click(
         fn=predict,
+        inputs=input_img,
         outputs=[output_img, output_txt, output_debug]
     )
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)