Spaces:

phamha
/

engineering-drawing

Sleeping

App Files Files Community

Harry Pham commited on Apr 2

Commit

f8fef9f

1 Parent(s): c87ac5f

update OCR

Browse files

Files changed (1) hide show

src/inference.py +217 -77

src/inference.py CHANGED Viewed

@@ -11,6 +11,7 @@ import json
 import numpy as np
 from pathlib import Path
 from ultralytics import RTDETR
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[INFO] Device: {DEVICE}")
@@ -20,102 +21,240 @@ CLASS_DISPLAY = {"note": "Note", "part-drawing": "PartDrawing", "table": "Table"
 COLORS        = {"note": (0,165,255), "part-drawing": (0,200,0), "table": (0,0,220)}
 _det_model  = None
-_easy_reader = None
 def get_det_model(checkpoint="best.pt"):
     global _det_model
     if _det_model is None:
-        print(f"[INFO] Loading model: {checkpoint}")
         _det_model = RTDETR(checkpoint)
     return _det_model
-def get_easy_reader():
-    global _easy_reader
-    if _easy_reader is None:
-        import easyocr
-        print("[INFO] Loading EasyOCR...")
-        _easy_reader = easyocr.Reader(["vi","en"], gpu=False, verbose=False)
-    return _easy_reader
-def preprocess_for_ocr(img_bgr):
     h, w = img_bgr.shape[:2]
     if w < 800:
         scale = 800 / w
-        img_bgr = cv2.resize(img_bgr, (int(w*scale), int(h*scale)),
-                             interpolation=cv2.INTER_CUBIC)
-    gray  = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
-    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
-    gray  = clahe.apply(gray)
-    gray  = cv2.fastNlMeansDenoising(gray, h=15,
-                templateWindowSize=7, searchWindowSize=21)
-    kernel = np.array([[0,-1,0],[-1,5,-1],[0,-1,0]])
-    gray   = cv2.filter2D(gray, -1, kernel)
-    return cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
-def ocr_note(img_path):
-    img = cv2.imread(img_path)
-    if img is None:
-        return ""
-    img_proc = preprocess_for_ocr(img)
-    try:
-        reader  = get_easy_reader()
-        results = reader.readtext(img_proc, detail=1, paragraph=False,
                                   width_ths=0.7, height_ths=0.7)
-        lines = [t for (_,t,c) in results if c >= 0.2 and t.strip()]
-        return "\n".join(lines)
-    except Exception as e:
-        print(f"[WARN] ocr_note: {e}")
-        return ""
-def _group_rows(items):
     if not items:
         return []
-    items = sorted(items, key=lambda x: x["y"])
-    y_vals = [it["y"] for it in items]
     if len(y_vals) > 1:
-        gaps   = [y_vals[i+1]-y_vals[i] for i in range(len(y_vals)-1)]
-        thresh = max(8, (sum(gaps)/len(gaps)) * 0.6)
     else:
         thresh = 12
-    rows, cur = [], [items[0]]
-    for item in items[1:]:
-        if item["y"] - cur[-1]["y"] < thresh:
-            cur.append(item)
         else:
-            cur.sort(key=lambda x: x["x"])
-            rows.append([i["text"] for i in cur])
-            cur = [item]
-    cur.sort(key=lambda x: x["x"])
-    rows.append([i["text"] for i in cur])
-    return rows
-def ocr_table(img_path):
     img = cv2.imread(img_path)
     if img is None:
-        return {"rows":[], "text":""}
-    img_proc = preprocess_for_ocr(img)
-    items = []
-    try:
-        reader  = get_easy_reader()
-        results = reader.readtext(img_proc, detail=1, paragraph=False,
-                                  width_ths=0.5, height_ths=0.5)
-        for (pts, text, conf) in results:
-            if conf < 0.2 or not text.strip():
-                continue
-            items.append({
-                "text": text.strip(),
-                "y": sum(p[1] for p in pts)/4,
-                "x": sum(p[0] for p in pts)/4,
-            })
-    except Exception as e:
-        print(f"[WARN] ocr_table: {e}")
     if not items:
-        return {"rows":[], "text":""}
-    rows = _group_rows(items)
-    return {"rows": rows, "text": "\n".join(" | ".join(r) for r in rows)}
 def run_pipeline(image_path, output_dir="outputs",
-                 checkpoint="best.pt", conf_thresh=0.3):
     image_path = str(image_path)
     img_name   = Path(image_path).name
     stem       = Path(image_path).stem
@@ -147,12 +286,12 @@ def run_pipeline(image_path, output_dir="outputs",
         ocr_content = None
         if cls_raw == "note":
             print(f"[OCR] Note #{i+1}...")
-            ocr_content = ocr_note(crop_path)
-            print(f"      → {repr(ocr_content[:80]) if ocr_content else 'EMPTY'}")
         elif cls_raw == "table":
             print(f"[OCR] Table #{i+1}...")
-            ocr_content = ocr_table(crop_path)
-            preview = ocr_content.get("text","")[:80]
             print(f"      → {repr(preview) if preview else 'EMPTY'}")
         objects.append({
@@ -185,5 +324,6 @@ def run_pipeline(image_path, output_dir="outputs",
 if __name__ == "__main__":
     import sys
     img = sys.argv[1] if len(sys.argv) > 1 else "test.jpg"
-    result, _ = run_pipeline(img)
     print(json.dumps(result, ensure_ascii=False, indent=2))

 import numpy as np
 from pathlib import Path
 from ultralytics import RTDETR
+import re
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[INFO] Device: {DEVICE}")
 COLORS        = {"note": (0,165,255), "part-drawing": (0,200,0), "table": (0,0,220)}
 _det_model  = None
+_ocr_reader = None  # sẽ là PaddleOCR hoặc EasyOCR
 def get_det_model(checkpoint="best.pt"):
     global _det_model
     if _det_model is None:
+        print(f"[INFO] Loading detection model: {checkpoint}")
         _det_model = RTDETR(checkpoint)
     return _det_model
+def get_ocr_reader(backend="paddle"):
+    """Khởi tạo OCR engine, ưu tiên PaddleOCR, fallback EasyOCR"""
+    global _ocr_reader
+    if _ocr_reader is not None:
+        return _ocr_reader
+    if backend == "paddle":
+        try:
+            from paddleocr import PaddleOCR
+            print("[INFO] Initializing PaddleOCR (lang: vi, en)...")
+            _ocr_reader = PaddleOCR(
+                lang='vi',                     # tiếng Việt + tiếng Anh
+                use_angle_cls=True,            # tự động xoay ảnh
+                use_gpu=(DEVICE == "cuda"),
+                show_log=False,
+                det_db_thresh=0.3,
+                det_db_box_thresh=0.5,
+                rec_algorithm='SVTR_LCNet'     # mạnh cho chữ in
+            )
+            return _ocr_reader
+        except ImportError:
+            print("[WARN] PaddleOCR not installed, falling back to EasyOCR.")
+        except Exception as e:
+            print(f"[WARN] PaddleOCR init failed: {e}, fallback to EasyOCR.")
+    # Fallback to EasyOCR
+    import easyocr
+    print("[INFO] Loading EasyOCR (vi, en)...")
+    _ocr_reader = easyocr.Reader(["vi", "en"], gpu=(DEVICE == "cuda"), verbose=False)
+    return _ocr_reader
+def preprocess_image(img_bgr, ocr_type="note"):
+    """
+    Tiền xử lý ảnh phù hợp với từng loại:
+    - note: tăng độ tương phản, làm mờ nhẹ, sharpening
+    - table: nhị phân hóa, xóa đường kẻ ngang/dọc (tùy chọn)
+    """
     h, w = img_bgr.shape[:2]
+    # Resize nếu quá nhỏ (cải thiện OCR)
     if w < 800:
         scale = 800 / w
+        img_bgr = cv2.resize(img_bgr, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_CUBIC)
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    if ocr_type == "note":
+        # CLAHE + Denoising + Sharpening
+        clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
+        gray = clahe.apply(gray)
+        gray = cv2.fastNlMeansDenoising(gray, h=10, templateWindowSize=7, searchWindowSize=21)
+        kernel = np.array([[0,-1,0],[-1,5,-1],[0,-1,0]])
+        gray = cv2.filter2D(gray, -1, kernel)
+        # Chuyển về BGR cho PaddleOCR/EasyOCR
+        return cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
+    else:  # table
+        # Nhị phân hóa thích ứng (giữ chữ, xóa bớt nhiễu nền)
+        binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                       cv2.THRESH_BINARY, 11, 2)
+        # Loại bỏ đường kẻ ngang/dọc (tùy chọn, giúp OCR dễ hơn)
+        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25,1))
+        vertical_kernel   = cv2.getStructuringElement(cv2.MORPH_RECT, (1,25))
+        detected_lines_h = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
+        detected_lines_v = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
+        # Xóa đường kẻ khỏi ảnh nhị phân
+        binary = cv2.bitwise_and(binary, cv2.bitwise_not(detected_lines_h))
+        binary = cv2.bitwise_and(binary, cv2.bitwise_not(detected_lines_v))
+        # Làm dày chữ một chút
+        kernel_dilate = np.ones((2,2), np.uint8)
+        binary = cv2.dilate(binary, kernel_dilate, iterations=1)
+        return cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
+def ocr_with_backend(img_bgr, backend="paddle", ocr_type="note"):
+    """Gọi OCR engine tương ứng, trả về list các (text, conf, center_x, center_y)"""
+    reader = get_ocr_reader(backend)
+    img_for_ocr = preprocess_image(img_bgr, ocr_type)
+    if backend == "paddle":
+        # PaddleOCR trả về list: [ [[box], (text, confidence)], ... ]
+        result = reader.ocr(img_for_ocr, cls=True)
+        if not result or not result[0]:
+            return []
+        items = []
+        for line in result[0]:
+            box, (text, conf) = line
+            if conf < 0.3 or not text.strip():
+                continue
+            # Tính trung tâm bounding box
+            xs = [p[0] for p in box]
+            ys = [p[1] for p in box]
+            cx, cy = np.mean(xs), np.mean(ys)
+            items.append({
+                "text": text.strip(),
+                "conf": conf,
+                "x": cx,
+                "y": cy,
+                "box": box
+            })
+        return items
+    else:
+        # EasyOCR
+        results = reader.readtext(img_for_ocr, detail=1, paragraph=False,
                                   width_ths=0.7, height_ths=0.7)
+        items = []
+        for (pts, text, conf) in results:
+            if conf < 0.2 or not text.strip():
+                continue
+            cx = sum(p[0] for p in pts) / 4
+            cy = sum(p[1] for p in pts) / 4
+            items.append({
+                "text": text.strip(),
+                "conf": conf,
+                "x": cx,
+                "y": cy,
+                "box": pts
+            })
+        return items
+def group_rows(items, vertical_thresh_ratio=0.6):
+    """
+    Nhóm các item theo hàng dựa trên tọa độ y.
+    Dùng DBSCAN nếu có sklearn, nếu không thì dùng heuristic.
+    """
     if not items:
         return []
+    # Sắp xếp theo y tăng dần
+    items_sorted = sorted(items, key=lambda x: x["y"])
+    y_vals = [it["y"] for it in items_sorted]
+    # Tự động ước lượng ngưỡng dựa trên khoảng cách trung bình
     if len(y_vals) > 1:
+        gaps = [y_vals[i+1] - y_vals[i] for i in range(len(y_vals)-1)]
+        median_gap = np.median(gaps)
+        thresh = max(8, median_gap * vertical_thresh_ratio)
     else:
         thresh = 12
+    rows = []
+    current_row = [items_sorted[0]]
+    for it in items_sorted[1:]:
+        if it["y"] - current_row[-1]["y"] < thresh:
+            current_row.append(it)
         else:
+            # Sắp xếp các item trong cùng hàng theo x
+            current_row.sort(key=lambda x: x["x"])
+            rows.append(current_row)
+            current_row = [it]
+    current_row.sort(key=lambda x: x["x"])
+    rows.append(current_row)
+    # Chuyển thành list text theo hàng
+    return [[it["text"] for it in row] for row in rows]
+def ocr_note(img_path, backend="paddle"):
+    """OCR cho vùng Note, trả về chuỗi văn bản."""
     img = cv2.imread(img_path)
     if img is None:
+        return ""
+    items = ocr_with_backend(img, backend, ocr_type="note")
     if not items:
+        # Thử lại với preprocessing khác (bỏ sharpen, chỉ CLAHE)
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+        gray = clahe.apply(gray)
+        img2 = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
+        items = ocr_with_backend(img2, backend, ocr_type="note")
+        if not items:
+            return ""
+    # Sắp xếp theo y rồi x để tạo đoạn văn bản
+    items_sorted = sorted(items, key=lambda x: (x["y"], x["x"]))
+    lines = []
+    current_line = []
+    y_thresh = 12  # ngưỡng dòng
+    for i, it in enumerate(items_sorted):
+        if i == 0:
+            current_line.append(it["text"])
+        else:
+            if abs(it["y"] - items_sorted[i-1]["y"]) < y_thresh:
+                current_line.append(it["text"])
+            else:
+                lines.append(" ".join(current_line))
+                current_line = [it["text"]]
+    if current_line:
+        lines.append(" ".join(current_line))
+    # Post-processing: loại bỏ ký tự lạ, chuẩn hóa khoảng trắng
+    clean_lines = []
+    for line in lines:
+        line = re.sub(r'[^\w\s\.\,\-\/\(\)]', '', line)  # giữ chữ, số, dấu câu cơ bản
+        line = re.sub(r'\s+', ' ', line).strip()
+        if len(line) > 1:
+            clean_lines.append(line)
+    return "\n".join(clean_lines)
+def ocr_table(img_path, backend="paddle"):
+    """OCR cho vùng Table, trả về dict rows và text."""
+    img = cv2.imread(img_path)
+    if img is None:
+        return {"rows": [], "text": ""}
+    items = ocr_with_backend(img, backend, ocr_type="table")
+    if not items:
+        # Thử lại với ảnh gốc (không xóa đường kẻ) vì đôi khi đường kẻ giúp định vị ô
+        img2 = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        img2 = cv2.adaptiveThreshold(img2, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                     cv2.THRESH_BINARY, 15, 5)
+        img2 = cv2.cvtColor(img2, cv2.COLOR_GRAY2BGR)
+        items = ocr_with_backend(img2, backend, ocr_type="table")
+        if not items:
+            return {"rows": [], "text": ""}
+    rows = group_rows(items, vertical_thresh_ratio=0.6)
+    # Chuyển rows thành text (các cột cách nhau bằng ' | ')
+    text_lines = [" | ".join(row) for row in rows if row]
+    return {"rows": rows, "text": "\n".join(text_lines)}
 def run_pipeline(image_path, output_dir="outputs",
+                 checkpoint="best.pt", conf_thresh=0.3,
+                 ocr_backend="paddle"):
+    """
+    ocr_backend: "paddle" (khuyến nghị) hoặc "easyocr"
+    """
     image_path = str(image_path)
     img_name   = Path(image_path).name
     stem       = Path(image_path).stem
         ocr_content = None
         if cls_raw == "note":
             print(f"[OCR] Note #{i+1}...")
+            ocr_content = ocr_note(crop_path, backend=ocr_backend)
+            print(f"      → {repr(ocr_content[:100]) if ocr_content else 'EMPTY'}")
         elif cls_raw == "table":
             print(f"[OCR] Table #{i+1}...")
+            ocr_content = ocr_table(crop_path, backend=ocr_backend)
+            preview = ocr_content.get("text", "")[:100]
             print(f"      → {repr(preview) if preview else 'EMPTY'}")
         objects.append({
 if __name__ == "__main__":
     import sys
     img = sys.argv[1] if len(sys.argv) > 1 else "test.jpg"
+    # Có thể chọn backend: "paddle" (mặc định) hoặc "easyocr"
+    result, _ = run_pipeline(img, ocr_backend="paddle")
     print(json.dumps(result, ensure_ascii=False, indent=2))