Spaces:

phamha
/

engineering-drawing

Sleeping

App Files Files Community

Harry Pham commited on Apr 2

Commit

d80899e

1 Parent(s): f69131e

update OCR

Browse files

Files changed (1) hide show

src/inference.py +95 -95

src/inference.py CHANGED Viewed

@@ -35,6 +35,22 @@ def get_det_model(checkpoint="best.pt"):
         _det_model = RTDETR(checkpoint)
     return _det_model
 def get_paddle_reader(lang='vi'):
     """
@@ -239,26 +255,30 @@ def multi_pass_ocr(img_bgr, reader, ocr_type="note"):
 # ============================================================
 # DUAL-ENGINE OCR — PaddleOCR (vi) + PaddleOCR (en), chọn tốt hơn
 # ============================================================
-def dual_engine_ocr(img_bgr, ocr_type="note"):
     """
-    Chạy PaddleOCR với cả lang='vi' và lang='en',
-    chọn kết quả có confidence cao hơn.
-    Nếu PaddleOCR fail → fallback EasyOCR.
     """
-    reader_vi = get_paddle_reader('vi')
-    reader_en = get_paddle_reader('en')
-    if reader_vi is None and reader_en is None:
-        # Fallback to EasyOCR
         reader = get_easyocr_reader()
-        texts, conf = multi_pass_ocr(img_bgr, reader, ocr_type)
-        return texts, conf
     best_texts = []
     best_conf = 0.0
     best_lang = ""
-    # Try Vietnamese
     if reader_vi:
         texts_vi, conf_vi = multi_pass_ocr(img_bgr, reader_vi, ocr_type)
         if conf_vi > best_conf:
@@ -266,7 +286,6 @@ def dual_engine_ocr(img_bgr, ocr_type="note"):
             best_texts = texts_vi
             best_lang = "vi"
-    # Try English
     if reader_en:
         texts_en, conf_en = multi_pass_ocr(img_bgr, reader_en, ocr_type)
         if conf_en > best_conf:
@@ -274,7 +293,13 @@ def dual_engine_ocr(img_bgr, ocr_type="note"):
             best_texts = texts_en
             best_lang = "en"
-    print(f"      Best language: {best_lang} (conf={best_conf:.3f})")
     return best_texts, best_conf
@@ -313,22 +338,15 @@ def post_process_ocr_text(text):
 # OCR NOTE — Cải thiện
 # ============================================================
 def ocr_note(img_path, backend="paddle"):
-    """
-    OCR cho vùng Note — cải thiện:
-    1. Upscale mạnh (min 1500px width)
-    2. Multi-pass với nhiều preprocessing
-    3. Dual-engine (vi + en)
-    4. Post-processing
-    """
     img = cv2.imread(img_path)
     if img is None:
         return ""
-    texts, conf = dual_engine_ocr(img, ocr_type="note")
     # Post-process từng dòng
     processed = [post_process_ocr_text(t) for t in texts]
-    processed = [t for t in processed if t]  # remove empty
     return "\n".join(processed)
@@ -379,84 +397,52 @@ def parse_html_table(html_str):
 def ocr_table(img_path, backend="paddle"):
-    """
-    OCR cho vùng Table — cải thiện:
-    1. Thử PPStructure trước (table structure recognition tốt nhất)
-    2. Fallback: detect cells thủ công + OCR từng cell
-    3. Post-processing
-    """
     img = cv2.imread(img_path)
     if img is None:
         return {"rows": [], "text": ""}
-    # === Strategy 1: PPStructure (best for tables) ===
-    pp_engine = get_pp_structure()
-    if pp_engine is not None:
-        try:
-            # Upscale trước khi đưa vào PPStructure
-            h, w = img.shape[:2]
-            if w < 1200:
-                scale = 1200 / w
-                img_scaled = cv2.resize(img, None, fx=scale, fy=scale,
-                                        interpolation=cv2.INTER_CUBIC)
-            else:
-                img_scaled = img
-            result = pp_engine(img_scaled)
-            for item in result:
-                if item.get('type') == 'table':
-                    html = item.get('res', {}).get('html', '')
-                    if html:
-                        rows = parse_html_table(html)
-                        if rows:
-                            # Post-process mỗi cell
-                            rows = [[post_process_ocr_text(cell) for cell in row]
-                                    for row in rows]
-                            text = "\n".join(" | ".join(r) for r in rows)
-                            print(f"      PPStructure: {len(rows)} rows detected")
-                            return {"rows": rows, "text": text, "html": html}
-            # PPStructure ran but no table found → extract text
-            all_texts = []
-            for item in result:
-                res = item.get('res', [])
-                if isinstance(res, list):
-                    for line in res:
-                        if isinstance(line, dict) and 'text' in line:
-                            all_texts.append(line['text'])
-                        elif isinstance(line, (list, tuple)) and len(line) >= 2:
-                            text_info = line[1]
-                            if isinstance(text_info, (list, tuple)):
-                                all_texts.append(str(text_info[0]))
-                            else:
-                                all_texts.append(str(text_info))
-            if all_texts:
-                return {"rows": [all_texts], "text": "\n".join(all_texts)}
-        except Exception as e:
-            print(f"      PPStructure error: {e}, falling back to manual")
-    # === Strategy 2: Manual cell detection + OCR ===
     return ocr_table_manual(img, img_path, backend)
 def ocr_table_manual(img, img_path, backend="paddle"):
-    """
-    Fallback: detect table cells thủ công + OCR từng cell.
-    Cải thiện: upscale mỗi cell riêng, multi-pass OCR.
-    """
     cells = detect_table_structure(img)
     if cells:
-        reader = get_paddle_reader('vi') or get_easyocr_reader()
         ocr_results = []
         for (x1, y1, x2, y2) in cells:
-            # Bỏ cell quá lớn (toàn bộ bảng) hoặc quá nhỏ
             cell_w, cell_h = x2 - x1, y2 - y1
             img_h, img_w = img.shape[:2]
             if cell_w > img_w * 0.9 and cell_h > img_h * 0.9:
-                continue  # Skip full-table contour
             if cell_w < 15 or cell_h < 15:
                 continue
@@ -467,7 +453,7 @@ def ocr_table_manual(img, img_path, backend="paddle"):
             cx2 = min(img.shape[1], x2 + pad)
             cell_img = img[cy1:cy2, cx1:cx2]
-            text = ocr_cell_improved(cell_img, reader)
             if text:
                 ocr_results.append({
                     "text": post_process_ocr_text(text),
@@ -483,31 +469,36 @@ def ocr_table_manual(img, img_path, backend="paddle"):
                 "text": "\n".join(" | ".join(r) for r in rows)
             }
-    # === Strategy 3: OCR toàn bộ ảnh table, group theo hàng ===
     return ocr_table_fullimage(img, backend)
-def ocr_cell_improved(img_cell, reader):
     """OCR 1 cell — upscale mạnh, multi-preprocessing."""
     if img_cell.size == 0:
         return ""
     h, w = img_cell.shape[:2]
-    # Upscale cell nhỏ rất mạnh
     target_w = max(300, w)
     if w < target_w:
         scale = target_w / w
         img_cell = cv2.resize(img_cell, None, fx=scale, fy=scale,
                               interpolation=cv2.INTER_CUBIC)
-    # Try 2 variants
     best_text = ""
     best_conf = 0
     for variant in ["color", "binary"]:
         if variant == "color":
-            # Gentle enhancement
             img_proc = cv2.bilateralFilter(img_cell, 5, 50, 50)
             lab = cv2.cvtColor(img_proc, cv2.COLOR_BGR2LAB)
             l, a, b = cv2.split(lab)
@@ -531,8 +522,18 @@ def ocr_cell_improved(img_cell, reader):
 def ocr_table_fullimage(img, backend="paddle"):
-    """OCR toàn bộ ảnh table (không chia cell), group by rows."""
-    reader = get_paddle_reader('vi') or get_easyocr_reader()
     img_proc = preprocess_for_ocr(img, min_width=1500, mode="table")
     items = []
@@ -571,7 +572,6 @@ def ocr_table_fullimage(img, backend="paddle"):
     rows = group_rows(items, vertical_thresh_ratio=0.6)
     return {"rows": rows, "text": "\n".join(" | ".join(r) for r in rows)}
 # ============================================================
 # TABLE STRUCTURE DETECTION (giữ nguyên, có cải thiện nhỏ)
 # ============================================================
@@ -717,5 +717,5 @@ def run_pipeline(image_path, output_dir="outputs",
 if __name__ == "__main__":
     import sys
     img = sys.argv[1] if len(sys.argv) > 1 else "test.jpg"
-    result, _ = run_pipeline(img, ocr_backend="paddle")
     print(json.dumps(result, ensure_ascii=False, indent=2))

         _det_model = RTDETR(checkpoint)
     return _det_model
+# Thêm Surya OCR làm engine thứ 3
+from surya.ocr import run_ocr
+from surya.model.detection.model import load_det_processor, load_det_model
+from surya.model.recognition.model import load_rec_model
+from surya.model.recognition.processor import load_rec_processor
+def ocr_with_surya(img_bgr, langs=["vi", "en"]):
+    det_processor, det_model = load_det_processor(), load_det_model()
+    rec_model, rec_processor = load_rec_model(), load_rec_processor()
+    from PIL import Image
+    pil_img = Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
+    predictions = run_ocr([pil_img], [langs], det_model, det_processor,
+                          rec_model, rec_processor)
+    texts = [line.text for line in predictions[0].text_lines]
+    return "\n".join(texts)
 def get_paddle_reader(lang='vi'):
     """
 # ============================================================
 # DUAL-ENGINE OCR — PaddleOCR (vi) + PaddleOCR (en), chọn tốt hơn
 # ============================================================
+def run_ocr_with_backend(img_bgr, backend="paddle", ocr_type="note"):
     """
+    Chạy OCR với backend được chọn.
+    backend: "paddle", "easyocr", "surya"
+    Trả về (list_of_texts, avg_confidence) - với surya, confidence luôn = 1.0
     """
+    if backend == "surya":
+        text = ocr_with_surya(img_bgr, langs=["vi", "en"])
+        lines = [line.strip() for line in text.split("\n") if line.strip()]
+        return lines, 1.0  # Surya không trả confidence, coi như 1.0
+    # logic cũ cho paddle + easyocr
+    reader_vi = get_paddle_reader('vi') if backend == "paddle" else None
+    reader_en = get_paddle_reader('en') if backend == "paddle" else None
+    if reader_vi is None and reader_en is None and backend == "paddle":
+        # fallback easyocr
         reader = get_easyocr_reader()
+        return multi_pass_ocr(img_bgr, reader, ocr_type)
     best_texts = []
     best_conf = 0.0
     best_lang = ""
     if reader_vi:
         texts_vi, conf_vi = multi_pass_ocr(img_bgr, reader_vi, ocr_type)
         if conf_vi > best_conf:
             best_texts = texts_vi
             best_lang = "vi"
     if reader_en:
         texts_en, conf_en = multi_pass_ocr(img_bgr, reader_en, ocr_type)
         if conf_en > best_conf:
             best_texts = texts_en
             best_lang = "en"
+    if best_lang:
+        print(f"      Best language: {best_lang} (conf={best_conf:.3f})")
+    else:
+        # fallback easyocr
+        reader = get_easyocr_reader()
+        best_texts, best_conf = multi_pass_ocr(img_bgr, reader, ocr_type)
     return best_texts, best_conf
 # OCR NOTE — Cải thiện
 # ============================================================
 def ocr_note(img_path, backend="paddle"):
     img = cv2.imread(img_path)
     if img is None:
         return ""
+    texts, _ = run_ocr_with_backend(img, backend=backend, ocr_type="note")
     # Post-process từng dòng
     processed = [post_process_ocr_text(t) for t in texts]
+    processed = [t for t in processed if t]
     return "\n".join(processed)
 def ocr_table(img_path, backend="paddle"):
     img = cv2.imread(img_path)
     if img is None:
         return {"rows": [], "text": ""}
+    # Strategy 1: PPStructure (chỉ dùng nếu backend là paddle, vì PPStructure dùng PaddleOCR)
+    if backend == "paddle":
+        pp_engine = get_pp_structure()
+        if pp_engine is not None:
+            try:
+                h, w = img.shape[:2]
+                if w < 1200:
+                    scale = 1200 / w
+                    img_scaled = cv2.resize(img, None, fx=scale, fy=scale,
+                                            interpolation=cv2.INTER_CUBIC)
+                else:
+                    img_scaled = img
+                result = pp_engine(img_scaled)
+                for item in result:
+                    if item.get('type') == 'table':
+                        html = item.get('res', {}).get('html', '')
+                        if html:
+                            rows = parse_html_table(html)
+                            if rows:
+                                rows = [[post_process_ocr_text(cell) for cell in row]
+                                        for row in rows]
+                                text = "\n".join(" | ".join(r) for r in rows)
+                                print(f"      PPStructure: {len(rows)} rows detected")
+                                return {"rows": rows, "text": text, "html": html}
+                # Nếu không tìm thấy table, fallback
+            except Exception as e:
+                print(f"      PPStructure error: {e}, falling back to manual")
+    # Strategy 2: Manual cell detection
     return ocr_table_manual(img, img_path, backend)
 def ocr_table_manual(img, img_path, backend="paddle"):
     cells = detect_table_structure(img)
     if cells:
         ocr_results = []
         for (x1, y1, x2, y2) in cells:
             cell_w, cell_h = x2 - x1, y2 - y1
             img_h, img_w = img.shape[:2]
             if cell_w > img_w * 0.9 and cell_h > img_h * 0.9:
+                continue
             if cell_w < 15 or cell_h < 15:
                 continue
             cx2 = min(img.shape[1], x2 + pad)
             cell_img = img[cy1:cy2, cx1:cx2]
+            text = ocr_cell_improved(cell_img, backend=backend)
             if text:
                 ocr_results.append({
                     "text": post_process_ocr_text(text),
                 "text": "\n".join(" | ".join(r) for r in rows)
             }
     return ocr_table_fullimage(img, backend)
+def ocr_cell_improved(img_cell, backend="paddle"):
     """OCR 1 cell — upscale mạnh, multi-preprocessing."""
     if img_cell.size == 0:
         return ""
     h, w = img_cell.shape[:2]
     target_w = max(300, w)
     if w < target_w:
         scale = target_w / w
         img_cell = cv2.resize(img_cell, None, fx=scale, fy=scale,
                               interpolation=cv2.INTER_CUBIC)
+    if backend == "surya":
+        # Chạy Surya trực tiếp
+        text = ocr_with_surya(img_cell, langs=["vi", "en"])
+        return text.strip()
+    # logic cũ với reader (paddle/easyocr)
+    reader = get_paddle_reader('vi') if backend == "paddle" else get_easyocr_reader()
+    if reader is None:
+        reader = get_easyocr_reader()
     best_text = ""
     best_conf = 0
     for variant in ["color", "binary"]:
         if variant == "color":
             img_proc = cv2.bilateralFilter(img_cell, 5, 50, 50)
             lab = cv2.cvtColor(img_proc, cv2.COLOR_BGR2LAB)
             l, a, b = cv2.split(lab)
 def ocr_table_fullimage(img, backend="paddle"):
+    if backend == "surya":
+        # Dùng Surya OCR trên toàn bộ ảnh table
+        text = ocr_with_surya(img, langs=["vi", "en"])
+        lines = [line.strip() for line in text.split("\n") if line.strip()]
+        # Với Surya, ta không có bounding box, chỉ trả về một cột
+        rows = [[line] for line in lines]
+        return {"rows": rows, "text": text}
+    # logic cũ với paddle/easyocr
+    reader = get_paddle_reader('vi') if backend == "paddle" else get_easyocr_reader()
+    if reader is None:
+        reader = get_easyocr_reader()
     img_proc = preprocess_for_ocr(img, min_width=1500, mode="table")
     items = []
     rows = group_rows(items, vertical_thresh_ratio=0.6)
     return {"rows": rows, "text": "\n".join(" | ".join(r) for r in rows)}
 # ============================================================
 # TABLE STRUCTURE DETECTION (giữ nguyên, có cải thiện nhỏ)
 # ============================================================
 if __name__ == "__main__":
     import sys
     img = sys.argv[1] if len(sys.argv) > 1 else "test.jpg"
+    result, _ = run_pipeline(img, ocr_backend="surya")
     print(json.dumps(result, ensure_ascii=False, indent=2))