Spaces:

phamha
/

engineering-drawing

Sleeping

App Files Files Community

Harry Pham commited on Apr 2

Commit

f407757

1 Parent(s): 5c88957

Fix: remove paddlepaddle, pin versions for Python 3.12

Browse files

Files changed (2) hide show

requirements.txt +0 -3
src/inference.py +49 -167

requirements.txt CHANGED Viewed

@@ -139,9 +139,6 @@ opt-einsum==3.3.0
 orjson==3.11.8
 overrides==7.4.0
 packaging==23.2
-paddleocr==3.4.0
-paddlepaddle==3.3.1
-paddlex==3.4.3
 pandas==2.1.1
 pandocfilters==1.5.0
 parso==0.8.3

 orjson==3.11.8
 overrides==7.4.0
 packaging==23.2
 pandas==2.1.1
 pandocfilters==1.5.0
 parso==0.8.3

src/inference.py CHANGED Viewed

@@ -1,12 +1,10 @@
 # src/inference.py
-# ── Patch torch.load — PHẢI LÀ DÒNG ĐẦU TIÊN ──────────────
 import torch
 _orig_torch_load = torch.load
 def _patched_load(*args, **kwargs):
     kwargs.setdefault("weights_only", False)
     return _orig_torch_load(*args, **kwargs)
 torch.load = _patched_load
-# ───────────────────────────────────────────────────────────
 import cv2
 import json
@@ -14,143 +12,71 @@ import numpy as np
 from pathlib import Path
 from ultralytics import RTDETR
-# ── Device ─────────────────────────────────────────────────
-DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
 print(f"[INFO] Device: {DEVICE}")
-# ── Class config ────────────────────────────────────────────
 CLASS_NAMES   = ["note", "part-drawing", "table"]
-CLASS_DISPLAY = {
-    "note":         "Note",
-    "part-drawing": "PartDrawing",
-    "table":        "Table",
-}
-COLORS = {
-    "note":         (0,  165, 255),
-    "part-drawing": (0,  200,   0),
-    "table":        (0,   0,  220),
-}
-# ───────────────────────────────────────────────────────────
-# DETECTION MODEL
-# ───────────────────────────────────────────────────────────
-_det_model = None
-def get_det_model(checkpoint: str = "best.pt") -> RTDETR:
     global _det_model
     if _det_model is None:
-        print(f"[INFO] Loading detection model: {checkpoint}")
         _det_model = RTDETR(checkpoint)
     return _det_model
-# ───────────────────────────────────────────────────────────
-# OCR ENGINES
-# ───────────────────────────────────────────────────────────
-_easy_reader   = None
-_paddle_engine = None
 def get_easy_reader():
     global _easy_reader
     if _easy_reader is None:
         import easyocr
-        print("[INFO] Loading EasyOCR (vi + en)...")
-        _easy_reader = easyocr.Reader(
-            ["vi", "en"],
-            gpu=False,
-            verbose=False,
-        )
     return _easy_reader
-def get_paddle_engine():
-    global _paddle_engine
-    if _paddle_engine is None:
-        from paddleocr import PaddleOCR
-        print("[INFO] Loading PaddleOCR (vi)...")
-        _paddle_engine = PaddleOCR(
-            use_angle_cls=True,
-            lang="vi",
-            show_log=False,
-            use_gpu=False,
-        )
-    return _paddle_engine
-# ───────────────────────────────────────────────────────────
-# PREPROCESSING
-# ───────────────────────────────────────────────────────────
-def preprocess_for_ocr(img_bgr: np.ndarray) -> np.ndarray:
     h, w = img_bgr.shape[:2]
-    # Upscale nếu quá nhỏ
     if w < 800:
-        scale   = 800 / w
-        img_bgr = cv2.resize(
-            img_bgr,
-            (int(w * scale), int(h * scale)),
-            interpolation=cv2.INTER_CUBIC,
-        )
     gray  = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
-    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
     gray  = clahe.apply(gray)
     gray  = cv2.fastNlMeansDenoising(gray, h=15,
-                                      templateWindowSize=7,
-                                      searchWindowSize=21)
-    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
     gray   = cv2.filter2D(gray, -1, kernel)
     return cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
-# ───────────────────────────────────────────────────────────
-# OCR: NOTE
-# ───────────────────────────────────────────────────────────
-def ocr_note(img_path: str) -> str:
     img = cv2.imread(img_path)
     if img is None:
         return ""
     img_proc = preprocess_for_ocr(img)
-    # EasyOCR
     try:
         reader  = get_easy_reader()
         results = reader.readtext(img_proc, detail=1, paragraph=False,
                                   width_ths=0.7, height_ths=0.7)
-        lines = [t for (_, t, c) in results if c >= 0.2 and t.strip()]
-        if lines:
-            return "\n".join(lines)
-    except Exception as e:
-        print(f"[WARN] EasyOCR note: {e}")
-    # Fallback PaddleOCR
-    try:
-        ocr    = get_paddle_engine()
-        result = ocr.ocr(img_proc, cls=True)
-        if result and result[0]:
-            return "\n".join(l[1][0] for l in result[0] if l[1][1] >= 0.2)
     except Exception as e:
-        print(f"[WARN] PaddleOCR note: {e}")
-    return ""
-# ───────────────────────────────────────────────────────────
-# OCR: TABLE
-# ───────────────────────────────────────────────────────────
-def _group_rows(items: list) -> list:
     if not items:
         return []
     items = sorted(items, key=lambda x: x["y"])
     y_vals = [it["y"] for it in items]
     if len(y_vals) > 1:
-        gaps   = [y_vals[i+1] - y_vals[i] for i in range(len(y_vals)-1)]
         thresh = max(8, (sum(gaps)/len(gaps)) * 0.6)
     else:
         thresh = 12
     rows, cur = [], [items[0]]
     for item in items[1:]:
         if item["y"] - cur[-1]["y"] < thresh:
@@ -163,16 +89,12 @@ def _group_rows(items: list) -> list:
     rows.append([i["text"] for i in cur])
     return rows
-def ocr_table(img_path: str) -> dict:
     img = cv2.imread(img_path)
     if img is None:
-        return {"rows": [], "text": ""}
     img_proc = preprocess_for_ocr(img)
-    items    = []
-    # EasyOCR
     try:
         reader  = get_easy_reader()
         results = reader.readtext(img_proc, detail=1, paragraph=False,
@@ -182,81 +104,46 @@ def ocr_table(img_path: str) -> dict:
                 continue
             items.append({
                 "text": text.strip(),
-                "y": sum(p[1] for p in pts) / 4,
-                "x": sum(p[0] for p in pts) / 4,
             })
     except Exception as e:
-        print(f"[WARN] EasyOCR table: {e}")
-    # Fallback PaddleOCR
-    if not items:
-        try:
-            ocr    = get_paddle_engine()
-            result = ocr.ocr(img_proc, cls=True)
-            if result and result[0]:
-                for line in result[0]:
-                    pts, (text, conf) = line[0], line[1]
-                    if conf < 0.2 or not text.strip():
-                        continue
-                    items.append({
-                        "text": text.strip(),
-                        "y": sum(p[1] for p in pts) / 4,
-                        "x": sum(p[0] for p in pts) / 4,
-                    })
-        except Exception as e:
-            print(f"[WARN] PaddleOCR table: {e}")
     if not items:
-        return {"rows": [], "text": ""}
     rows = _group_rows(items)
-    return {
-        "rows": rows,
-        "text": "\n".join(" | ".join(r) for r in rows),
-    }
-# ───────────────────────────────────────────────────────────
-# MAIN PIPELINE
-# ───────────────────────────────────────────────────────────
-def run_pipeline(
-    image_path:  str,
-    output_dir:  str   = "outputs",
-    checkpoint:  str   = "best.pt",
-    conf_thresh: float = 0.3,
-) -> tuple:
     image_path = str(image_path)
     img_name   = Path(image_path).name
     stem       = Path(image_path).stem
     crop_dir   = Path(output_dir) / stem / "crops"
     crop_dir.mkdir(parents=True, exist_ok=True)
-    # 1. Detect
     model   = get_det_model(checkpoint)
     results = model(image_path, imgsz=1024, conf=conf_thresh,
                     iou=0.5, device=DEVICE, verbose=False)
     img_bgr = cv2.imread(image_path)
     if img_bgr is None:
-        raise ValueError(f"Không đọc được ảnh: {image_path}")
     objects = []
     for i, box in enumerate(results[0].boxes):
-        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
         cls_idx  = int(box.cls[0])
         conf_val = round(float(box.conf[0]), 4)
         cls_raw  = CLASS_NAMES[cls_idx]
         cls_show = CLASS_DISPLAY[cls_raw]
-        # 2. Crop
         pad  = 6
         crop = img_bgr[max(0,y1-pad):min(img_bgr.shape[0],y2+pad),
                        max(0,x1-pad):min(img_bgr.shape[1],x2+pad)]
         crop_path = str(crop_dir / f"{cls_show}_{i+1}.jpg")
         cv2.imwrite(crop_path, crop, [cv2.IMWRITE_JPEG_QUALITY, 95])
-        # 3. OCR
         ocr_content = None
         if cls_raw == "note":
             print(f"[OCR] Note #{i+1}...")
@@ -265,41 +152,36 @@ def run_pipeline(
         elif cls_raw == "table":
             print(f"[OCR] Table #{i+1}...")
             ocr_content = ocr_table(crop_path)
-            print(f"      → {repr(ocr_content.get('text','')[:80]) if ocr_content else 'EMPTY'}")
         objects.append({
-            "id":          i + 1,
-            "class":       cls_show,
-            "confidence":  conf_val,
-            "bbox":        {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
-            "crop_path":   crop_path,
             "ocr_content": ocr_content,
         })
-        # 4. Vẽ bbox
         color = COLORS[cls_raw]
-        cv2.rectangle(img_bgr, (x1, y1), (x2, y2), color, 2)
         label = f"{cls_show} {conf_val:.2f}"
-        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
-        cv2.rectangle(img_bgr, (x1, y1-th-10), (x1+tw+8, y1), color, -1)
-        cv2.putText(img_bgr, label, (x1+4, y1-4),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 2)
-    # 5. Lưu visualize
-    vis_path = str(Path(output_dir) / stem / "result_vis.jpg")
     cv2.imwrite(vis_path, img_bgr)
-    # 6. Lưu JSON
     result    = {"image": img_name, "objects": objects}
-    json_path = str(Path(output_dir) / stem / "result.json")
-    with open(json_path, "w", encoding="utf-8") as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
-    print(f"\n[✓] {len(objects)} objects | vis→{vis_path} | json→{json_path}")
     return result, vis_path
-# ── CLI ──────────────────────────────────────────────────────
 if __name__ == "__main__":
     import sys
     img = sys.argv[1] if len(sys.argv) > 1 else "test.jpg"

 # src/inference.py
 import torch
 _orig_torch_load = torch.load
 def _patched_load(*args, **kwargs):
     kwargs.setdefault("weights_only", False)
     return _orig_torch_load(*args, **kwargs)
 torch.load = _patched_load
 import cv2
 import json
 from pathlib import Path
 from ultralytics import RTDETR
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[INFO] Device: {DEVICE}")
 CLASS_NAMES   = ["note", "part-drawing", "table"]
+CLASS_DISPLAY = {"note": "Note", "part-drawing": "PartDrawing", "table": "Table"}
+COLORS        = {"note": (0,165,255), "part-drawing": (0,200,0), "table": (0,0,220)}
+_det_model  = None
+_easy_reader = None
+def get_det_model(checkpoint="best.pt"):
     global _det_model
     if _det_model is None:
+        print(f"[INFO] Loading model: {checkpoint}")
         _det_model = RTDETR(checkpoint)
     return _det_model
 def get_easy_reader():
     global _easy_reader
     if _easy_reader is None:
         import easyocr
+        print("[INFO] Loading EasyOCR...")
+        _easy_reader = easyocr.Reader(["vi","en"], gpu=False, verbose=False)
     return _easy_reader
+def preprocess_for_ocr(img_bgr):
     h, w = img_bgr.shape[:2]
     if w < 800:
+        scale = 800 / w
+        img_bgr = cv2.resize(img_bgr, (int(w*scale), int(h*scale)),
+                             interpolation=cv2.INTER_CUBIC)
     gray  = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
     gray  = clahe.apply(gray)
     gray  = cv2.fastNlMeansDenoising(gray, h=15,
+                templateWindowSize=7, searchWindowSize=21)
+    kernel = np.array([[0,-1,0],[-1,5,-1],[0,-1,0]])
     gray   = cv2.filter2D(gray, -1, kernel)
     return cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
+def ocr_note(img_path):
     img = cv2.imread(img_path)
     if img is None:
         return ""
     img_proc = preprocess_for_ocr(img)
     try:
         reader  = get_easy_reader()
         results = reader.readtext(img_proc, detail=1, paragraph=False,
                                   width_ths=0.7, height_ths=0.7)
+        lines = [t for (_,t,c) in results if c >= 0.2 and t.strip()]
+        return "\n".join(lines)
     except Exception as e:
+        print(f"[WARN] ocr_note: {e}")
+        return ""
+def _group_rows(items):
     if not items:
         return []
     items = sorted(items, key=lambda x: x["y"])
     y_vals = [it["y"] for it in items]
     if len(y_vals) > 1:
+        gaps   = [y_vals[i+1]-y_vals[i] for i in range(len(y_vals)-1)]
         thresh = max(8, (sum(gaps)/len(gaps)) * 0.6)
     else:
         thresh = 12
     rows, cur = [], [items[0]]
     for item in items[1:]:
         if item["y"] - cur[-1]["y"] < thresh:
     rows.append([i["text"] for i in cur])
     return rows
+def ocr_table(img_path):
     img = cv2.imread(img_path)
     if img is None:
+        return {"rows":[], "text":""}
     img_proc = preprocess_for_ocr(img)
+    items = []
     try:
         reader  = get_easy_reader()
         results = reader.readtext(img_proc, detail=1, paragraph=False,
                 continue
             items.append({
                 "text": text.strip(),
+                "y": sum(p[1] for p in pts)/4,
+                "x": sum(p[0] for p in pts)/4,
             })
     except Exception as e:
+        print(f"[WARN] ocr_table: {e}")
     if not items:
+        return {"rows":[], "text":""}
     rows = _group_rows(items)
+    return {"rows": rows, "text": "\n".join(" | ".join(r) for r in rows)}
+def run_pipeline(image_path, output_dir="outputs",
+                 checkpoint="best.pt", conf_thresh=0.3):
     image_path = str(image_path)
     img_name   = Path(image_path).name
     stem       = Path(image_path).stem
     crop_dir   = Path(output_dir) / stem / "crops"
     crop_dir.mkdir(parents=True, exist_ok=True)
     model   = get_det_model(checkpoint)
     results = model(image_path, imgsz=1024, conf=conf_thresh,
                     iou=0.5, device=DEVICE, verbose=False)
     img_bgr = cv2.imread(image_path)
     if img_bgr is None:
+        raise ValueError(f"Cannot read: {image_path}")
     objects = []
     for i, box in enumerate(results[0].boxes):
+        x1,y1,x2,y2 = map(int, box.xyxy[0].tolist())
         cls_idx  = int(box.cls[0])
         conf_val = round(float(box.conf[0]), 4)
         cls_raw  = CLASS_NAMES[cls_idx]
         cls_show = CLASS_DISPLAY[cls_raw]
         pad  = 6
         crop = img_bgr[max(0,y1-pad):min(img_bgr.shape[0],y2+pad),
                        max(0,x1-pad):min(img_bgr.shape[1],x2+pad)]
         crop_path = str(crop_dir / f"{cls_show}_{i+1}.jpg")
         cv2.imwrite(crop_path, crop, [cv2.IMWRITE_JPEG_QUALITY, 95])
         ocr_content = None
         if cls_raw == "note":
             print(f"[OCR] Note #{i+1}...")
         elif cls_raw == "table":
             print(f"[OCR] Table #{i+1}...")
             ocr_content = ocr_table(crop_path)
+            preview = ocr_content.get("text","")[:80]
+            print(f"      → {repr(preview) if preview else 'EMPTY'}")
         objects.append({
+            "id": i+1, "class": cls_show,
+            "confidence": conf_val,
+            "bbox": {"x1":x1,"y1":y1,"x2":x2,"y2":y2},
+            "crop_path": crop_path,
             "ocr_content": ocr_content,
         })
         color = COLORS[cls_raw]
+        cv2.rectangle(img_bgr, (x1,y1), (x2,y2), color, 2)
         label = f"{cls_show} {conf_val:.2f}"
+        (tw,th),_ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
+        cv2.rectangle(img_bgr, (x1,y1-th-10), (x1+tw+8,y1), color, -1)
+        cv2.putText(img_bgr, label, (x1+4,y1-4),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,255,255), 2)
+    vis_path = str(Path(output_dir)/stem/"result_vis.jpg")
     cv2.imwrite(vis_path, img_bgr)
     result    = {"image": img_name, "objects": objects}
+    json_path = str(Path(output_dir)/stem/"result.json")
+    with open(json_path,"w",encoding="utf-8") as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
+    print(f"[✓] {len(objects)} objects | {vis_path} | {json_path}")
     return result, vis_path
 if __name__ == "__main__":
     import sys
     img = sys.argv[1] if len(sys.argv) > 1 else "test.jpg"