Spaces:

howard9963
/

testComplianceLocal

Sleeping

App Files Files Community

howard9963 commited on Aug 14, 2025

Commit

53077f6

verified ·

1 Parent(s): d25eff9

Upload app.py

Browse files

Files changed (1) hide show

app.py +29 -10

app.py CHANGED Viewed

@@ -218,23 +218,42 @@ def _ocr_page_text(page) -> str:
 def _read_pdf_text(path: str) -> Tuple[str, int]:
     """
-    讀取 PDF：若該頁抽不到文字（可能是掃描影像），則以 OCR 進行辨識。
     回傳：(全文, 頁數)
     """
     try:
-        print(f"[READ] PDF: {os.path.basename(path)}")
-        parts: List[str] = []
         with fitz.open(path) as doc:
             for page in doc:
-                txt = (page.get_text("text") or "").strip()
-                if len(txt) < 20:  # 低於門檻判定影像頁
-                    ocr_txt = _ocr_page_text(page)
-                    parts.append(ocr_txt)
                 else:
-                    parts.append(txt)
-            return "\n".join(parts).strip(), len(doc)
     except Exception as e:
-        print(f"[READ][ERROR] PDF {path}: {e}")
         return "", 0
 def _read_file_to_text(file_path: Optional[str]) -> Tuple[str, str]:

 def _read_pdf_text(path: str) -> Tuple[str, int]:
     """
+    讀取 PDF；如果抽到的文字少於 200 字，就用 OCR。
     回傳：(全文, 頁數)
     """
+    text_parts = []
     try:
         with fitz.open(path) as doc:
+            total_chars = 0
             for page in doc:
+                txt = page.get_text("text") or ""
+                total_chars += len(txt)
+                text_parts.append(txt.strip())
+            if total_chars >= 200:
+                print(f"[PDF] 偵測為文字 PDF（共 {total_chars} 字）")
+                return "\n".join(text_parts).strip(), len(doc)
+            # 不足 200 字 → 做 OCR
+            print(f"[PDF] 偵測為影像 PDF（僅 {total_chars} 字），進行 OCR")
+            ocr_text_parts = []
+            for page in doc:
+                pix = page.get_pixmap(dpi=240)
+                img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
+                if OCR:
+                    res = OCR.ocr(np.array(img), cls=True)
+                    if res and res[0]:
+                        page_text = "\n".join([line[1][0] for line in res[0]])
+                    else:
+                        page_text = ""
                 else:
+                    page_text = ""
+                ocr_text_parts.append(page_text.strip())
+            return "\n".join(ocr_text_parts).strip(), len(doc)
     except Exception as e:
+        print(f"[PDF] 讀取錯誤：{e}")
         return "", 0
 def _read_file_to_text(file_path: Optional[str]) -> Tuple[str, str]: