Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -218,23 +218,42 @@ def _ocr_page_text(page) -> str:
|
|
| 218 |
|
| 219 |
def _read_pdf_text(path: str) -> Tuple[str, int]:
|
| 220 |
"""
|
| 221 |
-
讀取 PDF
|
| 222 |
回傳:(全文, 頁數)
|
| 223 |
"""
|
|
|
|
| 224 |
try:
|
| 225 |
-
print(f"[READ] PDF: {os.path.basename(path)}")
|
| 226 |
-
parts: List[str] = []
|
| 227 |
with fitz.open(path) as doc:
|
|
|
|
| 228 |
for page in doc:
|
| 229 |
-
txt =
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
else:
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
| 236 |
except Exception as e:
|
| 237 |
-
print(f"[
|
| 238 |
return "", 0
|
| 239 |
|
| 240 |
def _read_file_to_text(file_path: Optional[str]) -> Tuple[str, str]:
|
|
|
|
| 218 |
|
| 219 |
def _read_pdf_text(path: str) -> Tuple[str, int]:
|
| 220 |
"""
|
| 221 |
+
讀取 PDF;如果抽到的文字少於 200 字,就用 OCR。
|
| 222 |
回傳:(全文, 頁數)
|
| 223 |
"""
|
| 224 |
+
text_parts = []
|
| 225 |
try:
|
|
|
|
|
|
|
| 226 |
with fitz.open(path) as doc:
|
| 227 |
+
total_chars = 0
|
| 228 |
for page in doc:
|
| 229 |
+
txt = page.get_text("text") or ""
|
| 230 |
+
total_chars += len(txt)
|
| 231 |
+
text_parts.append(txt.strip())
|
| 232 |
+
|
| 233 |
+
if total_chars >= 200:
|
| 234 |
+
print(f"[PDF] 偵測為文字 PDF(共 {total_chars} 字)")
|
| 235 |
+
return "\n".join(text_parts).strip(), len(doc)
|
| 236 |
+
|
| 237 |
+
# 不足 200 字 → 做 OCR
|
| 238 |
+
print(f"[PDF] 偵測為影像 PDF(僅 {total_chars} 字),進行 OCR")
|
| 239 |
+
ocr_text_parts = []
|
| 240 |
+
for page in doc:
|
| 241 |
+
pix = page.get_pixmap(dpi=240)
|
| 242 |
+
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
|
| 243 |
+
if OCR:
|
| 244 |
+
res = OCR.ocr(np.array(img), cls=True)
|
| 245 |
+
if res and res[0]:
|
| 246 |
+
page_text = "\n".join([line[1][0] for line in res[0]])
|
| 247 |
+
else:
|
| 248 |
+
page_text = ""
|
| 249 |
else:
|
| 250 |
+
page_text = ""
|
| 251 |
+
ocr_text_parts.append(page_text.strip())
|
| 252 |
+
|
| 253 |
+
return "\n".join(ocr_text_parts).strip(), len(doc)
|
| 254 |
+
|
| 255 |
except Exception as e:
|
| 256 |
+
print(f"[PDF] 讀取錯誤:{e}")
|
| 257 |
return "", 0
|
| 258 |
|
| 259 |
def _read_file_to_text(file_path: Optional[str]) -> Tuple[str, str]:
|