howard9963 commited on
Commit
53077f6
·
verified ·
1 Parent(s): d25eff9

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -10
app.py CHANGED
@@ -218,23 +218,42 @@ def _ocr_page_text(page) -> str:
218
 
219
  def _read_pdf_text(path: str) -> Tuple[str, int]:
220
  """
221
- 讀取 PDF:若該頁到文字(可能是掃描影像)則以 OCR 進行辨識
222
  回傳:(全文, 頁數)
223
  """
 
224
  try:
225
- print(f"[READ] PDF: {os.path.basename(path)}")
226
- parts: List[str] = []
227
  with fitz.open(path) as doc:
 
228
  for page in doc:
229
- txt = (page.get_text("text") or "").strip()
230
- if len(txt) < 20: # 低於門檻判定影像頁
231
- ocr_txt = _ocr_page_text(page)
232
- parts.append(ocr_txt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  else:
234
- parts.append(txt)
235
- return "\n".join(parts).strip(), len(doc)
 
 
 
236
  except Exception as e:
237
- print(f"[READ][ERROR] PDF {path}: {e}")
238
  return "", 0
239
 
240
  def _read_file_to_text(file_path: Optional[str]) -> Tuple[str, str]:
 
218
 
219
  def _read_pdf_text(path: str) -> Tuple[str, int]:
220
  """
221
+ 讀取 PDF;如果抽到文字少於 200 字就用 OCR。
222
  回傳:(全文, 頁數)
223
  """
224
+ text_parts = []
225
  try:
 
 
226
  with fitz.open(path) as doc:
227
+ total_chars = 0
228
  for page in doc:
229
+ txt = page.get_text("text") or ""
230
+ total_chars += len(txt)
231
+ text_parts.append(txt.strip())
232
+
233
+ if total_chars >= 200:
234
+ print(f"[PDF] 偵測為文字 PDF(共 {total_chars} 字)")
235
+ return "\n".join(text_parts).strip(), len(doc)
236
+
237
+ # 不足 200 字 → 做 OCR
238
+ print(f"[PDF] 偵測為影像 PDF(僅 {total_chars} 字),進行 OCR")
239
+ ocr_text_parts = []
240
+ for page in doc:
241
+ pix = page.get_pixmap(dpi=240)
242
+ img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
243
+ if OCR:
244
+ res = OCR.ocr(np.array(img), cls=True)
245
+ if res and res[0]:
246
+ page_text = "\n".join([line[1][0] for line in res[0]])
247
+ else:
248
+ page_text = ""
249
  else:
250
+ page_text = ""
251
+ ocr_text_parts.append(page_text.strip())
252
+
253
+ return "\n".join(ocr_text_parts).strip(), len(doc)
254
+
255
  except Exception as e:
256
+ print(f"[PDF] 讀取錯誤:{e}")
257
  return "", 0
258
 
259
  def _read_file_to_text(file_path: Optional[str]) -> Tuple[str, str]: