Spaces:

Corin1998
/

Score

Sleeping

Corin1998 commited on Aug 27, 2025

Commit

41a0a5e

verified ·

1 Parent(s): 5a9913b

Create pdf_utils.py

Files changed (1) hide show

core/pdf_utils.py ADDED Viewed

+import io, shutil
+from typing import List
+from pdf2image import convert_from_path
+import pdfplumber
+def pdf_to_images(pdf_path: str, dpi: int = 220, max_pages: int = 6) -> List[bytes]:
+    if not shutil.which("pdftoppm"):
+        # 環境になくても UI は出す。上位でテキスト抽出にフォールバックさせる。
+        raise RuntimeError("pdftoppm（poppler-utils）が見つかりません。画像化はスキップします。")
+    pages = convert_from_path(pdf_path, dpi=dpi, fmt="png")
+    out: List[bytes] = []
+    for i, p in enumerate(pages):
+        if i >= max_pages:
+            break
+        buf = io.BytesIO()
+        p.save(buf, format="PNG")
+        out.append(buf.getvalue())
+    return out
+def pdf_to_text(pdf_path: str, max_chars: int = 15000) -> str:
+    chunks: List[str] = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for i, page in enumerate(pdf.pages):
+            t = (page.extract_text() or "").strip()
+            if t:
+                chunks.append(f"[page {i+1}]\n{t}")
+            if sum(len(c) for c in chunks) > max_chars:
+                break
+    return "\n\n".join(chunks)[:max_chars]