Spaces:

Corin1998
/

Score

Sleeping

Corin1998 commited on Aug 27, 2025

Commit

2c42056

verified ·

1 Parent(s): cfc19b1

Create pdf_io.py

Files changed (1) hide show

core/pdf_io.py ADDED Viewed

+from __future__ import annotations
+import io, base64
+from typing import List
+import fitz  # PyMuPDF
+import pdfplumber
+# 画像化（Vision用）
+def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]:
+    images: List[bytes] = []
+    with fitz.open(pdf_path) as doc:
+        for i, page in enumerate(doc):
+            if i >= max_pages:
+                break
+            zoom = dpi / 72.0  # 72dpi基準
+            mat = fitz.Matrix(zoom, zoom)
+            pix = page.get_pixmap(matrix=mat, alpha=False)
+            images.append(pix.tobytes("png"))
+    return images
+# テキスト抽出（テキストPDF用のフォールバック）
+def pdf_to_text(pdf_path: str, max_chars: int = 15000) -> str:
+    chunks = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for i, page in enumerate(pdf.pages):
+            t = (page.extract_text() or "").strip()
+            if t:
+                chunks.append(f"[page {i+1}]\n{t}")
+            if sum(len(c) for c in chunks) > max_chars:
+                break
+    return "\n\n".join(chunks)[:max_chars]
+def b64(img: bytes) -> str:
+    return base64.b64encode(img).decode("utf-8")