Spaces:

Corin1998
/

Score

Sleeping

Corin1998 commited on Aug 28, 2025

Commit

2394322

verified ·

1 Parent(s): 86c3008

Update core/pdf_io.py

Files changed (1) hide show

core/pdf_io.py CHANGED Viewed

@@ -1,33 +1,24 @@
 from __future__ import annotations
-import io, base64
 from typing import List
 import fitz  # PyMuPDF
 import pdfplumber
-# 画像化（Vision用）
 def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]:
     images: List[bytes] = []
     with fitz.open(pdf_path) as doc:
         for i, page in enumerate(doc):
-            if i >= max_pages:
-                break
-            zoom = dpi / 72.0  # 72dpi基準
             mat = fitz.Matrix(zoom, zoom)
             pix = page.get_pixmap(matrix=mat, alpha=False)
             images.append(pix.tobytes("png"))
     return images
-# テキスト抽出（テキストPDF用のフォールバック）
-def pdf_to_text(pdf_path: str, max_chars: int = 15000) -> str:
     chunks = []
     with pdfplumber.open(pdf_path) as pdf:
-        for i, page in enumerate(pdf.pages):
             t = (page.extract_text() or "").strip()
-            if t:
-                chunks.append(f"[page {i+1}]\n{t}")
-            if sum(len(c) for c in chunks) > max_chars:
-                break
     return "\n\n".join(chunks)[:max_chars]
-def b64(img: bytes) -> str:
-    return base64.b64encode(img).decode("utf-8")

 from __future__ import annotations
 from typing import List
 import fitz  # PyMuPDF
 import pdfplumber
 def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]:
     images: List[bytes] = []
     with fitz.open(pdf_path) as doc:
         for i, page in enumerate(doc):
+            if i >= max_pages: break
+            zoom = dpi / 72.0
             mat = fitz.Matrix(zoom, zoom)
             pix = page.get_pixmap(matrix=mat, alpha=False)
             images.append(pix.tobytes("png"))
     return images
+def pdf_to_text(pdf_path: str, max_chars: int = 15000, pages:int=3) -> str:
     chunks = []
     with pdfplumber.open(pdf_path) as pdf:
+        for i, page in enumerate(pdf.pages[:pages]):
             t = (page.extract_text() or "").strip()
+            if t: chunks.append(f"[page {i+1}]\n{t}")
+            if sum(len(c) for c in chunks) > max_chars: break
     return "\n\n".join(chunks)[:max_chars]