Corin1998 commited on
Commit
2c42056
·
verified ·
1 Parent(s): cfc19b1

Create pdf_io.py

Browse files
Files changed (1) hide show
  1. core/pdf_io.py +33 -0
core/pdf_io.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import io, base64
3
+ from typing import List
4
+ import fitz # PyMuPDF
5
+ import pdfplumber
6
+
7
+ # 画像化(Vision用)
8
+ def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]:
9
+ images: List[bytes] = []
10
+ with fitz.open(pdf_path) as doc:
11
+ for i, page in enumerate(doc):
12
+ if i >= max_pages:
13
+ break
14
+ zoom = dpi / 72.0 # 72dpi基準
15
+ mat = fitz.Matrix(zoom, zoom)
16
+ pix = page.get_pixmap(matrix=mat, alpha=False)
17
+ images.append(pix.tobytes("png"))
18
+ return images
19
+
20
+ # テキスト抽出(テキストPDF用のフォールバック)
21
+ def pdf_to_text(pdf_path: str, max_chars: int = 15000) -> str:
22
+ chunks = []
23
+ with pdfplumber.open(pdf_path) as pdf:
24
+ for i, page in enumerate(pdf.pages):
25
+ t = (page.extract_text() or "").strip()
26
+ if t:
27
+ chunks.append(f"[page {i+1}]\n{t}")
28
+ if sum(len(c) for c in chunks) > max_chars:
29
+ break
30
+ return "\n\n".join(chunks)[:max_chars]
31
+
32
+ def b64(img: bytes) -> str:
33
+ return base64.b64encode(img).decode("utf-8")