Corin1998 commited on
Commit
41a0a5e
·
verified ·
1 Parent(s): 5a9913b

Create pdf_utils.py

Browse files
Files changed (1) hide show
  1. core/pdf_utils.py +29 -0
core/pdf_utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, shutil
2
+ from typing import List
3
+ from pdf2image import convert_from_path
4
+ import pdfplumber
5
+
6
+ def pdf_to_images(pdf_path: str, dpi: int = 220, max_pages: int = 6) -> List[bytes]:
7
+ if not shutil.which("pdftoppm"):
8
+ # 環境になくても UI は出す。上位でテキスト抽出にフォールバックさせる。
9
+ raise RuntimeError("pdftoppm(poppler-utils)が見つかりません。画像化はスキップします。")
10
+ pages = convert_from_path(pdf_path, dpi=dpi, fmt="png")
11
+ out: List[bytes] = []
12
+ for i, p in enumerate(pages):
13
+ if i >= max_pages:
14
+ break
15
+ buf = io.BytesIO()
16
+ p.save(buf, format="PNG")
17
+ out.append(buf.getvalue())
18
+ return out
19
+
20
+ def pdf_to_text(pdf_path: str, max_chars: int = 15000) -> str:
21
+ chunks: List[str] = []
22
+ with pdfplumber.open(pdf_path) as pdf:
23
+ for i, page in enumerate(pdf.pages):
24
+ t = (page.extract_text() or "").strip()
25
+ if t:
26
+ chunks.append(f"[page {i+1}]\n{t}")
27
+ if sum(len(c) for c in chunks) > max_chars:
28
+ break
29
+ return "\n\n".join(chunks)[:max_chars]