Corin1998 commited on
Commit
2394322
·
verified ·
1 Parent(s): 86c3008

Update core/pdf_io.py

Browse files
Files changed (1) hide show
  1. core/pdf_io.py +6 -15
core/pdf_io.py CHANGED
@@ -1,33 +1,24 @@
1
  from __future__ import annotations
2
- import io, base64
3
  from typing import List
4
  import fitz # PyMuPDF
5
  import pdfplumber
6
 
7
- # 画像化(Vision用)
8
  def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]:
9
  images: List[bytes] = []
10
  with fitz.open(pdf_path) as doc:
11
  for i, page in enumerate(doc):
12
- if i >= max_pages:
13
- break
14
- zoom = dpi / 72.0 # 72dpi基準
15
  mat = fitz.Matrix(zoom, zoom)
16
  pix = page.get_pixmap(matrix=mat, alpha=False)
17
  images.append(pix.tobytes("png"))
18
  return images
19
 
20
- # テキスト抽出(テキストPDF用のフォールバック)
21
- def pdf_to_text(pdf_path: str, max_chars: int = 15000) -> str:
22
  chunks = []
23
  with pdfplumber.open(pdf_path) as pdf:
24
- for i, page in enumerate(pdf.pages):
25
  t = (page.extract_text() or "").strip()
26
- if t:
27
- chunks.append(f"[page {i+1}]\n{t}")
28
- if sum(len(c) for c in chunks) > max_chars:
29
- break
30
  return "\n\n".join(chunks)[:max_chars]
31
-
32
- def b64(img: bytes) -> str:
33
- return base64.b64encode(img).decode("utf-8")
 
1
  from __future__ import annotations
 
2
  from typing import List
3
  import fitz # PyMuPDF
4
  import pdfplumber
5
 
 
6
  def pdf_to_images(pdf_path: str, dpi: int = 200, max_pages: int = 6) -> List[bytes]:
7
  images: List[bytes] = []
8
  with fitz.open(pdf_path) as doc:
9
  for i, page in enumerate(doc):
10
+ if i >= max_pages: break
11
+ zoom = dpi / 72.0
 
12
  mat = fitz.Matrix(zoom, zoom)
13
  pix = page.get_pixmap(matrix=mat, alpha=False)
14
  images.append(pix.tobytes("png"))
15
  return images
16
 
17
+ def pdf_to_text(pdf_path: str, max_chars: int = 15000, pages:int=3) -> str:
 
18
  chunks = []
19
  with pdfplumber.open(pdf_path) as pdf:
20
+ for i, page in enumerate(pdf.pages[:pages]):
21
  t = (page.extract_text() or "").strip()
22
+ if t: chunks.append(f"[page {i+1}]\n{t}")
23
+ if sum(len(c) for c in chunks) > max_chars: break
 
 
24
  return "\n\n".join(chunks)[:max_chars]