Corin1998 commited on
Commit
45b5c62
·
verified ·
1 Parent(s): 19256c1

Update core/extract.py

Browse files
Files changed (1) hide show
  1. core/extract.py +8 -24
core/extract.py CHANGED
@@ -16,7 +16,6 @@ def env_summary() -> str:
16
  return " / ".join(out)
17
 
18
  def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
19
- """pdfminer ベースの純テキスト抽出(速い)"""
20
  log = []
21
  chunks = []
22
  try:
@@ -35,33 +34,21 @@ def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
35
  return "", "\n".join(log)
36
 
37
  def _pick_business_text(raw_text: str) -> str:
38
- """事業説明/会社概要っぽい段落を拾う(AI補足用)"""
39
  keys = ("事業内容", "会社概要", "製品", "サービス", "沿革")
40
  best = ""
41
  for block in raw_text.split("\n\n"):
42
  if any(k in block for k in keys):
43
- best = block if len(block) > len(best) else best
44
- return (best or raw_text[:1200])
 
45
 
46
- def parse_pdf(
47
- file_paths: List[str],
48
- force_ocr: bool = False,
49
- dpi: int = 220,
50
- max_pages: int = 8
51
- ) -> Tuple[List[bytes], str, str, str]:
52
- """
53
- Returns:
54
- images : Vision へ渡せる PNG バイト列(最大 max_pages)
55
- raw_text : テキスト抽出結果(テキストモデルのフォールバック用)
56
- business : 事業説明に近いテキスト(AI所見の市場/製品補足用)
57
- debug_log : 抽出ログ(UI に表示)
58
- """
59
  if not file_paths:
60
  raise ExtractError("PDFが指定されていません。")
61
 
62
  debug_lines = [f"[env] {env_summary()}"]
63
 
64
- # ---- まずは全ファイルからテキスト抽出(速い・確実)
65
  all_text = []
66
  for p in file_paths:
67
  txt, lg = _pdf_to_text(p)
@@ -69,14 +56,14 @@ def parse_pdf(
69
  all_text.append(txt)
70
  raw_text = "\n\n".join(all_text)
71
 
72
- # ---- 画像化(Vision 用)。テキストが薄い/OCR強制ら実行
73
  images: List[bytes] = []
74
  need_images = force_ocr or (len(raw_text) < 500)
75
  if need_images:
76
  try:
77
  for p in file_paths:
78
  pages = convert_from_path(p, dpi=dpi, fmt="png")
79
- for i, pg in enumerate(pages):
80
  if len(images) >= max_pages:
81
  break
82
  buf = io.BytesIO()
@@ -84,13 +71,10 @@ def parse_pdf(
84
  images.append(buf.getvalue())
85
  debug_lines.append(f"[image] generated pages: {len(images)}")
86
  except Exception as e:
87
- # Poppler 未導入や壊れ PDF を丁寧に通知
88
  debug_lines.append(f"[image] error: {type(e).__name__}: {e}")
89
  if shutil.which("pdftoppm") is None:
90
  raise ExtractError(
91
- "PDFの画像化に失敗しました(Poppler 未検出)。"
92
- "Space の packages.txt に `poppler-utils` を入れて再ビルドしてください。"
93
  )
94
- # 画像化は諦め、テキストのみで続行
95
  business = _pick_business_text(raw_text)
96
  return images, raw_text, business, "\n".join(debug_lines)
 
16
  return " / ".join(out)
17
 
18
  def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
 
19
  log = []
20
  chunks = []
21
  try:
 
34
  return "", "\n".join(log)
35
 
36
  def _pick_business_text(raw_text: str) -> str:
 
37
  keys = ("事業内容", "会社概要", "製品", "サービス", "沿革")
38
  best = ""
39
  for block in raw_text.split("\n\n"):
40
  if any(k in block for k in keys):
41
+ if len(block) > len(best):
42
+ best = block
43
+ return best or raw_text[:1200]
44
 
45
+ def parse_pdf(file_paths: List[str], force_ocr: bool = False, dpi: int = 220, max_pages: int = 8) -> Tuple[List[bytes], str, str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
46
  if not file_paths:
47
  raise ExtractError("PDFが指定されていません。")
48
 
49
  debug_lines = [f"[env] {env_summary()}"]
50
 
51
+ # テキスト抽出
52
  all_text = []
53
  for p in file_paths:
54
  txt, lg = _pdf_to_text(p)
 
56
  all_text.append(txt)
57
  raw_text = "\n\n".join(all_text)
58
 
59
+ # 画像化(必要ときだけ)
60
  images: List[bytes] = []
61
  need_images = force_ocr or (len(raw_text) < 500)
62
  if need_images:
63
  try:
64
  for p in file_paths:
65
  pages = convert_from_path(p, dpi=dpi, fmt="png")
66
+ for pg in pages:
67
  if len(images) >= max_pages:
68
  break
69
  buf = io.BytesIO()
 
71
  images.append(buf.getvalue())
72
  debug_lines.append(f"[image] generated pages: {len(images)}")
73
  except Exception as e:
 
74
  debug_lines.append(f"[image] error: {type(e).__name__}: {e}")
75
  if shutil.which("pdftoppm") is None:
76
  raise ExtractError(
77
+ "PDFの画像化に失敗しました(Poppler 未検出)。packages.txt に `poppler-utils` を追加して再ビルドしてください。"
 
78
  )
 
79
  business = _pick_business_text(raw_text)
80
  return images, raw_text, business, "\n".join(debug_lines)