Update core/extract.py
Browse files- core/extract.py +8 -24
core/extract.py
CHANGED
|
@@ -16,7 +16,6 @@ def env_summary() -> str:
|
|
| 16 |
return " / ".join(out)
|
| 17 |
|
| 18 |
def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
|
| 19 |
-
"""pdfminer ベースの純テキスト抽出(速い)"""
|
| 20 |
log = []
|
| 21 |
chunks = []
|
| 22 |
try:
|
|
@@ -35,33 +34,21 @@ def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
|
|
| 35 |
return "", "\n".join(log)
|
| 36 |
|
| 37 |
def _pick_business_text(raw_text: str) -> str:
|
| 38 |
-
"""事業説明/会社概要っぽい段落を拾う(AI補足用)"""
|
| 39 |
keys = ("事業内容", "会社概要", "製品", "サービス", "沿革")
|
| 40 |
best = ""
|
| 41 |
for block in raw_text.split("\n\n"):
|
| 42 |
if any(k in block for k in keys):
|
| 43 |
-
|
| 44 |
-
|
|
|
|
| 45 |
|
| 46 |
-
def parse_pdf(
|
| 47 |
-
file_paths: List[str],
|
| 48 |
-
force_ocr: bool = False,
|
| 49 |
-
dpi: int = 220,
|
| 50 |
-
max_pages: int = 8
|
| 51 |
-
) -> Tuple[List[bytes], str, str, str]:
|
| 52 |
-
"""
|
| 53 |
-
Returns:
|
| 54 |
-
images : Vision へ渡せる PNG バイト列(最大 max_pages)
|
| 55 |
-
raw_text : テキスト抽出結果(テキストモデルのフォールバック用)
|
| 56 |
-
business : 事業説明に近いテキスト(AI所見の市場/製品補足用)
|
| 57 |
-
debug_log : 抽出ログ(UI に表示)
|
| 58 |
-
"""
|
| 59 |
if not file_paths:
|
| 60 |
raise ExtractError("PDFが指定されていません。")
|
| 61 |
|
| 62 |
debug_lines = [f"[env] {env_summary()}"]
|
| 63 |
|
| 64 |
-
#
|
| 65 |
all_text = []
|
| 66 |
for p in file_paths:
|
| 67 |
txt, lg = _pdf_to_text(p)
|
|
@@ -69,14 +56,14 @@ def parse_pdf(
|
|
| 69 |
all_text.append(txt)
|
| 70 |
raw_text = "\n\n".join(all_text)
|
| 71 |
|
| 72 |
-
#
|
| 73 |
images: List[bytes] = []
|
| 74 |
need_images = force_ocr or (len(raw_text) < 500)
|
| 75 |
if need_images:
|
| 76 |
try:
|
| 77 |
for p in file_paths:
|
| 78 |
pages = convert_from_path(p, dpi=dpi, fmt="png")
|
| 79 |
-
for
|
| 80 |
if len(images) >= max_pages:
|
| 81 |
break
|
| 82 |
buf = io.BytesIO()
|
|
@@ -84,13 +71,10 @@ def parse_pdf(
|
|
| 84 |
images.append(buf.getvalue())
|
| 85 |
debug_lines.append(f"[image] generated pages: {len(images)}")
|
| 86 |
except Exception as e:
|
| 87 |
-
# Poppler 未導入や壊れ PDF を丁寧に通知
|
| 88 |
debug_lines.append(f"[image] error: {type(e).__name__}: {e}")
|
| 89 |
if shutil.which("pdftoppm") is None:
|
| 90 |
raise ExtractError(
|
| 91 |
-
"PDFの画像化に失敗しました(Poppler 未検出)。"
|
| 92 |
-
"Space の packages.txt に `poppler-utils` を入れて再ビルドしてください。"
|
| 93 |
)
|
| 94 |
-
# 画像化は諦め、テキストのみで続行
|
| 95 |
business = _pick_business_text(raw_text)
|
| 96 |
return images, raw_text, business, "\n".join(debug_lines)
|
|
|
|
| 16 |
return " / ".join(out)
|
| 17 |
|
| 18 |
def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
|
|
|
|
| 19 |
log = []
|
| 20 |
chunks = []
|
| 21 |
try:
|
|
|
|
| 34 |
return "", "\n".join(log)
|
| 35 |
|
| 36 |
def _pick_business_text(raw_text: str) -> str:
|
|
|
|
| 37 |
keys = ("事業内容", "会社概要", "製品", "サービス", "沿革")
|
| 38 |
best = ""
|
| 39 |
for block in raw_text.split("\n\n"):
|
| 40 |
if any(k in block for k in keys):
|
| 41 |
+
if len(block) > len(best):
|
| 42 |
+
best = block
|
| 43 |
+
return best or raw_text[:1200]
|
| 44 |
|
| 45 |
+
def parse_pdf(file_paths: List[str], force_ocr: bool = False, dpi: int = 220, max_pages: int = 8) -> Tuple[List[bytes], str, str, str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
if not file_paths:
|
| 47 |
raise ExtractError("PDFが指定されていません。")
|
| 48 |
|
| 49 |
debug_lines = [f"[env] {env_summary()}"]
|
| 50 |
|
| 51 |
+
# テキスト抽出
|
| 52 |
all_text = []
|
| 53 |
for p in file_paths:
|
| 54 |
txt, lg = _pdf_to_text(p)
|
|
|
|
| 56 |
all_text.append(txt)
|
| 57 |
raw_text = "\n\n".join(all_text)
|
| 58 |
|
| 59 |
+
# 画像化(必要なときだけ)
|
| 60 |
images: List[bytes] = []
|
| 61 |
need_images = force_ocr or (len(raw_text) < 500)
|
| 62 |
if need_images:
|
| 63 |
try:
|
| 64 |
for p in file_paths:
|
| 65 |
pages = convert_from_path(p, dpi=dpi, fmt="png")
|
| 66 |
+
for pg in pages:
|
| 67 |
if len(images) >= max_pages:
|
| 68 |
break
|
| 69 |
buf = io.BytesIO()
|
|
|
|
| 71 |
images.append(buf.getvalue())
|
| 72 |
debug_lines.append(f"[image] generated pages: {len(images)}")
|
| 73 |
except Exception as e:
|
|
|
|
| 74 |
debug_lines.append(f"[image] error: {type(e).__name__}: {e}")
|
| 75 |
if shutil.which("pdftoppm") is None:
|
| 76 |
raise ExtractError(
|
| 77 |
+
"PDFの画像化に失敗しました(Poppler 未検出)。packages.txt に `poppler-utils` を追加して再ビルドしてください。"
|
|
|
|
| 78 |
)
|
|
|
|
| 79 |
business = _pick_business_text(raw_text)
|
| 80 |
return images, raw_text, business, "\n".join(debug_lines)
|