Spaces:

Corin1998
/

Score

Sleeping

App Files Files Community

Corin1998 commited on Aug 28, 2025

Commit

45b5c62

verified ·

1 Parent(s): 19256c1

Update core/extract.py

Browse files

Files changed (1) hide show

core/extract.py +8 -24

core/extract.py CHANGED Viewed

@@ -16,7 +16,6 @@ def env_summary() -> str:
     return " / ".join(out)
 def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
-    """pdfminer ベースの純テキスト抽出（速い）"""
     log = []
     chunks = []
     try:
@@ -35,33 +34,21 @@ def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
         return "", "\n".join(log)
 def _pick_business_text(raw_text: str) -> str:
-    """事業説明/会社概要っぽい段落を拾う（AI補足用）"""
     keys = ("事業内容", "会社概要", "製品", "サービス", "沿革")
     best = ""
     for block in raw_text.split("\n\n"):
         if any(k in block for k in keys):
-            best = block if len(block) > len(best) else best
-    return (best or raw_text[:1200])
-def parse_pdf(
-    file_paths: List[str],
-    force_ocr: bool = False,
-    dpi: int = 220,
-    max_pages: int = 8
-) -> Tuple[List[bytes], str, str, str]:
-    """
-    Returns:
-      images     : Vision へ渡せる PNG バイト列（最大 max_pages）
-      raw_text   : テキスト抽出結果（テキストモデルのフォールバック用）
-      business   : 事業説明に近いテキスト（AI所見の市場/製品補足用）
-      debug_log  : 抽出ログ（UI に表示）
-    """
     if not file_paths:
         raise ExtractError("PDFが指定されていません。")
     debug_lines = [f"[env] {env_summary()}"]
-    # ---- まずは全ファイルからテキスト抽出（速い・確実）
     all_text = []
     for p in file_paths:
         txt, lg = _pdf_to_text(p)
@@ -69,14 +56,14 @@ def parse_pdf(
         all_text.append(txt)
     raw_text = "\n\n".join(all_text)
-    # ---- 画像化（Vision 用）。テキストが薄い／OCR強制なら実行
     images: List[bytes] = []
     need_images = force_ocr or (len(raw_text) < 500)
     if need_images:
         try:
             for p in file_paths:
                 pages = convert_from_path(p, dpi=dpi, fmt="png")
-                for i, pg in enumerate(pages):
                     if len(images) >= max_pages:
                         break
                     buf = io.BytesIO()
@@ -84,13 +71,10 @@ def parse_pdf(
                     images.append(buf.getvalue())
             debug_lines.append(f"[image] generated pages: {len(images)}")
         except Exception as e:
-            # Poppler 未導入や壊れ PDF を丁寧に通知
             debug_lines.append(f"[image] error: {type(e).__name__}: {e}")
             if shutil.which("pdftoppm") is None:
                 raise ExtractError(
-                    "PDFの画像化に失敗しました（Poppler 未検出）。"
-                    "Space の packages.txt に `poppler-utils` を入れて再ビルドしてください。"
                 )
-            # 画像化は諦め、テキストのみで続行
     business = _pick_business_text(raw_text)
     return images, raw_text, business, "\n".join(debug_lines)

     return " / ".join(out)
 def _pdf_to_text(path: str, max_chars: int = 20000) -> Tuple[str, str]:
     log = []
     chunks = []
     try:
         return "", "\n".join(log)
 def _pick_business_text(raw_text: str) -> str:
     keys = ("事業内容", "会社概要", "製品", "サービス", "沿革")
     best = ""
     for block in raw_text.split("\n\n"):
         if any(k in block for k in keys):
+            if len(block) > len(best):
+                best = block
+    return best or raw_text[:1200]
+def parse_pdf(file_paths: List[str], force_ocr: bool = False, dpi: int = 220, max_pages: int = 8) -> Tuple[List[bytes], str, str, str]:
     if not file_paths:
         raise ExtractError("PDFが指定されていません。")
     debug_lines = [f"[env] {env_summary()}"]
+    # テキスト抽出
     all_text = []
     for p in file_paths:
         txt, lg = _pdf_to_text(p)
         all_text.append(txt)
     raw_text = "\n\n".join(all_text)
+    # 画像化（必要なときだけ）
     images: List[bytes] = []
     need_images = force_ocr or (len(raw_text) < 500)
     if need_images:
         try:
             for p in file_paths:
                 pages = convert_from_path(p, dpi=dpi, fmt="png")
+                for pg in pages:
                     if len(images) >= max_pages:
                         break
                     buf = io.BytesIO()
                     images.append(buf.getvalue())
             debug_lines.append(f"[image] generated pages: {len(images)}")
         except Exception as e:
             debug_lines.append(f"[image] error: {type(e).__name__}: {e}")
             if shutil.which("pdftoppm") is None:
                 raise ExtractError(
+                    "PDFの画像化に失敗しました（Poppler 未検出）。packages.txt に `poppler-utils` を追加して再ビルドしてください。"
                 )
     business = _pick_business_text(raw_text)
     return images, raw_text, business, "\n".join(debug_lines)