Spaces:

yzweak
/

AutoPR

Build error

yzweak commited on Sep 24, 2025

Commit

43aac1a

1 Parent(s): 9648b0a

Add progress reporting to figure extraction

Files changed (2) hide show

app.py CHANGED Viewed

@@ -244,7 +244,7 @@ async def process_pdf(
         progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
         extraction_work_dir = work_dir / "figure_extraction"
         extraction_work_dir.mkdir()
-        paired_dir = run_figure_extraction(str(pdf_path), str(extraction_work_dir))
         if not paired_dir or not any(Path(paired_dir).iterdir()):
             raise gr.Error("Failed to extract any figures from the PDF.")

         progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
         extraction_work_dir = work_dir / "figure_extraction"
         extraction_work_dir.mkdir()
+        paired_dir = run_figure_extraction(str(pdf_path), str(extraction_work_dir), progress=progress)
         if not paired_dir or not any(Path(paired_dir).iterdir()):
             raise gr.Error("Failed to extract any figures from the PDF.")

pragent/backend/figure_table_pipeline.py CHANGED Viewed

@@ -8,7 +8,9 @@ from pragent.backend.loader import ImagePDFLoader
 from pragent.backend.yolo import extract_and_save_layout_components
 from tqdm.asyncio import tqdm
-def run_figure_extraction(pdf_path: str, base_work_dir: str) -> str:
     """
     一个完整的、从PDF提取并配对图表的流程。
     这是被 app.py 调用的主函数。
@@ -16,6 +18,7 @@ def run_figure_extraction(pdf_path: str, base_work_dir: str) -> str:
     Args:
         pdf_path (str): 用户上传的PDF的路径。
         base_work_dir (str): 本次会话的临时工作目录。
     Returns:
         str: 最终配对结果的目录路径，如果失败则返回 None。
@@ -45,7 +48,10 @@ def run_figure_extraction(pdf_path: str, base_work_dir: str) -> str:
     tqdm.write(f"\n--- 步骤 2/3: 分析页面布局以裁剪图和表 ---")
     cropped_results_dir = os.path.join(base_work_dir, "cropped_results", pdf_stem)
-    for path in page_image_paths:
         page_num_str = Path(path).stem
         page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
         extract_and_save_layout_components(image_path=path, model_path=model_path, save_base_dir=page_crop_dir, imgsz=640)

 from pragent.backend.yolo import extract_and_save_layout_components
 from tqdm.asyncio import tqdm
+from typing import Any
+def run_figure_extraction(pdf_path: str, base_work_dir: str, progress: Any | None = None) -> str:
     """
     一个完整的、从PDF提取并配对图表的流程。
     这是被 app.py 调用的主函数。
     Args:
         pdf_path (str): 用户上传的PDF的路径。
         base_work_dir (str): 本次会话的临时工作目录。
+        progress (Any | None): Gradio progress object.
     Returns:
         str: 最终配对结果的目录路径，如果失败则返回 None。
     tqdm.write(f"\n--- 步骤 2/3: 分析页面布局以裁剪图和表 ---")
     cropped_results_dir = os.path.join(base_work_dir, "cropped_results", pdf_stem)
+    num_pages = len(page_image_paths)
+    for i, path in enumerate(page_image_paths):
+        if progress:
+            progress(0.3 + (i / num_pages) * 0.2, desc=f"Analyzing page {i+1}/{num_pages}")
         page_num_str = Path(path).stem
         page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
         extract_and_save_layout_components(image_path=path, model_path=model_path, save_base_dir=page_crop_dir, imgsz=640)