Spaces:
Running
Running
Add progress reporting to figure extraction
Browse files- app.py +1 -1
- pragent/backend/figure_table_pipeline.py +8 -2
app.py
CHANGED
|
@@ -244,7 +244,7 @@ async def process_pdf(
|
|
| 244 |
progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
|
| 245 |
extraction_work_dir = work_dir / "figure_extraction"
|
| 246 |
extraction_work_dir.mkdir()
|
| 247 |
-
paired_dir = run_figure_extraction(str(pdf_path), str(extraction_work_dir))
|
| 248 |
if not paired_dir or not any(Path(paired_dir).iterdir()):
|
| 249 |
raise gr.Error("Failed to extract any figures from the PDF.")
|
| 250 |
|
|
|
|
| 244 |
progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
|
| 245 |
extraction_work_dir = work_dir / "figure_extraction"
|
| 246 |
extraction_work_dir.mkdir()
|
| 247 |
+
paired_dir = run_figure_extraction(str(pdf_path), str(extraction_work_dir), progress=progress)
|
| 248 |
if not paired_dir or not any(Path(paired_dir).iterdir()):
|
| 249 |
raise gr.Error("Failed to extract any figures from the PDF.")
|
| 250 |
|
pragent/backend/figure_table_pipeline.py
CHANGED
|
@@ -8,7 +8,9 @@ from pragent.backend.loader import ImagePDFLoader
|
|
| 8 |
from pragent.backend.yolo import extract_and_save_layout_components
|
| 9 |
from tqdm.asyncio import tqdm
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
一个完整的、从PDF提取并配对图表的流程。
|
| 14 |
这是被 app.py 调用的主函数。
|
|
@@ -16,6 +18,7 @@ def run_figure_extraction(pdf_path: str, base_work_dir: str) -> str:
|
|
| 16 |
Args:
|
| 17 |
pdf_path (str): 用户上传的PDF的路径。
|
| 18 |
base_work_dir (str): 本次会话的临时工作目录。
|
|
|
|
| 19 |
|
| 20 |
Returns:
|
| 21 |
str: 最终配对结果的目录路径,如果失败则返回 None。
|
|
@@ -45,7 +48,10 @@ def run_figure_extraction(pdf_path: str, base_work_dir: str) -> str:
|
|
| 45 |
|
| 46 |
tqdm.write(f"\n--- 步骤 2/3: 分析页面布局以裁剪图和表 ---")
|
| 47 |
cropped_results_dir = os.path.join(base_work_dir, "cropped_results", pdf_stem)
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
| 49 |
page_num_str = Path(path).stem
|
| 50 |
page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
|
| 51 |
extract_and_save_layout_components(image_path=path, model_path=model_path, save_base_dir=page_crop_dir, imgsz=640)
|
|
|
|
| 8 |
from pragent.backend.yolo import extract_and_save_layout_components
|
| 9 |
from tqdm.asyncio import tqdm
|
| 10 |
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
def run_figure_extraction(pdf_path: str, base_work_dir: str, progress: Any | None = None) -> str:
|
| 14 |
"""
|
| 15 |
一个完整的、从PDF提取并配对图表的流程。
|
| 16 |
这是被 app.py 调用的主函数。
|
|
|
|
| 18 |
Args:
|
| 19 |
pdf_path (str): 用户上传的PDF的路径。
|
| 20 |
base_work_dir (str): 本次会话的临时工作目录。
|
| 21 |
+
progress (Any | None): Gradio progress object.
|
| 22 |
|
| 23 |
Returns:
|
| 24 |
str: 最终配对结果的目录路径,如果失败则返回 None。
|
|
|
|
| 48 |
|
| 49 |
tqdm.write(f"\n--- 步骤 2/3: 分析页面布局以裁剪图和表 ---")
|
| 50 |
cropped_results_dir = os.path.join(base_work_dir, "cropped_results", pdf_stem)
|
| 51 |
+
num_pages = len(page_image_paths)
|
| 52 |
+
for i, path in enumerate(page_image_paths):
|
| 53 |
+
if progress:
|
| 54 |
+
progress(0.3 + (i / num_pages) * 0.2, desc=f"Analyzing page {i+1}/{num_pages}")
|
| 55 |
page_num_str = Path(path).stem
|
| 56 |
page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
|
| 57 |
extract_and_save_layout_components(image_path=path, model_path=model_path, save_base_dir=page_crop_dir, imgsz=640)
|