Spaces:
Sleeping
Sleeping
Refactor figure extraction to be async
Browse files- app.py +1 -1
- pragent/backend/figure_table_pipeline.py +12 -2
app.py
CHANGED
|
@@ -244,7 +244,7 @@ async def process_pdf(
|
|
| 244 |
progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
|
| 245 |
extraction_work_dir = work_dir / "figure_extraction"
|
| 246 |
extraction_work_dir.mkdir()
|
| 247 |
-
paired_dir = run_figure_extraction(str(pdf_path), str(extraction_work_dir), progress=progress)
|
| 248 |
if not paired_dir or not any(Path(paired_dir).iterdir()):
|
| 249 |
raise gr.Error("Failed to extract any figures from the PDF.")
|
| 250 |
|
|
|
|
| 244 |
progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
|
| 245 |
extraction_work_dir = work_dir / "figure_extraction"
|
| 246 |
extraction_work_dir.mkdir()
|
| 247 |
+
paired_dir = await run_figure_extraction(str(pdf_path), str(extraction_work_dir), progress=progress)
|
| 248 |
if not paired_dir or not any(Path(paired_dir).iterdir()):
|
| 249 |
raise gr.Error("Failed to extract any figures from the PDF.")
|
| 250 |
|
pragent/backend/figure_table_pipeline.py
CHANGED
|
@@ -8,9 +8,10 @@ from pragent.backend.loader import ImagePDFLoader
|
|
| 8 |
from pragent.backend.yolo import extract_and_save_layout_components
|
| 9 |
from tqdm.asyncio import tqdm
|
| 10 |
|
|
|
|
| 11 |
from typing import Any
|
| 12 |
|
| 13 |
-
def run_figure_extraction(pdf_path: str, base_work_dir: str, progress: Any | None = None) -> str:
|
| 14 |
"""
|
| 15 |
一个完整的、从PDF提取并配对图表的流程。
|
| 16 |
这是被 app.py 调用的主函数。
|
|
@@ -52,9 +53,18 @@ def run_figure_extraction(pdf_path: str, base_work_dir: str, progress: Any | Non
|
|
| 52 |
for i, path in enumerate(page_image_paths):
|
| 53 |
if progress:
|
| 54 |
progress(0.3 + (i / num_pages) * 0.2, desc=f"Analyzing page {i+1}/{num_pages}")
|
|
|
|
| 55 |
page_num_str = Path(path).stem
|
| 56 |
page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
tqdm.write(f"[*] 所有裁剪结果已保存至: {cropped_results_dir}")
|
| 59 |
|
| 60 |
tqdm.write(f"\n--- 步骤 3/3: 对裁剪出的组件进行配对 ---")
|
|
|
|
| 8 |
from pragent.backend.yolo import extract_and_save_layout_components
|
| 9 |
from tqdm.asyncio import tqdm
|
| 10 |
|
| 11 |
+
import asyncio
|
| 12 |
from typing import Any
|
| 13 |
|
| 14 |
+
async def run_figure_extraction(pdf_path: str, base_work_dir: str, progress: Any | None = None) -> str:
|
| 15 |
"""
|
| 16 |
一个完整的、从PDF提取并配对图表的流程。
|
| 17 |
这是被 app.py 调用的主函数。
|
|
|
|
| 53 |
for i, path in enumerate(page_image_paths):
|
| 54 |
if progress:
|
| 55 |
progress(0.3 + (i / num_pages) * 0.2, desc=f"Analyzing page {i+1}/{num_pages}")
|
| 56 |
+
|
| 57 |
page_num_str = Path(path).stem
|
| 58 |
page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
|
| 59 |
+
|
| 60 |
+
await asyncio.to_thread(
|
| 61 |
+
extract_and_save_layout_components,
|
| 62 |
+
image_path=path,
|
| 63 |
+
model_path=model_path,
|
| 64 |
+
save_base_dir=page_crop_dir,
|
| 65 |
+
imgsz=640
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
tqdm.write(f"[*] 所有裁剪结果已保存至: {cropped_results_dir}")
|
| 69 |
|
| 70 |
tqdm.write(f"\n--- 步骤 3/3: 对裁剪出的组件进行配对 ---")
|