yzweak commited on
Commit
ec5f146
·
1 Parent(s): 43aac1a

Refactor figure extraction to be async

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. pragent/backend/figure_table_pipeline.py +12 -2
app.py CHANGED
@@ -244,7 +244,7 @@ async def process_pdf(
244
  progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
245
  extraction_work_dir = work_dir / "figure_extraction"
246
  extraction_work_dir.mkdir()
247
- paired_dir = run_figure_extraction(str(pdf_path), str(extraction_work_dir), progress=progress)
248
  if not paired_dir or not any(Path(paired_dir).iterdir()):
249
  raise gr.Error("Failed to extract any figures from the PDF.")
250
 
 
244
  progress(0.3, desc="Step 2/5: Extracting figures from PDF...")
245
  extraction_work_dir = work_dir / "figure_extraction"
246
  extraction_work_dir.mkdir()
247
+ paired_dir = await run_figure_extraction(str(pdf_path), str(extraction_work_dir), progress=progress)
248
  if not paired_dir or not any(Path(paired_dir).iterdir()):
249
  raise gr.Error("Failed to extract any figures from the PDF.")
250
 
pragent/backend/figure_table_pipeline.py CHANGED
@@ -8,9 +8,10 @@ from pragent.backend.loader import ImagePDFLoader
8
  from pragent.backend.yolo import extract_and_save_layout_components
9
  from tqdm.asyncio import tqdm
10
 
 
11
  from typing import Any
12
 
13
- def run_figure_extraction(pdf_path: str, base_work_dir: str, progress: Any | None = None) -> str:
14
  """
15
  一个完整的、从PDF提取并配对图表的流程。
16
  这是被 app.py 调用的主函数。
@@ -52,9 +53,18 @@ def run_figure_extraction(pdf_path: str, base_work_dir: str, progress: Any | Non
52
  for i, path in enumerate(page_image_paths):
53
  if progress:
54
  progress(0.3 + (i / num_pages) * 0.2, desc=f"Analyzing page {i+1}/{num_pages}")
 
55
  page_num_str = Path(path).stem
56
  page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
57
- extract_and_save_layout_components(image_path=path, model_path=model_path, save_base_dir=page_crop_dir, imgsz=640)
 
 
 
 
 
 
 
 
58
  tqdm.write(f"[*] 所有裁剪结果已保存至: {cropped_results_dir}")
59
 
60
  tqdm.write(f"\n--- 步骤 3/3: 对裁剪出的组件进行配对 ---")
 
8
  from pragent.backend.yolo import extract_and_save_layout_components
9
  from tqdm.asyncio import tqdm
10
 
11
+ import asyncio
12
  from typing import Any
13
 
14
+ async def run_figure_extraction(pdf_path: str, base_work_dir: str, progress: Any | None = None) -> str:
15
  """
16
  一个完整的、从PDF提取并配对图表的流程。
17
  这是被 app.py 调用的主函数。
 
53
  for i, path in enumerate(page_image_paths):
54
  if progress:
55
  progress(0.3 + (i / num_pages) * 0.2, desc=f"Analyzing page {i+1}/{num_pages}")
56
+
57
  page_num_str = Path(path).stem
58
  page_crop_dir = os.path.join(cropped_results_dir, page_num_str)
59
+
60
+ await asyncio.to_thread(
61
+ extract_and_save_layout_components,
62
+ image_path=path,
63
+ model_path=model_path,
64
+ save_base_dir=page_crop_dir,
65
+ imgsz=640
66
+ )
67
+
68
  tqdm.write(f"[*] 所有裁剪结果已保存至: {cropped_results_dir}")
69
 
70
  tqdm.write(f"\n--- 步骤 3/3: 对裁剪出的组件进行配对 ---")