File size: 1,739 Bytes
ec3d86e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# data_loader.py
import asyncio
import aiofiles
from pathlib import Path
import re
from typing import List, Dict
from tqdm.asyncio import tqdm
async def load_plain_text(txt_path: str) -> str:
    """异步地从 .txt 文件加载纯文本内容。"""
    try:
        async with aiofiles.open(txt_path, mode='r', encoding='utf-8') as f:
            return await f.read()
    except Exception as e:
        tqdm.write(f"[!] 读取文本文件 '{txt_path}' 时出错: {e}")
        return ""

def load_paired_image_paths(base_dir: Path) -> List[Dict]:
    """
    递归地扫描 'paired_*' 文件夹,并加载主图和其标题图的路径。
    """
    items = []
    if not base_dir.is_dir():
        tqdm.write(f"[!] 错误: 找不到配对结果的基础文件夹: {base_dir}")
        return items

    tqdm.write(f"[*] 正在从 {base_dir} 递归加载图文对...")
    
    item_dirs = sorted(
        [d for d in base_dir.rglob('paired_*') if d.is_dir()],
        key=lambda p: p.name  
    )

    for item_dir in item_dirs:
        item_files = list(item_dir.glob('*.jpg'))
        if len(item_files) < 2:
            continue

        main_item_path, caption_path = None, None
        for f in item_files:
            if "caption" in f.name:
                caption_path = f
            else:
                main_item_path = f
        
        if main_item_path and caption_path:
            items.append({
                "type": "figure" if "figure" in item_dir.name else "table",
                "item_path": str(main_item_path.resolve()),
                "caption_path": str(caption_path.resolve()),
            })
            
    tqdm.write(f"[*] 加载完成,共找到 {len(items)} 个图文对。")
    return items