File size: 4,151 Bytes
cb2428f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from pathlib import Path
from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS

def download_interaction_pair(local_dir="/root/autodl-tmp/seamless", num_pairs=200):
    """
    Download multiple interaction pairs with resume support.

    - First run: auto-samples pairs, saves their keys.
    - Subsequent runs: only downloads remaining files.
    """
    config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True, local_dir=local_dir)
    fs = SeamlessInteractionFS(config=config)

    # 文件路径
    keys_file = Path(local_dir) / "interaction_keys.txt"
    all_file_ids_file = Path(local_dir) / "all_file_ids.txt"
    downloaded_ids_file = Path(local_dir) / "downloaded_file_ids.txt"

    Path(local_dir).mkdir(parents=True, exist_ok=True)

    # 第一次运行: 采样并保存 interaction_keys
    if not keys_file.exists():
        pairs = fs.get_interaction_pairs(num_pairs=num_pairs)
        interaction_keys = [pair[0].rsplit("_", 1)[0] for pair in pairs]
        with keys_file.open("w") as f:
            for k in interaction_keys:
                f.write(k + "\n")
        print(f"✅ 初次采样 {len(interaction_keys)} 个interaction_keys 已保存: {keys_file}")
    else:
        # 后续运行: 从文件加载
        interaction_keys = [line.strip() for line in keys_file.open("r")]
        print(f"✅ 已加载 {len(interaction_keys)} 个interaction_keys: {keys_file}")

    # 将interaction_keys解析成file_ids
    file_ids = []
    for key in interaction_keys:
        pairs = fs.get_interaction_pairs(interaction_keys=[key])
        if pairs:
            file_ids.extend(pairs[0])

    # 保存file_ids到all_file_ids.txt
    if not all_file_ids_file.exists():
        with all_file_ids_file.open("w") as f:
            for fid in file_ids:
                f.write(fid + "\n")
        print(f"✅ 已保存所有file IDs: {all_file_ids_file}")

    # 加载已下载file_id
    downloaded = set()
    if downloaded_ids_file.exists():
        downloaded = set(line.strip() for line in downloaded_ids_file.open())

    # 下载
    for fid in file_ids:
        if fid in downloaded:
            print(f"⏭️ 已下载,跳过: {fid}")
            continue
        try:
            fs.gather_file_id_data_from_s3(fid, local_dir=local_dir)
            print(f"✅ 下载完成: {fid}")
            with downloaded_ids_file.open("a") as f:
                f.write(fid + "\n")
        except Exception as e:
            print(f"❌ 下载失败 {fid}: {e}")

    print("🎉 所有任务完成")




def main():
    """
    Demonstrate S3-based flexible download options with intelligent sampling.

    All functions support both manual key specification and automatic sampling.
    Auto-sampling prioritizes smaller vendors (V00, V01).
    """
    print("🔍 S3 Download Options with Intelligent Sampling:")
    print("1. Single example (~100MB) - Quick exploration")
    print("2. Interaction pair (~200MB) - Conversational dynamics")
    print("3. Sample set (~1GB) - Initial prototyping")
    print("4. Session exploration (~400MB/session) - Deep context study")
    print()
    print("💡 All options auto-sample from preferred vendors if no keys provided")
    print("   Preferred: V00, V01 (smaller files)")
    print("   Avoided: V03 (larger 100MB-800MB videos)")
    print()
    print("📍 You can also specify exact keys:")
    print("   Interaction key: V00_S0809_I00000582")
    print("   Session key: V00_S0809")

    # Uncomment desired download scenario:
    #download_single_example()  # Auto-samples if no file_id provided
    # download_single_example("V01_S0223_I00000127_P1505")  # Specific file
    download_interaction_pair()  # Auto-samples interaction pairs
    # download_interaction_pair("V00_S0809_I00000582")  # Specific interaction
    # download_samples_1gb()  # Auto-samples 10 diverse files
    # download_samples_1gb(num_samples=20)  # Auto-samples 20 files (~2GB)
    # download_session_exploration()  # Auto-samples 1 rich session
    # download_session_exploration("V00_S0809")  # Specific session


if __name__ == "__main__":
    main()