from pathlib import Path from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS def download_interaction_pair(local_dir="/root/autodl-tmp/seamless", num_pairs=200): """ Download multiple interaction pairs with resume support. - First run: auto-samples pairs, saves their keys. - Subsequent runs: only downloads remaining files. """ config = DatasetConfig(label="improvised", split="dev", preferred_vendors_only=True, local_dir=local_dir) fs = SeamlessInteractionFS(config=config) # 文件路径 keys_file = Path(local_dir) / "interaction_keys.txt" all_file_ids_file = Path(local_dir) / "all_file_ids.txt" downloaded_ids_file = Path(local_dir) / "downloaded_file_ids.txt" Path(local_dir).mkdir(parents=True, exist_ok=True) # 第一次运行: 采样并保存 interaction_keys if not keys_file.exists(): pairs = fs.get_interaction_pairs(num_pairs=num_pairs) interaction_keys = [pair[0].rsplit("_", 1)[0] for pair in pairs] with keys_file.open("w") as f: for k in interaction_keys: f.write(k + "\n") print(f"✅ 初次采样 {len(interaction_keys)} 个interaction_keys 已保存: {keys_file}") else: # 后续运行: 从文件加载 interaction_keys = [line.strip() for line in keys_file.open("r")] print(f"✅ 已加载 {len(interaction_keys)} 个interaction_keys: {keys_file}") # 将interaction_keys解析成file_ids file_ids = [] for key in interaction_keys: pairs = fs.get_interaction_pairs(interaction_keys=[key]) if pairs: file_ids.extend(pairs[0]) # 保存file_ids到all_file_ids.txt if not all_file_ids_file.exists(): with all_file_ids_file.open("w") as f: for fid in file_ids: f.write(fid + "\n") print(f"✅ 已保存所有file IDs: {all_file_ids_file}") # 加载已下载file_id downloaded = set() if downloaded_ids_file.exists(): downloaded = set(line.strip() for line in downloaded_ids_file.open()) # 下载 for fid in file_ids: if fid in downloaded: print(f"⏭️ 已下载,跳过: {fid}") continue try: fs.gather_file_id_data_from_s3(fid, local_dir=local_dir) print(f"✅ 下载完成: {fid}") with downloaded_ids_file.open("a") as f: f.write(fid + "\n") except Exception as e: print(f"❌ 下载失败 {fid}: {e}") print("🎉 所有任务完成") def main(): """ Demonstrate S3-based flexible download options with intelligent sampling. All functions support both manual key specification and automatic sampling. Auto-sampling prioritizes smaller vendors (V00, V01). """ print("🔍 S3 Download Options with Intelligent Sampling:") print("1. Single example (~100MB) - Quick exploration") print("2. Interaction pair (~200MB) - Conversational dynamics") print("3. Sample set (~1GB) - Initial prototyping") print("4. Session exploration (~400MB/session) - Deep context study") print() print("💡 All options auto-sample from preferred vendors if no keys provided") print(" Preferred: V00, V01 (smaller files)") print(" Avoided: V03 (larger 100MB-800MB videos)") print() print("📍 You can also specify exact keys:") print(" Interaction key: V00_S0809_I00000582") print(" Session key: V00_S0809") # Uncomment desired download scenario: #download_single_example() # Auto-samples if no file_id provided # download_single_example("V01_S0223_I00000127_P1505") # Specific file download_interaction_pair() # Auto-samples interaction pairs # download_interaction_pair("V00_S0809_I00000582") # Specific interaction # download_samples_1gb() # Auto-samples 10 diverse files # download_samples_1gb(num_samples=20) # Auto-samples 20 files (~2GB) # download_session_exploration() # Auto-samples 1 rich session # download_session_exploration("V00_S0809") # Specific session if __name__ == "__main__": main()