File size: 4,530 Bytes
cb2428f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | # Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS
def download_1gb_sample_archive():
"""
Download ~1GB of samples using selective archives.
Traditional archive-based approach for quick exploration on laptops.
"""
config = DatasetConfig(label="improvised", split="dev", num_workers=4)
fs = SeamlessInteractionFS(config=config)
# Download specific archives (~1GB total)
fs.download_batch_from_hf(batch_idx=0, archive_list=[0])
print("✅ Downloaded ~1GB sample from HF (archive-based)")
def download_single_batch():
"""
Download a complete batch (~50-100GB).
Good for substantial local exploration and development.
"""
config = DatasetConfig(label="improvised", split="dev", num_workers=8)
fs = SeamlessInteractionFS(config=config)
# Download complete batch
fs.download_batch_from_hf(batch_idx=0)
print("✅ Downloaded single batch (~50-100GB)")
def download_multiple_batches():
"""
Download multiple batches for training datasets.
Suitable for model training and large-scale analysis.
"""
config = DatasetConfig(label="improvised", split="train", num_workers=8)
fs = SeamlessInteractionFS(config=config)
# Download first 3 batches of training data (~150GB+)
for batch_idx in range(3):
fs.download_batch_from_hf(batch_idx=batch_idx)
print(f"✅ Downloaded batch {batch_idx}")
print("✅ Downloaded multiple batches (~150GB+)")
def download_different_splits():
"""
Download data from different splits and labels.
Covers both improvised/naturalistic and train/dev/test splits.
"""
# Download from different combinations
splits_to_download = [
("improvised", "dev", 0),
("naturalistic", "dev", 0),
("improvised", "test", 0),
("naturalistic", "test", 0),
]
for label, split, batch_idx in splits_to_download:
config = DatasetConfig(label=label, num_workers=4)
fs = SeamlessInteractionFS(config=config)
# Download only first few archives to keep size manageable (~1GB per split)
fs.download_batch_from_hf(
split=split, batch_idx=batch_idx, archive_list=[0, 1, 2]
)
print(f"✅ Downloaded {label}/{split} sample")
print("✅ Downloaded samples from different splits")
def download_whole_dataset():
"""
Download the complete dataset (~27TB).
⚠️ CAUTION: This will download the entire dataset!
Only use on high-capacity storage with fast internet.
"""
# Method 1: Using batch-by-batch download (recommended for control)
labels = ["improvised", "naturalistic"]
splits = ["train", "dev", "test"]
confirm = input(
"Are you sure you want to download the entire dataset (~27TB)? (y/n): "
)
if confirm not in ["y", "Y", "yes", "Yes", "YES"]:
print("Download cancelled.")
return
for label in labels:
for split in splits:
print(f"Downloading all {label}/{split} batches...")
config = DatasetConfig(label=label, num_workers=16)
fs = SeamlessInteractionFS(config=config)
fs.download_batch_from_hf(
split=split,
batch_idx=None, # Download all batches
)
# Method 2: Using HuggingFace snapshot (alternative)
# from huggingface_hub import snapshot_download
# snapshot_download(
# repo_id="facebook/seamless-interaction",
# repo_type="dataset",
# local_dir="~/datasets/seamless_interaction_full"
# )
print("✅ Downloaded complete dataset (~27TB)")
def main():
"""
Demonstrate HuggingFace-based flexible download options.
"""
print("📦 HuggingFace Download Options:")
print("1. Sample set (~1GB) - Traditional archive-based")
print("2. Single batch (~50-100GB)")
print("3. Multiple batches (~150GB+)")
print("4. Different splits (improvised/naturalistic, train/dev/test)")
print("5. Whole dataset (~27TB)")
# Uncomment desired download scenario:
download_1gb_sample_archive()
# download_single_batch()
# download_multiple_batches()
# download_different_splits()
# download_whole_dataset() # ⚠️ CAUTION: Very large!
if __name__ == "__main__":
main()
|