Student0809's picture
Add files using upload-large-folder tool
cb2428f verified
raw
history blame
4.53 kB
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from seamless_interaction.fs import DatasetConfig, SeamlessInteractionFS
def download_1gb_sample_archive():
"""
Download ~1GB of samples using selective archives.
Traditional archive-based approach for quick exploration on laptops.
"""
config = DatasetConfig(label="improvised", split="dev", num_workers=4)
fs = SeamlessInteractionFS(config=config)
# Download specific archives (~1GB total)
fs.download_batch_from_hf(batch_idx=0, archive_list=[0])
print("βœ… Downloaded ~1GB sample from HF (archive-based)")
def download_single_batch():
"""
Download a complete batch (~50-100GB).
Good for substantial local exploration and development.
"""
config = DatasetConfig(label="improvised", split="dev", num_workers=8)
fs = SeamlessInteractionFS(config=config)
# Download complete batch
fs.download_batch_from_hf(batch_idx=0)
print("βœ… Downloaded single batch (~50-100GB)")
def download_multiple_batches():
"""
Download multiple batches for training datasets.
Suitable for model training and large-scale analysis.
"""
config = DatasetConfig(label="improvised", split="train", num_workers=8)
fs = SeamlessInteractionFS(config=config)
# Download first 3 batches of training data (~150GB+)
for batch_idx in range(3):
fs.download_batch_from_hf(batch_idx=batch_idx)
print(f"βœ… Downloaded batch {batch_idx}")
print("βœ… Downloaded multiple batches (~150GB+)")
def download_different_splits():
"""
Download data from different splits and labels.
Covers both improvised/naturalistic and train/dev/test splits.
"""
# Download from different combinations
splits_to_download = [
("improvised", "dev", 0),
("naturalistic", "dev", 0),
("improvised", "test", 0),
("naturalistic", "test", 0),
]
for label, split, batch_idx in splits_to_download:
config = DatasetConfig(label=label, num_workers=4)
fs = SeamlessInteractionFS(config=config)
# Download only first few archives to keep size manageable (~1GB per split)
fs.download_batch_from_hf(
split=split, batch_idx=batch_idx, archive_list=[0, 1, 2]
)
print(f"βœ… Downloaded {label}/{split} sample")
print("βœ… Downloaded samples from different splits")
def download_whole_dataset():
"""
Download the complete dataset (~27TB).
⚠️ CAUTION: This will download the entire dataset!
Only use on high-capacity storage with fast internet.
"""
# Method 1: Using batch-by-batch download (recommended for control)
labels = ["improvised", "naturalistic"]
splits = ["train", "dev", "test"]
confirm = input(
"Are you sure you want to download the entire dataset (~27TB)? (y/n): "
)
if confirm not in ["y", "Y", "yes", "Yes", "YES"]:
print("Download cancelled.")
return
for label in labels:
for split in splits:
print(f"Downloading all {label}/{split} batches...")
config = DatasetConfig(label=label, num_workers=16)
fs = SeamlessInteractionFS(config=config)
fs.download_batch_from_hf(
split=split,
batch_idx=None, # Download all batches
)
# Method 2: Using HuggingFace snapshot (alternative)
# from huggingface_hub import snapshot_download
# snapshot_download(
# repo_id="facebook/seamless-interaction",
# repo_type="dataset",
# local_dir="~/datasets/seamless_interaction_full"
# )
print("βœ… Downloaded complete dataset (~27TB)")
def main():
"""
Demonstrate HuggingFace-based flexible download options.
"""
print("πŸ“¦ HuggingFace Download Options:")
print("1. Sample set (~1GB) - Traditional archive-based")
print("2. Single batch (~50-100GB)")
print("3. Multiple batches (~150GB+)")
print("4. Different splits (improvised/naturalistic, train/dev/test)")
print("5. Whole dataset (~27TB)")
# Uncomment desired download scenario:
download_1gb_sample_archive()
# download_single_batch()
# download_multiple_batches()
# download_different_splits()
# download_whole_dataset() # ⚠️ CAUTION: Very large!
if __name__ == "__main__":
main()