File size: 695 Bytes
40b3335 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
import os
# Ensure the HF_HOME environment variable points to your desired cache location
os.environ["HF_TOKEN"] = "Your_HF_Token"
cache_dir = 'Your_Cache_Dir'
os.environ['HF_HOME'] = cache_dir
from datasets import load_dataset
# Specify local directory for caching
dataset_path = "./c4_realnewslike"
# Load only the "realnewslike" subset of train and validation
dataset = load_dataset("allenai/c4", "realnewslike", cache_dir=dataset_path)
# Print confirmation
print("Dataset downloaded and stored at:", dataset_path)
# Print the number of samples in each subset
print("Number of training samples:", len(dataset["train"]))
print("Number of validation samples:", len(dataset["validation"]))
|