import os # Ensure the HF_HOME environment variable points to your desired cache location os.environ["HF_TOKEN"] = "Your_HF_Token" cache_dir = 'Your_Cache_Dir' os.environ['HF_HOME'] = cache_dir from datasets import load_dataset # Specify local directory for caching dataset_path = "./c4_realnewslike" # Load only the "realnewslike" subset of train and validation dataset = load_dataset("allenai/c4", "realnewslike", cache_dir=dataset_path) # Print confirmation print("Dataset downloaded and stored at:", dataset_path) # Print the number of samples in each subset print("Number of training samples:", len(dataset["train"])) print("Number of validation samples:", len(dataset["validation"]))