| import os | |
| # Ensure the HF_HOME environment variable points to your desired cache location | |
| os.environ["HF_TOKEN"] = "Your_HF_Token" | |
| cache_dir = 'Your_Cache_Dir' | |
| os.environ['HF_HOME'] = cache_dir | |
| from datasets import load_dataset | |
| # Specify local directory for caching | |
| dataset_path = "./c4_realnewslike" | |
| # Load only the "realnewslike" subset of train and validation | |
| dataset = load_dataset("allenai/c4", "realnewslike", cache_dir=dataset_path) | |
| # Print confirmation | |
| print("Dataset downloaded and stored at:", dataset_path) | |
| # Print the number of samples in each subset | |
| print("Number of training samples:", len(dataset["train"])) | |
| print("Number of validation samples:", len(dataset["validation"])) | |