File size: 695 Bytes
40b3335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import os
# Ensure the HF_HOME environment variable points to your desired cache location
os.environ["HF_TOKEN"] = "Your_HF_Token"
cache_dir = 'Your_Cache_Dir'
os.environ['HF_HOME'] = cache_dir

from datasets import load_dataset

# Specify local directory for caching
dataset_path = "./c4_realnewslike"

# Load only the "realnewslike" subset of train and validation
dataset = load_dataset("allenai/c4", "realnewslike", cache_dir=dataset_path)

# Print confirmation
print("Dataset downloaded and stored at:", dataset_path)

# Print the number of samples in each subset
print("Number of training samples:", len(dataset["train"]))
print("Number of validation samples:", len(dataset["validation"]))