File size: 1,210 Bytes
40b3335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
# Ensure the HF_HOME environment variable points to your desired cache location
os.environ["HF_TOKEN"] = "Your_HF_Token"
cache_dir = 'Your_Cache_Dir'
os.environ['HF_HOME'] = cache_dir
import json
from datasets import load_dataset


# Set dataset save path
save_path = "cnn.json"
if not os.path.exists(save_path):
    dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")
    train_data = dataset["train"][:20000]
    test_data = dataset["test"][:1000]
    
    data_subset = []
    
    for article, highlights, data_id in zip(train_data["article"], train_data["highlights"], train_data["id"]): 
        data_subset.append({
            "id": data_id,
            "article": article,
            "highlights": highlights,
            "type": "train"
        })
    
    for article, highlights, data_id in zip(test_data["article"], test_data["highlights"], test_data["id"]):
        data_subset.append({
            "id": data_id,
            "article": article,
            "highlights": highlights,
            "type": "test"
        })
    
    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(data_subset, f, ensure_ascii=False, indent=4)

print(f"Data saved to {save_path}")