|
|
import os |
|
|
|
|
|
os.environ["HF_TOKEN"] = "Your_HF_Token" |
|
|
cache_dir = 'Your_Cache_Dir' |
|
|
os.environ['HF_HOME'] = cache_dir |
|
|
import json |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
|
|
|
save_path = "cnn.json" |
|
|
if not os.path.exists(save_path): |
|
|
dataset = load_dataset("abisee/cnn_dailymail", "3.0.0") |
|
|
train_data = dataset["train"][:20000] |
|
|
test_data = dataset["test"][:1000] |
|
|
|
|
|
data_subset = [] |
|
|
|
|
|
for article, highlights, data_id in zip(train_data["article"], train_data["highlights"], train_data["id"]): |
|
|
data_subset.append({ |
|
|
"id": data_id, |
|
|
"article": article, |
|
|
"highlights": highlights, |
|
|
"type": "train" |
|
|
}) |
|
|
|
|
|
for article, highlights, data_id in zip(test_data["article"], test_data["highlights"], test_data["id"]): |
|
|
data_subset.append({ |
|
|
"id": data_id, |
|
|
"article": article, |
|
|
"highlights": highlights, |
|
|
"type": "test" |
|
|
}) |
|
|
|
|
|
with open(save_path, "w", encoding="utf-8") as f: |
|
|
json.dump(data_subset, f, ensure_ascii=False, indent=4) |
|
|
|
|
|
print(f"Data saved to {save_path}") |