File size: 1,020 Bytes
0dbbebb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | from datasets import load_from_disk, concatenate_datasets
from pathlib import Path
import pandas as pd
src_root = Path("hf_raw")
dst_root = Path("ft_data")
seed = 42
for ds_dir in src_root.iterdir():
if not ds_dir.is_dir():
continue
ds = load_from_disk(ds_dir)
# Combine all available splits, shuffle, then 80/10/10
full = concatenate_datasets([ds[s] for s in ds.keys()])
full = full.shuffle(seed=seed)
split1 = full.train_test_split(test_size=0.2, seed=seed)
train = split1["train"]
split2 = split1["test"].train_test_split(test_size=0.5, seed=seed)
dev, test = split2["train"], split2["test"]
out = dst_root / ds_dir.name / "split"
out.mkdir(parents=True, exist_ok=True)
for name, subset in [("train", train), ("dev", dev), ("test", test)]:
subset.to_pandas()[["seq", "label"]].rename(
columns={"seq": "sequence", "label": "labels"}
).to_csv(out / f"{name}.csv", sep="\t", index=False)
print(f"Wrote {out/f'{name}.csv'}")
|