File size: 1,020 Bytes
0dbbebb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from datasets import load_from_disk, concatenate_datasets
from pathlib import Path
import pandas as pd

src_root = Path("hf_raw")
dst_root = Path("ft_data")
seed = 42

for ds_dir in src_root.iterdir():
    if not ds_dir.is_dir():
        continue
    ds = load_from_disk(ds_dir)

    # Combine all available splits, shuffle, then 80/10/10
    full = concatenate_datasets([ds[s] for s in ds.keys()])
    full = full.shuffle(seed=seed)
    split1 = full.train_test_split(test_size=0.2, seed=seed)
    train = split1["train"]
    split2 = split1["test"].train_test_split(test_size=0.5, seed=seed)
    dev, test = split2["train"], split2["test"]

    out = dst_root / ds_dir.name / "split"
    out.mkdir(parents=True, exist_ok=True)
    for name, subset in [("train", train), ("dev", dev), ("test", test)]:
        subset.to_pandas()[["seq", "label"]].rename(
            columns={"seq": "sequence", "label": "labels"}
        ).to_csv(out / f"{name}.csv", sep="\t", index=False)
        print(f"Wrote {out/f'{name}.csv'}")