from datasets import load_from_disk, concatenate_datasets from pathlib import Path import pandas as pd src_root = Path("hf_raw") dst_root = Path("ft_data") seed = 42 for ds_dir in src_root.iterdir(): if not ds_dir.is_dir(): continue ds = load_from_disk(ds_dir) # Combine all available splits, shuffle, then 80/10/10 full = concatenate_datasets([ds[s] for s in ds.keys()]) full = full.shuffle(seed=seed) split1 = full.train_test_split(test_size=0.2, seed=seed) train = split1["train"] split2 = split1["test"].train_test_split(test_size=0.5, seed=seed) dev, test = split2["train"], split2["test"] out = dst_root / ds_dir.name / "split" out.mkdir(parents=True, exist_ok=True) for name, subset in [("train", train), ("dev", dev), ("test", test)]: subset.to_pandas()[["seq", "label"]].rename( columns={"seq": "sequence", "label": "labels"} ).to_csv(out / f"{name}.csv", sep="\t", index=False) print(f"Wrote {out/f'{name}.csv'}")