| from datasets import load_from_disk, concatenate_datasets |
| from pathlib import Path |
| import pandas as pd |
|
|
| src_root = Path("hf_raw") |
| dst_root = Path("ft_data") |
| seed = 42 |
|
|
| for ds_dir in src_root.iterdir(): |
| if not ds_dir.is_dir(): |
| continue |
| ds = load_from_disk(ds_dir) |
|
|
| |
| full = concatenate_datasets([ds[s] for s in ds.keys()]) |
| full = full.shuffle(seed=seed) |
| split1 = full.train_test_split(test_size=0.2, seed=seed) |
| train = split1["train"] |
| split2 = split1["test"].train_test_split(test_size=0.5, seed=seed) |
| dev, test = split2["train"], split2["test"] |
|
|
| out = dst_root / ds_dir.name / "split" |
| out.mkdir(parents=True, exist_ok=True) |
| for name, subset in [("train", train), ("dev", dev), ("test", test)]: |
| subset.to_pandas()[["seq", "label"]].rename( |
| columns={"seq": "sequence", "label": "labels"} |
| ).to_csv(out / f"{name}.csv", sep="\t", index=False) |
| print(f"Wrote {out/f'{name}.csv'}") |
|
|