Spaces:
Running
Running
| import os | |
| import json | |
| from sklearn.model_selection import train_test_split | |
| from baseline import load_data | |
| def create_splits(data_dir, splits_dir): | |
| print("Loading raw dataset...") | |
| all_samples = load_data(data_dir) | |
| hal = [s for s in all_samples if s.get("is_hallucination")] | |
| clean = [s for s in all_samples if not s.get("is_hallucination")] | |
| # Stratified split for hallucinated: 70 / 15 / 15 | |
| hal_train, hal_tmp = train_test_split(hal, test_size=0.30, random_state=42) | |
| hal_val, hal_test = train_test_split(hal_tmp, test_size=0.50, random_state=42) | |
| # Stratified split for clean: 70 / 15 / 15 | |
| cln_train, cln_tmp = train_test_split(clean, test_size=0.30, random_state=42) | |
| cln_val, cln_test = train_test_split(cln_tmp, test_size=0.50, random_state=42) | |
| train = hal_train + cln_train | |
| val = hal_val + cln_val | |
| test = hal_test + cln_test | |
| os.makedirs(splits_dir, exist_ok=True) | |
| with open(os.path.join(splits_dir, "train.json"), "w", encoding='utf-8') as f: | |
| json.dump(train, f, indent=4) | |
| with open(os.path.join(splits_dir, "val.json"), "w", encoding='utf-8') as f: | |
| json.dump(val, f, indent=4) | |
| with open(os.path.join(splits_dir, "test.json"), "w", encoding='utf-8') as f: | |
| json.dump(test, f, indent=4) | |
| print(f"Splits saved to {splits_dir}") | |
| print(f"Train: {len(train)} (Hal: {len(hal_train)}, Clean: {len(cln_train)})") | |
| print(f"Val: {len(val)} (Hal: {len(hal_val)}, Clean: {len(cln_val)})") | |
| print(f"Test: {len(test)} (Hal: {len(hal_test)}, Clean: {len(cln_test)})") | |
| if __name__ == "__main__": | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| project_root = os.path.join(script_dir, "..", "..") | |
| data_dir = os.path.join(project_root, "data", "raw") | |
| splits_dir = os.path.join(project_root, "data", "splits") | |
| create_splits(data_dir, splits_dir) | |