agentsight-api / src /data /splitter.py
Minato Namikaze
Deploy to Hugging Face Spaces
2aed081
Raw
History Blame Contribute Delete
1.93 kB
import os
import json
from sklearn.model_selection import train_test_split
from baseline import load_data
def create_splits(data_dir, splits_dir):
print("Loading raw dataset...")
all_samples = load_data(data_dir)
hal = [s for s in all_samples if s.get("is_hallucination")]
clean = [s for s in all_samples if not s.get("is_hallucination")]
# Stratified split for hallucinated: 70 / 15 / 15
hal_train, hal_tmp = train_test_split(hal, test_size=0.30, random_state=42)
hal_val, hal_test = train_test_split(hal_tmp, test_size=0.50, random_state=42)
# Stratified split for clean: 70 / 15 / 15
cln_train, cln_tmp = train_test_split(clean, test_size=0.30, random_state=42)
cln_val, cln_test = train_test_split(cln_tmp, test_size=0.50, random_state=42)
train = hal_train + cln_train
val = hal_val + cln_val
test = hal_test + cln_test
os.makedirs(splits_dir, exist_ok=True)
with open(os.path.join(splits_dir, "train.json"), "w", encoding='utf-8') as f:
json.dump(train, f, indent=4)
with open(os.path.join(splits_dir, "val.json"), "w", encoding='utf-8') as f:
json.dump(val, f, indent=4)
with open(os.path.join(splits_dir, "test.json"), "w", encoding='utf-8') as f:
json.dump(test, f, indent=4)
print(f"Splits saved to {splits_dir}")
print(f"Train: {len(train)} (Hal: {len(hal_train)}, Clean: {len(cln_train)})")
print(f"Val: {len(val)} (Hal: {len(hal_val)}, Clean: {len(cln_val)})")
print(f"Test: {len(test)} (Hal: {len(hal_test)}, Clean: {len(cln_test)})")
if __name__ == "__main__":
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.join(script_dir, "..", "..")
data_dir = os.path.join(project_root, "data", "raw")
splits_dir = os.path.join(project_root, "data", "splits")
create_splits(data_dir, splits_dir)