import os
import json
from sklearn.model_selection import train_test_split
from baseline import load_data

def create_splits(data_dir, splits_dir):
    print("Loading raw dataset...")
    all_samples = load_data(data_dir)
    
    hal = [s for s in all_samples if s.get("is_hallucination")]
    clean = [s for s in all_samples if not s.get("is_hallucination")]
    
    # Stratified split for hallucinated: 70 / 15 / 15
    hal_train, hal_tmp = train_test_split(hal, test_size=0.30, random_state=42)
    hal_val, hal_test = train_test_split(hal_tmp, test_size=0.50, random_state=42)
    
    # Stratified split for clean: 70 / 15 / 15
    cln_train, cln_tmp = train_test_split(clean, test_size=0.30, random_state=42)
    cln_val, cln_test = train_test_split(cln_tmp, test_size=0.50, random_state=42)
    
    train = hal_train + cln_train
    val = hal_val + cln_val
    test = hal_test + cln_test
    
    os.makedirs(splits_dir, exist_ok=True)
    
    with open(os.path.join(splits_dir, "train.json"), "w", encoding='utf-8') as f:
        json.dump(train, f, indent=4)
        
    with open(os.path.join(splits_dir, "val.json"), "w", encoding='utf-8') as f:
        json.dump(val, f, indent=4)
        
    with open(os.path.join(splits_dir, "test.json"), "w", encoding='utf-8') as f:
        json.dump(test, f, indent=4)
        
    print(f"Splits saved to {splits_dir}")
    print(f"Train: {len(train)} (Hal: {len(hal_train)}, Clean: {len(cln_train)})")
    print(f"Val:   {len(val)} (Hal: {len(hal_val)}, Clean: {len(cln_val)})")
    print(f"Test:  {len(test)} (Hal: {len(hal_test)}, Clean: {len(cln_test)})")

if __name__ == "__main__":
    script_dir = os.path.dirname(os.path.abspath(__file__))
    project_root = os.path.join(script_dir, "..", "..")
    data_dir = os.path.join(project_root, "data", "raw")
    splits_dir = os.path.join(project_root, "data", "splits")
    
    create_splits(data_dir, splits_dir)