Spaces:
Sleeping
Sleeping
File size: 870 Bytes
492754f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | import os
import pandas as pd
from sklearn.model_selection import train_test_split
RANDOM_STATE = 42
df = pd.read_csv("data/processed/processed_bbc.csv")
train_df, temp_df = train_test_split(
df,
test_size=0.30,
random_state=RANDOM_STATE,
stratify=df["label_text"]
)
val_df, test_df = train_test_split(
temp_df,
test_size=0.50,
random_state=RANDOM_STATE,
stratify=temp_df["label_text"]
)
os.makedirs("data/splits", exist_ok=True)
train_df.to_csv("data/splits/train.csv", index=False)
val_df.to_csv("data/splits/val.csv", index=False)
test_df.to_csv("data/splits/test.csv", index=False)
print("Train/Validation/Test split completed.")
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)
print("\nTrain class distribution:")
print(train_df["label_text"].value_counts()) |