Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| RANDOM_STATE = 42 | |
| df = pd.read_csv("data/processed/processed_bbc.csv") | |
| train_df, temp_df = train_test_split( | |
| df, | |
| test_size=0.30, | |
| random_state=RANDOM_STATE, | |
| stratify=df["label_text"] | |
| ) | |
| val_df, test_df = train_test_split( | |
| temp_df, | |
| test_size=0.50, | |
| random_state=RANDOM_STATE, | |
| stratify=temp_df["label_text"] | |
| ) | |
| os.makedirs("data/splits", exist_ok=True) | |
| train_df.to_csv("data/splits/train.csv", index=False) | |
| val_df.to_csv("data/splits/val.csv", index=False) | |
| test_df.to_csv("data/splits/test.csv", index=False) | |
| print("Train/Validation/Test split completed.") | |
| print("Train shape:", train_df.shape) | |
| print("Validation shape:", val_df.shape) | |
| print("Test shape:", test_df.shape) | |
| print("\nTrain class distribution:") | |
| print(train_df["label_text"].value_counts()) |