File size: 1,398 Bytes
0e038ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd


def prepare_data(dataset_name: str = "David-Egea/phishing-texts"):
    print(f"Loading dataset: {dataset_name}...")
    # The dataset usually loads into a 'train' split if not specified
    ds = load_dataset(dataset_name)

    # Convert to pandas for easier manipulation/splitting
    df: pd.DataFrame = ds["train"].to_pandas()  # type: ignore

    print(f"Total samples: {len(df)}")
    print(f"Class distribution:\n{df['phishing'].value_counts(normalize=True)}")

    # 80% Train, 20% Temp (Val + Test)
    train_df, temp_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df["phishing"]
    )

    # Split temp into 50% Val, 50% Test (results in 10% each of total)
    val_df, test_df = train_test_split(
        temp_df, test_size=0.5, random_state=42, stratify=temp_df["phishing"]
    )

    print(f"Train samples: {len(train_df)}")
    print(f"Val samples: {len(val_df)}")
    print(f"Test samples: {len(test_df)}")

    # Ensure data directory exists
    os.makedirs("data", exist_ok=True)

    # Save splits
    train_df.to_csv("data/train.csv", index=False)
    val_df.to_csv("data/val.csv", index=False)
    test_df.to_csv("data/test.csv", index=False)
    print("Splits saved to data/ folder.")


if __name__ == "__main__":
    prepare_data()