File size: 2,712 Bytes
caf26c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Handles the train/test split and fitting of the classifier.

Model: LinearSVC (Linear Support Vector Classifier)
  - Designed for high-dimensional sparse feature spaces, which is exactly
    what TF-IDF character n-gram matrices produce.
  - Performs multi-class classification using a one-vs-rest strategy
    (one binary classifier per class; the highest-scoring class wins).
  - class_weight='balanced' adjusts the penalty weight inversely
    proportional to class frequency, which corrects for the strong
    imbalance caused by English being the dominant donor language.
  - Significantly faster to train than kernel SVMs on large sparse inputs.
"""

import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split


# 80 % of the data goes to training; 20 % is held out for evaluation
TEST_SIZE = 0.2

# Fixed seed for reproducibility across runs
RANDOM_STATE = 42


def split_data(X, y, df):
    """
    Split the feature matrix, label array, and source DataFrame simultaneously.

    Splitting all three together ensures that the test-set rows in df line up
    exactly with the rows in X_test and y_test, which we need for the CSV export.

    Stratification preserves the class distribution in both splits, which is
    important given the heavy imbalance toward English entries.

    Args:
        X  : Sparse TF-IDF feature matrix (n_samples, n_features).
        y  : Integer label array of shape (n_samples,).
        df : Cleaned DataFrame aligned with X and y (same row order).

    Returns:
        Tuple of (X_train, X_test, y_train, y_test, df_train, df_test).
    """
    # Generate a row-index array so we can split the DataFrame in sync with X and y
    indices = np.arange(len(df))

    X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
        X, y, indices, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

    df_train = df.iloc[idx_train].reset_index(drop=True)
    df_test = df.iloc[idx_test].reset_index(drop=True)

    return X_train, X_test, y_train, y_test, df_train, df_test


def train_model(X_train, y_train) -> LinearSVC:
    """
    Fit a LinearSVC on the training data and return the trained model.

    Args:
        X_train : Sparse training feature matrix.
        y_train : Integer training labels.

    Returns:
        Fitted LinearSVC model ready for prediction and evaluation.
    """
    model = LinearSVC(
        class_weight="balanced",  # compensates for English-heavy class imbalance
        max_iter=2000,  # extra iterations for convergence on larger datasets
        random_state=RANDOM_STATE,
        dual=False,
    )
    model.fit(X_train, y_train)
    return model