Spaces:
Sleeping
Sleeping
File size: 2,770 Bytes
caf26c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | """
Trains the gairaigo origin classifier on the full dataset and saves the
trained artifacts to disk so that predict.py can load and use them without
needing to re-train.
Saved artifacts (written to models/):
model.joblib — the fitted LinearSVC classifier
vectorizer.joblib — the fitted TfidfVectorizer (char n-gram)
encoder.joblib — the fitted LabelEncoder (integer ↔ language name)
Usage:
python scripts/train.py
"""
import sys
import os
# Allow imports from the project root regardless of where the script is called from
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
import joblib
from src.loader import load_gairaigo
from src.preprocessor import preprocess, build_features
JMDICT_PATH = "data/JMdict"
MODEL_DIR = "models"
def main():
os.makedirs(MODEL_DIR, exist_ok=True)
# Load
print("\n[train] Loading gairaigo entries from JMdict...")
df_raw = load_gairaigo(JMDICT_PATH)
print(f" {len(df_raw):,} entries loaded.")
# Preprocess
print("\n[train] Preprocessing...")
df, label_encoder = preprocess(df_raw)
print(f" {len(df):,} entries after dedup and class consolidation.")
print(f" Classes: {list(label_encoder.classes_)}")
# Featurize
print("\n[train] Building character n-gram features...")
X, vectorizer = build_features(df["katakana"])
y = df["label"].values
print(f" Feature matrix: {X.shape[0]:,} samples × {X.shape[1]:,} features")
# Split
from src.trainer import split_data, train_model
from src.evaluator import evaluate
print("\n[train] Splitting into train/test sets (80/20)...")
X_train, X_test, y_train, y_test, df_train, df_test = split_data(X, y, df)
print(
f" Train: {X_train.shape[0]:,} samples | Test: {X_test.shape[0]:,} samples"
)
# Train
print("\n[train] Fitting LinearSVC on training set...")
model = train_model(X_train, y_train)
print(" Training complete.")
# Evaluate
print("\n[train] Evaluating on test set...")
evaluate(model, X_test, y_test, label_encoder)
# Save artifacts
print(f"\n[train] Saving model artifacts to {MODEL_DIR}/ ...")
model_path = os.path.join(MODEL_DIR, "model.joblib")
vectorizer_path = os.path.join(MODEL_DIR, "vectorizer.joblib")
encoder_path = os.path.join(MODEL_DIR, "encoder.joblib")
joblib.dump(model, model_path)
joblib.dump(vectorizer, vectorizer_path)
joblib.dump(label_encoder, encoder_path)
print(f" Saved: {model_path}")
print(f" Saved: {vectorizer_path}")
print(f" Saved: {encoder_path}")
print("\n[train] Done. Run predict.py to classify new katakana words.\n")
if __name__ == "__main__":
main()
|