kataklassifer / main.py
krislette's picture
Initial commit
caf26c9
"""
main.py
Entry point for the Gairaigo Origin Language Classifier.
This script orchestrates the full machine learning pipeline in eight steps:
Step 1 β€” Load : Parse JMdict XML into a (katakana, language) DataFrame.
Step 2 β€” Preprocess : Remove duplicates, consolidate rare classes, encode labels.
Step 3 β€” Featurize : Build a TF-IDF character n-gram feature matrix.
Step 4 β€” Split : Divide data into training (80%) and test (20%) sets.
Step 5 β€” Train : Fit a LinearSVC classifier on the training features.
Step 6 β€” Evaluate : Compute accuracy, F1, and confusion matrix on the test set.
Step 7 β€” Visualize : Save all charts to output/plots/.
Step 8 β€” Export : Write per-word predictions to output/results/.
Usage:
python main.py
Make sure the JMdict file is placed at data/JMdict before running.
Download it from: https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project
"""
import os
import pandas as pd
from src.loader import load_gairaigo
from src.preprocessor import preprocess, build_features
from src.trainer import split_data, train_model
from src.evaluator import evaluate
from src.visualizer import (
save_class_distribution,
save_confusion_matrix,
save_top_features,
)
JMDICT_PATH = "data/JMdict"
RESULTS_DIR = "output/results"
def main():
# ------------------------------------------------------------------
# Step 1: Load
# ------------------------------------------------------------------
print("\n[Step 1] Loading gairaigo entries from JMdict...")
df_raw = load_gairaigo(JMDICT_PATH)
print(f" Loaded : {len(df_raw):,} gairaigo entries")
print(f' Languages : {df_raw["language"].nunique()} unique donor languages')
# ------------------------------------------------------------------
# Step 2: Preprocess
# ------------------------------------------------------------------
print("\n[Step 2] Preprocessing...")
df, label_encoder = preprocess(df_raw)
print(f" After dedup + class consolidation: {len(df):,} entries")
from src.preprocessor import KEEP_LANGUAGES
print(
f" Class selection criteria : only {len(KEEP_LANGUAGES)} target languages kept"
)
print(f" Target languages : {sorted(KEEP_LANGUAGES)}")
print(
f" Final classes ({len(label_encoder.classes_)}): {list(label_encoder.classes_)}"
)
# ------------------------------------------------------------------
# Step 3: Featurize
# ------------------------------------------------------------------
print("\n[Step 3] Building character n-gram feature matrix...")
X, vectorizer = build_features(df["katakana"])
y = df["label"].values
print(f" Feature matrix : {X.shape[0]:,} samples Γ— {X.shape[1]:,} n-gram features")
# ------------------------------------------------------------------
# Step 4: Split
# ------------------------------------------------------------------
print("\n[Step 4] Splitting into train / test sets (80 / 20, stratified)...")
X_train, X_test, y_train, y_test, df_train, df_test = split_data(X, y, df)
print(f" Train size : {X_train.shape[0]:,} samples")
print(f" Test size : {X_test.shape[0]:,} samples")
# ------------------------------------------------------------------
# Step 5: Train
# ------------------------------------------------------------------
print("\n[Step 5] Training LinearSVC classifier...")
model = train_model(X_train, y_train)
print(" Training complete.")
# ------------------------------------------------------------------
# Step 6: Evaluate
# ------------------------------------------------------------------
print("\n[Step 6] Evaluating on test set...")
results = evaluate(model, X_test, y_test, label_encoder)
# ------------------------------------------------------------------
# Step 7: Visualize
# ------------------------------------------------------------------
print("\n[Step 7] Generating visualizations...")
save_class_distribution(df)
save_confusion_matrix(results["confusion_matrix"], results["class_names"])
save_top_features(model, vectorizer, label_encoder)
# ------------------------------------------------------------------
# Step 8: Export
# ------------------------------------------------------------------
print("\n[Step 8] Exporting predictions to CSV...")
os.makedirs(RESULTS_DIR, exist_ok=True)
export_df = pd.DataFrame(
{
"katakana": df_test["katakana"].values,
"true_language": label_encoder.inverse_transform(y_test),
"predicted_language": label_encoder.inverse_transform(results["y_pred"]),
}
)
export_df["correct"] = export_df["true_language"] == export_df["predicted_language"]
csv_path = os.path.join(RESULTS_DIR, "classified_loanwords.csv")
export_df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f" Exported {len(export_df):,} predictions β†’ {csv_path}")
# Print a small sample of predictions so we can do a quick sanity check
print("\n Sample predictions (first 10 rows):")
print(export_df.head(10).to_string(index=False))
print("\n[Done] All steps complete.\n")
if __name__ == "__main__":
main()