""" Generates and saves all output charts for the gairaigo origin classifier. Charts produced: 1. class_distribution.png Bar chart showing how many loanword samples exist per origin language. Useful for understanding class imbalance before modeling. 2. confusion_matrix.png Heatmap of true vs. predicted language labels on the test set. The diagonal shows correct predictions; off-diagonal cells reveal which language pairs the model confuses most. 3. top_features.png Horizontal bar chart of the highest-coefficient character n-grams for each language class (from the LinearSVC weight vectors). Shows which phonetic patterns the model learned to associate with each donor language, the linguistically interesting output. """ import os import numpy as np import matplotlib import matplotlib.pyplot as plt import matplotlib.ticker as ticker import matplotlib.font_manager as fm import seaborn as sns import pandas as pd OUTPUT_DIR = "output/plots" _JAPANESE_FONT_CANDIDATES = [ "Meiryo", # Windows, bundled since Vista "Yu Gothic", # Windows 8.1+ "MS Gothic", # Windows, older fallback "Hiragino Sans", # macOS "Hiragino Kaku Gothic Pro", # macOS, older "Noto Sans CJK JP", # Linux / cross-platform "IPAGothic", # Linux "IPAPGothic", # Linux alternate ] def _configure_japanese_font() -> None: """ Detect and activate a Japanese-capable font for matplotlib. Scans the system font list once and sets rcParams['font.family'] to the first candidate found. Prints a one-line status so it is visible in the console without being noisy. Called automatically when this module loads. """ available_fonts = {f.name for f in fm.fontManager.ttflist} for font_name in _JAPANESE_FONT_CANDIDATES: if font_name in available_fonts: matplotlib.rcParams["font.family"] = font_name print(f' [font] Using "{font_name}" for katakana rendering.') return # No Japanese font found, charts will still save but glyphs may appear as boxes print( " [font] Warning: no Japanese font found on this system.\n" " Katakana labels in top_features.png may render as boxes.\n" " Install Meiryo, Yu Gothic, or Noto Sans CJK JP and rerun." ) # Run font detection once at import time so every chart function picks it up _configure_japanese_font() def save_class_distribution(df: pd.DataFrame): """ Plot and save a horizontal bar chart of loanword sample counts by origin language. Horizontal layout keeps full language names (e.g. 'Ancient Greek') readable on the left axis without overlapping, which would happen with vertical bars once names exceed about six characters. Args: df : Cleaned DataFrame with a 'language' column. """ os.makedirs(OUTPUT_DIR, exist_ok=True) # Sort ascending so the longest bar sits at the top of the chart counts = df["language"].value_counts().sort_values(ascending=True) fig, ax = plt.subplots(figsize=(10, 7)) bars = ax.barh( counts.index, counts.values, color="steelblue", edgecolor="white", height=0.65 ) # Annotate each bar with its exact sample count, placed just past the bar tip max_count = counts.values.max() for bar, count in zip(bars, counts.values): ax.text( bar.get_width() + max_count * 0.01, bar.get_y() + bar.get_height() / 2, str(count), ha="left", va="center", fontsize=9, color="#333333", ) ax.set_title("Gairaigo Sample Count by Origin Language", fontsize=14, pad=14) ax.set_xlabel("Number of Loanword Entries", fontsize=11) ax.set_ylabel("Origin Language", fontsize=11) ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True)) # Give the count labels room on the right so they are not clipped ax.set_xlim(right=max_count * 1.12) ax.spines[["top", "right"]].set_visible(False) plt.tight_layout() plt.savefig(os.path.join(OUTPUT_DIR, "class_distribution.png"), dpi=150) plt.close() print(" [saved] class_distribution.png") def save_confusion_matrix(cm: np.ndarray, class_names: list): """ Plot and save a labeled heatmap of the confusion matrix. Args: cm : Confusion matrix array from sklearn.metrics.confusion_matrix. class_names : Ordered list of class label strings. """ os.makedirs(OUTPUT_DIR, exist_ok=True) fig, ax = plt.subplots(figsize=(9, 7)) sns.heatmap( cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names, linewidths=0.5, ax=ax, ) ax.set_title("Confusion Matrix — Gairaigo Origin Classifier", fontsize=13, pad=14) ax.set_xlabel("Predicted Language", fontsize=11) ax.set_ylabel("True Language", fontsize=11) plt.tight_layout() plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150) plt.close() print(" [saved] confusion_matrix.png") def save_top_features(model, vectorizer, label_encoder, top_n: int = 10): """ Plot the top-weighted character n-grams for each language class. LinearSVC assigns a coefficient to every feature for every class. Higher coefficients mean the n-gram is a stronger positive signal for that class. Visualizing these reveals which phonetic sub-patterns the model learned to associate with each donor language. Args: model : Fitted LinearSVC model. vectorizer : Fitted TfidfVectorizer used during featurization. label_encoder : LabelEncoder used during preprocessing. top_n : Number of top features to show per class. """ os.makedirs(OUTPUT_DIR, exist_ok=True) feature_names = np.array(vectorizer.get_feature_names_out()) class_names = label_encoder.classes_ n_classes = len(class_names) # LinearSVC stores one coefficient row per class (one-vs-rest strategy) coefficients = model.coef_ cols = 3 rows = (n_classes + cols - 1) // cols fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 3.8)) axes = axes.flatten() for i, (class_name, coef_row) in enumerate(zip(class_names, coefficients)): top_indices = np.argsort(coef_row)[-top_n:] top_features = feature_names[top_indices] top_values = coef_row[top_indices] axes[i].barh(top_features, top_values, color="steelblue", edgecolor="white") axes[i].set_title(class_name, fontsize=11, fontweight="bold") axes[i].set_xlabel("Coefficient Weight", fontsize=8) axes[i].tick_params(axis="y", labelsize=8) axes[i].spines[["top", "right"]].set_visible(False) # Hide unused subplot panels when n_classes is not a multiple of cols for j in range(i + 1, len(axes)): axes[j].set_visible(False) fig.suptitle( f"Top {top_n} Character N-Gram Features per Language Class\n" f"(LinearSVC Coefficient Weights)", fontsize=13, y=1.01, ) plt.tight_layout() plt.savefig( os.path.join(OUTPUT_DIR, "top_features.png"), dpi=150, bbox_inches="tight" ) plt.close() print(" [saved] top_features.png")