File size: 7,364 Bytes
caf26c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Generates and saves all output charts for the gairaigo origin classifier.

Charts produced:
  1. class_distribution.png
       Bar chart showing how many loanword samples exist per origin language.
       Useful for understanding class imbalance before modeling.

  2. confusion_matrix.png
       Heatmap of true vs. predicted language labels on the test set.
       The diagonal shows correct predictions; off-diagonal cells reveal
       which language pairs the model confuses most.

  3. top_features.png
       Horizontal bar chart of the highest-coefficient character n-grams
       for each language class (from the LinearSVC weight vectors).
       Shows which phonetic patterns the model learned to associate with
       each donor language, the linguistically interesting output.
"""

import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.font_manager as fm
import seaborn as sns
import pandas as pd


OUTPUT_DIR = "output/plots"

_JAPANESE_FONT_CANDIDATES = [
    "Meiryo",  # Windows, bundled since Vista
    "Yu Gothic",  # Windows 8.1+
    "MS Gothic",  # Windows, older fallback
    "Hiragino Sans",  # macOS
    "Hiragino Kaku Gothic Pro",  # macOS, older
    "Noto Sans CJK JP",  # Linux / cross-platform
    "IPAGothic",  # Linux
    "IPAPGothic",  # Linux alternate
]


def _configure_japanese_font() -> None:
    """
    Detect and activate a Japanese-capable font for matplotlib.

    Scans the system font list once and sets rcParams['font.family'] to the
    first candidate found. Prints a one-line status so it is visible in the
    console without being noisy. Called automatically when this module loads.
    """
    available_fonts = {f.name for f in fm.fontManager.ttflist}

    for font_name in _JAPANESE_FONT_CANDIDATES:
        if font_name in available_fonts:
            matplotlib.rcParams["font.family"] = font_name
            print(f'  [font] Using "{font_name}" for katakana rendering.')
            return

    # No Japanese font found, charts will still save but glyphs may appear as boxes
    print(
        "  [font] Warning: no Japanese font found on this system.\n"
        "         Katakana labels in top_features.png may render as boxes.\n"
        "         Install Meiryo, Yu Gothic, or Noto Sans CJK JP and rerun."
    )


# Run font detection once at import time so every chart function picks it up
_configure_japanese_font()


def save_class_distribution(df: pd.DataFrame):
    """
    Plot and save a horizontal bar chart of loanword sample counts by origin language.

    Horizontal layout keeps full language names (e.g. 'Ancient Greek') readable
    on the left axis without overlapping, which would happen with vertical bars
    once names exceed about six characters.

    Args:
        df : Cleaned DataFrame with a 'language' column.
    """
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Sort ascending so the longest bar sits at the top of the chart
    counts = df["language"].value_counts().sort_values(ascending=True)

    fig, ax = plt.subplots(figsize=(10, 7))
    bars = ax.barh(
        counts.index, counts.values, color="steelblue", edgecolor="white", height=0.65
    )

    # Annotate each bar with its exact sample count, placed just past the bar tip
    max_count = counts.values.max()
    for bar, count in zip(bars, counts.values):
        ax.text(
            bar.get_width() + max_count * 0.01,
            bar.get_y() + bar.get_height() / 2,
            str(count),
            ha="left",
            va="center",
            fontsize=9,
            color="#333333",
        )

    ax.set_title("Gairaigo Sample Count by Origin Language", fontsize=14, pad=14)
    ax.set_xlabel("Number of Loanword Entries", fontsize=11)
    ax.set_ylabel("Origin Language", fontsize=11)
    ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))

    # Give the count labels room on the right so they are not clipped
    ax.set_xlim(right=max_count * 1.12)

    ax.spines[["top", "right"]].set_visible(False)
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "class_distribution.png"), dpi=150)
    plt.close()
    print("  [saved] class_distribution.png")


def save_confusion_matrix(cm: np.ndarray, class_names: list):
    """
    Plot and save a labeled heatmap of the confusion matrix.

    Args:
        cm          : Confusion matrix array from sklearn.metrics.confusion_matrix.
        class_names : Ordered list of class label strings.
    """
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    fig, ax = plt.subplots(figsize=(9, 7))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=class_names,
        yticklabels=class_names,
        linewidths=0.5,
        ax=ax,
    )
    ax.set_title("Confusion Matrix — Gairaigo Origin Classifier", fontsize=13, pad=14)
    ax.set_xlabel("Predicted Language", fontsize=11)
    ax.set_ylabel("True Language", fontsize=11)
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
    plt.close()
    print("  [saved] confusion_matrix.png")


def save_top_features(model, vectorizer, label_encoder, top_n: int = 10):
    """
    Plot the top-weighted character n-grams for each language class.

    LinearSVC assigns a coefficient to every feature for every class.
    Higher coefficients mean the n-gram is a stronger positive signal for
    that class. Visualizing these reveals which phonetic sub-patterns
    the model learned to associate with each donor language.

    Args:
        model         : Fitted LinearSVC model.
        vectorizer    : Fitted TfidfVectorizer used during featurization.
        label_encoder : LabelEncoder used during preprocessing.
        top_n         : Number of top features to show per class.
    """
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    feature_names = np.array(vectorizer.get_feature_names_out())
    class_names = label_encoder.classes_
    n_classes = len(class_names)

    # LinearSVC stores one coefficient row per class (one-vs-rest strategy)
    coefficients = model.coef_

    cols = 3
    rows = (n_classes + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 3.8))
    axes = axes.flatten()

    for i, (class_name, coef_row) in enumerate(zip(class_names, coefficients)):
        top_indices = np.argsort(coef_row)[-top_n:]
        top_features = feature_names[top_indices]
        top_values = coef_row[top_indices]

        axes[i].barh(top_features, top_values, color="steelblue", edgecolor="white")
        axes[i].set_title(class_name, fontsize=11, fontweight="bold")
        axes[i].set_xlabel("Coefficient Weight", fontsize=8)
        axes[i].tick_params(axis="y", labelsize=8)
        axes[i].spines[["top", "right"]].set_visible(False)

    # Hide unused subplot panels when n_classes is not a multiple of cols
    for j in range(i + 1, len(axes)):
        axes[j].set_visible(False)

    fig.suptitle(
        f"Top {top_n} Character N-Gram Features per Language Class\n"
        f"(LinearSVC Coefficient Weights)",
        fontsize=13,
        y=1.01,
    )
    plt.tight_layout()
    plt.savefig(
        os.path.join(OUTPUT_DIR, "top_features.png"), dpi=150, bbox_inches="tight"
    )
    plt.close()
    print("  [saved] top_features.png")