Spaces:

krislette
/

kataklassifer

Sleeping

App Files Files Community

kataklassifer / src /visualizer.py

krislette

Initial commit

caf26c9 about 1 month ago

raw

history blame contribute delete

7.36 kB

	"""
	Generates and saves all output charts for the gairaigo origin classifier.

	Charts produced:
	1. class_distribution.png
	Bar chart showing how many loanword samples exist per origin language.
	Useful for understanding class imbalance before modeling.

	2. confusion_matrix.png
	Heatmap of true vs. predicted language labels on the test set.
	The diagonal shows correct predictions; off-diagonal cells reveal
	which language pairs the model confuses most.

	3. top_features.png
	Horizontal bar chart of the highest-coefficient character n-grams
	for each language class (from the LinearSVC weight vectors).
	Shows which phonetic patterns the model learned to associate with
	each donor language, the linguistically interesting output.
	"""

	import os
	import numpy as np
	import matplotlib
	import matplotlib.pyplot as plt
	import matplotlib.ticker as ticker
	import matplotlib.font_manager as fm
	import seaborn as sns
	import pandas as pd


	OUTPUT_DIR = "output/plots"

	_JAPANESE_FONT_CANDIDATES = [
	"Meiryo", # Windows, bundled since Vista
	"Yu Gothic", # Windows 8.1+
	"MS Gothic", # Windows, older fallback
	"Hiragino Sans", # macOS
	"Hiragino Kaku Gothic Pro", # macOS, older
	"Noto Sans CJK JP", # Linux / cross-platform
	"IPAGothic", # Linux
	"IPAPGothic", # Linux alternate
	]


	def _configure_japanese_font() -> None:
	"""
	Detect and activate a Japanese-capable font for matplotlib.

	Scans the system font list once and sets rcParams['font.family'] to the
	first candidate found. Prints a one-line status so it is visible in the
	console without being noisy. Called automatically when this module loads.
	"""
	available_fonts = {f.name for f in fm.fontManager.ttflist}

	for font_name in _JAPANESE_FONT_CANDIDATES:
	if font_name in available_fonts:
	matplotlib.rcParams["font.family"] = font_name
	print(f' [font] Using "{font_name}" for katakana rendering.')
	return

	# No Japanese font found, charts will still save but glyphs may appear as boxes
	print(
	" [font] Warning: no Japanese font found on this system.\n"
	" Katakana labels in top_features.png may render as boxes.\n"
	" Install Meiryo, Yu Gothic, or Noto Sans CJK JP and rerun."
	)


	# Run font detection once at import time so every chart function picks it up
	_configure_japanese_font()


	def save_class_distribution(df: pd.DataFrame):
	"""
	Plot and save a horizontal bar chart of loanword sample counts by origin language.

	Horizontal layout keeps full language names (e.g. 'Ancient Greek') readable
	on the left axis without overlapping, which would happen with vertical bars
	once names exceed about six characters.

	Args:
	df : Cleaned DataFrame with a 'language' column.
	"""
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# Sort ascending so the longest bar sits at the top of the chart
	counts = df["language"].value_counts().sort_values(ascending=True)

	fig, ax = plt.subplots(figsize=(10, 7))
	bars = ax.barh(
	counts.index, counts.values, color="steelblue", edgecolor="white", height=0.65
	)

	# Annotate each bar with its exact sample count, placed just past the bar tip
	max_count = counts.values.max()
	for bar, count in zip(bars, counts.values):
	ax.text(
	bar.get_width() + max_count * 0.01,
	bar.get_y() + bar.get_height() / 2,
	str(count),
	ha="left",
	va="center",
	fontsize=9,
	color="#333333",
	)

	ax.set_title("Gairaigo Sample Count by Origin Language", fontsize=14, pad=14)
	ax.set_xlabel("Number of Loanword Entries", fontsize=11)
	ax.set_ylabel("Origin Language", fontsize=11)
	ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))

	# Give the count labels room on the right so they are not clipped
	ax.set_xlim(right=max_count * 1.12)

	ax.spines[["top", "right"]].set_visible(False)
	plt.tight_layout()
	plt.savefig(os.path.join(OUTPUT_DIR, "class_distribution.png"), dpi=150)
	plt.close()
	print(" [saved] class_distribution.png")


	def save_confusion_matrix(cm: np.ndarray, class_names: list):
	"""
	Plot and save a labeled heatmap of the confusion matrix.

	Args:
	cm : Confusion matrix array from sklearn.metrics.confusion_matrix.
	class_names : Ordered list of class label strings.
	"""
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	fig, ax = plt.subplots(figsize=(9, 7))
	sns.heatmap(
	cm,
	annot=True,
	fmt="d",
	cmap="Blues",
	xticklabels=class_names,
	yticklabels=class_names,
	linewidths=0.5,
	ax=ax,
	)
	ax.set_title("Confusion Matrix — Gairaigo Origin Classifier", fontsize=13, pad=14)
	ax.set_xlabel("Predicted Language", fontsize=11)
	ax.set_ylabel("True Language", fontsize=11)
	plt.tight_layout()
	plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix.png"), dpi=150)
	plt.close()
	print(" [saved] confusion_matrix.png")


	def save_top_features(model, vectorizer, label_encoder, top_n: int = 10):
	"""
	Plot the top-weighted character n-grams for each language class.

	LinearSVC assigns a coefficient to every feature for every class.
	Higher coefficients mean the n-gram is a stronger positive signal for
	that class. Visualizing these reveals which phonetic sub-patterns
	the model learned to associate with each donor language.

	Args:
	model : Fitted LinearSVC model.
	vectorizer : Fitted TfidfVectorizer used during featurization.
	label_encoder : LabelEncoder used during preprocessing.
	top_n : Number of top features to show per class.
	"""
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	feature_names = np.array(vectorizer.get_feature_names_out())
	class_names = label_encoder.classes_
	n_classes = len(class_names)

	# LinearSVC stores one coefficient row per class (one-vs-rest strategy)
	coefficients = model.coef_

	cols = 3
	rows = (n_classes + cols - 1) // cols
	fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 3.8))
	axes = axes.flatten()

	for i, (class_name, coef_row) in enumerate(zip(class_names, coefficients)):
	top_indices = np.argsort(coef_row)[-top_n:]
	top_features = feature_names[top_indices]
	top_values = coef_row[top_indices]

	axes[i].barh(top_features, top_values, color="steelblue", edgecolor="white")
	axes[i].set_title(class_name, fontsize=11, fontweight="bold")
	axes[i].set_xlabel("Coefficient Weight", fontsize=8)
	axes[i].tick_params(axis="y", labelsize=8)
	axes[i].spines[["top", "right"]].set_visible(False)

	# Hide unused subplot panels when n_classes is not a multiple of cols
	for j in range(i + 1, len(axes)):
	axes[j].set_visible(False)

	fig.suptitle(
	f"Top {top_n} Character N-Gram Features per Language Class\n"
	f"(LinearSVC Coefficient Weights)",
	fontsize=13,
	y=1.01,
	)
	plt.tight_layout()
	plt.savefig(
	os.path.join(OUTPUT_DIR, "top_features.png"), dpi=150, bbox_inches="tight"
	)
	plt.close()
	print(" [saved] top_features.png")