lexir / src /plot_eval.py

ADDED MORE GPAPHS

c6cece9 6 days ago

21.4 kB

	import json
	from pathlib import Path

	import matplotlib.pyplot as plt
	import numpy as np


	def read_json(path):
	return json.loads(Path(path).read_text(encoding="utf-8"))


	def pick_models(files):
	items = []
	for p in files:
	try:
	j = read_json(p)
	items.append((Path(p).stem, j))
	except Exception:
	pass
	return items


	def metric_value(obj, scope, lang, metric):
	if scope == "overall":
	return obj.get("overall", {}).get(metric, None)
	if scope == "by_lang":
	return obj.get("by_lang", {}).get(lang, {}).get(metric, None)
	return None


	def section(obj, scope, lang):
	if scope == "overall":
	return obj.get("overall", {})
	if scope == "by_lang":
	return obj.get("by_lang", {}).get(lang, {})
	return {}


	def rank_stat_value(obj, scope, lang, key):
	return section(obj, scope, lang).get("rank_stats", {}).get(key, None)


	def score_stat_value(obj, scope, lang, group, key):
	return section(obj, scope, lang).get("score_stats", {}).get(group, {}).get(key, None)


	def coverage_value(obj, scope, lang, key):
	return section(obj, scope, lang).get("coverage", {}).get(key, None)


	def distribution_value(obj, scope, lang, key):
	return section(obj, scope, lang).get("distributions", {}).get(key, [])


	def save_recall_plot(models, scope, lang, out_path):
	ks = [1, 3, 5, 10]
	x = np.arange(len(ks))
	width = 0.8 / max(1, len(models))

	plt.figure()
	for i, (name, obj) in enumerate(models):
	vals = []
	for k in ks:
	v = metric_value(obj, scope, lang, f"recall@{k}")
	vals.append(0.0 if v is None else float(v))
	plt.bar(
	x + (i - (len(models) - 1) / 2) * width,
	vals,
	width=width,
	label=obj.get("model", name),
	)

	plt.xticks(x, [f"@{k}" for k in ks])
	title = "Recall@k"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.ylabel("score")
	ymax = max(
	[0.0]
	+ [
	max(
	[
	metric_value(o, scope, lang, f"recall@{k}") or 0.0
	for k in ks
	]
	)
	for _, o in models
	]
	)
	plt.ylim(0, min(1.0, max(0.05, ymax * 1.2)))
	plt.legend()
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_rank_metrics_plot(models, scope, lang, out_path):
	metrics = ["mrr@10", "ndcg@10"]
	x = np.arange(len(metrics))
	width = 0.8 / max(1, len(models))

	plt.figure()
	for i, (name, obj) in enumerate(models):
	vals = []
	for m in metrics:
	v = metric_value(obj, scope, lang, m)
	vals.append(0.0 if v is None else float(v))
	plt.bar(
	x + (i - (len(models) - 1) / 2) * width,
	vals,
	width=width,
	label=obj.get("model", name),
	)

	plt.xticks(x, metrics)
	title = "Ranking metrics"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.ylabel("score")
	ymax = max(
	[0.0]
	+ [
	max([metric_value(o, scope, lang, m) or 0.0 for m in metrics])
	for _, o in models
	]
	)
	plt.ylim(0, min(1.0, max(0.05, ymax * 1.2)))
	plt.legend()
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_precision_plot(models, scope, lang, out_path):
	ks = [1, 3, 5, 10]
	x = np.arange(len(ks))
	width = 0.8 / max(1, len(models))

	plt.figure()
	any_data = False
	for i, (name, obj) in enumerate(models):
	vals = []
	for k in ks:
	v = metric_value(obj, scope, lang, f"precision@{k}")
	if v is not None:
	any_data = True
	vals.append(0.0 if v is None else float(v))
	plt.bar(
	x + (i - (len(models) - 1) / 2) * width,
	vals,
	width=width,
	label=obj.get("model", name),
	)

	if not any_data:
	plt.close()
	return

	plt.xticks(x, [f"@{k}" for k in ks])
	title = "Precision@k (single-positive)"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.ylabel("score")
	ymax = max(
	[0.0]
	+ [
	max(
	[
	metric_value(o, scope, lang, f"precision@{k}") or 0.0
	for k in ks
	]
	)
	for _, o in models
	]
	)
	plt.ylim(0, min(1.0, max(0.05, ymax * 1.2)))
	plt.legend()
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_recall_curve_plot(models, scope, lang, out_path):
	ks = [1, 3, 5, 10]
	xs = np.array(ks, dtype=float)

	plt.figure()
	for name, obj in models:
	ys = []
	for k in ks:
	v = metric_value(obj, scope, lang, f"recall@{k}")
	ys.append(0.0 if v is None else float(v))
	plt.plot(xs, ys, marker="o", label=obj.get("model", name))

	plt.xticks(xs, [f"@{k}" for k in ks])
	title = "Recall@k vs k"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.xlabel("k")
	plt.ylabel("recall")
	ymax = max(
	[0.0]
	+ [
	max(
	[
	metric_value(o, scope, lang, f"recall@{k}") or 0.0
	for k in ks
	]
	)
	for _, o in models
	]
	)
	plt.ylim(0, min(1.0, max(0.05, ymax * 1.2)))
	plt.legend()
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_rank_stats_plot(models, scope, lang, out_path):
	metrics = [("mean_rank", "Mean"), ("median_rank", "Median"), ("p90_rank", "P90")]
	x = np.arange(len(metrics))
	width = 0.8 / max(1, len(models))

	plt.figure()
	any_data = False
	for i, (name, obj) in enumerate(models):
	vals = []
	for key, _ in metrics:
	v = rank_stat_value(obj, scope, lang, key)
	if v is not None:
	any_data = True
	vals.append(np.nan if v is None else float(v))
	plt.bar(
	x + (i - (len(models) - 1) / 2) * width,
	vals,
	width=width,
	label=obj.get("model", name),
	)

	if not any_data:
	plt.close()
	return

	plt.xticks(x, [m[1] for m in metrics])
	title = "Rank stats (1-based)"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.ylabel("rank")
	plt.legend()
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_rank_distribution_plot(models, scope, lang, out_path, not_found_out_path=None):
	top_k = None
	for _, obj in models:
	if "top_k" in obj:
	top_k = int(obj["top_k"])
	break
	if top_k is None:
	return

	x = np.arange(top_k)
	width = 0.8 / max(1, len(models))

	plt.figure()
	any_data = False
	not_found_rates = []
	not_found_labels = []
	for i, (name, obj) in enumerate(models):
	ranks = distribution_value(obj, scope, lang, "ranks")
	if not ranks:
	continue
	any_data = True
	buckets = [0] * top_k
	not_found = 0
	for r in ranks:
	if r is None or r < 0 or r >= top_k:
	not_found += 1
	else:
	buckets[int(r)] += 1
	total = max(1, len(ranks))
	not_found_rates.append(not_found / total)
	not_found_labels.append(obj.get("model", name))
	found_total = total - not_found
	if found_total <= 0:
	vals = [0.0] * top_k
	else:
	vals = [b / found_total for b in buckets]
	plt.bar(
	x + (i - (len(models) - 1) / 2) * width,
	vals,
	width=width,
	label=obj.get("model", name),
	)

	if not any_data:
	plt.close()
	return

	labels = [str(i + 1) for i in range(top_k)]
	plt.xticks(x, labels)
	title = "Rank distribution (found only)"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.ylabel("share of found queries")
	plt.legend()
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()

	if not_found_out_path and not_found_rates:
	plt.figure()
	x_nf = np.arange(len(not_found_rates))
	plt.bar(x_nf, not_found_rates)
	plt.xticks(x_nf, not_found_labels, rotation=15, ha="right")
	title = "Not found rate (NF)"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.ylabel("share of queries")
	plt.ylim(0, 1.0)
	Path(not_found_out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(not_found_out_path, dpi=180)
	plt.close()


	def save_margin_boxplot(models, scope, lang, out_path):
	data = []
	labels = []
	for name, obj in models:
	margins = distribution_value(obj, scope, lang, "margins")
	if margins:
	data.append(margins)
	labels.append(obj.get("model", name))

	if not data:
	return

	plt.figure(figsize=(8, 4.5))
	plt.boxplot(data, labels=labels, showfliers=False)
	title = "Score margin (top1 - top2)"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.ylabel("margin")
	plt.xticks(rotation=15, ha="right")
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_coverage_plot(models, scope, lang, out_path):
	vals = []
	labels = []
	for name, obj in models:
	v = coverage_value(obj, scope, lang, "coverage_ratio")
	if v is not None:
	vals.append(float(v))
	labels.append(obj.get("model", name))

	if not vals:
	return

	x = np.arange(len(vals))
	plt.figure()
	plt.bar(x, vals)
	plt.xticks(x, labels, rotation=15, ha="right")
	title = "Coverage ratio (unique docs / corpus)"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.ylabel("ratio")
	plt.ylim(0, 1.0)
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def _grouped_model_bars(models, value_fn, out_path, title, ylabel):
	labels = []
	overall_vals = []
	ru_vals = []
	kz_vals = []

	for name, obj in models:
	label = obj.get("model", name)
	labels.append(label)
	overall_vals.append(value_fn(obj, "overall", None))
	ru_vals.append(value_fn(obj, "by_lang", "ru"))
	kz_vals.append(value_fn(obj, "by_lang", "kz"))

	if not labels:
	return

	x = np.arange(len(labels))
	width = 0.25

	plt.figure(figsize=(9, 4.8))
	plt.bar(x - width, [0.0 if v is None else float(v) for v in overall_vals], width, label="overall")
	plt.bar(x, [0.0 if v is None else float(v) for v in ru_vals], width, label="ru")
	plt.bar(x + width, [0.0 if v is None else float(v) for v in kz_vals], width, label="kz")
	plt.title(title)
	plt.ylabel(ylabel)
	plt.ylim(0, 1.0)
	plt.xticks(x, labels, rotation=15, ha="right")
	plt.grid(axis="y", alpha=0.2)
	plt.legend()
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_coverage_grouped(models, out_path):
	def value_fn(obj, scope, lang):
	return coverage_value(obj, scope, lang, "coverage_ratio")

	_grouped_model_bars(
	models,
	value_fn,
	out_path,
	"Coverage (overall/ru/kz)",
	"ratio",
	)


	def save_not_found_grouped(models, out_path):
	def value_fn(obj, scope, lang):
	v = metric_value(obj, scope, lang, "not_found_rate")
	if v is None:
	v = rank_stat_value(obj, scope, lang, "not_found_rate")
	return v

	_grouped_model_bars(
	models,
	value_fn,
	out_path,
	"Not found rate (overall/ru/kz)",
	"share of queries",
	)


	def save_top1_score_hist(models, scope, lang, out_dir):
	for name, obj in models:
	tp = distribution_value(obj, scope, lang, "top1_scores_tp")
	fp = distribution_value(obj, scope, lang, "top1_scores_fp")
	if not tp and not fp:
	continue
	plt.figure()
	if tp:
	plt.hist(tp, bins=20, alpha=0.6, label="top-1 is positive")
	if fp:
	plt.hist(fp, bins=20, alpha=0.6, label="top-1 is not positive")
	title = "Top-1 score distribution"
	label = obj.get("model", name)
	if scope == "overall":
	plt.title(f"{title} ({label}, overall)")
	else:
	plt.title(f"{title} ({label}, {lang})")
	plt.xlabel("similarity score")
	plt.ylabel("count")
	plt.legend()
	Path(out_dir).mkdir(parents=True, exist_ok=True)
	out_path = (
	Path(out_dir)
	/ f"top1_score_tp_fp_{model_label_key(obj, name)}_{scope if scope else 'overall'}{'' if lang is None else '_' + lang}.png"
	)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_metrics_heatmap(models, out_path):
	metrics = ["recall@1", "recall@3", "recall@5", "recall@10", "mrr@10", "ndcg@10", "not_found_rate"]
	data = []
	labels = []
	for name, obj in models:
	labels.append(obj.get("model", name))
	row = []
	for m in metrics:
	v = metric_value(obj, "overall", None, m)
	if v is None:
	v = rank_stat_value(obj, "overall", None, m)
	row.append(0.0 if v is None else float(v))
	data.append(row)

	if not data:
	return

	arr = np.array(data)
	plt.figure(figsize=(9, 3.8))
	im = plt.imshow(arr, aspect="auto", cmap="viridis")
	plt.yticks(np.arange(len(labels)), labels)
	plt.xticks(np.arange(len(metrics)), metrics, rotation=30, ha="right")
	plt.title("Metrics heatmap (overall)")
	plt.colorbar(im, fraction=0.046, pad=0.04)
	plt.tight_layout()
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_rank_cdf(models, out_path):
	top_k = None
	for _, obj in models:
	if "top_k" in obj:
	top_k = int(obj["top_k"])
	break
	if top_k is None:
	return

	ks = np.arange(1, top_k + 1)
	plt.figure(figsize=(8.5, 4.5))
	for name, obj in models:
	ranks = distribution_value(obj, "overall", None, "ranks")
	if not ranks:
	continue
	total = max(1, len(ranks))
	ys = []
	for k in ks:
	found = sum(1 for r in ranks if r is not None and r >= 0 and r < k)
	ys.append(found / total)
	plt.plot(ks, ys, marker="o", label=obj.get("model", name))

	plt.title("Rank CDF (overall)")
	plt.xlabel("k")
	plt.ylabel("share of queries with rank ≤ k")
	plt.ylim(0, 1.0)
	plt.grid(alpha=0.2)
	plt.legend()
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def save_confidence_scatter(models, out_path):
	xs = []
	ys = []
	labels = []
	for name, obj in models:
	mrr = metric_value(obj, "overall", None, "mrr@10")
	top1 = score_stat_value(obj, "overall", None, "top1_score", "mean")
	if mrr is None or top1 is None:
	continue
	xs.append(float(top1))
	ys.append(float(mrr))
	labels.append(obj.get("model", name))

	if not xs:
	return

	plt.figure(figsize=(6.5, 4.5))
	plt.scatter(xs, ys, s=60)
	for x, y, label in zip(xs, ys, labels):
	plt.text(x, y, f" {label}", fontsize=9, ha="left", va="center")
	plt.title("Top-1 confidence vs MRR@10 (overall)")
	plt.xlabel("mean top-1 score")
	plt.ylabel("mrr@10")
	plt.grid(alpha=0.2)
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def model_label_key(obj, name):
	s = str(obj.get("model", name)).lower()
	if "labse" in s:
	return "labse"
	if "finetuned" in s or "artifacts" in s:
	return "finetuned"
	if "paraphrase-multilingual-mpnet-base-v2" in s:
	return "base"
	if "mpnet" in s:
	return "base"
	return name.lower()


	def select_model(models, key):
	for name, obj in models:
	if model_label_key(obj, name) == key:
	return (name, obj)
	return None


	def save_relative_improvement_plot(models, scope, lang, out_path):
	fin = select_model(models, "finetuned")
	base = select_model(models, "base")
	if fin is None or base is None:
	return

	metrics = ["recall@1", "recall@3", "recall@5", "recall@10", "mrr@10", "ndcg@10"]
	labels = ["R@1", "R@3", "R@5", "R@10", "MRR@10", "nDCG@10"]

	fin_obj = fin[1]
	base_obj = base[1]

	vals = []
	for m in metrics:
	fv = metric_value(fin_obj, scope, lang, m)
	bv = metric_value(base_obj, scope, lang, m)
	fv = 0.0 if fv is None else float(fv)
	bv = 0.0 if bv is None else float(bv)
	if bv <= 0:
	vals.append(np.nan)
	else:
	vals.append((fv - bv) / bv * 100.0)

	x = np.arange(len(metrics))
	plt.figure()
	plt.bar(x, vals)
	plt.xticks(x, labels)
	title = "Relative improvement vs base (%)"
	if scope == "overall":
	plt.title(f"{title} (overall)")
	else:
	plt.title(f"{title} ({lang})")
	plt.ylabel("%")
	plt.axhline(0.0)
	Path(out_path).parent.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def main():
	reports_dir = Path("artifacts/reports")
	files = sorted([str(p) for p in reports_dir.glob("eval_*.json")])
	models = pick_models(files)

	if not models:
	raise SystemExit("No eval_*.json found in artifacts/reports")

	fig_dir = reports_dir / "figures"
	fig_dir.mkdir(parents=True, exist_ok=True)

	save_recall_plot(models, "overall", None, fig_dir / "recall_overall.png")
	save_rank_metrics_plot(models, "overall", None, fig_dir / "rank_metrics_overall.png")
	save_recall_curve_plot(models, "overall", None, fig_dir / "recall_curve_overall.png")
	save_relative_improvement_plot(models, "overall", None, fig_dir / "relative_improvement_overall.png")
	save_precision_plot(models, "overall", None, fig_dir / "precision_overall.png")
	save_rank_stats_plot(models, "overall", None, fig_dir / "rank_stats_overall.png")
	save_rank_distribution_plot(
	models,
	"overall",
	None,
	fig_dir / "rank_distribution_overall.png",
	None,
	)
	save_margin_boxplot(models, "overall", None, fig_dir / "score_margin_overall.png")
	# Per-scope coverage plots removed in favor of grouped chart.
	save_top1_score_hist(models, "overall", None, fig_dir)
	save_coverage_grouped(models, fig_dir / "coverage_grouped.png")
	save_not_found_grouped(models, fig_dir / "not_found_grouped.png")
	save_metrics_heatmap(models, fig_dir / "metrics_heatmap_overall.png")
	save_rank_cdf(models, fig_dir / "rank_cdf_overall.png")
	save_confidence_scatter(models, fig_dir / "confidence_scatter_overall.png")

	for lang in ["ru", "kz"]:
	save_recall_plot(models, "by_lang", lang, fig_dir / f"recall_{lang}.png")
	save_rank_metrics_plot(
	models, "by_lang", lang, fig_dir / f"rank_metrics_{lang}.png"
	)
	save_recall_curve_plot(
	models, "by_lang", lang, fig_dir / f"recall_curve_{lang}.png"
	)
	save_relative_improvement_plot(
	models, "by_lang", lang, fig_dir / f"relative_improvement_{lang}.png"
	)
	save_precision_plot(models, "by_lang", lang, fig_dir / f"precision_{lang}.png")
	save_rank_stats_plot(models, "by_lang", lang, fig_dir / f"rank_stats_{lang}.png")
	save_rank_distribution_plot(
	models,
	"by_lang",
	lang,
	fig_dir / f"rank_distribution_{lang}.png",
	None,
	)

	summary = {
	"loaded_reports": [Path(f).name for f in files],
	"figures": [p.name for p in sorted(fig_dir.glob("*.png"))],
	}
	(reports_dir / "figures_summary.json").write_text(
	json.dumps(summary, ensure_ascii=False, indent=2),
	encoding="utf-8",
	)


	if __name__ == "__main__":
	main()