import json from pathlib import Path import matplotlib.pyplot as plt import numpy as np def read_json(path): return json.loads(Path(path).read_text(encoding="utf-8")) def pick_models(files): items = [] for p in files: try: j = read_json(p) items.append((Path(p).stem, j)) except Exception: pass return items def metric_value(obj, scope, lang, metric): if scope == "overall": return obj.get("overall", {}).get(metric, None) if scope == "by_lang": return obj.get("by_lang", {}).get(lang, {}).get(metric, None) return None def section(obj, scope, lang): if scope == "overall": return obj.get("overall", {}) if scope == "by_lang": return obj.get("by_lang", {}).get(lang, {}) return {} def rank_stat_value(obj, scope, lang, key): return section(obj, scope, lang).get("rank_stats", {}).get(key, None) def score_stat_value(obj, scope, lang, group, key): return section(obj, scope, lang).get("score_stats", {}).get(group, {}).get(key, None) def coverage_value(obj, scope, lang, key): return section(obj, scope, lang).get("coverage", {}).get(key, None) def distribution_value(obj, scope, lang, key): return section(obj, scope, lang).get("distributions", {}).get(key, []) def save_recall_plot(models, scope, lang, out_path): ks = [1, 3, 5, 10] x = np.arange(len(ks)) width = 0.8 / max(1, len(models)) plt.figure() for i, (name, obj) in enumerate(models): vals = [] for k in ks: v = metric_value(obj, scope, lang, f"recall@{k}") vals.append(0.0 if v is None else float(v)) plt.bar( x + (i - (len(models) - 1) / 2) * width, vals, width=width, label=obj.get("model", name), ) plt.xticks(x, [f"@{k}" for k in ks]) title = "Recall@k" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.ylabel("score") ymax = max( [0.0] + [ max( [ metric_value(o, scope, lang, f"recall@{k}") or 0.0 for k in ks ] ) for _, o in models ] ) plt.ylim(0, min(1.0, max(0.05, ymax * 1.2))) plt.legend() Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def save_rank_metrics_plot(models, scope, lang, out_path): metrics = ["mrr@10", "ndcg@10"] x = np.arange(len(metrics)) width = 0.8 / max(1, len(models)) plt.figure() for i, (name, obj) in enumerate(models): vals = [] for m in metrics: v = metric_value(obj, scope, lang, m) vals.append(0.0 if v is None else float(v)) plt.bar( x + (i - (len(models) - 1) / 2) * width, vals, width=width, label=obj.get("model", name), ) plt.xticks(x, metrics) title = "Ranking metrics" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.ylabel("score") ymax = max( [0.0] + [ max([metric_value(o, scope, lang, m) or 0.0 for m in metrics]) for _, o in models ] ) plt.ylim(0, min(1.0, max(0.05, ymax * 1.2))) plt.legend() Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def save_precision_plot(models, scope, lang, out_path): ks = [1, 3, 5, 10] x = np.arange(len(ks)) width = 0.8 / max(1, len(models)) plt.figure() any_data = False for i, (name, obj) in enumerate(models): vals = [] for k in ks: v = metric_value(obj, scope, lang, f"precision@{k}") if v is not None: any_data = True vals.append(0.0 if v is None else float(v)) plt.bar( x + (i - (len(models) - 1) / 2) * width, vals, width=width, label=obj.get("model", name), ) if not any_data: plt.close() return plt.xticks(x, [f"@{k}" for k in ks]) title = "Precision@k (single-positive)" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.ylabel("score") ymax = max( [0.0] + [ max( [ metric_value(o, scope, lang, f"precision@{k}") or 0.0 for k in ks ] ) for _, o in models ] ) plt.ylim(0, min(1.0, max(0.05, ymax * 1.2))) plt.legend() Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def save_recall_curve_plot(models, scope, lang, out_path): ks = [1, 3, 5, 10] xs = np.array(ks, dtype=float) plt.figure() for name, obj in models: ys = [] for k in ks: v = metric_value(obj, scope, lang, f"recall@{k}") ys.append(0.0 if v is None else float(v)) plt.plot(xs, ys, marker="o", label=obj.get("model", name)) plt.xticks(xs, [f"@{k}" for k in ks]) title = "Recall@k vs k" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.xlabel("k") plt.ylabel("recall") ymax = max( [0.0] + [ max( [ metric_value(o, scope, lang, f"recall@{k}") or 0.0 for k in ks ] ) for _, o in models ] ) plt.ylim(0, min(1.0, max(0.05, ymax * 1.2))) plt.legend() Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def save_rank_stats_plot(models, scope, lang, out_path): metrics = [("mean_rank", "Mean"), ("median_rank", "Median"), ("p90_rank", "P90")] x = np.arange(len(metrics)) width = 0.8 / max(1, len(models)) plt.figure() any_data = False for i, (name, obj) in enumerate(models): vals = [] for key, _ in metrics: v = rank_stat_value(obj, scope, lang, key) if v is not None: any_data = True vals.append(np.nan if v is None else float(v)) plt.bar( x + (i - (len(models) - 1) / 2) * width, vals, width=width, label=obj.get("model", name), ) if not any_data: plt.close() return plt.xticks(x, [m[1] for m in metrics]) title = "Rank stats (1-based)" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.ylabel("rank") plt.legend() Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def save_rank_distribution_plot(models, scope, lang, out_path, not_found_out_path=None): top_k = None for _, obj in models: if "top_k" in obj: top_k = int(obj["top_k"]) break if top_k is None: return x = np.arange(top_k) width = 0.8 / max(1, len(models)) plt.figure() any_data = False not_found_rates = [] not_found_labels = [] for i, (name, obj) in enumerate(models): ranks = distribution_value(obj, scope, lang, "ranks") if not ranks: continue any_data = True buckets = [0] * top_k not_found = 0 for r in ranks: if r is None or r < 0 or r >= top_k: not_found += 1 else: buckets[int(r)] += 1 total = max(1, len(ranks)) not_found_rates.append(not_found / total) not_found_labels.append(obj.get("model", name)) found_total = total - not_found if found_total <= 0: vals = [0.0] * top_k else: vals = [b / found_total for b in buckets] plt.bar( x + (i - (len(models) - 1) / 2) * width, vals, width=width, label=obj.get("model", name), ) if not any_data: plt.close() return labels = [str(i + 1) for i in range(top_k)] plt.xticks(x, labels) title = "Rank distribution (found only)" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.ylabel("share of found queries") plt.legend() Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() if not_found_out_path and not_found_rates: plt.figure() x_nf = np.arange(len(not_found_rates)) plt.bar(x_nf, not_found_rates) plt.xticks(x_nf, not_found_labels, rotation=15, ha="right") title = "Not found rate (NF)" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.ylabel("share of queries") plt.ylim(0, 1.0) Path(not_found_out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(not_found_out_path, dpi=180) plt.close() def save_margin_boxplot(models, scope, lang, out_path): data = [] labels = [] for name, obj in models: margins = distribution_value(obj, scope, lang, "margins") if margins: data.append(margins) labels.append(obj.get("model", name)) if not data: return plt.figure(figsize=(8, 4.5)) plt.boxplot(data, labels=labels, showfliers=False) title = "Score margin (top1 - top2)" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.ylabel("margin") plt.xticks(rotation=15, ha="right") Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def save_coverage_plot(models, scope, lang, out_path): vals = [] labels = [] for name, obj in models: v = coverage_value(obj, scope, lang, "coverage_ratio") if v is not None: vals.append(float(v)) labels.append(obj.get("model", name)) if not vals: return x = np.arange(len(vals)) plt.figure() plt.bar(x, vals) plt.xticks(x, labels, rotation=15, ha="right") title = "Coverage ratio (unique docs / corpus)" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.ylabel("ratio") plt.ylim(0, 1.0) Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def _grouped_model_bars(models, value_fn, out_path, title, ylabel): labels = [] overall_vals = [] ru_vals = [] kz_vals = [] for name, obj in models: label = obj.get("model", name) labels.append(label) overall_vals.append(value_fn(obj, "overall", None)) ru_vals.append(value_fn(obj, "by_lang", "ru")) kz_vals.append(value_fn(obj, "by_lang", "kz")) if not labels: return x = np.arange(len(labels)) width = 0.25 plt.figure(figsize=(9, 4.8)) plt.bar(x - width, [0.0 if v is None else float(v) for v in overall_vals], width, label="overall") plt.bar(x, [0.0 if v is None else float(v) for v in ru_vals], width, label="ru") plt.bar(x + width, [0.0 if v is None else float(v) for v in kz_vals], width, label="kz") plt.title(title) plt.ylabel(ylabel) plt.ylim(0, 1.0) plt.xticks(x, labels, rotation=15, ha="right") plt.grid(axis="y", alpha=0.2) plt.legend() Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def save_coverage_grouped(models, out_path): def value_fn(obj, scope, lang): return coverage_value(obj, scope, lang, "coverage_ratio") _grouped_model_bars( models, value_fn, out_path, "Coverage (overall/ru/kz)", "ratio", ) def save_not_found_grouped(models, out_path): def value_fn(obj, scope, lang): v = metric_value(obj, scope, lang, "not_found_rate") if v is None: v = rank_stat_value(obj, scope, lang, "not_found_rate") return v _grouped_model_bars( models, value_fn, out_path, "Not found rate (overall/ru/kz)", "share of queries", ) def save_top1_score_hist(models, scope, lang, out_dir): for name, obj in models: tp = distribution_value(obj, scope, lang, "top1_scores_tp") fp = distribution_value(obj, scope, lang, "top1_scores_fp") if not tp and not fp: continue plt.figure() if tp: plt.hist(tp, bins=20, alpha=0.6, label="top-1 is positive") if fp: plt.hist(fp, bins=20, alpha=0.6, label="top-1 is not positive") title = "Top-1 score distribution" label = obj.get("model", name) if scope == "overall": plt.title(f"{title} ({label}, overall)") else: plt.title(f"{title} ({label}, {lang})") plt.xlabel("similarity score") plt.ylabel("count") plt.legend() Path(out_dir).mkdir(parents=True, exist_ok=True) out_path = ( Path(out_dir) / f"top1_score_tp_fp_{model_label_key(obj, name)}_{scope if scope else 'overall'}{'' if lang is None else '_' + lang}.png" ) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def save_metrics_heatmap(models, out_path): metrics = ["recall@1", "recall@3", "recall@5", "recall@10", "mrr@10", "ndcg@10", "not_found_rate"] data = [] labels = [] for name, obj in models: labels.append(obj.get("model", name)) row = [] for m in metrics: v = metric_value(obj, "overall", None, m) if v is None: v = rank_stat_value(obj, "overall", None, m) row.append(0.0 if v is None else float(v)) data.append(row) if not data: return arr = np.array(data) plt.figure(figsize=(9, 3.8)) im = plt.imshow(arr, aspect="auto", cmap="viridis") plt.yticks(np.arange(len(labels)), labels) plt.xticks(np.arange(len(metrics)), metrics, rotation=30, ha="right") plt.title("Metrics heatmap (overall)") plt.colorbar(im, fraction=0.046, pad=0.04) plt.tight_layout() Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.savefig(out_path, dpi=180) plt.close() def save_rank_cdf(models, out_path): top_k = None for _, obj in models: if "top_k" in obj: top_k = int(obj["top_k"]) break if top_k is None: return ks = np.arange(1, top_k + 1) plt.figure(figsize=(8.5, 4.5)) for name, obj in models: ranks = distribution_value(obj, "overall", None, "ranks") if not ranks: continue total = max(1, len(ranks)) ys = [] for k in ks: found = sum(1 for r in ranks if r is not None and r >= 0 and r < k) ys.append(found / total) plt.plot(ks, ys, marker="o", label=obj.get("model", name)) plt.title("Rank CDF (overall)") plt.xlabel("k") plt.ylabel("share of queries with rank ≤ k") plt.ylim(0, 1.0) plt.grid(alpha=0.2) plt.legend() Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def save_confidence_scatter(models, out_path): xs = [] ys = [] labels = [] for name, obj in models: mrr = metric_value(obj, "overall", None, "mrr@10") top1 = score_stat_value(obj, "overall", None, "top1_score", "mean") if mrr is None or top1 is None: continue xs.append(float(top1)) ys.append(float(mrr)) labels.append(obj.get("model", name)) if not xs: return plt.figure(figsize=(6.5, 4.5)) plt.scatter(xs, ys, s=60) for x, y, label in zip(xs, ys, labels): plt.text(x, y, f" {label}", fontsize=9, ha="left", va="center") plt.title("Top-1 confidence vs MRR@10 (overall)") plt.xlabel("mean top-1 score") plt.ylabel("mrr@10") plt.grid(alpha=0.2) Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def model_label_key(obj, name): s = str(obj.get("model", name)).lower() if "labse" in s: return "labse" if "finetuned" in s or "artifacts" in s: return "finetuned" if "paraphrase-multilingual-mpnet-base-v2" in s: return "base" if "mpnet" in s: return "base" return name.lower() def select_model(models, key): for name, obj in models: if model_label_key(obj, name) == key: return (name, obj) return None def save_relative_improvement_plot(models, scope, lang, out_path): fin = select_model(models, "finetuned") base = select_model(models, "base") if fin is None or base is None: return metrics = ["recall@1", "recall@3", "recall@5", "recall@10", "mrr@10", "ndcg@10"] labels = ["R@1", "R@3", "R@5", "R@10", "MRR@10", "nDCG@10"] fin_obj = fin[1] base_obj = base[1] vals = [] for m in metrics: fv = metric_value(fin_obj, scope, lang, m) bv = metric_value(base_obj, scope, lang, m) fv = 0.0 if fv is None else float(fv) bv = 0.0 if bv is None else float(bv) if bv <= 0: vals.append(np.nan) else: vals.append((fv - bv) / bv * 100.0) x = np.arange(len(metrics)) plt.figure() plt.bar(x, vals) plt.xticks(x, labels) title = "Relative improvement vs base (%)" if scope == "overall": plt.title(f"{title} (overall)") else: plt.title(f"{title} ({lang})") plt.ylabel("%") plt.axhline(0.0) Path(out_path).parent.mkdir(parents=True, exist_ok=True) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def main(): reports_dir = Path("artifacts/reports") files = sorted([str(p) for p in reports_dir.glob("eval_*.json")]) models = pick_models(files) if not models: raise SystemExit("No eval_*.json found in artifacts/reports") fig_dir = reports_dir / "figures" fig_dir.mkdir(parents=True, exist_ok=True) save_recall_plot(models, "overall", None, fig_dir / "recall_overall.png") save_rank_metrics_plot(models, "overall", None, fig_dir / "rank_metrics_overall.png") save_recall_curve_plot(models, "overall", None, fig_dir / "recall_curve_overall.png") save_relative_improvement_plot(models, "overall", None, fig_dir / "relative_improvement_overall.png") save_precision_plot(models, "overall", None, fig_dir / "precision_overall.png") save_rank_stats_plot(models, "overall", None, fig_dir / "rank_stats_overall.png") save_rank_distribution_plot( models, "overall", None, fig_dir / "rank_distribution_overall.png", None, ) save_margin_boxplot(models, "overall", None, fig_dir / "score_margin_overall.png") # Per-scope coverage plots removed in favor of grouped chart. save_top1_score_hist(models, "overall", None, fig_dir) save_coverage_grouped(models, fig_dir / "coverage_grouped.png") save_not_found_grouped(models, fig_dir / "not_found_grouped.png") save_metrics_heatmap(models, fig_dir / "metrics_heatmap_overall.png") save_rank_cdf(models, fig_dir / "rank_cdf_overall.png") save_confidence_scatter(models, fig_dir / "confidence_scatter_overall.png") for lang in ["ru", "kz"]: save_recall_plot(models, "by_lang", lang, fig_dir / f"recall_{lang}.png") save_rank_metrics_plot( models, "by_lang", lang, fig_dir / f"rank_metrics_{lang}.png" ) save_recall_curve_plot( models, "by_lang", lang, fig_dir / f"recall_curve_{lang}.png" ) save_relative_improvement_plot( models, "by_lang", lang, fig_dir / f"relative_improvement_{lang}.png" ) save_precision_plot(models, "by_lang", lang, fig_dir / f"precision_{lang}.png") save_rank_stats_plot(models, "by_lang", lang, fig_dir / f"rank_stats_{lang}.png") save_rank_distribution_plot( models, "by_lang", lang, fig_dir / f"rank_distribution_{lang}.png", None, ) summary = { "loaded_reports": [Path(f).name for f in files], "figures": [p.name for p in sorted(fig_dir.glob("*.png"))], } (reports_dir / "figures_summary.json").write_text( json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8", ) if __name__ == "__main__": main()