|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
def read_json(path): |
|
|
return json.loads(Path(path).read_text(encoding="utf-8")) |
|
|
|
|
|
|
|
|
def pick_models(files): |
|
|
items = [] |
|
|
for p in files: |
|
|
try: |
|
|
j = read_json(p) |
|
|
items.append((Path(p).stem, j)) |
|
|
except Exception: |
|
|
pass |
|
|
return items |
|
|
|
|
|
|
|
|
def metric_value(obj, scope, lang, metric): |
|
|
if scope == "overall": |
|
|
return obj.get("overall", {}).get(metric, None) |
|
|
if scope == "by_lang": |
|
|
return obj.get("by_lang", {}).get(lang, {}).get(metric, None) |
|
|
return None |
|
|
|
|
|
|
|
|
def section(obj, scope, lang): |
|
|
if scope == "overall": |
|
|
return obj.get("overall", {}) |
|
|
if scope == "by_lang": |
|
|
return obj.get("by_lang", {}).get(lang, {}) |
|
|
return {} |
|
|
|
|
|
|
|
|
def rank_stat_value(obj, scope, lang, key): |
|
|
return section(obj, scope, lang).get("rank_stats", {}).get(key, None) |
|
|
|
|
|
|
|
|
def score_stat_value(obj, scope, lang, group, key): |
|
|
return section(obj, scope, lang).get("score_stats", {}).get(group, {}).get(key, None) |
|
|
|
|
|
|
|
|
def coverage_value(obj, scope, lang, key): |
|
|
return section(obj, scope, lang).get("coverage", {}).get(key, None) |
|
|
|
|
|
|
|
|
def distribution_value(obj, scope, lang, key): |
|
|
return section(obj, scope, lang).get("distributions", {}).get(key, []) |
|
|
|
|
|
|
|
|
def save_recall_plot(models, scope, lang, out_path): |
|
|
ks = [1, 3, 5, 10] |
|
|
x = np.arange(len(ks)) |
|
|
width = 0.8 / max(1, len(models)) |
|
|
|
|
|
plt.figure() |
|
|
for i, (name, obj) in enumerate(models): |
|
|
vals = [] |
|
|
for k in ks: |
|
|
v = metric_value(obj, scope, lang, f"recall@{k}") |
|
|
vals.append(0.0 if v is None else float(v)) |
|
|
plt.bar( |
|
|
x + (i - (len(models) - 1) / 2) * width, |
|
|
vals, |
|
|
width=width, |
|
|
label=obj.get("model", name), |
|
|
) |
|
|
|
|
|
plt.xticks(x, [f"@{k}" for k in ks]) |
|
|
title = "Recall@k" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.ylabel("score") |
|
|
ymax = max( |
|
|
[0.0] |
|
|
+ [ |
|
|
max( |
|
|
[ |
|
|
metric_value(o, scope, lang, f"recall@{k}") or 0.0 |
|
|
for k in ks |
|
|
] |
|
|
) |
|
|
for _, o in models |
|
|
] |
|
|
) |
|
|
plt.ylim(0, min(1.0, max(0.05, ymax * 1.2))) |
|
|
plt.legend() |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_rank_metrics_plot(models, scope, lang, out_path): |
|
|
metrics = ["mrr@10", "ndcg@10"] |
|
|
x = np.arange(len(metrics)) |
|
|
width = 0.8 / max(1, len(models)) |
|
|
|
|
|
plt.figure() |
|
|
for i, (name, obj) in enumerate(models): |
|
|
vals = [] |
|
|
for m in metrics: |
|
|
v = metric_value(obj, scope, lang, m) |
|
|
vals.append(0.0 if v is None else float(v)) |
|
|
plt.bar( |
|
|
x + (i - (len(models) - 1) / 2) * width, |
|
|
vals, |
|
|
width=width, |
|
|
label=obj.get("model", name), |
|
|
) |
|
|
|
|
|
plt.xticks(x, metrics) |
|
|
title = "Ranking metrics" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.ylabel("score") |
|
|
ymax = max( |
|
|
[0.0] |
|
|
+ [ |
|
|
max([metric_value(o, scope, lang, m) or 0.0 for m in metrics]) |
|
|
for _, o in models |
|
|
] |
|
|
) |
|
|
plt.ylim(0, min(1.0, max(0.05, ymax * 1.2))) |
|
|
plt.legend() |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_precision_plot(models, scope, lang, out_path): |
|
|
ks = [1, 3, 5, 10] |
|
|
x = np.arange(len(ks)) |
|
|
width = 0.8 / max(1, len(models)) |
|
|
|
|
|
plt.figure() |
|
|
any_data = False |
|
|
for i, (name, obj) in enumerate(models): |
|
|
vals = [] |
|
|
for k in ks: |
|
|
v = metric_value(obj, scope, lang, f"precision@{k}") |
|
|
if v is not None: |
|
|
any_data = True |
|
|
vals.append(0.0 if v is None else float(v)) |
|
|
plt.bar( |
|
|
x + (i - (len(models) - 1) / 2) * width, |
|
|
vals, |
|
|
width=width, |
|
|
label=obj.get("model", name), |
|
|
) |
|
|
|
|
|
if not any_data: |
|
|
plt.close() |
|
|
return |
|
|
|
|
|
plt.xticks(x, [f"@{k}" for k in ks]) |
|
|
title = "Precision@k (single-positive)" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.ylabel("score") |
|
|
ymax = max( |
|
|
[0.0] |
|
|
+ [ |
|
|
max( |
|
|
[ |
|
|
metric_value(o, scope, lang, f"precision@{k}") or 0.0 |
|
|
for k in ks |
|
|
] |
|
|
) |
|
|
for _, o in models |
|
|
] |
|
|
) |
|
|
plt.ylim(0, min(1.0, max(0.05, ymax * 1.2))) |
|
|
plt.legend() |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_recall_curve_plot(models, scope, lang, out_path): |
|
|
ks = [1, 3, 5, 10] |
|
|
xs = np.array(ks, dtype=float) |
|
|
|
|
|
plt.figure() |
|
|
for name, obj in models: |
|
|
ys = [] |
|
|
for k in ks: |
|
|
v = metric_value(obj, scope, lang, f"recall@{k}") |
|
|
ys.append(0.0 if v is None else float(v)) |
|
|
plt.plot(xs, ys, marker="o", label=obj.get("model", name)) |
|
|
|
|
|
plt.xticks(xs, [f"@{k}" for k in ks]) |
|
|
title = "Recall@k vs k" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.xlabel("k") |
|
|
plt.ylabel("recall") |
|
|
ymax = max( |
|
|
[0.0] |
|
|
+ [ |
|
|
max( |
|
|
[ |
|
|
metric_value(o, scope, lang, f"recall@{k}") or 0.0 |
|
|
for k in ks |
|
|
] |
|
|
) |
|
|
for _, o in models |
|
|
] |
|
|
) |
|
|
plt.ylim(0, min(1.0, max(0.05, ymax * 1.2))) |
|
|
plt.legend() |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_rank_stats_plot(models, scope, lang, out_path): |
|
|
metrics = [("mean_rank", "Mean"), ("median_rank", "Median"), ("p90_rank", "P90")] |
|
|
x = np.arange(len(metrics)) |
|
|
width = 0.8 / max(1, len(models)) |
|
|
|
|
|
plt.figure() |
|
|
any_data = False |
|
|
for i, (name, obj) in enumerate(models): |
|
|
vals = [] |
|
|
for key, _ in metrics: |
|
|
v = rank_stat_value(obj, scope, lang, key) |
|
|
if v is not None: |
|
|
any_data = True |
|
|
vals.append(np.nan if v is None else float(v)) |
|
|
plt.bar( |
|
|
x + (i - (len(models) - 1) / 2) * width, |
|
|
vals, |
|
|
width=width, |
|
|
label=obj.get("model", name), |
|
|
) |
|
|
|
|
|
if not any_data: |
|
|
plt.close() |
|
|
return |
|
|
|
|
|
plt.xticks(x, [m[1] for m in metrics]) |
|
|
title = "Rank stats (1-based)" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.ylabel("rank") |
|
|
plt.legend() |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_rank_distribution_plot(models, scope, lang, out_path, not_found_out_path=None): |
|
|
top_k = None |
|
|
for _, obj in models: |
|
|
if "top_k" in obj: |
|
|
top_k = int(obj["top_k"]) |
|
|
break |
|
|
if top_k is None: |
|
|
return |
|
|
|
|
|
x = np.arange(top_k) |
|
|
width = 0.8 / max(1, len(models)) |
|
|
|
|
|
plt.figure() |
|
|
any_data = False |
|
|
not_found_rates = [] |
|
|
not_found_labels = [] |
|
|
for i, (name, obj) in enumerate(models): |
|
|
ranks = distribution_value(obj, scope, lang, "ranks") |
|
|
if not ranks: |
|
|
continue |
|
|
any_data = True |
|
|
buckets = [0] * top_k |
|
|
not_found = 0 |
|
|
for r in ranks: |
|
|
if r is None or r < 0 or r >= top_k: |
|
|
not_found += 1 |
|
|
else: |
|
|
buckets[int(r)] += 1 |
|
|
total = max(1, len(ranks)) |
|
|
not_found_rates.append(not_found / total) |
|
|
not_found_labels.append(obj.get("model", name)) |
|
|
found_total = total - not_found |
|
|
if found_total <= 0: |
|
|
vals = [0.0] * top_k |
|
|
else: |
|
|
vals = [b / found_total for b in buckets] |
|
|
plt.bar( |
|
|
x + (i - (len(models) - 1) / 2) * width, |
|
|
vals, |
|
|
width=width, |
|
|
label=obj.get("model", name), |
|
|
) |
|
|
|
|
|
if not any_data: |
|
|
plt.close() |
|
|
return |
|
|
|
|
|
labels = [str(i + 1) for i in range(top_k)] |
|
|
plt.xticks(x, labels) |
|
|
title = "Rank distribution (found only)" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.ylabel("share of found queries") |
|
|
plt.legend() |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
if not_found_out_path and not_found_rates: |
|
|
plt.figure() |
|
|
x_nf = np.arange(len(not_found_rates)) |
|
|
plt.bar(x_nf, not_found_rates) |
|
|
plt.xticks(x_nf, not_found_labels, rotation=15, ha="right") |
|
|
title = "Not found rate (NF)" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.ylabel("share of queries") |
|
|
plt.ylim(0, 1.0) |
|
|
Path(not_found_out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(not_found_out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_margin_boxplot(models, scope, lang, out_path): |
|
|
data = [] |
|
|
labels = [] |
|
|
for name, obj in models: |
|
|
margins = distribution_value(obj, scope, lang, "margins") |
|
|
if margins: |
|
|
data.append(margins) |
|
|
labels.append(obj.get("model", name)) |
|
|
|
|
|
if not data: |
|
|
return |
|
|
|
|
|
plt.figure(figsize=(8, 4.5)) |
|
|
plt.boxplot(data, labels=labels, showfliers=False) |
|
|
title = "Score margin (top1 - top2)" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.ylabel("margin") |
|
|
plt.xticks(rotation=15, ha="right") |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_coverage_plot(models, scope, lang, out_path): |
|
|
vals = [] |
|
|
labels = [] |
|
|
for name, obj in models: |
|
|
v = coverage_value(obj, scope, lang, "coverage_ratio") |
|
|
if v is not None: |
|
|
vals.append(float(v)) |
|
|
labels.append(obj.get("model", name)) |
|
|
|
|
|
if not vals: |
|
|
return |
|
|
|
|
|
x = np.arange(len(vals)) |
|
|
plt.figure() |
|
|
plt.bar(x, vals) |
|
|
plt.xticks(x, labels, rotation=15, ha="right") |
|
|
title = "Coverage ratio (unique docs / corpus)" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.ylabel("ratio") |
|
|
plt.ylim(0, 1.0) |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def _grouped_model_bars(models, value_fn, out_path, title, ylabel): |
|
|
labels = [] |
|
|
overall_vals = [] |
|
|
ru_vals = [] |
|
|
kz_vals = [] |
|
|
|
|
|
for name, obj in models: |
|
|
label = obj.get("model", name) |
|
|
labels.append(label) |
|
|
overall_vals.append(value_fn(obj, "overall", None)) |
|
|
ru_vals.append(value_fn(obj, "by_lang", "ru")) |
|
|
kz_vals.append(value_fn(obj, "by_lang", "kz")) |
|
|
|
|
|
if not labels: |
|
|
return |
|
|
|
|
|
x = np.arange(len(labels)) |
|
|
width = 0.25 |
|
|
|
|
|
plt.figure(figsize=(9, 4.8)) |
|
|
plt.bar(x - width, [0.0 if v is None else float(v) for v in overall_vals], width, label="overall") |
|
|
plt.bar(x, [0.0 if v is None else float(v) for v in ru_vals], width, label="ru") |
|
|
plt.bar(x + width, [0.0 if v is None else float(v) for v in kz_vals], width, label="kz") |
|
|
plt.title(title) |
|
|
plt.ylabel(ylabel) |
|
|
plt.ylim(0, 1.0) |
|
|
plt.xticks(x, labels, rotation=15, ha="right") |
|
|
plt.grid(axis="y", alpha=0.2) |
|
|
plt.legend() |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_coverage_grouped(models, out_path): |
|
|
def value_fn(obj, scope, lang): |
|
|
return coverage_value(obj, scope, lang, "coverage_ratio") |
|
|
|
|
|
_grouped_model_bars( |
|
|
models, |
|
|
value_fn, |
|
|
out_path, |
|
|
"Coverage (overall/ru/kz)", |
|
|
"ratio", |
|
|
) |
|
|
|
|
|
|
|
|
def save_not_found_grouped(models, out_path): |
|
|
def value_fn(obj, scope, lang): |
|
|
v = metric_value(obj, scope, lang, "not_found_rate") |
|
|
if v is None: |
|
|
v = rank_stat_value(obj, scope, lang, "not_found_rate") |
|
|
return v |
|
|
|
|
|
_grouped_model_bars( |
|
|
models, |
|
|
value_fn, |
|
|
out_path, |
|
|
"Not found rate (overall/ru/kz)", |
|
|
"share of queries", |
|
|
) |
|
|
|
|
|
|
|
|
def save_top1_score_hist(models, scope, lang, out_dir): |
|
|
for name, obj in models: |
|
|
tp = distribution_value(obj, scope, lang, "top1_scores_tp") |
|
|
fp = distribution_value(obj, scope, lang, "top1_scores_fp") |
|
|
if not tp and not fp: |
|
|
continue |
|
|
plt.figure() |
|
|
if tp: |
|
|
plt.hist(tp, bins=20, alpha=0.6, label="top-1 is positive") |
|
|
if fp: |
|
|
plt.hist(fp, bins=20, alpha=0.6, label="top-1 is not positive") |
|
|
title = "Top-1 score distribution" |
|
|
label = obj.get("model", name) |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} ({label}, overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({label}, {lang})") |
|
|
plt.xlabel("similarity score") |
|
|
plt.ylabel("count") |
|
|
plt.legend() |
|
|
Path(out_dir).mkdir(parents=True, exist_ok=True) |
|
|
out_path = ( |
|
|
Path(out_dir) |
|
|
/ f"top1_score_tp_fp_{model_label_key(obj, name)}_{scope if scope else 'overall'}{'' if lang is None else '_' + lang}.png" |
|
|
) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_metrics_heatmap(models, out_path): |
|
|
metrics = ["recall@1", "recall@3", "recall@5", "recall@10", "mrr@10", "ndcg@10", "not_found_rate"] |
|
|
data = [] |
|
|
labels = [] |
|
|
for name, obj in models: |
|
|
labels.append(obj.get("model", name)) |
|
|
row = [] |
|
|
for m in metrics: |
|
|
v = metric_value(obj, "overall", None, m) |
|
|
if v is None: |
|
|
v = rank_stat_value(obj, "overall", None, m) |
|
|
row.append(0.0 if v is None else float(v)) |
|
|
data.append(row) |
|
|
|
|
|
if not data: |
|
|
return |
|
|
|
|
|
arr = np.array(data) |
|
|
plt.figure(figsize=(9, 3.8)) |
|
|
im = plt.imshow(arr, aspect="auto", cmap="viridis") |
|
|
plt.yticks(np.arange(len(labels)), labels) |
|
|
plt.xticks(np.arange(len(metrics)), metrics, rotation=30, ha="right") |
|
|
plt.title("Metrics heatmap (overall)") |
|
|
plt.colorbar(im, fraction=0.046, pad=0.04) |
|
|
plt.tight_layout() |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_rank_cdf(models, out_path): |
|
|
top_k = None |
|
|
for _, obj in models: |
|
|
if "top_k" in obj: |
|
|
top_k = int(obj["top_k"]) |
|
|
break |
|
|
if top_k is None: |
|
|
return |
|
|
|
|
|
ks = np.arange(1, top_k + 1) |
|
|
plt.figure(figsize=(8.5, 4.5)) |
|
|
for name, obj in models: |
|
|
ranks = distribution_value(obj, "overall", None, "ranks") |
|
|
if not ranks: |
|
|
continue |
|
|
total = max(1, len(ranks)) |
|
|
ys = [] |
|
|
for k in ks: |
|
|
found = sum(1 for r in ranks if r is not None and r >= 0 and r < k) |
|
|
ys.append(found / total) |
|
|
plt.plot(ks, ys, marker="o", label=obj.get("model", name)) |
|
|
|
|
|
plt.title("Rank CDF (overall)") |
|
|
plt.xlabel("k") |
|
|
plt.ylabel("share of queries with rank ≤ k") |
|
|
plt.ylim(0, 1.0) |
|
|
plt.grid(alpha=0.2) |
|
|
plt.legend() |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def save_confidence_scatter(models, out_path): |
|
|
xs = [] |
|
|
ys = [] |
|
|
labels = [] |
|
|
for name, obj in models: |
|
|
mrr = metric_value(obj, "overall", None, "mrr@10") |
|
|
top1 = score_stat_value(obj, "overall", None, "top1_score", "mean") |
|
|
if mrr is None or top1 is None: |
|
|
continue |
|
|
xs.append(float(top1)) |
|
|
ys.append(float(mrr)) |
|
|
labels.append(obj.get("model", name)) |
|
|
|
|
|
if not xs: |
|
|
return |
|
|
|
|
|
plt.figure(figsize=(6.5, 4.5)) |
|
|
plt.scatter(xs, ys, s=60) |
|
|
for x, y, label in zip(xs, ys, labels): |
|
|
plt.text(x, y, f" {label}", fontsize=9, ha="left", va="center") |
|
|
plt.title("Top-1 confidence vs MRR@10 (overall)") |
|
|
plt.xlabel("mean top-1 score") |
|
|
plt.ylabel("mrr@10") |
|
|
plt.grid(alpha=0.2) |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def model_label_key(obj, name): |
|
|
s = str(obj.get("model", name)).lower() |
|
|
if "labse" in s: |
|
|
return "labse" |
|
|
if "finetuned" in s or "artifacts" in s: |
|
|
return "finetuned" |
|
|
if "paraphrase-multilingual-mpnet-base-v2" in s: |
|
|
return "base" |
|
|
if "mpnet" in s: |
|
|
return "base" |
|
|
return name.lower() |
|
|
|
|
|
|
|
|
def select_model(models, key): |
|
|
for name, obj in models: |
|
|
if model_label_key(obj, name) == key: |
|
|
return (name, obj) |
|
|
return None |
|
|
|
|
|
|
|
|
def save_relative_improvement_plot(models, scope, lang, out_path): |
|
|
fin = select_model(models, "finetuned") |
|
|
base = select_model(models, "base") |
|
|
if fin is None or base is None: |
|
|
return |
|
|
|
|
|
metrics = ["recall@1", "recall@3", "recall@5", "recall@10", "mrr@10", "ndcg@10"] |
|
|
labels = ["R@1", "R@3", "R@5", "R@10", "MRR@10", "nDCG@10"] |
|
|
|
|
|
fin_obj = fin[1] |
|
|
base_obj = base[1] |
|
|
|
|
|
vals = [] |
|
|
for m in metrics: |
|
|
fv = metric_value(fin_obj, scope, lang, m) |
|
|
bv = metric_value(base_obj, scope, lang, m) |
|
|
fv = 0.0 if fv is None else float(fv) |
|
|
bv = 0.0 if bv is None else float(bv) |
|
|
if bv <= 0: |
|
|
vals.append(np.nan) |
|
|
else: |
|
|
vals.append((fv - bv) / bv * 100.0) |
|
|
|
|
|
x = np.arange(len(metrics)) |
|
|
plt.figure() |
|
|
plt.bar(x, vals) |
|
|
plt.xticks(x, labels) |
|
|
title = "Relative improvement vs base (%)" |
|
|
if scope == "overall": |
|
|
plt.title(f"{title} (overall)") |
|
|
else: |
|
|
plt.title(f"{title} ({lang})") |
|
|
plt.ylabel("%") |
|
|
plt.axhline(0.0) |
|
|
Path(out_path).parent.mkdir(parents=True, exist_ok=True) |
|
|
plt.tight_layout() |
|
|
plt.savefig(out_path, dpi=180) |
|
|
plt.close() |
|
|
|
|
|
|
|
|
def main(): |
|
|
reports_dir = Path("artifacts/reports") |
|
|
files = sorted([str(p) for p in reports_dir.glob("eval_*.json")]) |
|
|
models = pick_models(files) |
|
|
|
|
|
if not models: |
|
|
raise SystemExit("No eval_*.json found in artifacts/reports") |
|
|
|
|
|
fig_dir = reports_dir / "figures" |
|
|
fig_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
save_recall_plot(models, "overall", None, fig_dir / "recall_overall.png") |
|
|
save_rank_metrics_plot(models, "overall", None, fig_dir / "rank_metrics_overall.png") |
|
|
save_recall_curve_plot(models, "overall", None, fig_dir / "recall_curve_overall.png") |
|
|
save_relative_improvement_plot(models, "overall", None, fig_dir / "relative_improvement_overall.png") |
|
|
save_precision_plot(models, "overall", None, fig_dir / "precision_overall.png") |
|
|
save_rank_stats_plot(models, "overall", None, fig_dir / "rank_stats_overall.png") |
|
|
save_rank_distribution_plot( |
|
|
models, |
|
|
"overall", |
|
|
None, |
|
|
fig_dir / "rank_distribution_overall.png", |
|
|
None, |
|
|
) |
|
|
save_margin_boxplot(models, "overall", None, fig_dir / "score_margin_overall.png") |
|
|
|
|
|
save_top1_score_hist(models, "overall", None, fig_dir) |
|
|
save_coverage_grouped(models, fig_dir / "coverage_grouped.png") |
|
|
save_not_found_grouped(models, fig_dir / "not_found_grouped.png") |
|
|
save_metrics_heatmap(models, fig_dir / "metrics_heatmap_overall.png") |
|
|
save_rank_cdf(models, fig_dir / "rank_cdf_overall.png") |
|
|
save_confidence_scatter(models, fig_dir / "confidence_scatter_overall.png") |
|
|
|
|
|
for lang in ["ru", "kz"]: |
|
|
save_recall_plot(models, "by_lang", lang, fig_dir / f"recall_{lang}.png") |
|
|
save_rank_metrics_plot( |
|
|
models, "by_lang", lang, fig_dir / f"rank_metrics_{lang}.png" |
|
|
) |
|
|
save_recall_curve_plot( |
|
|
models, "by_lang", lang, fig_dir / f"recall_curve_{lang}.png" |
|
|
) |
|
|
save_relative_improvement_plot( |
|
|
models, "by_lang", lang, fig_dir / f"relative_improvement_{lang}.png" |
|
|
) |
|
|
save_precision_plot(models, "by_lang", lang, fig_dir / f"precision_{lang}.png") |
|
|
save_rank_stats_plot(models, "by_lang", lang, fig_dir / f"rank_stats_{lang}.png") |
|
|
save_rank_distribution_plot( |
|
|
models, |
|
|
"by_lang", |
|
|
lang, |
|
|
fig_dir / f"rank_distribution_{lang}.png", |
|
|
None, |
|
|
) |
|
|
|
|
|
summary = { |
|
|
"loaded_reports": [Path(f).name for f in files], |
|
|
"figures": [p.name for p in sorted(fig_dir.glob("*.png"))], |
|
|
} |
|
|
(reports_dir / "figures_summary.json").write_text( |
|
|
json.dumps(summary, ensure_ascii=False, indent=2), |
|
|
encoding="utf-8", |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|