lexir / src /plot_eval.py
irinaqqq's picture
ADDED MORE GPAPHS
c6cece9
import json
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
def read_json(path):
return json.loads(Path(path).read_text(encoding="utf-8"))
def pick_models(files):
items = []
for p in files:
try:
j = read_json(p)
items.append((Path(p).stem, j))
except Exception:
pass
return items
def metric_value(obj, scope, lang, metric):
if scope == "overall":
return obj.get("overall", {}).get(metric, None)
if scope == "by_lang":
return obj.get("by_lang", {}).get(lang, {}).get(metric, None)
return None
def section(obj, scope, lang):
if scope == "overall":
return obj.get("overall", {})
if scope == "by_lang":
return obj.get("by_lang", {}).get(lang, {})
return {}
def rank_stat_value(obj, scope, lang, key):
return section(obj, scope, lang).get("rank_stats", {}).get(key, None)
def score_stat_value(obj, scope, lang, group, key):
return section(obj, scope, lang).get("score_stats", {}).get(group, {}).get(key, None)
def coverage_value(obj, scope, lang, key):
return section(obj, scope, lang).get("coverage", {}).get(key, None)
def distribution_value(obj, scope, lang, key):
return section(obj, scope, lang).get("distributions", {}).get(key, [])
def save_recall_plot(models, scope, lang, out_path):
ks = [1, 3, 5, 10]
x = np.arange(len(ks))
width = 0.8 / max(1, len(models))
plt.figure()
for i, (name, obj) in enumerate(models):
vals = []
for k in ks:
v = metric_value(obj, scope, lang, f"recall@{k}")
vals.append(0.0 if v is None else float(v))
plt.bar(
x + (i - (len(models) - 1) / 2) * width,
vals,
width=width,
label=obj.get("model", name),
)
plt.xticks(x, [f"@{k}" for k in ks])
title = "Recall@k"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.ylabel("score")
ymax = max(
[0.0]
+ [
max(
[
metric_value(o, scope, lang, f"recall@{k}") or 0.0
for k in ks
]
)
for _, o in models
]
)
plt.ylim(0, min(1.0, max(0.05, ymax * 1.2)))
plt.legend()
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def save_rank_metrics_plot(models, scope, lang, out_path):
metrics = ["mrr@10", "ndcg@10"]
x = np.arange(len(metrics))
width = 0.8 / max(1, len(models))
plt.figure()
for i, (name, obj) in enumerate(models):
vals = []
for m in metrics:
v = metric_value(obj, scope, lang, m)
vals.append(0.0 if v is None else float(v))
plt.bar(
x + (i - (len(models) - 1) / 2) * width,
vals,
width=width,
label=obj.get("model", name),
)
plt.xticks(x, metrics)
title = "Ranking metrics"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.ylabel("score")
ymax = max(
[0.0]
+ [
max([metric_value(o, scope, lang, m) or 0.0 for m in metrics])
for _, o in models
]
)
plt.ylim(0, min(1.0, max(0.05, ymax * 1.2)))
plt.legend()
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def save_precision_plot(models, scope, lang, out_path):
ks = [1, 3, 5, 10]
x = np.arange(len(ks))
width = 0.8 / max(1, len(models))
plt.figure()
any_data = False
for i, (name, obj) in enumerate(models):
vals = []
for k in ks:
v = metric_value(obj, scope, lang, f"precision@{k}")
if v is not None:
any_data = True
vals.append(0.0 if v is None else float(v))
plt.bar(
x + (i - (len(models) - 1) / 2) * width,
vals,
width=width,
label=obj.get("model", name),
)
if not any_data:
plt.close()
return
plt.xticks(x, [f"@{k}" for k in ks])
title = "Precision@k (single-positive)"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.ylabel("score")
ymax = max(
[0.0]
+ [
max(
[
metric_value(o, scope, lang, f"precision@{k}") or 0.0
for k in ks
]
)
for _, o in models
]
)
plt.ylim(0, min(1.0, max(0.05, ymax * 1.2)))
plt.legend()
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def save_recall_curve_plot(models, scope, lang, out_path):
ks = [1, 3, 5, 10]
xs = np.array(ks, dtype=float)
plt.figure()
for name, obj in models:
ys = []
for k in ks:
v = metric_value(obj, scope, lang, f"recall@{k}")
ys.append(0.0 if v is None else float(v))
plt.plot(xs, ys, marker="o", label=obj.get("model", name))
plt.xticks(xs, [f"@{k}" for k in ks])
title = "Recall@k vs k"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.xlabel("k")
plt.ylabel("recall")
ymax = max(
[0.0]
+ [
max(
[
metric_value(o, scope, lang, f"recall@{k}") or 0.0
for k in ks
]
)
for _, o in models
]
)
plt.ylim(0, min(1.0, max(0.05, ymax * 1.2)))
plt.legend()
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def save_rank_stats_plot(models, scope, lang, out_path):
metrics = [("mean_rank", "Mean"), ("median_rank", "Median"), ("p90_rank", "P90")]
x = np.arange(len(metrics))
width = 0.8 / max(1, len(models))
plt.figure()
any_data = False
for i, (name, obj) in enumerate(models):
vals = []
for key, _ in metrics:
v = rank_stat_value(obj, scope, lang, key)
if v is not None:
any_data = True
vals.append(np.nan if v is None else float(v))
plt.bar(
x + (i - (len(models) - 1) / 2) * width,
vals,
width=width,
label=obj.get("model", name),
)
if not any_data:
plt.close()
return
plt.xticks(x, [m[1] for m in metrics])
title = "Rank stats (1-based)"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.ylabel("rank")
plt.legend()
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def save_rank_distribution_plot(models, scope, lang, out_path, not_found_out_path=None):
top_k = None
for _, obj in models:
if "top_k" in obj:
top_k = int(obj["top_k"])
break
if top_k is None:
return
x = np.arange(top_k)
width = 0.8 / max(1, len(models))
plt.figure()
any_data = False
not_found_rates = []
not_found_labels = []
for i, (name, obj) in enumerate(models):
ranks = distribution_value(obj, scope, lang, "ranks")
if not ranks:
continue
any_data = True
buckets = [0] * top_k
not_found = 0
for r in ranks:
if r is None or r < 0 or r >= top_k:
not_found += 1
else:
buckets[int(r)] += 1
total = max(1, len(ranks))
not_found_rates.append(not_found / total)
not_found_labels.append(obj.get("model", name))
found_total = total - not_found
if found_total <= 0:
vals = [0.0] * top_k
else:
vals = [b / found_total for b in buckets]
plt.bar(
x + (i - (len(models) - 1) / 2) * width,
vals,
width=width,
label=obj.get("model", name),
)
if not any_data:
plt.close()
return
labels = [str(i + 1) for i in range(top_k)]
plt.xticks(x, labels)
title = "Rank distribution (found only)"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.ylabel("share of found queries")
plt.legend()
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
if not_found_out_path and not_found_rates:
plt.figure()
x_nf = np.arange(len(not_found_rates))
plt.bar(x_nf, not_found_rates)
plt.xticks(x_nf, not_found_labels, rotation=15, ha="right")
title = "Not found rate (NF)"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.ylabel("share of queries")
plt.ylim(0, 1.0)
Path(not_found_out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(not_found_out_path, dpi=180)
plt.close()
def save_margin_boxplot(models, scope, lang, out_path):
data = []
labels = []
for name, obj in models:
margins = distribution_value(obj, scope, lang, "margins")
if margins:
data.append(margins)
labels.append(obj.get("model", name))
if not data:
return
plt.figure(figsize=(8, 4.5))
plt.boxplot(data, labels=labels, showfliers=False)
title = "Score margin (top1 - top2)"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.ylabel("margin")
plt.xticks(rotation=15, ha="right")
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def save_coverage_plot(models, scope, lang, out_path):
vals = []
labels = []
for name, obj in models:
v = coverage_value(obj, scope, lang, "coverage_ratio")
if v is not None:
vals.append(float(v))
labels.append(obj.get("model", name))
if not vals:
return
x = np.arange(len(vals))
plt.figure()
plt.bar(x, vals)
plt.xticks(x, labels, rotation=15, ha="right")
title = "Coverage ratio (unique docs / corpus)"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.ylabel("ratio")
plt.ylim(0, 1.0)
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def _grouped_model_bars(models, value_fn, out_path, title, ylabel):
labels = []
overall_vals = []
ru_vals = []
kz_vals = []
for name, obj in models:
label = obj.get("model", name)
labels.append(label)
overall_vals.append(value_fn(obj, "overall", None))
ru_vals.append(value_fn(obj, "by_lang", "ru"))
kz_vals.append(value_fn(obj, "by_lang", "kz"))
if not labels:
return
x = np.arange(len(labels))
width = 0.25
plt.figure(figsize=(9, 4.8))
plt.bar(x - width, [0.0 if v is None else float(v) for v in overall_vals], width, label="overall")
plt.bar(x, [0.0 if v is None else float(v) for v in ru_vals], width, label="ru")
plt.bar(x + width, [0.0 if v is None else float(v) for v in kz_vals], width, label="kz")
plt.title(title)
plt.ylabel(ylabel)
plt.ylim(0, 1.0)
plt.xticks(x, labels, rotation=15, ha="right")
plt.grid(axis="y", alpha=0.2)
plt.legend()
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def save_coverage_grouped(models, out_path):
def value_fn(obj, scope, lang):
return coverage_value(obj, scope, lang, "coverage_ratio")
_grouped_model_bars(
models,
value_fn,
out_path,
"Coverage (overall/ru/kz)",
"ratio",
)
def save_not_found_grouped(models, out_path):
def value_fn(obj, scope, lang):
v = metric_value(obj, scope, lang, "not_found_rate")
if v is None:
v = rank_stat_value(obj, scope, lang, "not_found_rate")
return v
_grouped_model_bars(
models,
value_fn,
out_path,
"Not found rate (overall/ru/kz)",
"share of queries",
)
def save_top1_score_hist(models, scope, lang, out_dir):
for name, obj in models:
tp = distribution_value(obj, scope, lang, "top1_scores_tp")
fp = distribution_value(obj, scope, lang, "top1_scores_fp")
if not tp and not fp:
continue
plt.figure()
if tp:
plt.hist(tp, bins=20, alpha=0.6, label="top-1 is positive")
if fp:
plt.hist(fp, bins=20, alpha=0.6, label="top-1 is not positive")
title = "Top-1 score distribution"
label = obj.get("model", name)
if scope == "overall":
plt.title(f"{title} ({label}, overall)")
else:
plt.title(f"{title} ({label}, {lang})")
plt.xlabel("similarity score")
plt.ylabel("count")
plt.legend()
Path(out_dir).mkdir(parents=True, exist_ok=True)
out_path = (
Path(out_dir)
/ f"top1_score_tp_fp_{model_label_key(obj, name)}_{scope if scope else 'overall'}{'' if lang is None else '_' + lang}.png"
)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def save_metrics_heatmap(models, out_path):
metrics = ["recall@1", "recall@3", "recall@5", "recall@10", "mrr@10", "ndcg@10", "not_found_rate"]
data = []
labels = []
for name, obj in models:
labels.append(obj.get("model", name))
row = []
for m in metrics:
v = metric_value(obj, "overall", None, m)
if v is None:
v = rank_stat_value(obj, "overall", None, m)
row.append(0.0 if v is None else float(v))
data.append(row)
if not data:
return
arr = np.array(data)
plt.figure(figsize=(9, 3.8))
im = plt.imshow(arr, aspect="auto", cmap="viridis")
plt.yticks(np.arange(len(labels)), labels)
plt.xticks(np.arange(len(metrics)), metrics, rotation=30, ha="right")
plt.title("Metrics heatmap (overall)")
plt.colorbar(im, fraction=0.046, pad=0.04)
plt.tight_layout()
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_path, dpi=180)
plt.close()
def save_rank_cdf(models, out_path):
top_k = None
for _, obj in models:
if "top_k" in obj:
top_k = int(obj["top_k"])
break
if top_k is None:
return
ks = np.arange(1, top_k + 1)
plt.figure(figsize=(8.5, 4.5))
for name, obj in models:
ranks = distribution_value(obj, "overall", None, "ranks")
if not ranks:
continue
total = max(1, len(ranks))
ys = []
for k in ks:
found = sum(1 for r in ranks if r is not None and r >= 0 and r < k)
ys.append(found / total)
plt.plot(ks, ys, marker="o", label=obj.get("model", name))
plt.title("Rank CDF (overall)")
plt.xlabel("k")
plt.ylabel("share of queries with rank ≤ k")
plt.ylim(0, 1.0)
plt.grid(alpha=0.2)
plt.legend()
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def save_confidence_scatter(models, out_path):
xs = []
ys = []
labels = []
for name, obj in models:
mrr = metric_value(obj, "overall", None, "mrr@10")
top1 = score_stat_value(obj, "overall", None, "top1_score", "mean")
if mrr is None or top1 is None:
continue
xs.append(float(top1))
ys.append(float(mrr))
labels.append(obj.get("model", name))
if not xs:
return
plt.figure(figsize=(6.5, 4.5))
plt.scatter(xs, ys, s=60)
for x, y, label in zip(xs, ys, labels):
plt.text(x, y, f" {label}", fontsize=9, ha="left", va="center")
plt.title("Top-1 confidence vs MRR@10 (overall)")
plt.xlabel("mean top-1 score")
plt.ylabel("mrr@10")
plt.grid(alpha=0.2)
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def model_label_key(obj, name):
s = str(obj.get("model", name)).lower()
if "labse" in s:
return "labse"
if "finetuned" in s or "artifacts" in s:
return "finetuned"
if "paraphrase-multilingual-mpnet-base-v2" in s:
return "base"
if "mpnet" in s:
return "base"
return name.lower()
def select_model(models, key):
for name, obj in models:
if model_label_key(obj, name) == key:
return (name, obj)
return None
def save_relative_improvement_plot(models, scope, lang, out_path):
fin = select_model(models, "finetuned")
base = select_model(models, "base")
if fin is None or base is None:
return
metrics = ["recall@1", "recall@3", "recall@5", "recall@10", "mrr@10", "ndcg@10"]
labels = ["R@1", "R@3", "R@5", "R@10", "MRR@10", "nDCG@10"]
fin_obj = fin[1]
base_obj = base[1]
vals = []
for m in metrics:
fv = metric_value(fin_obj, scope, lang, m)
bv = metric_value(base_obj, scope, lang, m)
fv = 0.0 if fv is None else float(fv)
bv = 0.0 if bv is None else float(bv)
if bv <= 0:
vals.append(np.nan)
else:
vals.append((fv - bv) / bv * 100.0)
x = np.arange(len(metrics))
plt.figure()
plt.bar(x, vals)
plt.xticks(x, labels)
title = "Relative improvement vs base (%)"
if scope == "overall":
plt.title(f"{title} (overall)")
else:
plt.title(f"{title} ({lang})")
plt.ylabel("%")
plt.axhline(0.0)
Path(out_path).parent.mkdir(parents=True, exist_ok=True)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def main():
reports_dir = Path("artifacts/reports")
files = sorted([str(p) for p in reports_dir.glob("eval_*.json")])
models = pick_models(files)
if not models:
raise SystemExit("No eval_*.json found in artifacts/reports")
fig_dir = reports_dir / "figures"
fig_dir.mkdir(parents=True, exist_ok=True)
save_recall_plot(models, "overall", None, fig_dir / "recall_overall.png")
save_rank_metrics_plot(models, "overall", None, fig_dir / "rank_metrics_overall.png")
save_recall_curve_plot(models, "overall", None, fig_dir / "recall_curve_overall.png")
save_relative_improvement_plot(models, "overall", None, fig_dir / "relative_improvement_overall.png")
save_precision_plot(models, "overall", None, fig_dir / "precision_overall.png")
save_rank_stats_plot(models, "overall", None, fig_dir / "rank_stats_overall.png")
save_rank_distribution_plot(
models,
"overall",
None,
fig_dir / "rank_distribution_overall.png",
None,
)
save_margin_boxplot(models, "overall", None, fig_dir / "score_margin_overall.png")
# Per-scope coverage plots removed in favor of grouped chart.
save_top1_score_hist(models, "overall", None, fig_dir)
save_coverage_grouped(models, fig_dir / "coverage_grouped.png")
save_not_found_grouped(models, fig_dir / "not_found_grouped.png")
save_metrics_heatmap(models, fig_dir / "metrics_heatmap_overall.png")
save_rank_cdf(models, fig_dir / "rank_cdf_overall.png")
save_confidence_scatter(models, fig_dir / "confidence_scatter_overall.png")
for lang in ["ru", "kz"]:
save_recall_plot(models, "by_lang", lang, fig_dir / f"recall_{lang}.png")
save_rank_metrics_plot(
models, "by_lang", lang, fig_dir / f"rank_metrics_{lang}.png"
)
save_recall_curve_plot(
models, "by_lang", lang, fig_dir / f"recall_curve_{lang}.png"
)
save_relative_improvement_plot(
models, "by_lang", lang, fig_dir / f"relative_improvement_{lang}.png"
)
save_precision_plot(models, "by_lang", lang, fig_dir / f"precision_{lang}.png")
save_rank_stats_plot(models, "by_lang", lang, fig_dir / f"rank_stats_{lang}.png")
save_rank_distribution_plot(
models,
"by_lang",
lang,
fig_dir / f"rank_distribution_{lang}.png",
None,
)
summary = {
"loaded_reports": [Path(f).name for f in files],
"figures": [p.name for p in sorted(fig_dir.glob("*.png"))],
}
(reports_dir / "figures_summary.json").write_text(
json.dumps(summary, ensure_ascii=False, indent=2),
encoding="utf-8",
)
if __name__ == "__main__":
main()