Spaces:

representational-alignment
/

iclr2026-realign-challenge

Running

App Files Files Community

iclr2026-realign-challenge / src /hackathon /scoring.py

siddsuresh97

Thread hf_link to Modal and require unique submitter names

ebf9b32 about 2 months ago

raw

history blame contribute delete

4.84 kB

	from __future__ import annotations

	from itertools import combinations
	from typing import Iterable

	import numpy as np

	from src.cka.compute import linear_cka
	from src.hackathon.data import get_dummy_model_embeddings, list_dummy_stimuli, resolve_stimulus_indices
	from src.hackathon.modal_client import (
	is_modal_enabled,
	score_blue_with_pairwise as modal_score_blue_with_pairwise,
	score_red_with_pairwise as modal_score_red_with_pairwise,
	)


	def _validate_models(model_names: Iterable[str], embeddings_by_model: dict[str, np.ndarray]) -> list[str]:
	names = [name.strip() for name in model_names if name.strip()]
	if len(names) < 2:
	raise ValueError("Select at least two models.")
	if len(names) != len(set(names)):
	raise ValueError("Model selections must be unique.")

	missing = [name for name in names if name not in embeddings_by_model]
	if missing:
	missing_str = ", ".join(missing)
	raise ValueError(f"Unknown models requested: {missing_str}")

	return names


	def _format_score(score: float) -> float:
	return round(float(score), 4)


	def _pairwise_scores(
	model_names: Iterable[str],
	embeddings_by_model: dict[str, np.ndarray],
	) -> tuple[float, list[dict[str, float \| str]]]:
	scores = []
	pairwise: list[dict[str, float \| str]] = []
	for model_a, model_b in combinations(model_names, 2):
	score = linear_cka(embeddings_by_model[model_a], embeddings_by_model[model_b])
	scores.append(score)
	pairwise.append(
	{
	"Model A": model_a,
	"Model B": model_b,
	"CKA": _format_score(score),
	}
	)

	if not scores:
	return 0.0, []
	return float(np.mean(scores)), pairwise


	def score_blue_with_pairwise(
	model_names: Iterable[str],
	*,
	embeddings_by_model: dict[str, np.ndarray] \| None = None,
	submission_id: str \| None = None,
	submitter: str \| None = None,
	hf_link: str \| None = None,
	) -> tuple[float, list[dict[str, float \| str]]]:
	if embeddings_by_model is None and is_modal_enabled():
	avg_cka, pairwise = modal_score_blue_with_pairwise(
	model_names, submission_id=submission_id, submitter=submitter, hf_link=hf_link,
	)
	formatted = [
	{"Model A": item["model_a"], "Model B": item["model_b"], "CKA": _format_score(item["cka"])}
	for item in pairwise
	]
	return float(avg_cka), formatted

	if embeddings_by_model is None:
	embeddings_by_model = get_dummy_model_embeddings()

	model_names = _validate_models(model_names, embeddings_by_model)
	avg_cka, pairwise = _pairwise_scores(model_names, embeddings_by_model)
	return float(avg_cka), pairwise


	def score_blue(
	model_names: Iterable[str],
	*,
	embeddings_by_model: dict[str, np.ndarray] \| None = None,
	) -> float:
	avg_cka, _ = score_blue_with_pairwise(model_names, embeddings_by_model=embeddings_by_model)
	return float(avg_cka)


	def score_red_with_pairwise(
	selected_stimuli: Iterable[dict[str, str] \| str],
	*,
	embeddings_by_model: dict[str, np.ndarray] \| None = None,
	stimuli_catalog: Iterable[dict[str, str]] \| None = None,
	submission_id: str \| None = None,
	submitter: str \| None = None,
	hf_link: str \| None = None,
	) -> tuple[float, list[dict[str, float \| str]]]:
	if embeddings_by_model is None and is_modal_enabled():
	score, pairwise = modal_score_red_with_pairwise(
	selected_stimuli, submission_id=submission_id, submitter=submitter, hf_link=hf_link,
	)
	formatted = [
	{"Model A": item["model_a"], "Model B": item["model_b"], "CKA": _format_score(item["cka"])}
	for item in pairwise
	]
	return float(score), formatted

	if embeddings_by_model is None:
	embeddings_by_model = get_dummy_model_embeddings()
	if stimuli_catalog is None:
	stimuli_catalog = list_dummy_stimuli()

	model_names = _validate_models(embeddings_by_model.keys(), embeddings_by_model)
	stimulus_indices = resolve_stimulus_indices(selected_stimuli, stimuli_catalog)
	if len(stimulus_indices) < 2:
	raise ValueError("Select at least two stimuli.")

	filtered = {name: embeddings_by_model[name][stimulus_indices] for name in model_names}
	avg_cka, pairwise = _pairwise_scores(model_names, filtered)
	return float(1.0 - avg_cka), pairwise


	def score_red(
	selected_stimuli: Iterable[dict[str, str] \| str],
	*,
	embeddings_by_model: dict[str, np.ndarray] \| None = None,
	stimuli_catalog: Iterable[dict[str, str]] \| None = None,
	) -> float:
	score, _ = score_red_with_pairwise(
	selected_stimuli,
	embeddings_by_model=embeddings_by_model,
	stimuli_catalog=stimuli_catalog,
	)
	return float(score)