leaderboard / tests /test_viewer_data.py
hotchpotch's picture
Deploy leaderboard viewer header update
b240723 verified
from __future__ import annotations
import logging
from pathlib import Path
import duckdb
import pytest
from hakari_bench.viewer.data import CURRENT_DUCKDB_SCHEMA_VERSION, TaskResultRecord, TaskResultsRepository
from hakari_bench.viewer.variant_display import VariantDisplayFlags
def test_task_results_repository_returns_pydantic_records_and_base_rows_by_default(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192, None, 768, None),
(
"model/a",
"BenchA",
"bench/a",
"BenchA",
None,
"a1",
"a1",
0.80,
10,
12,
8192,
"quantize_uint8_docs",
768,
"uint8",
),
("model/b", "BenchA", "bench/a", "BenchA", "split", "a1", "a1", None, 20, 24, 4096, None, 512, None),
("model/c", "BenchB", "bench/b", "BenchB", "split", "b1", "b1", 0.70, 30, 36, 2048, None, 256, None),
],
)
records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert records == [
TaskResultRecord(
model_name="model/a",
benchmark="BenchA",
dataset_id="bench/a",
dataset_name="BenchA",
split_name="",
task_name="a1",
task_key="a1",
score=0.90,
active_parameters=10,
total_parameters=12,
max_seq_length=8192,
embedding_variant_name=None,
embedding_dim=768,
quantization=None,
)
]
def test_task_results_repository_rejects_missing_current_viewer_task_results(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
con = duckdb.connect(str(db_path))
try:
con.execute("CREATE TABLE meta_database (schema_version VARCHAR)")
con.execute("INSERT INTO meta_database VALUES (?)", [CURRENT_DUCKDB_SCHEMA_VERSION])
con.execute("CREATE TABLE task_results (model_name VARCHAR, benchmark VARCHAR, task_key VARCHAR, score DOUBLE)")
finally:
con.close()
with pytest.raises(RuntimeError, match="viewer_task_results"):
TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
def test_task_results_repository_rejects_old_schema_version(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192),
],
schema_version="2",
)
with pytest.raises(RuntimeError, match="schema version"):
TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
def test_task_results_repository_accepts_schema_three_without_text_length_columns(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192),
],
schema_version="3",
include_text_length_columns=False,
)
rows = TaskResultsRepository(db_path).fetch_task_result_rows(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert [(row.model_name, row.query_mean_chars, row.document_mean_chars) for row in rows] == [
("model/a", None, None)
]
def test_task_results_repository_can_return_lightweight_rows_for_hot_path(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192),
],
rows_override_languages=[("ja", ["ja"])],
)
rows = TaskResultsRepository(db_path).fetch_task_result_rows(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert rows[0].model_name == "model/a"
assert rows[0].task_key == "a1"
assert rows[0].languages == ("ja",)
assert rows[0].prompt_summary == "model default"
assert not isinstance(rows[0], TaskResultRecord)
def test_task_results_repository_can_fetch_non_default_metric_scores(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192),
("model/b", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.80, 20, 24, 8192),
],
)
_write_metric_tables(
db_path,
[
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_accuracy@1", 0.20, "a.json"),
("model/b", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_accuracy@1", 0.95, "b.json"),
],
)
rows = TaskResultsRepository(db_path).fetch_task_result_rows(
benchmarks=["BenchA"],
include_embedding_variants=False,
score_metric="accuracy@1",
)
assert [(row.model_name, row.score) for row in rows] == [("model/a", 0.20), ("model/b", 0.95)]
def test_task_results_repository_limits_display_metric_options_to_research_focused_set(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192),
],
)
_write_metric_tables(
db_path,
[
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_accuracy@1", 0.20, "a.json"),
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_accuracy@3", 0.30, "a.json"),
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_accuracy@5", 0.40, "a.json"),
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_accuracy@10", 0.50, "a.json"),
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_precision@1", 0.20, "a.json"),
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_precision@10", 0.50, "a.json"),
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_recall@1", 0.20, "a.json"),
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_recall@10", 0.50, "a.json"),
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_mrr@10", 0.70, "a.json"),
("model/a", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_map@100", 0.80, "a.json"),
],
)
assert TaskResultsRepository(db_path).fetch_score_metric_options() == [
"ndcg@10",
"accuracy@1",
"accuracy@10",
"precision@10",
"recall@10",
"mrr@10",
"map@100",
]
def test_task_results_repository_uses_rerank_metrics_for_reranking_target(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("cross-encoder/reranker", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192),
],
)
_write_metric_tables(
db_path,
[
("cross-encoder/reranker", "BenchA", "bench/a", "a1", "a1", "BenchA_a1_cosine_accuracy@1", 0.20, "r.json"),
(
"cross-encoder/reranker",
"BenchA",
"bench/a",
"a1",
"a1",
"BenchA_a1_cosine_bm25_top100_rerank_accuracy@1",
0.85,
"r.json",
),
],
score_target="reranking",
)
rows = TaskResultsRepository(db_path).fetch_task_result_rows(
benchmarks=["BenchA"],
include_embedding_variants=False,
score_target="reranking",
score_metric="accuracy@1",
)
assert [(row.model_name, row.score) for row in rows] == [("cross-encoder/reranker", 0.85)]
def test_task_results_repository_can_fetch_embedding_variant_rows_when_requested(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192, None, 768, None),
(
"model/a",
"BenchA",
"bench/a",
"BenchA",
None,
"a1",
"a1",
0.80,
10,
12,
8192,
"quantize_uint8_docs",
768,
"uint8",
),
],
)
records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=True,
)
assert [(record.embedding_variant_name, record.quantization) for record in records] == [
(None, None),
("quantize_uint8_docs", "uint8"),
]
def test_task_results_repository_pushes_variant_display_flags_into_sql(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192, None, 768, None),
(
"model/a",
"BenchA",
"bench/a",
"BenchA",
None,
"a1",
"a1",
0.80,
10,
12,
8192,
"quantize_uint8_docs",
768,
"uint8",
),
(
"model/a",
"BenchA",
"bench/a",
"BenchA",
None,
"a1",
"a1",
0.83,
10,
12,
8192,
"truncate_dim_384",
384,
None,
),
(
"model/a",
"BenchA",
"bench/a",
"BenchA",
None,
"a1",
"a1",
0.82,
10,
12,
8192,
"truncate_dim_256_quantize_int8_docs",
256,
"int8",
),
(
"model/a",
"BenchA",
"bench/a",
"BenchA",
None,
"a1",
"a1",
0.81,
10,
12,
8192,
"binary_rescore",
768,
"binary",
),
(
"model/a",
"BenchA",
"bench/a",
"BenchA",
None,
"a1",
"a1",
0.75,
10,
12,
8192,
"custom_variant",
2048,
None,
),
],
)
quantization_records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=True,
variant_display_flags=VariantDisplayFlags(quantization=True),
)
cross_variant_records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=True,
variant_display_flags=VariantDisplayFlags(quantization=True, truncate=True),
)
rescore_records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=True,
variant_display_flags=VariantDisplayFlags(rescore=True),
)
other_records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=True,
variant_display_flags=VariantDisplayFlags(other=True),
)
assert [(record.embedding_variant_name, record.quantization) for record in quantization_records] == [
(None, None),
("quantize_uint8_docs", "uint8"),
]
assert [(record.embedding_variant_name, record.quantization) for record in cross_variant_records] == [
(None, None),
("quantize_uint8_docs", "uint8"),
("truncate_dim_384", None),
("truncate_dim_256_quantize_int8_docs", "int8"),
]
assert [record.embedding_variant_name for record in rescore_records] == [None, "binary_rescore"]
assert [record.embedding_variant_name for record in other_records] == [None, "custom_variant"]
def test_task_results_repository_logs_duckdb_query_and_record_transform_timing(
tmp_path: Path,
caplog: pytest.LogCaptureFixture,
) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "a1", 0.90, 10, 12, 8192, None, 768, None),
],
)
with caplog.at_level(logging.INFO, logger="hakari_bench.viewer"):
records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert len(records) == 1
messages = [record.getMessage() for record in caplog.records]
assert any(
"viewer.duckdb.query" in message
and "operation=fetch_task_results" in message
and "row_count=1" in message
for message in messages
)
assert any(
"viewer.transform" in message
and "operation=fetch_task_results.records" in message
and "deduped_row_count=1" in message
for message in messages
)
def test_task_results_repository_reads_runtime_option_columns_when_present(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
(
"model/e5",
"BenchA",
"bench/a",
"BenchA",
None,
"a1",
"a1",
0.90,
10,
12,
8192,
"bf16",
"flash_attention_2",
"query: ",
"passage: ",
None,
None,
None,
None,
False,
),
(
"model/gemma",
"BenchA",
"bench/a",
"BenchA",
None,
"a1",
"a1",
0.80,
20,
24,
2048,
"bf16",
"sdpa",
None,
None,
"query",
"document",
None,
None,
True,
),
],
)
records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert records[0].dtype == "bf16"
assert records[0].attn_implementation == "flash_attention_2"
assert records[0].query_prompt == "query: "
assert records[0].document_prompt == "passage: "
assert records[0].prompt_summary == "explicit prefixes"
assert records[0].trust_remote_code is False
assert records[1].attn_implementation == "sdpa"
assert records[1].query_prompt_name == "query"
assert records[1].document_prompt_name == "document"
assert records[1].prompt_summary == "prompt names"
assert records[1].trust_remote_code is True
def test_task_results_repository_reads_dataset_languages_when_present(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "task-ja", 0.90, 10, 12, 8192),
("model/a", "BenchA", "bench/a", "BenchA", None, "a2", "task-multi", 0.80, 10, 12, 8192),
],
rows_override_languages=[
("ja", ["ja"]),
("multilingual", ["en", "ja"]),
],
)
records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert [(record.task_key, record.language, record.languages) for record in records] == [
("task-ja", "ja", ["ja"]),
("task-multi", "multilingual", ["en", "ja"]),
]
def test_task_results_repository_reads_materialized_viewer_task_results(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "task-ja", 0.90, 10, 12, 8192),
],
rows_override_languages=[("viewer-ja", ["viewer-ja"])],
)
records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert [(record.task_key, record.language, record.languages) for record in records] == [
("task-ja", "viewer-ja", ["viewer-ja"])
]
def test_task_results_repository_reads_optional_model_type_column(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/sparse", "BenchA", "bench/a", "BenchA", None, "a1", "task-ja", 0.90, 10, 12, 8192),
],
include_model_type_column=True,
row_model_types=["sparse"],
)
records = TaskResultsRepository(db_path).fetch_task_result_rows(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert [(record.model_name, record.model_type) for record in records] == [("model/sparse", "sparse")]
def test_task_results_repository_reads_task_text_length_metadata(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
_write_viewer_task_results(
db_path,
[
("model/a", "BenchA", "bench/a", "BenchA", None, "a1", "task-ja", 0.90, 10, 12, 8192),
],
rows_override_text_lengths=[(512.5, 1536.0)],
)
records = TaskResultsRepository(db_path).fetch_task_result_rows(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert records[0].query_mean_chars == pytest.approx(512.5)
assert records[0].document_mean_chars == pytest.approx(1536.0)
def test_task_results_repository_falls_back_to_dataset_metadata_for_text_lengths(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
con = duckdb.connect(str(db_path))
try:
con.execute("CREATE TABLE meta_database (schema_version VARCHAR)")
con.execute("INSERT INTO meta_database VALUES (?)", [CURRENT_DUCKDB_SCHEMA_VERSION])
columns = (
"model_name VARCHAR",
"benchmark VARCHAR",
"dataset_id VARCHAR",
"dataset_name VARCHAR",
"split_name VARCHAR",
"task_name VARCHAR",
"task_key VARCHAR",
"score_target VARCHAR",
"score DOUBLE",
"language VARCHAR",
"languages VARCHAR[]",
"active_parameters BIGINT",
"total_parameters BIGINT",
"max_seq_length INTEGER",
)
con.execute(f"CREATE TABLE viewer_task_results ({', '.join(columns)})")
con.execute(
f"INSERT INTO viewer_task_results VALUES ({', '.join('?' for _ in columns)})",
("model/a", "BenchA", "bench/a", "BenchA", "", "a1", "task-ja", "all", 0.90, "ja", ["ja"], 10, 12, 8192),
)
con.execute(
"""
CREATE TABLE dataset_metadata (
benchmark VARCHAR,
dataset_id VARCHAR,
task_key VARCHAR,
query_mean_chars DOUBLE,
document_mean_chars DOUBLE
)
"""
)
con.execute("INSERT INTO dataset_metadata VALUES (?, ?, ?, ?, ?)", ("BenchA", "bench/a", "task-ja", 640.0, 2048.0))
finally:
con.close()
records = TaskResultsRepository(db_path).fetch_task_result_rows(
benchmarks=["BenchA"],
include_embedding_variants=False,
)
assert records[0].query_mean_chars == pytest.approx(640.0)
assert records[0].document_mean_chars == pytest.approx(2048.0)
def test_task_results_repository_reads_score_target_from_materialized_viewer_table(tmp_path: Path) -> None:
db_path = tmp_path / "results.duckdb"
con = duckdb.connect(str(db_path))
try:
con.execute("CREATE TABLE meta_database (schema_version VARCHAR)")
con.execute("INSERT INTO meta_database VALUES (?)", [CURRENT_DUCKDB_SCHEMA_VERSION])
con.execute(
"""
CREATE TABLE viewer_task_results (
model_name VARCHAR,
benchmark VARCHAR,
dataset_id VARCHAR,
dataset_name VARCHAR,
split_name VARCHAR,
task_name VARCHAR,
task_key VARCHAR,
score_target VARCHAR,
score DOUBLE,
language VARCHAR,
languages VARCHAR[],
active_parameters BIGINT,
total_parameters BIGINT,
max_seq_length INTEGER,
dtype VARCHAR,
attn_implementation VARCHAR,
query_prompt VARCHAR,
document_prompt VARCHAR,
query_prompt_name VARCHAR,
document_prompt_name VARCHAR,
query_encode_task VARCHAR,
document_encode_task VARCHAR,
trust_remote_code BOOLEAN,
embedding_variant_name VARCHAR,
embedding_dim INTEGER,
quantization VARCHAR
)
"""
)
con.executemany(
"INSERT INTO viewer_task_results VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
[
(
"model/a",
"BenchA",
"bench/a",
"BenchA",
"",
"a1",
"task-a1",
"all",
0.80,
"ja",
["ja"],
10,
12,
8192,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
768,
None,
),
(
"model/a",
"BenchA",
"bench/a",
"BenchA",
"",
"a1",
"task-a1",
"reranking",
0.92,
"ja",
["ja"],
10,
12,
8192,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
768,
None,
),
],
)
finally:
con.close()
records = TaskResultsRepository(db_path).fetch_task_results(
benchmarks=["BenchA"],
score_target="reranking",
include_embedding_variants=False,
)
assert [(record.task_key, record.score) for record in records] == [("task-a1", 0.92)]
def _write_viewer_task_results(
db_path: Path,
rows: list[tuple],
*,
rows_override_languages: list[tuple[str, list[str]]] | None = None,
rows_override_text_lengths: list[tuple[float | None, float | None]] | None = None,
schema_version: str = CURRENT_DUCKDB_SCHEMA_VERSION,
include_text_length_columns: bool = True,
include_model_type_column: bool = False,
row_model_types: list[str | None] | None = None,
) -> None:
con = duckdb.connect(str(db_path))
try:
con.execute("CREATE TABLE meta_database (schema_version VARCHAR)")
con.execute("INSERT INTO meta_database VALUES (?)", [schema_version])
columns = [
"model_name VARCHAR",
"benchmark VARCHAR",
"dataset_id VARCHAR",
"dataset_name VARCHAR",
"split_name VARCHAR",
"task_name VARCHAR",
"task_key VARCHAR",
"score_target VARCHAR",
"score DOUBLE",
"language VARCHAR",
"languages VARCHAR[]",
"active_parameters BIGINT",
"total_parameters BIGINT",
"max_seq_length INTEGER",
"dtype VARCHAR",
"attn_implementation VARCHAR",
"query_prompt VARCHAR",
"document_prompt VARCHAR",
"query_prompt_name VARCHAR",
"document_prompt_name VARCHAR",
"query_encode_task VARCHAR",
"document_encode_task VARCHAR",
"trust_remote_code BOOLEAN",
"embedding_variant_name VARCHAR",
"embedding_dim INTEGER",
"quantization VARCHAR",
]
if include_model_type_column:
columns.insert(1, "model_type VARCHAR")
if include_text_length_columns:
columns.extend(
[
"query_mean_chars DOUBLE",
"document_mean_chars DOUBLE",
]
)
con.execute(f"CREATE TABLE viewer_task_results ({', '.join(columns)})")
normalized_rows = []
for index, row in enumerate(rows):
language, languages = (
rows_override_languages[index]
if rows_override_languages is not None
else (None, [])
)
query_mean_chars, document_mean_chars = (
rows_override_text_lengths[index]
if rows_override_text_lengths is not None
else (None, None)
)
normalized_row = _viewer_task_result_row(
row,
language=language,
languages=languages,
query_mean_chars=query_mean_chars,
document_mean_chars=document_mean_chars,
)
if not include_text_length_columns:
normalized_row = normalized_row[:-2]
if include_model_type_column:
model_type = row_model_types[index] if row_model_types is not None else None
normalized_row = (normalized_row[0], model_type, *normalized_row[1:])
normalized_rows.append(normalized_row)
placeholders = ", ".join("?" for _ in columns)
con.executemany(f"INSERT INTO viewer_task_results VALUES ({placeholders})", normalized_rows)
finally:
con.close()
def _write_metric_tables(db_path: Path, rows: list[tuple], *, score_target: str = "all") -> None:
con = duckdb.connect(str(db_path))
try:
con.execute(
"""
CREATE TABLE dim_metric (
metric_id BIGINT,
metric_name VARCHAR,
metric_family VARCHAR,
cutoff INTEGER
)
"""
)
metric_names = sorted({row[5] for row in rows})
for metric_id, metric_name in enumerate(metric_names, start=1):
family, cutoff = metric_name.rsplit("@", 1)
family = family.rsplit("_", 1)[-1]
con.execute(
"INSERT INTO dim_metric VALUES (?, ?, ?, ?)",
[metric_id, metric_name, family, int(cutoff)],
)
con.execute(
"""
CREATE TABLE fact_metric_score (
metric_id BIGINT,
model_dir VARCHAR,
model_name VARCHAR,
benchmark VARCHAR,
dataset_id VARCHAR,
task_name VARCHAR,
metric_value DOUBLE,
result_path VARCHAR
)
"""
)
con.execute(
"""
CREATE TABLE fact_task_score (
model_dir VARCHAR,
model_name VARCHAR,
benchmark VARCHAR,
dataset_id VARCHAR,
dataset_name VARCHAR,
split_name VARCHAR,
task_name VARCHAR,
task_key VARCHAR,
score_target VARCHAR,
score DOUBLE,
active_parameters BIGINT,
total_parameters BIGINT,
max_seq_length INTEGER,
dtype VARCHAR,
attn_implementation VARCHAR,
query_prompt VARCHAR,
document_prompt VARCHAR,
query_prompt_name VARCHAR,
document_prompt_name VARCHAR,
query_encode_task VARCHAR,
document_encode_task VARCHAR,
trust_remote_code BOOLEAN,
embedding_variant_name VARCHAR,
embedding_dim INTEGER,
quantization VARCHAR,
result_path VARCHAR
)
"""
)
metric_id_by_name = {name: index for index, name in enumerate(metric_names, start=1)}
for model_name, benchmark, dataset_id, task_name, task_key, metric_name, metric_value, result_path in rows:
con.execute(
"INSERT INTO fact_metric_score VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
[
metric_id_by_name[metric_name],
model_name.replace("/", "__"),
model_name,
benchmark,
dataset_id,
task_name,
metric_value,
result_path,
],
)
con.execute(
"INSERT INTO fact_task_score VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
[
model_name.replace("/", "__"),
model_name,
benchmark,
dataset_id,
benchmark,
"",
task_name,
task_key,
score_target,
0.0,
10,
12,
8192,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
result_path,
],
)
finally:
con.close()
def _viewer_task_result_row(
row: tuple,
*,
language: str | None,
languages: list[str],
query_mean_chars: float | None = None,
document_mean_chars: float | None = None,
) -> tuple:
base = row[:11]
remaining = row[11:]
dtype = None
attn_implementation = None
query_prompt = None
document_prompt = None
query_prompt_name = None
document_prompt_name = None
query_encode_task = None
document_encode_task = None
trust_remote_code = None
embedding_variant_name = None
embedding_dim = None
quantization = None
if len(remaining) == 3:
embedding_variant_name, embedding_dim, quantization = remaining
elif len(remaining) == 9:
(
dtype,
attn_implementation,
query_prompt,
document_prompt,
query_prompt_name,
document_prompt_name,
query_encode_task,
document_encode_task,
trust_remote_code,
) = remaining
elif len(remaining) == 0:
pass
else:
raise AssertionError(f"Unexpected row shape: {row!r}")
return (
*base[:7],
"all",
base[7],
language,
languages,
*base[8:11],
dtype,
attn_implementation,
query_prompt,
document_prompt,
query_prompt_name,
document_prompt_name,
query_encode_task,
document_encode_task,
trust_remote_code,
embedding_variant_name,
embedding_dim,
quantization,
query_mean_chars,
document_mean_chars,
)