evalica / app.py
dustalov's picture
Update app.py
008105a verified
#!/usr/bin/env python3
from __future__ import annotations
# Copyright 2023 Dmitry Ustalov
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__author__ = "Dmitry Ustalov"
__license__ = "Apache 2.0"
from typing import Protocol, cast
import evalica
import gradio as gr
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import scipy.stats
from evalica import AlphaBootstrapResult, Winner, alpha_bootstrap
from plotly.graph_objects import Figure
TOLERANCE, LIMIT = 1e-6, 100
MORE_EVALICA = """
**More Evalica:**
- Paper: [2025.coling-demos.6](https://aclanthology.org/2025.coling-demos.6/) ([arXiv](https://arxiv.org/abs/2412.11314))
- GitHub: <https://github.com/dustalov/evalica>
- PyPI: <https://pypi.org/project/evalica/>
- conda-forge: <https://anaconda.org/conda-forge/evalica>
- crates.io: <https://crates.io/crates/evalica>
- LLMFAO: <https://evalovernite.substack.com/p/llmfao-human-ranking>
""".strip()
def visualize(df_pairwise: pd.DataFrame) -> Figure:
fig = px.imshow(df_pairwise, color_continuous_scale="RdBu", text_auto=".2f")
fig.update_layout(xaxis_title="Loser", yaxis_title="Winner", xaxis_side="top")
fig.update_traces(hovertemplate="Winner: %{y}<br>Loser: %{x}<br>Fraction of Wins: %{z}<extra></extra>")
return fig
def counting(
xs: pd.Series[str],
ys: pd.Series[str],
ws: pd.Series[Winner],
index: pd.Index,
) -> pd.Series[float]:
result = evalica.counting(xs, ys, ws, index=index)
return result.scores
def average_win_rate(
xs: pd.Series[str],
ys: pd.Series[str],
ws: pd.Series[Winner],
index: pd.Index,
) -> pd.Series[float]:
result = evalica.average_win_rate(xs, ys, ws, index=index)
return result.scores
def bradley_terry(
xs: pd.Series[str],
ys: pd.Series[str],
ws: pd.Series[Winner],
index: pd.Index,
) -> pd.Series[float]:
result = evalica.bradley_terry(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
return result.scores
def elo(
xs: pd.Series[str],
ys: pd.Series[str],
ws: pd.Series[Winner],
index: pd.Index,
) -> pd.Series[float]:
result = evalica.elo(xs, ys, ws, index=index)
return result.scores
def eigen(
xs: pd.Series[str],
ys: pd.Series[str],
ws: pd.Series[Winner],
index: pd.Index,
) -> pd.Series[float]:
result = evalica.eigen(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
return result.scores
def pagerank(
xs: pd.Series[str],
ys: pd.Series[str],
ws: pd.Series[Winner],
index: pd.Index,
) -> pd.Series[float]:
result = evalica.pagerank(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
return result.scores
def newman(
xs: pd.Series[str],
ys: pd.Series[str],
ws: pd.Series[Winner],
index: pd.Index,
) -> pd.Series[float]:
result = evalica.newman(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
return result.scores
class CallableAlgorithm(Protocol):
def __call__(
self,
xs: pd.Series[str],
ys: pd.Series[str],
ws: pd.Series[Winner],
index: pd.Index,
) -> pd.Series[float]: ...
ALGORITHMS: dict[str, CallableAlgorithm] = {
"Counting": counting,
"Average Win Rate": average_win_rate,
"Bradley-Terry (1952)": bradley_terry,
"Elo (1960)": elo,
"Eigenvector (1987)": eigen,
"PageRank (1998)": pagerank,
"Newman (2023)": newman,
}
def largest_strongly_connected_component(df_pairs: pd.DataFrame) -> set[str]:
G = nx.from_pandas_edgelist(df_pairs, source="left", target="right", create_using=nx.DiGraph)
H = nx.from_pandas_edgelist(
df_pairs[df_pairs["winner"] == Winner.Draw],
source="right",
target="left",
create_using=nx.DiGraph,
)
F = nx.compose(G, H)
largest = max(nx.strongly_connected_components(F), key=len)
return cast("set[str]", largest)
def estimate(
df_pairs: pd.DataFrame,
algorithm: CallableAlgorithm,
index: pd.Index,
) -> pd.DataFrame:
scores = algorithm(df_pairs["left"], df_pairs["right"], df_pairs["winner"], index)
df_result = pd.DataFrame(data={"score": scores}, index=index)
df_result.index.name = "item"
return df_result
def pairwise_bootstrap(
df_pairs: pd.DataFrame,
algorithm: CallableAlgorithm,
index: pd.Index,
rounds: int,
) -> pd.DataFrame:
left = df_pairs["left"].to_numpy()
right = df_pairs["right"].to_numpy()
winner = df_pairs["winner"].to_numpy()
def statistic(xs: np.ndarray, ys: np.ndarray, ws: np.ndarray) -> np.ndarray:
scores = algorithm(pd.Series(xs), pd.Series(ys), pd.Series(ws), index)
return scores.to_numpy()
result = scipy.stats.bootstrap(
(left, right, winner),
statistic,
n_resamples=rounds,
paired=True,
method="percentile",
random_state=0,
)
ratings = pd.Series(
np.median(result.bootstrap_distribution, axis=1),
index=index,
)
ci = pd.Series(
list(zip(result.confidence_interval.low, result.confidence_interval.high, strict=True)),
index=index,
)
df_result = pd.DataFrame({"score": ratings, "ci": ci})
df_result.index.name = "item"
return df_result
def handler(
file: str | None,
algorithm: str,
filtered: bool,
truncated: bool,
rounds: int,
) -> tuple[pd.DataFrame, Figure]:
if file is None:
raise gr.Error("File must be uploaded")
if algorithm not in ALGORITHMS:
raise gr.Error(f"Unknown algorithm: {algorithm}")
try:
df_pairs = pd.read_csv(file, dtype=str)
except ValueError as e:
raise gr.Error(f"Parsing error: {e}") from e
if not pd.Series(["left", "right", "winner"]).isin(df_pairs.columns).all():
raise gr.Error("Columns must exist: left, right, winner")
if not df_pairs["winner"].str.lower().isin(pd.Series(["left", "right", "tie"])).all():
raise gr.Error("Allowed winner values: left, right, tie")
df_pairs = df_pairs[["left", "right", "winner"]]
df_pairs["winner"] = (
df_pairs["winner"]
.str.lower()
.map(
{"left": Winner.X, "right": Winner.Y, "tie": Winner.Draw},
)
)
df_pairs = df_pairs.loc[df_pairs.notna().all(axis=1)]
if filtered:
largest = largest_strongly_connected_component(df_pairs)
mask = df_pairs["left"].isin(largest) & df_pairs["right"].isin(largest)
df_pairs = df_pairs.loc[mask]
*_, index = evalica.indexing(xs=df_pairs["left"], ys=df_pairs["right"])
if rounds:
df_result = pairwise_bootstrap(df_pairs, ALGORITHMS[algorithm], index, rounds)
else:
df_result = estimate(df_pairs, ALGORITHMS[algorithm], index)
df_result["pairs"] = (
pd.Series(0, dtype=int, index=index)
.add(
df_pairs.groupby("left")["left"].count(),
fill_value=0,
)
.add(
df_pairs.groupby("right")["right"].count(),
fill_value=0,
)
.astype(int)
)
df_result["rank"] = df_result["score"].rank(na_option="bottom", ascending=False).astype(int)
df_result = df_result.fillna(-np.inf)
df_result = df_result.sort_values(by=["rank", "score"], ascending=[True, False])
df_result = df_result.reset_index()
if truncated:
df_result = pd.concat((df_result.head(5), df_result.tail(5)))
df_result = df_result[~df_result.index.duplicated(keep="last")]
pairwise = evalica.pairwise_scores(df_result["score"].to_numpy())
df_pairwise = pd.DataFrame(data=pairwise, index=df_result["item"], columns=df_result["item"])
fig = visualize(df_pairwise)
if "ci" in df_result.columns:
df_result["ci"] = df_result.apply(
lambda row: f"({row['score'] - row['ci'][0]:.03f}; {row['ci'][1] - row['score']:.03f})",
axis=1,
)
df_result["score"] = df_result["score"].apply(lambda x: f"{x:.03f}")
return df_result, fig
def visualize_alpha_ci(bootstrap_result: AlphaBootstrapResult) -> Figure:
fig = px.histogram(
bootstrap_result.distribution,
nbins=50,
labels={"value": "Alpha", "count": "Frequency"},
)
fig.add_vline(
x=bootstrap_result.alpha,
line_dash="dash",
line_color="red",
annotation_text=f"alpha = {bootstrap_result.alpha:.3f}",
annotation_position="top right",
)
fig.add_vline(
x=bootstrap_result.low,
line_dash="dot",
line_color="blue",
annotation_text=f"low = {bootstrap_result.low:.3f}",
annotation_position="top left",
)
fig.add_vline(
x=bootstrap_result.high,
line_dash="dot",
line_color="blue",
annotation_text=f"high = {bootstrap_result.high:.3f}",
annotation_position="top right",
)
fig.update_layout(
xaxis_title="Alpha",
yaxis_title="Frequency",
showlegend=False,
)
return fig
def alpha_handler(file: str | None, distance: str, rounds: int) -> tuple[pd.DataFrame, Figure | None]:
if file is None:
raise gr.Error("File must be uploaded")
try:
df_ratings = pd.read_csv(file, header=None, dtype=str)
except ValueError as e:
raise gr.Error(f"Parsing error: {e}") from e
if df_ratings.empty:
raise gr.Error("The file is empty")
try:
bootstrap_result = alpha_bootstrap(
df_ratings,
distance=distance, # type: ignore[arg-type]
n_resamples=rounds or 1,
)
except evalica.InsufficientRatingsError as e:
raise gr.Error("Insufficient ratings: no units have at least 2 ratings") from e
except evalica.UnknownDistanceError as e:
raise gr.Error(f"Unknown distance: {e}") from e
except Exception as e:
raise gr.Error(f"Computation error: {e}") from e
df_metrics = pd.DataFrame(
{
"Metric": ["Alpha", "Observed Disagreement", "Expected Disagreement"],
"Value": [bootstrap_result.alpha, bootstrap_result.observed, bootstrap_result.expected],
},
)
if not rounds:
return df_metrics, None
return df_metrics, visualize_alpha_ci(bootstrap_result)
def alpha_interface() -> gr.Interface:
return gr.Interface(
fn=alpha_handler,
inputs=[
gr.File(
file_types=[".csv", ".tsv"],
label="Ratings",
),
gr.Dropdown(
choices=["nominal", "ordinal", "interval", "ratio"],
value="nominal",
label="Distance",
info="Nominal for categorical, ordinal for ordered categories, interval/ratio for numeric scales",
),
gr.Number(
value=0,
minimum=0,
maximum=10000,
label="Bootstrap Rounds",
info="Number of bootstrap resamples for the confidence interval plot. Set to 0 to skip.",
),
],
outputs=[
gr.Dataframe(
headers=["Metric", "Value"],
label="Inter-Rater Reliability",
),
gr.Plot(
label="Bootstrap Distribution of Alpha",
),
],
examples=[
["codings.csv", "ordinal", 1000],
["gcl.csv", "nominal", 1000],
],
title="Krippendorff's Alpha",
article=(
f"""
This tool computes Krippendorff's alpha, an inter-rater reliability coefficient.
As an input, it expects a comma-separated (CSV) file without a header:
rows are raters (observers), columns are units (items), and cell values are ratings.
As the output, this tool provides alpha together with observed and expected disagreement.
{MORE_EVALICA}
""".strip()
),
analytics_enabled=False,
flagging_mode="never",
)
def main() -> None:
pairwise_iface = gr.Interface(
fn=handler,
inputs=[
gr.File(
file_types=[".tsv", ".csv"],
label="Comparisons",
),
gr.Dropdown(
choices=list(ALGORITHMS),
value="Bradley-Terry (1952)",
label="Algorithm",
),
gr.Checkbox(
value=False,
label="Largest SCC",
info="Bradley-Terry, Eigenvector, and Newman algorithms require the comparison graph "
"to be strongly-connected. "
"This option keeps only the largest strongly-connected component (SCC) of the input graph. "
"Some items might be missing as a result of this filtering.",
),
gr.Checkbox(
value=False,
label="Truncate Output",
info="Perform the entire computation but output only five head and five tail items, avoiding overlap.",
),
gr.Number(
value=0,
minimum=0,
maximum=10000,
label="Bootstrap Rounds",
info="Number of bootstrap rounds to perform for estimating the confidence interval.",
),
],
outputs=[
gr.Dataframe(
headers=["item", "score", "ci", "pairs", "rank"],
label="Ranking",
),
gr.Plot(
label="Pairwise Chances of Winning the Comparison",
),
],
examples=[
["food.csv", "Counting", False, False, 0],
["food.csv", "Bradley-Terry (1952)", False, False, 1000],
["food.csv", "Eigenvector (1987)", False, False, 1000],
["food.csv", "PageRank (1998)", False, False, 1000],
["food.csv", "Newman (2023)", False, False, 1000],
["llmfao.csv", "Average Win Rate", False, True, 100],
["llmfao.csv", "Bradley-Terry (1952)", False, True, 100],
["llmfao.csv", "Elo (1960)", False, True, 100],
],
title="Pairwise Comparisons",
article=(
f"""
This easy-to-use tool transforms pairwise comparisons (*aka* side-by-side) to a meaningful ranking of items.
As an input, it expects a comma-separated (CSV) file with a header containing the following columns:
- `left`: the first compared item
- `right`: the second compared item
- `winner`: the label indicating the winning item
Possible values for `winner` are `left`, `right`, or `tie`. The provided examples might be a good starting point.
As the output, this tool provides a table with items, their estimated scores, and ranks.
{MORE_EVALICA}
""".strip()
),
flagging_mode="never",
analytics_enabled=False,
)
iface = gr.TabbedInterface(
[pairwise_iface, alpha_interface()],
["Pairwise Ranking", "Krippendorff's Alpha"],
title="Evalica",
analytics_enabled=False,
)
iface.launch()
if __name__ == "__main__":
main()