| |
|
|
| from __future__ import annotations |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| __author__ = "Dmitry Ustalov" |
| __license__ = "Apache 2.0" |
|
|
| from typing import Protocol, cast |
|
|
| import evalica |
| import gradio as gr |
| import networkx as nx |
| import numpy as np |
| import pandas as pd |
| import plotly.express as px |
| import scipy.stats |
| from evalica import AlphaBootstrapResult, Winner, alpha_bootstrap |
| from plotly.graph_objects import Figure |
|
|
| TOLERANCE, LIMIT = 1e-6, 100 |
| MORE_EVALICA = """ |
| **More Evalica:** |
| |
| - Paper: [2025.coling-demos.6](https://aclanthology.org/2025.coling-demos.6/) ([arXiv](https://arxiv.org/abs/2412.11314)) |
| - GitHub: <https://github.com/dustalov/evalica> |
| - PyPI: <https://pypi.org/project/evalica/> |
| - conda-forge: <https://anaconda.org/conda-forge/evalica> |
| - crates.io: <https://crates.io/crates/evalica> |
| - LLMFAO: <https://evalovernite.substack.com/p/llmfao-human-ranking> |
| """.strip() |
|
|
|
|
| def visualize(df_pairwise: pd.DataFrame) -> Figure: |
| fig = px.imshow(df_pairwise, color_continuous_scale="RdBu", text_auto=".2f") |
|
|
| fig.update_layout(xaxis_title="Loser", yaxis_title="Winner", xaxis_side="top") |
|
|
| fig.update_traces(hovertemplate="Winner: %{y}<br>Loser: %{x}<br>Fraction of Wins: %{z}<extra></extra>") |
|
|
| return fig |
|
|
|
|
| def counting( |
| xs: pd.Series[str], |
| ys: pd.Series[str], |
| ws: pd.Series[Winner], |
| index: pd.Index, |
| ) -> pd.Series[float]: |
| result = evalica.counting(xs, ys, ws, index=index) |
| return result.scores |
|
|
|
|
| def average_win_rate( |
| xs: pd.Series[str], |
| ys: pd.Series[str], |
| ws: pd.Series[Winner], |
| index: pd.Index, |
| ) -> pd.Series[float]: |
| result = evalica.average_win_rate(xs, ys, ws, index=index) |
| return result.scores |
|
|
|
|
| def bradley_terry( |
| xs: pd.Series[str], |
| ys: pd.Series[str], |
| ws: pd.Series[Winner], |
| index: pd.Index, |
| ) -> pd.Series[float]: |
| result = evalica.bradley_terry(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT) |
| return result.scores |
|
|
|
|
| def elo( |
| xs: pd.Series[str], |
| ys: pd.Series[str], |
| ws: pd.Series[Winner], |
| index: pd.Index, |
| ) -> pd.Series[float]: |
| result = evalica.elo(xs, ys, ws, index=index) |
| return result.scores |
|
|
|
|
| def eigen( |
| xs: pd.Series[str], |
| ys: pd.Series[str], |
| ws: pd.Series[Winner], |
| index: pd.Index, |
| ) -> pd.Series[float]: |
| result = evalica.eigen(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT) |
| return result.scores |
|
|
|
|
| def pagerank( |
| xs: pd.Series[str], |
| ys: pd.Series[str], |
| ws: pd.Series[Winner], |
| index: pd.Index, |
| ) -> pd.Series[float]: |
| result = evalica.pagerank(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT) |
| return result.scores |
|
|
|
|
| def newman( |
| xs: pd.Series[str], |
| ys: pd.Series[str], |
| ws: pd.Series[Winner], |
| index: pd.Index, |
| ) -> pd.Series[float]: |
| result = evalica.newman(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT) |
| return result.scores |
|
|
|
|
| class CallableAlgorithm(Protocol): |
| def __call__( |
| self, |
| xs: pd.Series[str], |
| ys: pd.Series[str], |
| ws: pd.Series[Winner], |
| index: pd.Index, |
| ) -> pd.Series[float]: ... |
|
|
|
|
| ALGORITHMS: dict[str, CallableAlgorithm] = { |
| "Counting": counting, |
| "Average Win Rate": average_win_rate, |
| "Bradley-Terry (1952)": bradley_terry, |
| "Elo (1960)": elo, |
| "Eigenvector (1987)": eigen, |
| "PageRank (1998)": pagerank, |
| "Newman (2023)": newman, |
| } |
|
|
|
|
| def largest_strongly_connected_component(df_pairs: pd.DataFrame) -> set[str]: |
| G = nx.from_pandas_edgelist(df_pairs, source="left", target="right", create_using=nx.DiGraph) |
| H = nx.from_pandas_edgelist( |
| df_pairs[df_pairs["winner"] == Winner.Draw], |
| source="right", |
| target="left", |
| create_using=nx.DiGraph, |
| ) |
| F = nx.compose(G, H) |
| largest = max(nx.strongly_connected_components(F), key=len) |
| return cast("set[str]", largest) |
|
|
|
|
| def estimate( |
| df_pairs: pd.DataFrame, |
| algorithm: CallableAlgorithm, |
| index: pd.Index, |
| ) -> pd.DataFrame: |
| scores = algorithm(df_pairs["left"], df_pairs["right"], df_pairs["winner"], index) |
|
|
| df_result = pd.DataFrame(data={"score": scores}, index=index) |
| df_result.index.name = "item" |
|
|
| return df_result |
|
|
|
|
| def pairwise_bootstrap( |
| df_pairs: pd.DataFrame, |
| algorithm: CallableAlgorithm, |
| index: pd.Index, |
| rounds: int, |
| ) -> pd.DataFrame: |
| left = df_pairs["left"].to_numpy() |
| right = df_pairs["right"].to_numpy() |
| winner = df_pairs["winner"].to_numpy() |
|
|
| def statistic(xs: np.ndarray, ys: np.ndarray, ws: np.ndarray) -> np.ndarray: |
| scores = algorithm(pd.Series(xs), pd.Series(ys), pd.Series(ws), index) |
| return scores.to_numpy() |
|
|
| result = scipy.stats.bootstrap( |
| (left, right, winner), |
| statistic, |
| n_resamples=rounds, |
| paired=True, |
| method="percentile", |
| random_state=0, |
| ) |
|
|
| ratings = pd.Series( |
| np.median(result.bootstrap_distribution, axis=1), |
| index=index, |
| ) |
|
|
| ci = pd.Series( |
| list(zip(result.confidence_interval.low, result.confidence_interval.high, strict=True)), |
| index=index, |
| ) |
|
|
| df_result = pd.DataFrame({"score": ratings, "ci": ci}) |
| df_result.index.name = "item" |
|
|
| return df_result |
|
|
|
|
| def handler( |
| file: str | None, |
| algorithm: str, |
| filtered: bool, |
| truncated: bool, |
| rounds: int, |
| ) -> tuple[pd.DataFrame, Figure]: |
| if file is None: |
| raise gr.Error("File must be uploaded") |
|
|
| if algorithm not in ALGORITHMS: |
| raise gr.Error(f"Unknown algorithm: {algorithm}") |
|
|
| try: |
| df_pairs = pd.read_csv(file, dtype=str) |
| except ValueError as e: |
| raise gr.Error(f"Parsing error: {e}") from e |
|
|
| if not pd.Series(["left", "right", "winner"]).isin(df_pairs.columns).all(): |
| raise gr.Error("Columns must exist: left, right, winner") |
|
|
| if not df_pairs["winner"].str.lower().isin(pd.Series(["left", "right", "tie"])).all(): |
| raise gr.Error("Allowed winner values: left, right, tie") |
|
|
| df_pairs = df_pairs[["left", "right", "winner"]] |
| df_pairs["winner"] = ( |
| df_pairs["winner"] |
| .str.lower() |
| .map( |
| {"left": Winner.X, "right": Winner.Y, "tie": Winner.Draw}, |
| ) |
| ) |
|
|
| df_pairs = df_pairs.loc[df_pairs.notna().all(axis=1)] |
|
|
| if filtered: |
| largest = largest_strongly_connected_component(df_pairs) |
| mask = df_pairs["left"].isin(largest) & df_pairs["right"].isin(largest) |
| df_pairs = df_pairs.loc[mask] |
|
|
| *_, index = evalica.indexing(xs=df_pairs["left"], ys=df_pairs["right"]) |
|
|
| if rounds: |
| df_result = pairwise_bootstrap(df_pairs, ALGORITHMS[algorithm], index, rounds) |
| else: |
| df_result = estimate(df_pairs, ALGORITHMS[algorithm], index) |
|
|
| df_result["pairs"] = ( |
| pd.Series(0, dtype=int, index=index) |
| .add( |
| df_pairs.groupby("left")["left"].count(), |
| fill_value=0, |
| ) |
| .add( |
| df_pairs.groupby("right")["right"].count(), |
| fill_value=0, |
| ) |
| .astype(int) |
| ) |
|
|
| df_result["rank"] = df_result["score"].rank(na_option="bottom", ascending=False).astype(int) |
|
|
| df_result = df_result.fillna(-np.inf) |
| df_result = df_result.sort_values(by=["rank", "score"], ascending=[True, False]) |
| df_result = df_result.reset_index() |
|
|
| if truncated: |
| df_result = pd.concat((df_result.head(5), df_result.tail(5))) |
| df_result = df_result[~df_result.index.duplicated(keep="last")] |
|
|
| pairwise = evalica.pairwise_scores(df_result["score"].to_numpy()) |
|
|
| df_pairwise = pd.DataFrame(data=pairwise, index=df_result["item"], columns=df_result["item"]) |
|
|
| fig = visualize(df_pairwise) |
|
|
| if "ci" in df_result.columns: |
| df_result["ci"] = df_result.apply( |
| lambda row: f"({row['score'] - row['ci'][0]:.03f}; {row['ci'][1] - row['score']:.03f})", |
| axis=1, |
| ) |
|
|
| df_result["score"] = df_result["score"].apply(lambda x: f"{x:.03f}") |
|
|
| return df_result, fig |
|
|
|
|
| def visualize_alpha_ci(bootstrap_result: AlphaBootstrapResult) -> Figure: |
| fig = px.histogram( |
| bootstrap_result.distribution, |
| nbins=50, |
| labels={"value": "Alpha", "count": "Frequency"}, |
| ) |
|
|
| fig.add_vline( |
| x=bootstrap_result.alpha, |
| line_dash="dash", |
| line_color="red", |
| annotation_text=f"alpha = {bootstrap_result.alpha:.3f}", |
| annotation_position="top right", |
| ) |
|
|
| fig.add_vline( |
| x=bootstrap_result.low, |
| line_dash="dot", |
| line_color="blue", |
| annotation_text=f"low = {bootstrap_result.low:.3f}", |
| annotation_position="top left", |
| ) |
|
|
| fig.add_vline( |
| x=bootstrap_result.high, |
| line_dash="dot", |
| line_color="blue", |
| annotation_text=f"high = {bootstrap_result.high:.3f}", |
| annotation_position="top right", |
| ) |
|
|
| fig.update_layout( |
| xaxis_title="Alpha", |
| yaxis_title="Frequency", |
| showlegend=False, |
| ) |
|
|
| return fig |
|
|
|
|
| def alpha_handler(file: str | None, distance: str, rounds: int) -> tuple[pd.DataFrame, Figure | None]: |
| if file is None: |
| raise gr.Error("File must be uploaded") |
|
|
| try: |
| df_ratings = pd.read_csv(file, header=None, dtype=str) |
| except ValueError as e: |
| raise gr.Error(f"Parsing error: {e}") from e |
|
|
| if df_ratings.empty: |
| raise gr.Error("The file is empty") |
|
|
| try: |
| bootstrap_result = alpha_bootstrap( |
| df_ratings, |
| distance=distance, |
| n_resamples=rounds or 1, |
| ) |
| except evalica.InsufficientRatingsError as e: |
| raise gr.Error("Insufficient ratings: no units have at least 2 ratings") from e |
| except evalica.UnknownDistanceError as e: |
| raise gr.Error(f"Unknown distance: {e}") from e |
| except Exception as e: |
| raise gr.Error(f"Computation error: {e}") from e |
|
|
| df_metrics = pd.DataFrame( |
| { |
| "Metric": ["Alpha", "Observed Disagreement", "Expected Disagreement"], |
| "Value": [bootstrap_result.alpha, bootstrap_result.observed, bootstrap_result.expected], |
| }, |
| ) |
|
|
| if not rounds: |
| return df_metrics, None |
|
|
| return df_metrics, visualize_alpha_ci(bootstrap_result) |
|
|
|
|
| def alpha_interface() -> gr.Interface: |
| return gr.Interface( |
| fn=alpha_handler, |
| inputs=[ |
| gr.File( |
| file_types=[".csv", ".tsv"], |
| label="Ratings", |
| ), |
| gr.Dropdown( |
| choices=["nominal", "ordinal", "interval", "ratio"], |
| value="nominal", |
| label="Distance", |
| info="Nominal for categorical, ordinal for ordered categories, interval/ratio for numeric scales", |
| ), |
| gr.Number( |
| value=0, |
| minimum=0, |
| maximum=10000, |
| label="Bootstrap Rounds", |
| info="Number of bootstrap resamples for the confidence interval plot. Set to 0 to skip.", |
| ), |
| ], |
| outputs=[ |
| gr.Dataframe( |
| headers=["Metric", "Value"], |
| label="Inter-Rater Reliability", |
| ), |
| gr.Plot( |
| label="Bootstrap Distribution of Alpha", |
| ), |
| ], |
| examples=[ |
| ["codings.csv", "ordinal", 1000], |
| ["gcl.csv", "nominal", 1000], |
| ], |
| title="Krippendorff's Alpha", |
| article=( |
| f""" |
| This tool computes Krippendorff's alpha, an inter-rater reliability coefficient. |
| |
| As an input, it expects a comma-separated (CSV) file without a header: |
| rows are raters (observers), columns are units (items), and cell values are ratings. |
| |
| As the output, this tool provides alpha together with observed and expected disagreement. |
| |
| {MORE_EVALICA} |
| """.strip() |
| ), |
| analytics_enabled=False, |
| flagging_mode="never", |
| ) |
|
|
|
|
| def main() -> None: |
| pairwise_iface = gr.Interface( |
| fn=handler, |
| inputs=[ |
| gr.File( |
| file_types=[".tsv", ".csv"], |
| label="Comparisons", |
| ), |
| gr.Dropdown( |
| choices=list(ALGORITHMS), |
| value="Bradley-Terry (1952)", |
| label="Algorithm", |
| ), |
| gr.Checkbox( |
| value=False, |
| label="Largest SCC", |
| info="Bradley-Terry, Eigenvector, and Newman algorithms require the comparison graph " |
| "to be strongly-connected. " |
| "This option keeps only the largest strongly-connected component (SCC) of the input graph. " |
| "Some items might be missing as a result of this filtering.", |
| ), |
| gr.Checkbox( |
| value=False, |
| label="Truncate Output", |
| info="Perform the entire computation but output only five head and five tail items, avoiding overlap.", |
| ), |
| gr.Number( |
| value=0, |
| minimum=0, |
| maximum=10000, |
| label="Bootstrap Rounds", |
| info="Number of bootstrap rounds to perform for estimating the confidence interval.", |
| ), |
| ], |
| outputs=[ |
| gr.Dataframe( |
| headers=["item", "score", "ci", "pairs", "rank"], |
| label="Ranking", |
| ), |
| gr.Plot( |
| label="Pairwise Chances of Winning the Comparison", |
| ), |
| ], |
| examples=[ |
| ["food.csv", "Counting", False, False, 0], |
| ["food.csv", "Bradley-Terry (1952)", False, False, 1000], |
| ["food.csv", "Eigenvector (1987)", False, False, 1000], |
| ["food.csv", "PageRank (1998)", False, False, 1000], |
| ["food.csv", "Newman (2023)", False, False, 1000], |
| ["llmfao.csv", "Average Win Rate", False, True, 100], |
| ["llmfao.csv", "Bradley-Terry (1952)", False, True, 100], |
| ["llmfao.csv", "Elo (1960)", False, True, 100], |
| ], |
| title="Pairwise Comparisons", |
| article=( |
| f""" |
| This easy-to-use tool transforms pairwise comparisons (*aka* side-by-side) to a meaningful ranking of items. |
| |
| As an input, it expects a comma-separated (CSV) file with a header containing the following columns: |
| |
| - `left`: the first compared item |
| - `right`: the second compared item |
| - `winner`: the label indicating the winning item |
| |
| Possible values for `winner` are `left`, `right`, or `tie`. The provided examples might be a good starting point. |
| |
| As the output, this tool provides a table with items, their estimated scores, and ranks. |
| |
| {MORE_EVALICA} |
| """.strip() |
| ), |
| flagging_mode="never", |
| analytics_enabled=False, |
| ) |
|
|
| iface = gr.TabbedInterface( |
| [pairwise_iface, alpha_interface()], |
| ["Pairwise Ranking", "Krippendorff's Alpha"], |
| title="Evalica", |
| analytics_enabled=False, |
| ) |
|
|
| iface.launch() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|