dustalov commited on
Commit
0366c98
·
verified ·
1 Parent(s): 0cc5c99

Update app.py

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. app.py +185 -64
  3. mypy.ini +0 -1
  4. requirements.txt +1 -1
  5. ruff.toml +1 -1
README.md CHANGED
@@ -4,8 +4,8 @@ emoji: 💞
4
  colorFrom: green
5
  colorTo: purple
6
  sdk: gradio
7
- python_version: 3.11
8
- sdk_version: 5.12.0
9
  app_file: app.py
10
  pinned: true
11
  license: apache-2.0
 
4
  colorFrom: green
5
  colorTo: purple
6
  sdk: gradio
7
+ python_version: 3.14
8
+ sdk_version: 6.5.1
9
  app_file: app.py
10
  pinned: true
11
  license: apache-2.0
app.py CHANGED
@@ -1,5 +1,7 @@
1
  #!/usr/bin/env python3
2
 
 
 
3
  # Copyright 2023 Dmitry Ustalov
4
  #
5
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,8 +19,7 @@
17
  __author__ = "Dmitry Ustalov"
18
  __license__ = "Apache 2.0"
19
 
20
- from collections.abc import Callable
21
- from typing import BinaryIO, cast
22
 
23
  import evalica
24
  import gradio as gr
@@ -27,7 +28,9 @@ import numpy as np
27
  import pandas as pd
28
  import plotly.express as px
29
  from evalica import Winner
30
- from plotly.graph_objects import Figure
 
 
31
 
32
  TOLERANCE, LIMIT = 1e-6, 100
33
 
@@ -42,49 +45,87 @@ def visualize(df_pairwise: pd.DataFrame) -> Figure:
42
  return fig
43
 
44
 
45
- def counting(xs: "pd.Series[str]", ys: "pd.Series[str]",
46
- ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
 
 
 
 
47
  result = evalica.counting(xs, ys, ws, index=index)
48
  return result.scores
49
 
50
 
51
- def average_win_rate(xs: "pd.Series[str]", ys: "pd.Series[str]",
52
- ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
53
- result = evalica.counting(xs, ys, ws, index=index)
 
 
 
 
54
  return result.scores
55
 
56
 
57
- def bradley_terry(xs: "pd.Series[str]", ys: "pd.Series[str]",
58
- ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
 
 
 
 
59
  result = evalica.bradley_terry(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
60
  return result.scores
61
 
62
 
63
- def elo(xs: "pd.Series[str]", ys: "pd.Series[str]",
64
- ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
 
 
 
 
65
  result = evalica.elo(xs, ys, ws, index=index)
66
  return result.scores
67
 
68
 
69
- def eigen(xs: "pd.Series[str]", ys: "pd.Series[str]",
70
- ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
 
 
 
 
71
  result = evalica.eigen(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
72
  return result.scores
73
 
74
 
75
- def pagerank(xs: "pd.Series[str]", ys: "pd.Series[str]",
76
- ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
 
 
 
 
77
  result = evalica.pagerank(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
78
  return result.scores
79
 
80
 
81
- def newman(xs: "pd.Series[str]", ys: "pd.Series[str]",
82
- ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
 
 
 
 
83
  result = evalica.newman(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
84
  return result.scores
85
 
86
 
87
- ALGORITHMS = {
 
 
 
 
 
 
 
 
 
 
88
  "Counting": counting,
89
  "Average Win Rate": average_win_rate,
90
  "Bradley-Terry (1952)": bradley_terry,
@@ -97,19 +138,22 @@ ALGORITHMS = {
97
 
98
  def largest_strongly_connected_component(df_pairs: pd.DataFrame) -> set[str]:
99
  G = nx.from_pandas_edgelist(df_pairs, source="left", target="right", create_using=nx.DiGraph)
100
- H = nx.from_pandas_edgelist(df_pairs[df_pairs["winner"] == "tie"], source="right", target="left",
101
- create_using=nx.DiGraph)
 
 
 
 
102
  F = nx.compose(G, H)
103
  largest = max(nx.strongly_connected_components(F), key=len)
104
- return cast(set[str], largest)
105
 
106
 
107
- def estimate(df_pairs: pd.DataFrame,
108
- algorithm: Callable[[ # type: ignore[type-var]
109
- "pd.Series[str]", "pd.Series[str]", "pd.Series[Winner]", dict[str, int]],
110
- "pd.Series[float]",
111
- ],
112
- index: dict[str, int]) -> pd.DataFrame:
113
  scores = algorithm(df_pairs["left"], df_pairs["right"], df_pairs["winner"], index)
114
 
115
  df_result = pd.DataFrame(data={"score": scores}, index=index)
@@ -118,13 +162,12 @@ def estimate(df_pairs: pd.DataFrame,
118
  return df_result
119
 
120
 
121
- def bootstrap(df_pairs: pd.DataFrame,
122
- algorithm: Callable[[ # type: ignore[type-var]
123
- "pd.Series[str]", "pd.Series[str]", "pd.Series[Winner]", dict[str, int]],
124
- "pd.Series[float]",
125
- ],
126
- index: dict[str, int],
127
- rounds: int) -> pd.DataFrame:
128
  scores: list[pd.Series[float]] = [] # assuming model names are strings
129
 
130
  for r in range(rounds):
@@ -136,11 +179,16 @@ def bootstrap(df_pairs: pd.DataFrame,
136
 
137
  df_bootstrap = pd.DataFrame(scores, columns=index)
138
 
139
- ratings = df_bootstrap.quantile(.5)
140
 
141
- ci = df_bootstrap.apply(lambda row: (
142
- row.quantile(.025).item(), row.quantile(.975).item(),
143
- ), axis=0, result_type="reduce")
 
 
 
 
 
144
 
145
  df_result = pd.DataFrame({"score": ratings, "ci": ci})
146
  df_result.index.name = "item"
@@ -149,11 +197,11 @@ def bootstrap(df_pairs: pd.DataFrame,
149
 
150
 
151
  def handler(
152
- file: BinaryIO,
153
- algorithm: str,
154
- filtered: bool,
155
- truncated: bool,
156
- rounds: int,
157
  ) -> tuple[pd.DataFrame, Figure]:
158
  if file is None:
159
  raise gr.Error("File must be uploaded")
@@ -162,27 +210,31 @@ def handler(
162
  raise gr.Error(f"Unknown algorithm: {algorithm}")
163
 
164
  try:
165
- df_pairs = pd.read_csv(file.name, dtype=str)
166
  except ValueError as e:
167
  raise gr.Error(f"Parsing error: {e}") from e
168
 
169
  if not pd.Series(["left", "right", "winner"]).isin(df_pairs.columns).all():
170
  raise gr.Error("Columns must exist: left, right, winner")
171
 
172
- if not df_pairs["winner"].isin(pd.Series(["left", "right", "tie"])).all():
173
  raise gr.Error("Allowed winner values: left, right, tie")
174
 
175
  df_pairs = df_pairs[["left", "right", "winner"]]
176
- df_pairs["winner"] = df_pairs["winner"].map(
177
- {"left": Winner.X, "right": Winner.Y, "tie": Winner.Draw},
 
 
 
 
178
  )
179
 
180
- df_pairs = df_pairs.dropna(axis=0)
181
 
182
  if filtered:
183
  largest = largest_strongly_connected_component(df_pairs)
184
-
185
- df_pairs = df_pairs.drop(df_pairs[~(df_pairs["left"].isin(largest) & df_pairs["right"].isin(largest))].index)
186
 
187
  *_, index = evalica.indexing(xs=df_pairs["left"], ys=df_pairs["right"])
188
 
@@ -191,11 +243,18 @@ def handler(
191
  else:
192
  df_result = estimate(df_pairs, ALGORITHMS[algorithm], index)
193
 
194
- df_result["pairs"] = pd.Series(0, dtype=int, index=index).add(
195
- df_pairs.groupby("left")["left"].count(), fill_value=0,
196
- ).add(
197
- df_pairs.groupby("right")["right"].count(), fill_value=0,
198
- ).astype(int)
 
 
 
 
 
 
 
199
 
200
  df_result["rank"] = df_result["score"].rank(na_option="bottom", ascending=False).astype(int)
201
 
@@ -204,7 +263,7 @@ def handler(
204
  df_result = df_result.reset_index()
205
 
206
  if truncated:
207
- df_result = pd.concat((df_result.head(5), df_result.tail(5)), copy=False)
208
  df_result = df_result[~df_result.index.duplicated(keep="last")]
209
 
210
  pairwise = evalica.pairwise_scores(df_result["score"].to_numpy())
@@ -224,8 +283,64 @@ def handler(
224
  return df_result, fig
225
 
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  def main() -> None:
228
- iface = gr.Interface(
229
  fn=handler,
230
  inputs=[
231
  gr.File(
@@ -233,7 +348,7 @@ def main() -> None:
233
  label="Comparisons",
234
  ),
235
  gr.Dropdown(
236
- choices=cast(list[str], ALGORITHMS),
237
  value="Bradley-Terry (1952)",
238
  label="Algorithm",
239
  ),
@@ -241,15 +356,14 @@ def main() -> None:
241
  value=False,
242
  label="Largest SCC",
243
  info="Bradley-Terry, Eigenvector, and Newman algorithms require the comparison graph "
244
- "to be strongly-connected. "
245
- "This option keeps only the largest strongly-connected component (SCC) of the input graph. "
246
- "Some items might be missing as a result of this filtering.",
247
  ),
248
  gr.Checkbox(
249
  value=False,
250
  label="Truncate Output",
251
- info="Perform the entire computation but output only five head and five tail items, "
252
- "avoiding overlap.",
253
  ),
254
  gr.Number(
255
  value=0,
@@ -307,6 +421,13 @@ As the output, this tool provides a table with items, their estimated scores, an
307
  analytics_enabled=False,
308
  )
309
 
 
 
 
 
 
 
 
310
  iface.launch()
311
 
312
 
 
1
  #!/usr/bin/env python3
2
 
3
+ from __future__ import annotations
4
+
5
  # Copyright 2023 Dmitry Ustalov
6
  #
7
  # Licensed under the Apache License, Version 2.0 (the "License");
 
19
  __author__ = "Dmitry Ustalov"
20
  __license__ = "Apache 2.0"
21
 
22
+ from typing import TYPE_CHECKING, Protocol, cast
 
23
 
24
  import evalica
25
  import gradio as gr
 
28
  import pandas as pd
29
  import plotly.express as px
30
  from evalica import Winner
31
+
32
+ if TYPE_CHECKING:
33
+ from plotly.graph_objects import Figure
34
 
35
  TOLERANCE, LIMIT = 1e-6, 100
36
 
 
45
  return fig
46
 
47
 
48
+ def counting(
49
+ xs: pd.Series[str],
50
+ ys: pd.Series[str],
51
+ ws: pd.Series[Winner],
52
+ index: pd.Index,
53
+ ) -> pd.Series[float]:
54
  result = evalica.counting(xs, ys, ws, index=index)
55
  return result.scores
56
 
57
 
58
+ def average_win_rate(
59
+ xs: pd.Series[str],
60
+ ys: pd.Series[str],
61
+ ws: pd.Series[Winner],
62
+ index: pd.Index,
63
+ ) -> pd.Series[float]:
64
+ result = evalica.average_win_rate(xs, ys, ws, index=index)
65
  return result.scores
66
 
67
 
68
+ def bradley_terry(
69
+ xs: pd.Series[str],
70
+ ys: pd.Series[str],
71
+ ws: pd.Series[Winner],
72
+ index: pd.Index,
73
+ ) -> pd.Series[float]:
74
  result = evalica.bradley_terry(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
75
  return result.scores
76
 
77
 
78
+ def elo(
79
+ xs: pd.Series[str],
80
+ ys: pd.Series[str],
81
+ ws: pd.Series[Winner],
82
+ index: pd.Index,
83
+ ) -> pd.Series[float]:
84
  result = evalica.elo(xs, ys, ws, index=index)
85
  return result.scores
86
 
87
 
88
+ def eigen(
89
+ xs: pd.Series[str],
90
+ ys: pd.Series[str],
91
+ ws: pd.Series[Winner],
92
+ index: pd.Index,
93
+ ) -> pd.Series[float]:
94
  result = evalica.eigen(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
95
  return result.scores
96
 
97
 
98
+ def pagerank(
99
+ xs: pd.Series[str],
100
+ ys: pd.Series[str],
101
+ ws: pd.Series[Winner],
102
+ index: pd.Index,
103
+ ) -> pd.Series[float]:
104
  result = evalica.pagerank(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
105
  return result.scores
106
 
107
 
108
+ def newman(
109
+ xs: pd.Series[str],
110
+ ys: pd.Series[str],
111
+ ws: pd.Series[Winner],
112
+ index: pd.Index,
113
+ ) -> pd.Series[float]:
114
  result = evalica.newman(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
115
  return result.scores
116
 
117
 
118
+ class CallableAlgorithm(Protocol):
119
+ def __call__(
120
+ self,
121
+ xs: pd.Series[str],
122
+ ys: pd.Series[str],
123
+ ws: pd.Series[Winner],
124
+ index: pd.Index,
125
+ ) -> pd.Series[float]: ...
126
+
127
+
128
+ ALGORITHMS: dict[str, CallableAlgorithm] = {
129
  "Counting": counting,
130
  "Average Win Rate": average_win_rate,
131
  "Bradley-Terry (1952)": bradley_terry,
 
138
 
139
  def largest_strongly_connected_component(df_pairs: pd.DataFrame) -> set[str]:
140
  G = nx.from_pandas_edgelist(df_pairs, source="left", target="right", create_using=nx.DiGraph)
141
+ H = nx.from_pandas_edgelist(
142
+ df_pairs[df_pairs["winner"] == "tie"],
143
+ source="right",
144
+ target="left",
145
+ create_using=nx.DiGraph,
146
+ )
147
  F = nx.compose(G, H)
148
  largest = max(nx.strongly_connected_components(F), key=len)
149
+ return cast("set[str]", largest)
150
 
151
 
152
+ def estimate(
153
+ df_pairs: pd.DataFrame,
154
+ algorithm: CallableAlgorithm,
155
+ index: pd.Index,
156
+ ) -> pd.DataFrame:
 
157
  scores = algorithm(df_pairs["left"], df_pairs["right"], df_pairs["winner"], index)
158
 
159
  df_result = pd.DataFrame(data={"score": scores}, index=index)
 
162
  return df_result
163
 
164
 
165
+ def bootstrap(
166
+ df_pairs: pd.DataFrame,
167
+ algorithm: CallableAlgorithm,
168
+ index: pd.Index,
169
+ rounds: int,
170
+ ) -> pd.DataFrame:
 
171
  scores: list[pd.Series[float]] = [] # assuming model names are strings
172
 
173
  for r in range(rounds):
 
179
 
180
  df_bootstrap = pd.DataFrame(scores, columns=index)
181
 
182
+ ratings = df_bootstrap.quantile(0.5)
183
 
184
+ ci = df_bootstrap.apply(
185
+ lambda row: (
186
+ row.quantile(0.025).item(),
187
+ row.quantile(0.975).item(),
188
+ ),
189
+ axis=0,
190
+ result_type="reduce",
191
+ )
192
 
193
  df_result = pd.DataFrame({"score": ratings, "ci": ci})
194
  df_result.index.name = "item"
 
197
 
198
 
199
  def handler(
200
+ file: str | None,
201
+ algorithm: str,
202
+ filtered: bool,
203
+ truncated: bool,
204
+ rounds: int,
205
  ) -> tuple[pd.DataFrame, Figure]:
206
  if file is None:
207
  raise gr.Error("File must be uploaded")
 
210
  raise gr.Error(f"Unknown algorithm: {algorithm}")
211
 
212
  try:
213
+ df_pairs = pd.read_csv(file, dtype=str)
214
  except ValueError as e:
215
  raise gr.Error(f"Parsing error: {e}") from e
216
 
217
  if not pd.Series(["left", "right", "winner"]).isin(df_pairs.columns).all():
218
  raise gr.Error("Columns must exist: left, right, winner")
219
 
220
+ if not df_pairs["winner"].str.lower().isin(pd.Series(["left", "right", "tie"])).all():
221
  raise gr.Error("Allowed winner values: left, right, tie")
222
 
223
  df_pairs = df_pairs[["left", "right", "winner"]]
224
+ df_pairs["winner"] = (
225
+ df_pairs["winner"]
226
+ .str.lower()
227
+ .map(
228
+ {"left": Winner.X, "right": Winner.Y, "tie": Winner.Draw},
229
+ )
230
  )
231
 
232
+ df_pairs = df_pairs.loc[df_pairs.notna().all(axis=1)]
233
 
234
  if filtered:
235
  largest = largest_strongly_connected_component(df_pairs)
236
+ mask = df_pairs["left"].isin(largest) & df_pairs["right"].isin(largest)
237
+ df_pairs = df_pairs.loc[mask]
238
 
239
  *_, index = evalica.indexing(xs=df_pairs["left"], ys=df_pairs["right"])
240
 
 
243
  else:
244
  df_result = estimate(df_pairs, ALGORITHMS[algorithm], index)
245
 
246
+ df_result["pairs"] = (
247
+ pd.Series(0, dtype=int, index=index)
248
+ .add(
249
+ df_pairs.groupby("left")["left"].count(),
250
+ fill_value=0,
251
+ )
252
+ .add(
253
+ df_pairs.groupby("right")["right"].count(),
254
+ fill_value=0,
255
+ )
256
+ .astype(int)
257
+ )
258
 
259
  df_result["rank"] = df_result["score"].rank(na_option="bottom", ascending=False).astype(int)
260
 
 
263
  df_result = df_result.reset_index()
264
 
265
  if truncated:
266
+ df_result = pd.concat((df_result.head(5), df_result.tail(5)))
267
  df_result = df_result[~df_result.index.duplicated(keep="last")]
268
 
269
  pairwise = evalica.pairwise_scores(df_result["score"].to_numpy())
 
283
  return df_result, fig
284
 
285
 
286
+ def alpha_handler(file: str | None, distance: str) -> pd.DataFrame:
287
+ if file is None:
288
+ raise gr.Error("File must be uploaded")
289
+
290
+ try:
291
+ df_ratings = pd.read_csv(file, header=None, dtype=str)
292
+ except ValueError as e:
293
+ raise gr.Error(f"Parsing error: {e}") from e
294
+
295
+ if df_ratings.empty:
296
+ raise gr.Error("The file is empty")
297
+
298
+ try:
299
+ result = evalica.alpha(df_ratings, distance=distance) # type: ignore[arg-type]
300
+ except evalica.InsufficientRatingsError as e:
301
+ raise gr.Error("Insufficient ratings: no units have at least 2 ratings") from e
302
+ except evalica.UnknownDistanceError as e:
303
+ raise gr.Error(f"Unknown distance: {e}") from e
304
+ except Exception as e:
305
+ raise gr.Error(f"Computation error: {e}") from e
306
+
307
+ return pd.DataFrame(
308
+ {
309
+ "Metric": ["Alpha", "Observed Disagreement", "Expected Disagreement"],
310
+ "Value": [result.alpha, result.observed, result.expected],
311
+ },
312
+ )
313
+
314
+
315
+ def alpha_interface() -> gr.Interface:
316
+ return gr.Interface(
317
+ fn=alpha_handler,
318
+ inputs=[
319
+ gr.File(
320
+ file_types=[".csv", ".tsv"],
321
+ label="Ratings Matrix (CSV without header)",
322
+ ),
323
+ gr.Dropdown(
324
+ choices=["nominal", "ordinal", "interval", "ratio"],
325
+ value="nominal",
326
+ label="Distance Metric",
327
+ info="Nominal for categorical, ordinal for ordered categories, interval/ratio for numeric scales",
328
+ ),
329
+ ],
330
+ outputs=[
331
+ gr.Dataframe(
332
+ headers=["Metric", "Value"],
333
+ label="Inter-Rater Reliability",
334
+ ),
335
+ ],
336
+ title="Krippendorff's Alpha",
337
+ analytics_enabled=False,
338
+ flagging_mode="never",
339
+ )
340
+
341
+
342
  def main() -> None:
343
+ pairwise_iface = gr.Interface(
344
  fn=handler,
345
  inputs=[
346
  gr.File(
 
348
  label="Comparisons",
349
  ),
350
  gr.Dropdown(
351
+ choices=list(ALGORITHMS),
352
  value="Bradley-Terry (1952)",
353
  label="Algorithm",
354
  ),
 
356
  value=False,
357
  label="Largest SCC",
358
  info="Bradley-Terry, Eigenvector, and Newman algorithms require the comparison graph "
359
+ "to be strongly-connected. "
360
+ "This option keeps only the largest strongly-connected component (SCC) of the input graph. "
361
+ "Some items might be missing as a result of this filtering.",
362
  ),
363
  gr.Checkbox(
364
  value=False,
365
  label="Truncate Output",
366
+ info="Perform the entire computation but output only five head and five tail items, avoiding overlap.",
 
367
  ),
368
  gr.Number(
369
  value=0,
 
421
  analytics_enabled=False,
422
  )
423
 
424
+ iface = gr.TabbedInterface(
425
+ [pairwise_iface, alpha_interface()],
426
+ ["Pairwise Ranking", "Krippendorff's Alpha"],
427
+ title="Evalica",
428
+ analytics_enabled=False,
429
+ )
430
+
431
  iface.launch()
432
 
433
 
mypy.ini CHANGED
@@ -1,4 +1,3 @@
1
  [mypy]
2
  ignore_missing_imports = True
3
- plugins = numpy.typing.mypy_plugin
4
  strict = True
 
1
  [mypy]
2
  ignore_missing_imports = True
 
3
  strict = True
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- evalica[gradio]
2
  networkx
3
  plotly
 
1
+ evalica[gradio] == 0.4.0rc2
2
  networkx
3
  plotly
ruff.toml CHANGED
@@ -1,5 +1,5 @@
1
  line-length = 120
2
- target-version = "py311"
3
 
4
  [lint]
5
  select = ["ALL"]
 
1
  line-length = 120
2
+ target-version = "py314"
3
 
4
  [lint]
5
  select = ["ALL"]