Spaces:
Running
Running
add bothbad state
Browse files
app.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import dotenv
|
| 2 |
import evalica
|
| 3 |
import gitlab
|
|
@@ -380,6 +384,7 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 380 |
"Conversation Efficiency Index",
|
| 381 |
"Model Consistency Score",
|
| 382 |
"Average Win Rate",
|
|
|
|
| 383 |
"Bradley-Terry Coefficient",
|
| 384 |
"Eigenvector Centrality Value",
|
| 385 |
"Newman Modularity Score",
|
|
@@ -387,29 +392,6 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 387 |
]
|
| 388 |
)
|
| 389 |
|
| 390 |
-
# map vote to winner
|
| 391 |
-
vote_df["winner"] = vote_df["winner"].map(
|
| 392 |
-
{
|
| 393 |
-
"left": evalica.Winner.X,
|
| 394 |
-
"right": evalica.Winner.Y,
|
| 395 |
-
"tie": evalica.Winner.Draw,
|
| 396 |
-
}
|
| 397 |
-
)
|
| 398 |
-
|
| 399 |
-
# Calculate scores using various metrics
|
| 400 |
-
avr_result = evalica.average_win_rate(
|
| 401 |
-
vote_df["left"], vote_df["right"], vote_df["winner"]
|
| 402 |
-
)
|
| 403 |
-
bt_result = evalica.bradley_terry(
|
| 404 |
-
vote_df["left"], vote_df["right"], vote_df["winner"]
|
| 405 |
-
)
|
| 406 |
-
newman_result = evalica.newman(vote_df["left"], vote_df["right"], vote_df["winner"])
|
| 407 |
-
eigen_result = evalica.eigen(vote_df["left"], vote_df["right"], vote_df["winner"])
|
| 408 |
-
elo_result = evalica.elo(vote_df["left"], vote_df["right"], vote_df["winner"])
|
| 409 |
-
pagerank_result = evalica.pagerank(
|
| 410 |
-
vote_df["left"], vote_df["right"], vote_df["winner"]
|
| 411 |
-
)
|
| 412 |
-
|
| 413 |
# Load conversation data from the Hugging Face repository
|
| 414 |
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
| 415 |
conversation_df = pd.DataFrame(conversation_data)
|
|
@@ -427,51 +409,89 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 427 |
left_model = row["left"]
|
| 428 |
right_model = row["right"]
|
| 429 |
is_self_match = left_model == right_model
|
| 430 |
-
|
| 431 |
# Initialize dictionaries for models if they don't exist yet
|
| 432 |
for model in [left_model, right_model]:
|
| 433 |
if model not in model_stats:
|
| 434 |
model_stats[model] = {
|
| 435 |
-
"cei_sum": 0,
|
| 436 |
-
"cei_max": 0,
|
| 437 |
-
"self_matches": 0,
|
| 438 |
-
"self_draws": 0
|
| 439 |
}
|
| 440 |
-
|
| 441 |
# Handle self-matches (same model on both sides)
|
| 442 |
if is_self_match:
|
| 443 |
model_stats[left_model]["self_matches"] += 1
|
| 444 |
-
if row["winner"] ==
|
| 445 |
model_stats[left_model]["self_draws"] += 1
|
| 446 |
continue
|
| 447 |
-
|
| 448 |
# Determine scores based on winner for competitive matches
|
| 449 |
match row["winner"]:
|
| 450 |
-
case
|
| 451 |
left_score = 1
|
| 452 |
right_score = -1
|
| 453 |
-
case
|
| 454 |
left_score = -1
|
| 455 |
right_score = 1
|
| 456 |
-
case
|
| 457 |
-
left_score = 0.
|
| 458 |
-
right_score = 0.
|
| 459 |
-
|
|
|
|
|
|
|
|
|
|
| 460 |
# Count rounds for each side
|
| 461 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
| 462 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
| 463 |
-
|
| 464 |
# Update CEI metrics
|
| 465 |
model_stats[left_model]["cei_max"] += 1 / left_round
|
| 466 |
model_stats[right_model]["cei_max"] += 1 / right_round
|
| 467 |
model_stats[left_model]["cei_sum"] += left_score / left_round
|
| 468 |
model_stats[right_model]["cei_sum"] += right_score / right_round
|
| 469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
# Calculate CEI results
|
| 471 |
cei_result = {}
|
| 472 |
for model in elo_result.scores.index:
|
| 473 |
if model in model_stats and model_stats[model]["cei_max"] > 0:
|
| 474 |
-
cei_result[model] = round(
|
|
|
|
|
|
|
| 475 |
else:
|
| 476 |
cei_result[model] = "N/A"
|
| 477 |
cei_result = pd.Series(cei_result)
|
|
@@ -480,7 +500,9 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 480 |
mcs_result = {}
|
| 481 |
for model in elo_result.scores.index:
|
| 482 |
if model in model_stats and model_stats[model]["self_matches"] > 0:
|
| 483 |
-
mcs_result[model] = round(
|
|
|
|
|
|
|
| 484 |
else:
|
| 485 |
mcs_result[model] = "N/A"
|
| 486 |
mcs_result = pd.Series(mcs_result)
|
|
@@ -934,10 +956,10 @@ with gr.Blocks() as app:
|
|
| 934 |
# Feedback panel, initially hidden
|
| 935 |
with gr.Row(visible=False) as vote_panel:
|
| 936 |
feedback = gr.Radio(
|
| 937 |
-
choices=["Model A", "Model B", "
|
| 938 |
label="Which model do you prefer?",
|
| 939 |
-
value="
|
| 940 |
-
interactive=False,
|
| 941 |
)
|
| 942 |
submit_feedback_btn = gr.Button("Submit Feedback", interactive=False)
|
| 943 |
|
|
@@ -1160,8 +1182,10 @@ with gr.Blocks() as app:
|
|
| 1160 |
winner_model = "left"
|
| 1161 |
case "Model B":
|
| 1162 |
winner_model = "right"
|
| 1163 |
-
case "
|
| 1164 |
winner_model = "tie"
|
|
|
|
|
|
|
| 1165 |
|
| 1166 |
# Create feedback entry
|
| 1167 |
vote_entry = {
|
|
@@ -1220,7 +1244,7 @@ with gr.Blocks() as app:
|
|
| 1220 |
value="Submit", interactive=True, visible=True
|
| 1221 |
), # [9] Reset send_first button
|
| 1222 |
gr.update(
|
| 1223 |
-
value="
|
| 1224 |
), # [10] Reset feedback radio selection
|
| 1225 |
get_leaderboard_data(vote_entry), # [11] Updated leaderboard data
|
| 1226 |
gr.update(
|
|
|
|
| 1 |
+
# References for model evaluation metrics:
|
| 2 |
+
# - Chatbot Arena: https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH
|
| 3 |
+
# - Evalica: https://github.com/dustalov/evalica/blob/master/Chatbot-Arena.ipynb
|
| 4 |
+
|
| 5 |
import dotenv
|
| 6 |
import evalica
|
| 7 |
import gitlab
|
|
|
|
| 384 |
"Conversation Efficiency Index",
|
| 385 |
"Model Consistency Score",
|
| 386 |
"Average Win Rate",
|
| 387 |
+
"Average Failure Rate",
|
| 388 |
"Bradley-Terry Coefficient",
|
| 389 |
"Eigenvector Centrality Value",
|
| 390 |
"Newman Modularity Score",
|
|
|
|
| 392 |
]
|
| 393 |
)
|
| 394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
# Load conversation data from the Hugging Face repository
|
| 396 |
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
| 397 |
conversation_df = pd.DataFrame(conversation_data)
|
|
|
|
| 409 |
left_model = row["left"]
|
| 410 |
right_model = row["right"]
|
| 411 |
is_self_match = left_model == right_model
|
| 412 |
+
|
| 413 |
# Initialize dictionaries for models if they don't exist yet
|
| 414 |
for model in [left_model, right_model]:
|
| 415 |
if model not in model_stats:
|
| 416 |
model_stats[model] = {
|
| 417 |
+
"cei_sum": 0, # Sum of per-round scores
|
| 418 |
+
"cei_max": 0, # Sum of per-round maximums
|
| 419 |
+
"self_matches": 0, # Count of self-matches
|
| 420 |
+
"self_draws": 0, # Count of draws in self-matches
|
| 421 |
}
|
| 422 |
+
|
| 423 |
# Handle self-matches (same model on both sides)
|
| 424 |
if is_self_match:
|
| 425 |
model_stats[left_model]["self_matches"] += 1
|
| 426 |
+
if row["winner"] == "both_bad" or row["winner"] == "tie":
|
| 427 |
model_stats[left_model]["self_draws"] += 1
|
| 428 |
continue
|
| 429 |
+
|
| 430 |
# Determine scores based on winner for competitive matches
|
| 431 |
match row["winner"]:
|
| 432 |
+
case "left":
|
| 433 |
left_score = 1
|
| 434 |
right_score = -1
|
| 435 |
+
case "right":
|
| 436 |
left_score = -1
|
| 437 |
right_score = 1
|
| 438 |
+
case "tie":
|
| 439 |
+
left_score = 0.3
|
| 440 |
+
right_score = 0.3
|
| 441 |
+
case "both_bad":
|
| 442 |
+
left_score = -0.3
|
| 443 |
+
right_score = -0.3
|
| 444 |
+
|
| 445 |
# Count rounds for each side
|
| 446 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
| 447 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
| 448 |
+
|
| 449 |
# Update CEI metrics
|
| 450 |
model_stats[left_model]["cei_max"] += 1 / left_round
|
| 451 |
model_stats[right_model]["cei_max"] += 1 / right_round
|
| 452 |
model_stats[left_model]["cei_sum"] += left_score / left_round
|
| 453 |
model_stats[right_model]["cei_sum"] += right_score / right_round
|
| 454 |
|
| 455 |
+
# map vote to winner
|
| 456 |
+
vote_df["winner"] = vote_df["winner"].map(
|
| 457 |
+
{
|
| 458 |
+
"left": evalica.Winner.X,
|
| 459 |
+
"right": evalica.Winner.Y,
|
| 460 |
+
"tie": evalica.Winner.Draw,
|
| 461 |
+
"both_bad": evalica.Winner.Draw,
|
| 462 |
+
}
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
# Calculate scores using various metrics
|
| 466 |
+
avr_result = evalica.average_win_rate(
|
| 467 |
+
vote_df["left"],
|
| 468 |
+
vote_df["right"],
|
| 469 |
+
vote_df["winner"],
|
| 470 |
+
tie_weight=0, # Chatbot Arena excludes ties
|
| 471 |
+
)
|
| 472 |
+
bt_result = evalica.bradley_terry(
|
| 473 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
| 474 |
+
)
|
| 475 |
+
newman_result = evalica.newman(
|
| 476 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
| 477 |
+
)
|
| 478 |
+
eigen_result = evalica.eigen(
|
| 479 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
| 480 |
+
)
|
| 481 |
+
elo_result = evalica.elo(
|
| 482 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
| 483 |
+
)
|
| 484 |
+
pagerank_result = evalica.pagerank(
|
| 485 |
+
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
# Calculate CEI results
|
| 489 |
cei_result = {}
|
| 490 |
for model in elo_result.scores.index:
|
| 491 |
if model in model_stats and model_stats[model]["cei_max"] > 0:
|
| 492 |
+
cei_result[model] = round(
|
| 493 |
+
model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2
|
| 494 |
+
)
|
| 495 |
else:
|
| 496 |
cei_result[model] = "N/A"
|
| 497 |
cei_result = pd.Series(cei_result)
|
|
|
|
| 500 |
mcs_result = {}
|
| 501 |
for model in elo_result.scores.index:
|
| 502 |
if model in model_stats and model_stats[model]["self_matches"] > 0:
|
| 503 |
+
mcs_result[model] = round(
|
| 504 |
+
model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2
|
| 505 |
+
)
|
| 506 |
else:
|
| 507 |
mcs_result[model] = "N/A"
|
| 508 |
mcs_result = pd.Series(mcs_result)
|
|
|
|
| 956 |
# Feedback panel, initially hidden
|
| 957 |
with gr.Row(visible=False) as vote_panel:
|
| 958 |
feedback = gr.Radio(
|
| 959 |
+
choices=["Model A", "Model B", "Tie", "Tie (Both Bad)"],
|
| 960 |
label="Which model do you prefer?",
|
| 961 |
+
value="Tie",
|
| 962 |
+
interactive=False,
|
| 963 |
)
|
| 964 |
submit_feedback_btn = gr.Button("Submit Feedback", interactive=False)
|
| 965 |
|
|
|
|
| 1182 |
winner_model = "left"
|
| 1183 |
case "Model B":
|
| 1184 |
winner_model = "right"
|
| 1185 |
+
case "Tie":
|
| 1186 |
winner_model = "tie"
|
| 1187 |
+
case _:
|
| 1188 |
+
winner_model = "both_bad"
|
| 1189 |
|
| 1190 |
# Create feedback entry
|
| 1191 |
vote_entry = {
|
|
|
|
| 1244 |
value="Submit", interactive=True, visible=True
|
| 1245 |
), # [9] Reset send_first button
|
| 1246 |
gr.update(
|
| 1247 |
+
value="Tie", interactive=True
|
| 1248 |
), # [10] Reset feedback radio selection
|
| 1249 |
get_leaderboard_data(vote_entry), # [11] Updated leaderboard data
|
| 1250 |
gr.update(
|