Spaces:
Running
Running
Jude Khouja
commited on
Commit
·
6226c1b
1
Parent(s):
36ce9ab
Change description and change color or baseline scores
Browse files- data_loader.py +2 -2
- tabs/leaderboard.py +5 -4
- utils.py +31 -0
data_loader.py
CHANGED
|
@@ -269,8 +269,8 @@ HEADER_CONTENT = (
|
|
| 269 |
</div>
|
| 270 |
|
| 271 |
<div class="description">
|
| 272 |
-
LingOly-TOO (L2) is a challenging reasoning benchmark designed to
|
| 273 |
-
|
| 274 |
<div class="highlight-question">
|
| 275 |
"How do top LLMs reason on unseen linguistic questions?"
|
| 276 |
</div>
|
|
|
|
| 269 |
</div>
|
| 270 |
|
| 271 |
<div class="description">
|
| 272 |
+
LingOly-TOO (L2) is a challenging linguistics reasoning benchmark designed to counteracts answering without reasoning (e.g. by guessing or memorizing answers).
|
| 273 |
+
We permute <b>Ling</b>uistics <b>Oly</b>mpiad problems with <b>T</b>emplates and <b>O</b>rthographic <b>O</b>bfuscations. By rewriting (obfuscating) parts of questions and answers, the chance of benchmark leakage in training data is minimized.
|
| 274 |
<div class="highlight-question">
|
| 275 |
"How do top LLMs reason on unseen linguistic questions?"
|
| 276 |
</div>
|
tabs/leaderboard.py
CHANGED
|
@@ -3,13 +3,14 @@ from data_loader import METHODOLOGY
|
|
| 3 |
from utils import (
|
| 4 |
get_rank_badge,
|
| 5 |
get_score_bar,
|
|
|
|
| 6 |
get_type_badge,
|
| 7 |
)
|
| 8 |
|
| 9 |
def filter_leaderboard(df, sort_by):
|
| 10 |
filtered_df = df.copy()
|
| 11 |
|
| 12 |
-
if sort_by == "Score
|
| 13 |
filtered_df = filtered_df.sort_values(by="Obfuscated score", ascending=False)
|
| 14 |
else:
|
| 15 |
filtered_df = filtered_df.sort_values(by="Baseline score", ascending=False)
|
|
@@ -129,7 +130,7 @@ def filter_leaderboard(df, sort_by):
|
|
| 129 |
<td class="vendor-cell">{row['Provider']}</td>
|
| 130 |
<td>{get_type_badge(row['Type'])}</td>
|
| 131 |
<td class="score-cell">{get_score_bar(row['Obfuscated score'])}</td>
|
| 132 |
-
<td class="score-cell">{
|
| 133 |
</tr>
|
| 134 |
"""
|
| 135 |
|
|
@@ -143,8 +144,8 @@ def create_leaderboard_tab(df, HEADER_CONTENT, CARDS):
|
|
| 143 |
with gr.Row(equal_height=True):
|
| 144 |
with gr.Column(scale=0.4):
|
| 145 |
sort_by = gr.Dropdown(
|
| 146 |
-
choices=["Score
|
| 147 |
-
value="Score
|
| 148 |
label="Sort by",
|
| 149 |
)
|
| 150 |
|
|
|
|
| 3 |
from utils import (
|
| 4 |
get_rank_badge,
|
| 5 |
get_score_bar,
|
| 6 |
+
get_score_bar_secondary,
|
| 7 |
get_type_badge,
|
| 8 |
)
|
| 9 |
|
| 10 |
def filter_leaderboard(df, sort_by):
|
| 11 |
filtered_df = df.copy()
|
| 12 |
|
| 13 |
+
if sort_by == "Score on obfuscated questions":
|
| 14 |
filtered_df = filtered_df.sort_values(by="Obfuscated score", ascending=False)
|
| 15 |
else:
|
| 16 |
filtered_df = filtered_df.sort_values(by="Baseline score", ascending=False)
|
|
|
|
| 130 |
<td class="vendor-cell">{row['Provider']}</td>
|
| 131 |
<td>{get_type_badge(row['Type'])}</td>
|
| 132 |
<td class="score-cell">{get_score_bar(row['Obfuscated score'])}</td>
|
| 133 |
+
<td class="score-cell">{get_score_bar_secondary(row['Baseline score'])}</td>
|
| 134 |
</tr>
|
| 135 |
"""
|
| 136 |
|
|
|
|
| 144 |
with gr.Row(equal_height=True):
|
| 145 |
with gr.Column(scale=0.4):
|
| 146 |
sort_by = gr.Dropdown(
|
| 147 |
+
choices=["Score on obfuscated questions", "Score on all questions"],
|
| 148 |
+
value="Score on obfuscated questions",
|
| 149 |
label="Sort by",
|
| 150 |
)
|
| 151 |
|
utils.py
CHANGED
|
@@ -67,6 +67,37 @@ def get_score_bar(score):
|
|
| 67 |
">{width:.1f}</span>
|
| 68 |
</div>
|
| 69 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def get_chart_colors():
|
| 71 |
# if is_dark_theme():
|
| 72 |
# return {
|
|
|
|
| 67 |
">{width:.1f}</span>
|
| 68 |
</div>
|
| 69 |
"""
|
| 70 |
+
|
| 71 |
+
def get_score_bar_secondary(score):
|
| 72 |
+
"""Generate HTML for score bar with gradient styling"""
|
| 73 |
+
width = score * 100
|
| 74 |
+
return f"""
|
| 75 |
+
<div style="display: flex; align-items: center; gap: 12px; width: 100%;">
|
| 76 |
+
<div style="
|
| 77 |
+
flex-grow: 1;
|
| 78 |
+
height: 8px;
|
| 79 |
+
background: var(--score-bg, rgba(255, 255, 255, 0.1));
|
| 80 |
+
border-radius: 4px;
|
| 81 |
+
overflow: hidden;
|
| 82 |
+
max-width: 200px;
|
| 83 |
+
">
|
| 84 |
+
<div style="
|
| 85 |
+
width: {width}%;
|
| 86 |
+
height: 100%;
|
| 87 |
+
background: linear-gradient(90deg, var(--accent-gray, #1f2937), var(--accent-gray-light, #9ca3af));
|
| 88 |
+
border-radius: 4px;
|
| 89 |
+
transition: width 0.3s ease;
|
| 90 |
+
"></div>
|
| 91 |
+
</div>
|
| 92 |
+
<span style="
|
| 93 |
+
font-family: 'SF Mono', monospace;
|
| 94 |
+
font-weight: 600;
|
| 95 |
+
color: var(--text-primary, #ffffff);
|
| 96 |
+
min-width: 60px;
|
| 97 |
+
">{width:.1f}</span>
|
| 98 |
+
</div>
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
def get_chart_colors():
|
| 102 |
# if is_dark_theme():
|
| 103 |
# return {
|