Spaces:
Sleeping
Sleeping
Alex
commited on
Commit
·
ea6e048
1
Parent(s):
6ec1619
error
Browse files- app.py +42 -33
- leaderboard_data.json +10 -10
app.py
CHANGED
|
@@ -11,16 +11,22 @@ DEFAULT_MODEL_NAME = "example/model"
|
|
| 11 |
|
| 12 |
# --------------- Data models ---------------
|
| 13 |
class Metrics(BaseModel):
|
| 14 |
-
readability:
|
| 15 |
-
relevance:
|
| 16 |
-
explanation_clarity:
|
| 17 |
-
problem_identification:
|
| 18 |
-
actionability:
|
| 19 |
-
completeness:
|
| 20 |
-
specificity:
|
| 21 |
-
contextual_adequacy:
|
| 22 |
-
consistency:
|
| 23 |
-
brevity:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
class LeaderboardEntry(BaseModel):
|
|
@@ -85,16 +91,16 @@ def submit_model(
|
|
| 85 |
llm_pass_1: float,
|
| 86 |
llm_pass_5: float,
|
| 87 |
llm_pass_10: float,
|
| 88 |
-
readability:
|
| 89 |
-
relevance:
|
| 90 |
-
explanation_clarity:
|
| 91 |
-
problem_identification:
|
| 92 |
-
actionability:
|
| 93 |
-
completeness:
|
| 94 |
-
specificity:
|
| 95 |
-
contextual_adequacy:
|
| 96 |
-
consistency:
|
| 97 |
-
brevity:
|
| 98 |
):
|
| 99 |
"""Validate and append a new model entry to the leaderboard."""
|
| 100 |
try:
|
|
@@ -133,9 +139,12 @@ def submit_model(
|
|
| 133 |
with gr.Blocks(title="Custom LLM Leaderboard") as demo:
|
| 134 |
gr.Markdown("""# 🏆 LLM Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
|
| 135 |
|
|
|
|
|
|
|
|
|
|
| 136 |
leaderboard_df = gr.Dataframe(
|
| 137 |
-
headers=list(
|
| 138 |
-
value=
|
| 139 |
label="Current Leaderboard",
|
| 140 |
interactive=False,
|
| 141 |
)
|
|
@@ -150,18 +159,18 @@ with gr.Blocks(title="Custom LLM Leaderboard") as demo:
|
|
| 150 |
pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0)
|
| 151 |
pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0)
|
| 152 |
|
| 153 |
-
gr.Markdown("### Multi-metric subjective scores (0
|
| 154 |
with gr.Row():
|
| 155 |
-
readability_inp = gr.Slider(minimum=0
|
| 156 |
-
relevance_inp = gr.Slider(minimum=0
|
| 157 |
-
explanation_inp = gr.Slider(minimum=0
|
| 158 |
-
problem_inp = gr.Slider(minimum=0
|
| 159 |
-
actionability_inp = gr.Slider(minimum=0
|
| 160 |
-
completeness_inp = gr.Slider(minimum=0
|
| 161 |
-
specificity_inp = gr.Slider(minimum=0
|
| 162 |
-
contextual_inp = gr.Slider(minimum=0
|
| 163 |
-
consistency_inp = gr.Slider(minimum=0
|
| 164 |
-
brevity_inp = gr.Slider(minimum=0
|
| 165 |
|
| 166 |
submit_btn = gr.Button("Submit")
|
| 167 |
status_markdown = gr.Markdown("")
|
|
|
|
| 11 |
|
| 12 |
# --------------- Data models ---------------
|
| 13 |
class Metrics(BaseModel):
|
| 14 |
+
readability: int
|
| 15 |
+
relevance: int
|
| 16 |
+
explanation_clarity: int = Field(alias="explanation_clarity")
|
| 17 |
+
problem_identification: int
|
| 18 |
+
actionability: int
|
| 19 |
+
completeness: int
|
| 20 |
+
specificity: int
|
| 21 |
+
contextual_adequacy: int
|
| 22 |
+
consistency: int
|
| 23 |
+
brevity: int
|
| 24 |
+
|
| 25 |
+
@field_validator("readability", "relevance", "explanation_clarity", "problem_identification", "actionability", "completeness", "specificity", "contextual_adequacy", "consistency", "brevity")
|
| 26 |
+
def metric_range(cls, v: int):
|
| 27 |
+
if not 0 <= v <= 10:
|
| 28 |
+
raise ValueError("Multi-metrics should be between 0 and 10")
|
| 29 |
+
return v
|
| 30 |
|
| 31 |
|
| 32 |
class LeaderboardEntry(BaseModel):
|
|
|
|
| 91 |
llm_pass_1: float,
|
| 92 |
llm_pass_5: float,
|
| 93 |
llm_pass_10: float,
|
| 94 |
+
readability: int,
|
| 95 |
+
relevance: int,
|
| 96 |
+
explanation_clarity: int,
|
| 97 |
+
problem_identification: int,
|
| 98 |
+
actionability: int,
|
| 99 |
+
completeness: int,
|
| 100 |
+
specificity: int,
|
| 101 |
+
contextual_adequacy: int,
|
| 102 |
+
consistency: int,
|
| 103 |
+
brevity: int,
|
| 104 |
):
|
| 105 |
"""Validate and append a new model entry to the leaderboard."""
|
| 106 |
try:
|
|
|
|
| 139 |
with gr.Blocks(title="Custom LLM Leaderboard") as demo:
|
| 140 |
gr.Markdown("""# 🏆 LLM Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
|
| 141 |
|
| 142 |
+
# Initialize table data
|
| 143 |
+
initial_data = _table_data()
|
| 144 |
+
|
| 145 |
leaderboard_df = gr.Dataframe(
|
| 146 |
+
headers=list(initial_data[0].keys()) if initial_data else ["Model", "BLEU", "Pass@1", "Pass@5", "Pass@10", "Readability", "Relevance", "Explanation Clarity", "Problem Identification", "Actionability", "Completeness", "Specificity", "Contextual Adequacy", "Consistency", "Brevity"],
|
| 147 |
+
value=initial_data,
|
| 148 |
label="Current Leaderboard",
|
| 149 |
interactive=False,
|
| 150 |
)
|
|
|
|
| 159 |
pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0)
|
| 160 |
pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0)
|
| 161 |
|
| 162 |
+
gr.Markdown("### Multi-metric subjective scores (0 – 10)")
|
| 163 |
with gr.Row():
|
| 164 |
+
readability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Readability")
|
| 165 |
+
relevance_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Relevance")
|
| 166 |
+
explanation_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Explanation Clarity")
|
| 167 |
+
problem_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Problem Identification")
|
| 168 |
+
actionability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Actionability")
|
| 169 |
+
completeness_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Completeness")
|
| 170 |
+
specificity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Specificity")
|
| 171 |
+
contextual_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Contextual Adequacy")
|
| 172 |
+
consistency_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Consistency")
|
| 173 |
+
brevity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Brevity")
|
| 174 |
|
| 175 |
submit_btn = gr.Button("Submit")
|
| 176 |
status_markdown = gr.Markdown("")
|
leaderboard_data.json
CHANGED
|
@@ -7,16 +7,16 @@
|
|
| 7 |
"llm_pass_5": 0.5,
|
| 8 |
"llm_pass_10": 0.5,
|
| 9 |
"metrics": {
|
| 10 |
-
"readability":
|
| 11 |
-
"relevance":
|
| 12 |
-
"explanation_clarity":
|
| 13 |
-
"problem_identification":
|
| 14 |
-
"actionability":
|
| 15 |
-
"completeness":
|
| 16 |
-
"specificity":
|
| 17 |
-
"contextual_adequacy":
|
| 18 |
-
"consistency":
|
| 19 |
-
"brevity":
|
| 20 |
}
|
| 21 |
}
|
| 22 |
]
|
|
|
|
| 7 |
"llm_pass_5": 0.5,
|
| 8 |
"llm_pass_10": 0.5,
|
| 9 |
"metrics": {
|
| 10 |
+
"readability": 5,
|
| 11 |
+
"relevance": 5,
|
| 12 |
+
"explanation_clarity": 5,
|
| 13 |
+
"problem_identification": 5,
|
| 14 |
+
"actionability": 5,
|
| 15 |
+
"completeness": 5,
|
| 16 |
+
"specificity": 5,
|
| 17 |
+
"contextual_adequacy": 5,
|
| 18 |
+
"consistency": 5,
|
| 19 |
+
"brevity": 5
|
| 20 |
}
|
| 21 |
}
|
| 22 |
]
|