Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -113,18 +113,27 @@ def evaluate_models(file_path, api_key, prompt_col, selected_models=None, progre
|
|
| 113 |
combined_benchmark_path = f'results/benchmark_results_{timestamp}.csv'
|
| 114 |
benchmark_df.to_csv(combined_benchmark_path, index=False)
|
| 115 |
|
| 116 |
-
# Create visualizations
|
| 117 |
progress(0.95, desc="Creating visualizations...")
|
| 118 |
radar_chart_path = create_radar_chart(all_results)
|
| 119 |
bar_chart_path = create_bar_chart(all_results)
|
| 120 |
|
| 121 |
progress(1.0, desc="Evaluation complete!")
|
| 122 |
|
| 123 |
-
# Sort results by combined score
|
| 124 |
sorted_results = benchmark_df.sort_values(by='combined_score', ascending=False)
|
| 125 |
|
| 126 |
return sorted_results, radar_chart_path, bar_chart_path, combined_benchmark_path
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
def create_gradio_interface():
|
| 129 |
with gr.Blocks(title="LLM Evaluation Tool") as app:
|
| 130 |
gr.Markdown("# LLM Evaluation Tool")
|
|
@@ -188,11 +197,9 @@ def create_gradio_interface():
|
|
| 188 |
with gr.Row():
|
| 189 |
gr.Markdown("### Leaderboard Details")
|
| 190 |
gr.Markdown("""
|
| 191 |
-
- **Креативность**: Оригинальность и инновационность ответов
|
| 192 |
-
-
|
| 193 |
-
-
|
| 194 |
-
- **Стабильность**: Насколько хорошо модель сохраняет смысл и контекст запроса
|
| 195 |
-
- **Общий балл**: Среднее значение всех показателей
|
| 196 |
""")
|
| 197 |
|
| 198 |
return app
|
|
|
|
| 113 |
combined_benchmark_path = f'results/benchmark_results_{timestamp}.csv'
|
| 114 |
benchmark_df.to_csv(combined_benchmark_path, index=False)
|
| 115 |
|
|
|
|
| 116 |
progress(0.95, desc="Creating visualizations...")
|
| 117 |
radar_chart_path = create_radar_chart(all_results)
|
| 118 |
bar_chart_path = create_bar_chart(all_results)
|
| 119 |
|
| 120 |
progress(1.0, desc="Evaluation complete!")
|
| 121 |
|
|
|
|
| 122 |
sorted_results = benchmark_df.sort_values(by='combined_score', ascending=False)
|
| 123 |
|
| 124 |
return sorted_results, radar_chart_path, bar_chart_path, combined_benchmark_path
|
| 125 |
|
| 126 |
+
|
| 127 |
+
def get_leaderboard_data():
|
| 128 |
+
return [
|
| 129 |
+
["Vikhr", "7.75", "0.9363600260019302", "0.860"],
|
| 130 |
+
["Llama3", "7.30", "0.9410231244564057", "0.827"],
|
| 131 |
+
["Mistral", "6.95", "0.9459488660097122", "0.807"],
|
| 132 |
+
["Owen", "6.93", "0.945682458281517", "0.800"],
|
| 133 |
+
["TinyLlama", "1.12", "0.945682458281517", "0.573"]
|
| 134 |
+
]
|
| 135 |
+
|
| 136 |
+
|
| 137 |
def create_gradio_interface():
|
| 138 |
with gr.Blocks(title="LLM Evaluation Tool") as app:
|
| 139 |
gr.Markdown("# LLM Evaluation Tool")
|
|
|
|
| 197 |
with gr.Row():
|
| 198 |
gr.Markdown("### Leaderboard Details")
|
| 199 |
gr.Markdown("""
|
| 200 |
+
- **Креативность**: Оригинальность и инновационность ответов (шкала до 10)
|
| 201 |
+
- **Стабильность**: Коэффициент стабильности модели (0-1)
|
| 202 |
+
- **Общий балл**: Средний комбинированный показатель производительности (0-1)
|
|
|
|
|
|
|
| 203 |
""")
|
| 204 |
|
| 205 |
return app
|