Spaces:
Sleeping
Sleeping
Moved overall results. Showing README.
Browse files
README.md
CHANGED
|
@@ -1,13 +1,17 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: 🤔
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: bsd-3-clause
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Verbal Reasoning Challenge
|
| 3 |
emoji: 🤔
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.15.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: bsd-3-clause
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# PhD Knowledge Not Required: A Reasoning Challenge for Large Language Models
|
| 14 |
+
|
| 15 |
+
This application presents the results of several models that we have
|
| 16 |
+
evaluated on verbal reasoning challenge. The overall results are below.
|
| 17 |
+
Use the tabs above to explore the results in more detail.
|
app.py
CHANGED
|
@@ -23,7 +23,7 @@ import pandas as pd
|
|
| 23 |
import numpy as np
|
| 24 |
from metrics import load_results, accuracy_by_model_and_time
|
| 25 |
import metrics
|
| 26 |
-
|
| 27 |
|
| 28 |
def get_model_response(prompt_id, model_name):
|
| 29 |
query = f"""
|
|
@@ -226,14 +226,24 @@ def all_challenges_view():
|
|
| 226 |
|
| 227 |
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
def create_interface():
|
| 231 |
with gr.Blocks() as demo:
|
| 232 |
with gr.Tabs():
|
|
|
|
|
|
|
| 233 |
with gr.TabItem("All Challenges"):
|
| 234 |
all_challenges_view()
|
| 235 |
-
with gr.TabItem("Accuracy by Model"):
|
| 236 |
-
gr.DataFrame(metrics.accuracy_by_model(conn).to_df())
|
| 237 |
with gr.TabItem("Accuracy Over Time"):
|
| 238 |
summary_view()
|
| 239 |
with gr.TabItem("DeepSeek R1 Analysis"):
|
|
|
|
| 23 |
import numpy as np
|
| 24 |
from metrics import load_results, accuracy_by_model_and_time
|
| 25 |
import metrics
|
| 26 |
+
from pathlib import Path
|
| 27 |
|
| 28 |
def get_model_response(prompt_id, model_name):
|
| 29 |
query = f"""
|
|
|
|
| 226 |
|
| 227 |
|
| 228 |
|
| 229 |
+
def overview_view():
|
| 230 |
+
with gr.Blocks(fill_height=True):
|
| 231 |
+
with gr.Row():
|
| 232 |
+
readme_text = Path("README.md").read_text()
|
| 233 |
+
# Find the second "---" and remove the text after it.
|
| 234 |
+
readme_text = readme_text.split("---")[2]
|
| 235 |
+
gr.Markdown(readme_text)
|
| 236 |
+
with gr.Row():
|
| 237 |
+
gr.DataFrame(metrics.accuracy_by_model(conn).to_df())
|
| 238 |
+
|
| 239 |
|
| 240 |
def create_interface():
|
| 241 |
with gr.Blocks() as demo:
|
| 242 |
with gr.Tabs():
|
| 243 |
+
with gr.TabItem("Overview"):
|
| 244 |
+
overview_view()
|
| 245 |
with gr.TabItem("All Challenges"):
|
| 246 |
all_challenges_view()
|
|
|
|
|
|
|
| 247 |
with gr.TabItem("Accuracy Over Time"):
|
| 248 |
summary_view()
|
| 249 |
with gr.TabItem("DeepSeek R1 Analysis"):
|