Spaces:

nuprl
/

reasoning-weekly

Sleeping

arjunguha commited on Feb 8, 2025

Commit

f60593a

1 Parent(s): 4026713

Moved overall results. Showing README.

Files changed (2) hide show

README.md CHANGED Viewed

@@ -1,13 +1,17 @@
 ---
-title: Puzzle Reasoning Challenge
 emoji: 🤔
 colorFrom: purple
 colorTo: blue
 sdk: gradio
-sdk_version: 5.13.1
 app_file: app.py
 pinned: false
 license: bsd-3-clause
 ---
-This is the dataset explorer for the paper [PhD Knowledge Not Required: A Reasoning Challenge for Large Language Models](https://arxiv.org/abs/2502.01584)

 ---
+title: Verbal Reasoning Challenge
 emoji: 🤔
 colorFrom: purple
 colorTo: blue
 sdk: gradio
+sdk_version: 5.15.0
 app_file: app.py
 pinned: false
 license: bsd-3-clause
 ---
+# PhD Knowledge Not Required: A Reasoning Challenge for Large Language Models
+This application presents the results of several models that we have
+evaluated on verbal reasoning challenge. The overall results are below.
+Use the tabs above to explore the results in more detail.

app.py CHANGED Viewed

@@ -23,7 +23,7 @@ import pandas as pd
 import numpy as np
 from metrics import load_results, accuracy_by_model_and_time
 import metrics
 def get_model_response(prompt_id, model_name):
     query = f"""
@@ -226,14 +226,24 @@ def all_challenges_view():
 def create_interface():
     with gr.Blocks() as demo:
         with gr.Tabs():
             with gr.TabItem("All Challenges"):
                 all_challenges_view()
-            with gr.TabItem("Accuracy by Model"):
-                gr.DataFrame(metrics.accuracy_by_model(conn).to_df())
             with gr.TabItem("Accuracy Over Time"):
                 summary_view()
             with gr.TabItem("DeepSeek R1 Analysis"):

 import numpy as np
 from metrics import load_results, accuracy_by_model_and_time
 import metrics
+from pathlib import Path
 def get_model_response(prompt_id, model_name):
     query = f"""
+def overview_view():
+    with gr.Blocks(fill_height=True):
+        with gr.Row():
+            readme_text = Path("README.md").read_text()
+            # Find the second "---" and remove the text after it.
+            readme_text = readme_text.split("---")[2]
+            gr.Markdown(readme_text)
+        with gr.Row():
+            gr.DataFrame(metrics.accuracy_by_model(conn).to_df())
 def create_interface():
     with gr.Blocks() as demo:
         with gr.Tabs():
+            with gr.TabItem("Overview"):
+                overview_view()
             with gr.TabItem("All Challenges"):
                 all_challenges_view()
             with gr.TabItem("Accuracy Over Time"):
                 summary_view()
             with gr.TabItem("DeepSeek R1 Analysis"):