Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Sleeping

App Files Files Community

KeenWoo commited on Aug 22

Commit

b008ff6

verified ·

1 Parent(s): 11de8fc

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -6

app.py CHANGED Viewed

@@ -665,7 +665,79 @@ def run_nlu_test(test_title: str):
     print("--- TEST COMPLETE ---\n")
     return status, comparison_data
 # In app.py, inside the Gradio Callbacks section for debugging
 def test_save_file():
@@ -741,17 +813,20 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CSS) as demo:
     with gr.Tab("Testing"):
         gr.Markdown("## NLU Context Detection Tests")
-        gr.Markdown("Select a test case from `conversation_test_fixtures.jsonl` to run it through the NLU classifier and see the results.")
         with gr.Row():
-            test_case_dropdown = gr.Dropdown(label="Select Test Case", scale=3)
-            run_test_btn = gr.Button("Load & Run Test", variant="primary", scale=1)
-        test_status_md = gr.Markdown("### Please select and run a test case.")
         test_results_df = gr.DataFrame(
             label="Test Results Comparison",
-            headers=["Category", "Expected", "Actual", "Result"],
             interactive=False
         )
     with gr.Tab("Settings"):
         with gr.Group():
             gr.Markdown("## Conversation & Persona Settings")
@@ -823,6 +898,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CSS) as demo:
     test_save_btn.click(fn=test_save_file, inputs=None, outputs=[test_status])
     check_save_btn.click(fn=check_test_file, inputs=None, outputs=[test_status])
     # --- ADD WIRING FOR THE TESTING TAB ---
     demo.load(load_test_fixtures, outputs=[test_case_dropdown])
     run_test_btn.click(
@@ -830,7 +906,10 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CSS) as demo:
         inputs=[test_case_dropdown],
         outputs=[test_status_md, test_results_df]
     )
 # --- Startup Logic ---
 def pre_load_indexes():
     global personal_vectorstore

     print("--- TEST COMPLETE ---\n")
     return status, comparison_data
+# add the new function that will run when the "Run All Tests" button is clicked.
+def run_all_nlu_tests():
+    """Runs all test fixtures in a batch and provides a summary."""
+    if not test_fixtures:
+        load_test_fixtures()
+        if not test_fixtures:
+            return "## Batch Test Summary: No test fixtures found. Please ensure `conversation_test_fixtures.jsonl` is present.", []
+    print("\n--- RUNNING ALL NLU TESTS ---")
+    behavior_options = CONFIG.get("behavior_tags", [])
+    emotion_options = CONFIG.get("emotion_tags", [])
+    topic_options = CONFIG.get("topic_tags", [])
+    context_options = CONFIG.get("context_tags", [])
+    total_tests = len(test_fixtures)
+    passed_tests = 0
+    all_results_data = []
+    for fixture in test_fixtures:
+        user_query = fixture["turns"][0]["text"]
+        expected_results = fixture["expected"]
+        actual_results_raw = detect_tags_from_query(
+            user_query,
+            behavior_options=behavior_options,
+            emotion_options=emotion_options,
+            topic_options=topic_options,
+            context_options=context_options
+        )
+        actual_results = {
+            "emotion": [actual_results_raw.get("detected_emotion")],
+            "behaviors": actual_results_raw.get("detected_behaviors", []),
+            "topic_tags": [actual_results_raw.get("detected_topic")],
+            "context_tags": actual_results_raw.get("detected_contexts", [])
+        }
+        pass_count = 0
+        total_count = 0
+        all_keys = set(expected_results.keys())
+        for key in sorted(list(all_keys)):
+            expected_set = set(expected_results.get(key, []))
+            if not expected_set: continue
+            total_count += 1
+            actual_set = set(a for a in actual_results.get(key, []) if a and a != "None")
+            # Flexible pass logic: passes if there is any overlap
+            is_pass = len(expected_set.intersection(actual_set)) > 0
+            if is_pass:
+                pass_count += 1
+        # A test case passes if all its expected categories pass
+        if total_count > 0 and pass_count == total_count:
+            passed_tests += 1
+            overall_result = "✅ Pass"
+        else:
+            overall_result = "❌ Fail"
+        all_results_data.append([
+            fixture["title"],
+            overall_result,
+            f"{pass_count} / {total_count}"
+        ])
+    pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
+    summary_md = f"## Batch Test Summary: {passed_tests} / {total_tests} Tests Passed ({pass_rate:.1f}%)"
+    print(f"--- BATCH TEST COMPLETE: {summary_md} ---")
+    return summary_md, all_results_data
 # In app.py, inside the Gradio Callbacks section for debugging
 def test_save_file():
     with gr.Tab("Testing"):
         gr.Markdown("## NLU Context Detection Tests")
+        gr.Markdown("Select a single test case to run, or run the entire batch of fixtures to get a summary of the NLU's performance.")
+        batch_summary_md = gr.Markdown("### Batch Test Summary: Not yet run.") # <-- ADD THIS
         with gr.Row():
+            test_case_dropdown = gr.Dropdown(label="Select Single Test Case", scale=2)
+            run_test_btn = gr.Button("Run Single Test", variant="secondary", scale=1)
+            run_all_btn = gr.Button("Run All Tests", variant="primary", scale=1) # <-- ADD THIS
+        test_status_md = gr.Markdown("### Test Results")
         test_results_df = gr.DataFrame(
             label="Test Results Comparison",
+            # UPDATE these headers for the batch summary
+            headers=["Test Case Title", "Overall Result", "Categories Passed"],
             interactive=False
         )
     with gr.Tab("Settings"):
         with gr.Group():
             gr.Markdown("## Conversation & Persona Settings")
     test_save_btn.click(fn=test_save_file, inputs=None, outputs=[test_status])
     check_save_btn.click(fn=check_test_file, inputs=None, outputs=[test_status])
     # --- ADD WIRING FOR THE TESTING TAB ---
     demo.load(load_test_fixtures, outputs=[test_case_dropdown])
     run_test_btn.click(
         inputs=[test_case_dropdown],
         outputs=[test_status_md, test_results_df]
     )
+    # --- ADD THE LINE BELOW ---
+    run_all_btn.click(fn=run_all_nlu_tests, outputs=[batch_summary_md, test_results_df])
 # --- Startup Logic ---
 def pre_load_indexes():
     global personal_vectorstore