ui: move Test & Debug tab after Full Evaluation
Browse filesTest & Debug is rarely used, so Full Evaluation is now the first tab.
Tab order:
- Tab 1: Full Evaluation (primary functionality)
- Tab 2: Test & Debug (debugging/diagnostics)
Co-Authored-By: Claude <noreply@anthropic.com>
app.py
CHANGED
|
@@ -696,59 +696,7 @@ with gr.Blocks() as demo:
|
|
| 696 |
)
|
| 697 |
|
| 698 |
with gr.Tabs():
|
| 699 |
-
# Tab 1:
|
| 700 |
-
with gr.Tab("🔍 Test & Debug"):
|
| 701 |
-
gr.Markdown("""
|
| 702 |
-
**Test Mode:** Run the agent on a single question and see detailed diagnostics.
|
| 703 |
-
|
| 704 |
-
This mode shows:
|
| 705 |
-
- API key status
|
| 706 |
-
- Execution plan
|
| 707 |
-
- Tools selected and executed
|
| 708 |
-
- Evidence collected
|
| 709 |
-
- Errors encountered
|
| 710 |
-
- Final answer
|
| 711 |
-
""")
|
| 712 |
-
|
| 713 |
-
test_question_input = gr.Textbox(
|
| 714 |
-
label="Enter Test Question",
|
| 715 |
-
placeholder="e.g., What is the capital of France?",
|
| 716 |
-
lines=3,
|
| 717 |
-
)
|
| 718 |
-
|
| 719 |
-
with gr.Row():
|
| 720 |
-
llm_provider_dropdown = gr.Dropdown(
|
| 721 |
-
label="LLM Provider",
|
| 722 |
-
choices=["Gemini", "HuggingFace", "Groq", "Claude"],
|
| 723 |
-
value="HuggingFace",
|
| 724 |
-
info="Select which LLM to use for this test",
|
| 725 |
-
)
|
| 726 |
-
|
| 727 |
-
test_button = gr.Button("Run Test", variant="primary")
|
| 728 |
-
|
| 729 |
-
with gr.Row():
|
| 730 |
-
with gr.Column(scale=1):
|
| 731 |
-
test_answer_output = gr.Textbox(
|
| 732 |
-
label="Answer", lines=3, interactive=False
|
| 733 |
-
)
|
| 734 |
-
test_api_status = gr.Textbox(
|
| 735 |
-
label="API Keys Status", lines=5, interactive=False
|
| 736 |
-
)
|
| 737 |
-
with gr.Column(scale=2):
|
| 738 |
-
test_diagnostics_output = gr.Textbox(
|
| 739 |
-
label="Execution Diagnostics", lines=20, interactive=False
|
| 740 |
-
)
|
| 741 |
-
|
| 742 |
-
test_button.click(
|
| 743 |
-
fn=test_single_question,
|
| 744 |
-
inputs=[
|
| 745 |
-
test_question_input,
|
| 746 |
-
llm_provider_dropdown,
|
| 747 |
-
],
|
| 748 |
-
outputs=[test_answer_output, test_diagnostics_output, test_api_status],
|
| 749 |
-
)
|
| 750 |
-
|
| 751 |
-
# Tab 2: Full Evaluation (existing functionality)
|
| 752 |
with gr.Tab("📊 Full Evaluation"):
|
| 753 |
gr.Markdown(
|
| 754 |
"""
|
|
@@ -812,6 +760,58 @@ with gr.Blocks() as demo:
|
|
| 812 |
outputs=[status_output, results_table, export_output],
|
| 813 |
)
|
| 814 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 815 |
if __name__ == "__main__":
|
| 816 |
print("\n" + "-" * 30 + " App Starting " + "-" * 30)
|
| 817 |
# Check for SPACE_HOST and SPACE_ID at startup for information
|
|
|
|
| 696 |
)
|
| 697 |
|
| 698 |
with gr.Tabs():
|
| 699 |
+
# Tab 1: Full Evaluation (primary functionality)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
with gr.Tab("📊 Full Evaluation"):
|
| 701 |
gr.Markdown(
|
| 702 |
"""
|
|
|
|
| 760 |
outputs=[status_output, results_table, export_output],
|
| 761 |
)
|
| 762 |
|
| 763 |
+
# Tab 2: Test Single Question (debugging/diagnostics)
|
| 764 |
+
with gr.Tab("🔍 Test & Debug"):
|
| 765 |
+
gr.Markdown("""
|
| 766 |
+
**Test Mode:** Run the agent on a single question and see detailed diagnostics.
|
| 767 |
+
|
| 768 |
+
This mode shows:
|
| 769 |
+
- API key status
|
| 770 |
+
- Execution plan
|
| 771 |
+
- Tools selected and executed
|
| 772 |
+
- Evidence collected
|
| 773 |
+
- Errors encountered
|
| 774 |
+
- Final answer
|
| 775 |
+
""")
|
| 776 |
+
|
| 777 |
+
test_question_input = gr.Textbox(
|
| 778 |
+
label="Enter Test Question",
|
| 779 |
+
placeholder="e.g., What is the capital of France?",
|
| 780 |
+
lines=3,
|
| 781 |
+
)
|
| 782 |
+
|
| 783 |
+
with gr.Row():
|
| 784 |
+
llm_provider_dropdown = gr.Dropdown(
|
| 785 |
+
label="LLM Provider",
|
| 786 |
+
choices=["Gemini", "HuggingFace", "Groq", "Claude"],
|
| 787 |
+
value="HuggingFace",
|
| 788 |
+
info="Select which LLM to use for this test",
|
| 789 |
+
)
|
| 790 |
+
|
| 791 |
+
test_button = gr.Button("Run Test", variant="primary")
|
| 792 |
+
|
| 793 |
+
with gr.Row():
|
| 794 |
+
with gr.Column(scale=1):
|
| 795 |
+
test_answer_output = gr.Textbox(
|
| 796 |
+
label="Answer", lines=3, interactive=False
|
| 797 |
+
)
|
| 798 |
+
test_api_status = gr.Textbox(
|
| 799 |
+
label="API Keys Status", lines=5, interactive=False
|
| 800 |
+
)
|
| 801 |
+
with gr.Column(scale=2):
|
| 802 |
+
test_diagnostics_output = gr.Textbox(
|
| 803 |
+
label="Execution Diagnostics", lines=20, interactive=False
|
| 804 |
+
)
|
| 805 |
+
|
| 806 |
+
test_button.click(
|
| 807 |
+
fn=test_single_question,
|
| 808 |
+
inputs=[
|
| 809 |
+
test_question_input,
|
| 810 |
+
llm_provider_dropdown,
|
| 811 |
+
],
|
| 812 |
+
outputs=[test_answer_output, test_diagnostics_output, test_api_status],
|
| 813 |
+
)
|
| 814 |
+
|
| 815 |
if __name__ == "__main__":
|
| 816 |
print("\n" + "-" * 30 + " App Starting " + "-" * 30)
|
| 817 |
# Check for SPACE_HOST and SPACE_ID at startup for information
|