agentbee

Running

App Files Files Community

mangubee commited on 22 days ago

Commit

65a5dc6

1 Parent(s): 9fb23b8

UI Control for Question Limit

Browse files

Files changed (2) hide show

CHANGELOG.md +33 -0
app.py +22 -8

CHANGELOG.md CHANGED Viewed

@@ -437,6 +437,39 @@ No "results" array exists with per-question correctness. API tells us "1/3 corre
 - ⏳ Verify exact match comparison works correctly
 - ⏳ Check performance with dataset caching
 ### Created Files
 - src/utils/ground_truth.py

 - ⏳ Verify exact match comparison works correctly
 - ⏳ Check performance with dataset caching
+### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
+**Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.
+**Solution:** Add UI number input for question limit in Full Evaluation tab.
+**Modified Files:**
+- **app.py** (~15 lines modified)
+  - Added `eval_question_limit` number input in Full Evaluation tab (lines 608-615)
+    - Range: 0-165 (0 = process all questions)
+    - Default: 0 (process all)
+    - Info: "Limit questions for testing (0 = process all)"
+  - Updated `run_and_submit_all()` function signature (line 285)
+    - Added `question_limit: int = 0` parameter
+    - Added docstring documenting parameter
+  - Updated `run_button.click()` to pass UI value (line 629)
+  - Updated question limiting logic (lines 345-351)
+    - Priority: UI value > .env value
+    - Falls back to .env if UI value is 0
+**Benefits:**
+- ✅ **Cloud testing:** Change question limit directly in HF Spaces UI
+- ✅ **No file editing:** No need to modify .env in cloud environment
+- ✅ **Instant adjustment:** Test with 3, 6, 10, or 20 questions without rebuild
+- ✅ **Local override:** UI value overrides .env for flexibility
+- ✅ **Production safety:** Default 0 processes all questions for full evaluation
+**Verification:**
+- ⏳ Testing with different UI question limits pending
 ### Created Files
 - src/utils/ground_truth.py

app.py CHANGED Viewed

@@ -282,11 +282,17 @@ def process_single_question(agent, item, index, total):
 def run_and_submit_all(
-    llm_provider: str, enable_fallback: bool, profile: gr.OAuthProfile | None = None
 ):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
     """
     # Start execution timer
     start_time = time.time()
@@ -335,13 +341,13 @@ def run_and_submit_all(
             print("Fetched questions list is empty.")
             return "Fetched questions list is empty or invalid format.", None, ""
-        # Apply debug limit if configured
-        debug_limit = int(os.getenv("DEBUG_QUESTION_LIMIT", "0"))
-        if debug_limit > 0:
-            questions_data = questions_data[:debug_limit]
-            logger.warning(f"DEBUG MODE: Limited to first {debug_limit} questions")
             print(
-                f"DEBUG MODE: Processing only {debug_limit} questions (set DEBUG_QUESTION_LIMIT=0 to disable)"
             )
         print(f"Processing {len(questions_data)} questions.")
@@ -605,6 +611,14 @@ with gr.Blocks() as demo:
                     value=True,
                     info="Recommended: Enable fallback for production evaluation",
                 )
             run_button = gr.Button("Run Evaluation & Submit All Answers")
@@ -618,7 +632,7 @@ with gr.Blocks() as demo:
             run_button.click(
                 fn=run_and_submit_all,
-                inputs=[eval_llm_provider_dropdown, eval_enable_fallback_checkbox],
                 outputs=[status_output, results_table, export_output],
             )

 def run_and_submit_all(
+    llm_provider: str, enable_fallback: bool, question_limit: int = 0, profile: gr.OAuthProfile | None = None
 ):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
+    Args:
+        llm_provider: LLM provider to use
+        enable_fallback: Whether to enable fallback to other providers
+        question_limit: Limit number of questions (0 = process all)
+        profile: OAuth profile for HF login
     """
     # Start execution timer
     start_time = time.time()
             print("Fetched questions list is empty.")
             return "Fetched questions list is empty or invalid format.", None, ""
+        # Apply question limit if configured (from UI or .env)
+        limit = int(question_limit) if question_limit > 0 else int(os.getenv("DEBUG_QUESTION_LIMIT", "0"))
+        if limit > 0:
+            questions_data = questions_data[:limit]
+            logger.warning(f"DEBUG MODE: Limited to first {limit} questions")
             print(
+                f"DEBUG MODE: Processing only {limit} questions (set to 0 to process all)"
             )
         print(f"Processing {len(questions_data)} questions.")
                     value=True,
                     info="Recommended: Enable fallback for production evaluation",
                 )
+                eval_question_limit = gr.Number(
+                    label="Question Limit (Debug)",
+                    value=0,
+                    precision=0,
+                    minimum=0,
+                    maximum=165,
+                    info="Limit questions for testing (0 = process all)",
+                )
             run_button = gr.Button("Run Evaluation & Submit All Answers")
             run_button.click(
                 fn=run_and_submit_all,
+                inputs=[eval_llm_provider_dropdown, eval_enable_fallback_checkbox, eval_question_limit],
                 outputs=[status_output, results_table, export_output],
             )