mangubee commited on
Commit
65a5dc6
·
1 Parent(s): 9fb23b8

UI Control for Question Limit

Browse files
Files changed (2) hide show
  1. CHANGELOG.md +33 -0
  2. app.py +22 -8
CHANGELOG.md CHANGED
@@ -437,6 +437,39 @@ No "results" array exists with per-question correctness. API tells us "1/3 corre
437
  - ⏳ Verify exact match comparison works correctly
438
  - ⏳ Check performance with dataset caching
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  ### Created Files
441
 
442
  - src/utils/ground_truth.py
 
437
  - ⏳ Verify exact match comparison works correctly
438
  - ⏳ Check performance with dataset caching
439
 
440
+ ### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
441
+
442
+ **Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.
443
+
444
+ **Solution:** Add UI number input for question limit in Full Evaluation tab.
445
+
446
+ **Modified Files:**
447
+
448
+ - **app.py** (~15 lines modified)
449
+ - Added `eval_question_limit` number input in Full Evaluation tab (lines 608-615)
450
+ - Range: 0-165 (0 = process all questions)
451
+ - Default: 0 (process all)
452
+ - Info: "Limit questions for testing (0 = process all)"
453
+ - Updated `run_and_submit_all()` function signature (line 285)
454
+ - Added `question_limit: int = 0` parameter
455
+ - Added docstring documenting parameter
456
+ - Updated `run_button.click()` to pass UI value (line 629)
457
+ - Updated question limiting logic (lines 345-351)
458
+ - Priority: UI value > .env value
459
+ - Falls back to .env if UI value is 0
460
+
461
+ **Benefits:**
462
+
463
+ - ✅ **Cloud testing:** Change question limit directly in HF Spaces UI
464
+ - ✅ **No file editing:** No need to modify .env in cloud environment
465
+ - ✅ **Instant adjustment:** Test with 3, 6, 10, or 20 questions without rebuild
466
+ - ✅ **Local override:** UI value overrides .env for flexibility
467
+ - ✅ **Production safety:** Default 0 processes all questions for full evaluation
468
+
469
+ **Verification:**
470
+
471
+ - ⏳ Testing with different UI question limits pending
472
+
473
  ### Created Files
474
 
475
  - src/utils/ground_truth.py
app.py CHANGED
@@ -282,11 +282,17 @@ def process_single_question(agent, item, index, total):
282
 
283
 
284
  def run_and_submit_all(
285
- llm_provider: str, enable_fallback: bool, profile: gr.OAuthProfile | None = None
286
  ):
287
  """
288
  Fetches all questions, runs the BasicAgent on them, submits all answers,
289
  and displays the results.
 
 
 
 
 
 
290
  """
291
  # Start execution timer
292
  start_time = time.time()
@@ -335,13 +341,13 @@ def run_and_submit_all(
335
  print("Fetched questions list is empty.")
336
  return "Fetched questions list is empty or invalid format.", None, ""
337
 
338
- # Apply debug limit if configured
339
- debug_limit = int(os.getenv("DEBUG_QUESTION_LIMIT", "0"))
340
- if debug_limit > 0:
341
- questions_data = questions_data[:debug_limit]
342
- logger.warning(f"DEBUG MODE: Limited to first {debug_limit} questions")
343
  print(
344
- f"DEBUG MODE: Processing only {debug_limit} questions (set DEBUG_QUESTION_LIMIT=0 to disable)"
345
  )
346
 
347
  print(f"Processing {len(questions_data)} questions.")
@@ -605,6 +611,14 @@ with gr.Blocks() as demo:
605
  value=True,
606
  info="Recommended: Enable fallback for production evaluation",
607
  )
 
 
 
 
 
 
 
 
608
 
609
  run_button = gr.Button("Run Evaluation & Submit All Answers")
610
 
@@ -618,7 +632,7 @@ with gr.Blocks() as demo:
618
 
619
  run_button.click(
620
  fn=run_and_submit_all,
621
- inputs=[eval_llm_provider_dropdown, eval_enable_fallback_checkbox],
622
  outputs=[status_output, results_table, export_output],
623
  )
624
 
 
282
 
283
 
284
  def run_and_submit_all(
285
+ llm_provider: str, enable_fallback: bool, question_limit: int = 0, profile: gr.OAuthProfile | None = None
286
  ):
287
  """
288
  Fetches all questions, runs the BasicAgent on them, submits all answers,
289
  and displays the results.
290
+
291
+ Args:
292
+ llm_provider: LLM provider to use
293
+ enable_fallback: Whether to enable fallback to other providers
294
+ question_limit: Limit number of questions (0 = process all)
295
+ profile: OAuth profile for HF login
296
  """
297
  # Start execution timer
298
  start_time = time.time()
 
341
  print("Fetched questions list is empty.")
342
  return "Fetched questions list is empty or invalid format.", None, ""
343
 
344
+ # Apply question limit if configured (from UI or .env)
345
+ limit = int(question_limit) if question_limit > 0 else int(os.getenv("DEBUG_QUESTION_LIMIT", "0"))
346
+ if limit > 0:
347
+ questions_data = questions_data[:limit]
348
+ logger.warning(f"DEBUG MODE: Limited to first {limit} questions")
349
  print(
350
+ f"DEBUG MODE: Processing only {limit} questions (set to 0 to process all)"
351
  )
352
 
353
  print(f"Processing {len(questions_data)} questions.")
 
611
  value=True,
612
  info="Recommended: Enable fallback for production evaluation",
613
  )
614
+ eval_question_limit = gr.Number(
615
+ label="Question Limit (Debug)",
616
+ value=0,
617
+ precision=0,
618
+ minimum=0,
619
+ maximum=165,
620
+ info="Limit questions for testing (0 = process all)",
621
+ )
622
 
623
  run_button = gr.Button("Run Evaluation & Submit All Answers")
624
 
 
632
 
633
  run_button.click(
634
  fn=run_and_submit_all,
635
+ inputs=[eval_llm_provider_dropdown, eval_enable_fallback_checkbox, eval_question_limit],
636
  outputs=[status_output, results_table, export_output],
637
  )
638