UI Control for Question Limit
Browse files- CHANGELOG.md +33 -0
- app.py +22 -8
CHANGELOG.md
CHANGED
|
@@ -437,6 +437,39 @@ No "results" array exists with per-question correctness. API tells us "1/3 corre
|
|
| 437 |
- ⏳ Verify exact match comparison works correctly
|
| 438 |
- ⏳ Check performance with dataset caching
|
| 439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
### Created Files
|
| 441 |
|
| 442 |
- src/utils/ground_truth.py
|
|
|
|
| 437 |
- ⏳ Verify exact match comparison works correctly
|
| 438 |
- ⏳ Check performance with dataset caching
|
| 439 |
|
| 440 |
+
### [FEATURE: UI Control for Question Limit - Cloud Testing Support]
|
| 441 |
+
|
| 442 |
+
**Problem:** DEBUG_QUESTION_LIMIT in .env requires file editing to change. In HF Spaces cloud, users can't easily modify .env for testing different question counts.
|
| 443 |
+
|
| 444 |
+
**Solution:** Add UI number input for question limit in Full Evaluation tab.
|
| 445 |
+
|
| 446 |
+
**Modified Files:**
|
| 447 |
+
|
| 448 |
+
- **app.py** (~15 lines modified)
|
| 449 |
+
- Added `eval_question_limit` number input in Full Evaluation tab (lines 608-615)
|
| 450 |
+
- Range: 0-165 (0 = process all questions)
|
| 451 |
+
- Default: 0 (process all)
|
| 452 |
+
- Info: "Limit questions for testing (0 = process all)"
|
| 453 |
+
- Updated `run_and_submit_all()` function signature (line 285)
|
| 454 |
+
- Added `question_limit: int = 0` parameter
|
| 455 |
+
- Added docstring documenting parameter
|
| 456 |
+
- Updated `run_button.click()` to pass UI value (line 629)
|
| 457 |
+
- Updated question limiting logic (lines 345-351)
|
| 458 |
+
- Priority: UI value > .env value
|
| 459 |
+
- Falls back to .env if UI value is 0
|
| 460 |
+
|
| 461 |
+
**Benefits:**
|
| 462 |
+
|
| 463 |
+
- ✅ **Cloud testing:** Change question limit directly in HF Spaces UI
|
| 464 |
+
- ✅ **No file editing:** No need to modify .env in cloud environment
|
| 465 |
+
- ✅ **Instant adjustment:** Test with 3, 6, 10, or 20 questions without rebuild
|
| 466 |
+
- ✅ **Local override:** UI value overrides .env for flexibility
|
| 467 |
+
- ✅ **Production safety:** Default 0 processes all questions for full evaluation
|
| 468 |
+
|
| 469 |
+
**Verification:**
|
| 470 |
+
|
| 471 |
+
- ⏳ Testing with different UI question limits pending
|
| 472 |
+
|
| 473 |
### Created Files
|
| 474 |
|
| 475 |
- src/utils/ground_truth.py
|
app.py
CHANGED
|
@@ -282,11 +282,17 @@ def process_single_question(agent, item, index, total):
|
|
| 282 |
|
| 283 |
|
| 284 |
def run_and_submit_all(
|
| 285 |
-
llm_provider: str, enable_fallback: bool, profile: gr.OAuthProfile | None = None
|
| 286 |
):
|
| 287 |
"""
|
| 288 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 289 |
and displays the results.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
"""
|
| 291 |
# Start execution timer
|
| 292 |
start_time = time.time()
|
|
@@ -335,13 +341,13 @@ def run_and_submit_all(
|
|
| 335 |
print("Fetched questions list is empty.")
|
| 336 |
return "Fetched questions list is empty or invalid format.", None, ""
|
| 337 |
|
| 338 |
-
# Apply
|
| 339 |
-
|
| 340 |
-
if
|
| 341 |
-
questions_data = questions_data[:
|
| 342 |
-
logger.warning(f"DEBUG MODE: Limited to first {
|
| 343 |
print(
|
| 344 |
-
f"DEBUG MODE: Processing only {
|
| 345 |
)
|
| 346 |
|
| 347 |
print(f"Processing {len(questions_data)} questions.")
|
|
@@ -605,6 +611,14 @@ with gr.Blocks() as demo:
|
|
| 605 |
value=True,
|
| 606 |
info="Recommended: Enable fallback for production evaluation",
|
| 607 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
|
| 609 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 610 |
|
|
@@ -618,7 +632,7 @@ with gr.Blocks() as demo:
|
|
| 618 |
|
| 619 |
run_button.click(
|
| 620 |
fn=run_and_submit_all,
|
| 621 |
-
inputs=[eval_llm_provider_dropdown, eval_enable_fallback_checkbox],
|
| 622 |
outputs=[status_output, results_table, export_output],
|
| 623 |
)
|
| 624 |
|
|
|
|
| 282 |
|
| 283 |
|
| 284 |
def run_and_submit_all(
|
| 285 |
+
llm_provider: str, enable_fallback: bool, question_limit: int = 0, profile: gr.OAuthProfile | None = None
|
| 286 |
):
|
| 287 |
"""
|
| 288 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 289 |
and displays the results.
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
llm_provider: LLM provider to use
|
| 293 |
+
enable_fallback: Whether to enable fallback to other providers
|
| 294 |
+
question_limit: Limit number of questions (0 = process all)
|
| 295 |
+
profile: OAuth profile for HF login
|
| 296 |
"""
|
| 297 |
# Start execution timer
|
| 298 |
start_time = time.time()
|
|
|
|
| 341 |
print("Fetched questions list is empty.")
|
| 342 |
return "Fetched questions list is empty or invalid format.", None, ""
|
| 343 |
|
| 344 |
+
# Apply question limit if configured (from UI or .env)
|
| 345 |
+
limit = int(question_limit) if question_limit > 0 else int(os.getenv("DEBUG_QUESTION_LIMIT", "0"))
|
| 346 |
+
if limit > 0:
|
| 347 |
+
questions_data = questions_data[:limit]
|
| 348 |
+
logger.warning(f"DEBUG MODE: Limited to first {limit} questions")
|
| 349 |
print(
|
| 350 |
+
f"DEBUG MODE: Processing only {limit} questions (set to 0 to process all)"
|
| 351 |
)
|
| 352 |
|
| 353 |
print(f"Processing {len(questions_data)} questions.")
|
|
|
|
| 611 |
value=True,
|
| 612 |
info="Recommended: Enable fallback for production evaluation",
|
| 613 |
)
|
| 614 |
+
eval_question_limit = gr.Number(
|
| 615 |
+
label="Question Limit (Debug)",
|
| 616 |
+
value=0,
|
| 617 |
+
precision=0,
|
| 618 |
+
minimum=0,
|
| 619 |
+
maximum=165,
|
| 620 |
+
info="Limit questions for testing (0 = process all)",
|
| 621 |
+
)
|
| 622 |
|
| 623 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 624 |
|
|
|
|
| 632 |
|
| 633 |
run_button.click(
|
| 634 |
fn=run_and_submit_all,
|
| 635 |
+
inputs=[eval_llm_provider_dropdown, eval_enable_fallback_checkbox, eval_question_limit],
|
| 636 |
outputs=[status_output, results_table, export_output],
|
| 637 |
)
|
| 638 |
|