Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1421,34 +1421,49 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 1421 |
print(agent_code)
|
| 1422 |
|
| 1423 |
# 2. Fetch Questions
|
|
|
|
|
|
|
|
|
|
| 1424 |
print(f"Fetching questions from: {questions_url}")
|
|
|
|
| 1425 |
try:
|
| 1426 |
response = requests.get(questions_url, timeout=15)
|
| 1427 |
response.raise_for_status()
|
| 1428 |
questions_data = response.json()
|
| 1429 |
if not questions_data:
|
| 1430 |
-
|
| 1431 |
-
|
| 1432 |
-
print(f"Fetched {len(questions_data)} questions.")
|
|
|
|
| 1433 |
except requests.exceptions.RequestException as e:
|
| 1434 |
-
print(f"Error fetching questions: {e}")
|
| 1435 |
return f"Error fetching questions: {e}", None
|
| 1436 |
except requests.exceptions.JSONDecodeError as e:
|
| 1437 |
-
|
| 1438 |
-
|
| 1439 |
-
|
| 1440 |
except Exception as e:
|
| 1441 |
-
print(f"An unexpected error occurred fetching questions: {e}")
|
| 1442 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 1443 |
|
| 1444 |
# 3. Run your Agent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1445 |
results_log = []
|
| 1446 |
answers_payload = []
|
| 1447 |
-
|
| 1448 |
-
for item in questions_data:
|
| 1449 |
-
|
|
|
|
|
|
|
|
|
|
| 1450 |
task_id = item.get("task_id")
|
| 1451 |
question_text = item.get("question")
|
|
|
|
| 1452 |
|
| 1453 |
# Initialize file variables for the current question
|
| 1454 |
local_file_path = None
|
|
@@ -1461,29 +1476,31 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 1461 |
# Extract the original file name to preserve the extension
|
| 1462 |
original_filename = file_path_from_api.split('/')[-1]
|
| 1463 |
|
| 1464 |
-
#
|
| 1465 |
local_file_path = original_filename
|
| 1466 |
|
| 1467 |
print(f"π₯ Downloading file for task {task_id}...")
|
| 1468 |
print(f" URL: {file_download_url}")
|
|
|
|
| 1469 |
print(f" Saving to: {local_file_path}")
|
| 1470 |
|
| 1471 |
try:
|
| 1472 |
file_response = requests.get(file_download_url, timeout=15)
|
| 1473 |
file_response.raise_for_status()
|
| 1474 |
|
|
|
|
| 1475 |
with open(local_file_path, 'wb') as f:
|
| 1476 |
f.write(file_response.content)
|
| 1477 |
|
| 1478 |
file_size = os.path.getsize(local_file_path)
|
| 1479 |
print(f"β
Downloaded file: {original_filename} ({file_size} bytes)")
|
| 1480 |
|
| 1481 |
-
#
|
| 1482 |
if not os.path.exists(local_file_path):
|
| 1483 |
print(f"β οΈ Warning: File saved but cannot be found at {local_file_path}")
|
| 1484 |
local_file_path = None
|
| 1485 |
else:
|
| 1486 |
-
print(f"β File accessible at: {os.path.abspath(local_file_path)}")
|
| 1487 |
|
| 1488 |
except requests.exceptions.RequestException as e:
|
| 1489 |
error_message = f"[FILE DOWNLOAD ERROR: Could not fetch file: {e}]"
|
|
@@ -1495,34 +1512,93 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 1495 |
local_file_path = None
|
| 1496 |
|
| 1497 |
if not task_id or question_text is None:
|
| 1498 |
-
print(f"Skipping item with missing task_id or question: {item}")
|
| 1499 |
continue
|
| 1500 |
|
| 1501 |
try:
|
| 1502 |
# Pass file_path to agent
|
| 1503 |
submitted_answer = agent(question_text, local_file_path)
|
| 1504 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 1505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1506 |
except Exception as e:
|
| 1507 |
-
|
| 1508 |
-
|
| 1509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1510 |
|
| 1511 |
if not answers_payload:
|
| 1512 |
-
print("Agent did not produce any answers to submit.")
|
| 1513 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 1514 |
|
| 1515 |
# 4. Prepare Submission
|
| 1516 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
| 1517 |
-
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 1518 |
-
print(status_update)
|
| 1519 |
|
| 1520 |
# 5. Submit
|
| 1521 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1522 |
try:
|
|
|
|
| 1523 |
response = requests.post(submit_url, json=submission_data, timeout=60)
|
|
|
|
|
|
|
| 1524 |
response.raise_for_status()
|
| 1525 |
result_data = response.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1526 |
final_status = (
|
| 1527 |
f"Submission Successful!\n"
|
| 1528 |
f"User: {result_data.get('username')}\n"
|
|
@@ -1530,9 +1606,14 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 1530 |
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 1531 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 1532 |
)
|
| 1533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1534 |
results_df = pd.DataFrame(results_log)
|
| 1535 |
return final_status, results_df
|
|
|
|
| 1536 |
except requests.exceptions.HTTPError as e:
|
| 1537 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 1538 |
try:
|
|
@@ -1541,22 +1622,42 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 1541 |
except requests.exceptions.JSONDecodeError:
|
| 1542 |
error_detail += f" Response: {e.response.text[:500]}"
|
| 1543 |
status_message = f"Submission Failed: {error_detail}"
|
|
|
|
|
|
|
|
|
|
| 1544 |
print(status_message)
|
|
|
|
| 1545 |
results_df = pd.DataFrame(results_log)
|
| 1546 |
return status_message, results_df
|
|
|
|
| 1547 |
except requests.exceptions.Timeout:
|
| 1548 |
status_message = "Submission Failed: The request timed out."
|
|
|
|
|
|
|
|
|
|
| 1549 |
print(status_message)
|
|
|
|
| 1550 |
results_df = pd.DataFrame(results_log)
|
| 1551 |
return status_message, results_df
|
|
|
|
| 1552 |
except requests.exceptions.RequestException as e:
|
| 1553 |
status_message = f"Submission Failed: Network error - {e}"
|
|
|
|
|
|
|
|
|
|
| 1554 |
print(status_message)
|
|
|
|
| 1555 |
results_df = pd.DataFrame(results_log)
|
| 1556 |
return status_message, results_df
|
|
|
|
| 1557 |
except Exception as e:
|
| 1558 |
status_message = f"An unexpected error occurred during submission: {e}"
|
|
|
|
|
|
|
|
|
|
| 1559 |
print(status_message)
|
|
|
|
|
|
|
| 1560 |
results_df = pd.DataFrame(results_log)
|
| 1561 |
return status_message, results_df
|
| 1562 |
|
|
|
|
| 1421 |
print(agent_code)
|
| 1422 |
|
| 1423 |
# 2. Fetch Questions
|
| 1424 |
+
print(f"\n{'='*70}")
|
| 1425 |
+
print(f"π₯ FETCHING QUESTIONS")
|
| 1426 |
+
print(f"{'='*70}")
|
| 1427 |
print(f"Fetching questions from: {questions_url}")
|
| 1428 |
+
|
| 1429 |
try:
|
| 1430 |
response = requests.get(questions_url, timeout=15)
|
| 1431 |
response.raise_for_status()
|
| 1432 |
questions_data = response.json()
|
| 1433 |
if not questions_data:
|
| 1434 |
+
print("Fetched questions list is empty.")
|
| 1435 |
+
return "Fetched questions list is empty or invalid format.", None
|
| 1436 |
+
print(f"β
Fetched {len(questions_data)} questions.")
|
| 1437 |
+
print(f"{'='*70}\n")
|
| 1438 |
except requests.exceptions.RequestException as e:
|
| 1439 |
+
print(f"β Error fetching questions: {e}")
|
| 1440 |
return f"Error fetching questions: {e}", None
|
| 1441 |
except requests.exceptions.JSONDecodeError as e:
|
| 1442 |
+
print(f"β Error decoding JSON response from questions endpoint: {e}")
|
| 1443 |
+
print(f"Response text: {response.text[:500]}")
|
| 1444 |
+
return f"Error decoding server response for questions: {e}", None
|
| 1445 |
except Exception as e:
|
| 1446 |
+
print(f"β An unexpected error occurred fetching questions: {e}")
|
| 1447 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 1448 |
|
| 1449 |
# 3. Run your Agent
|
| 1450 |
+
print(f"\n{'='*70}")
|
| 1451 |
+
print(f"π STARTING EVALUATION")
|
| 1452 |
+
print(f"{'='*70}")
|
| 1453 |
+
print(f"Total questions to process: {len(questions_data)}")
|
| 1454 |
+
print(f"{'='*70}\n")
|
| 1455 |
+
|
| 1456 |
results_log = []
|
| 1457 |
answers_payload = []
|
| 1458 |
+
|
| 1459 |
+
for idx, item in enumerate(questions_data, 1):
|
| 1460 |
+
print(f"\n{'='*70}")
|
| 1461 |
+
print(f"π PROCESSING QUESTION {idx}/{len(questions_data)}")
|
| 1462 |
+
print(f"{'='*70}")
|
| 1463 |
+
|
| 1464 |
task_id = item.get("task_id")
|
| 1465 |
question_text = item.get("question")
|
| 1466 |
+
correct_answer = item.get("answer", "N/A") # Get correct answer from API
|
| 1467 |
|
| 1468 |
# Initialize file variables for the current question
|
| 1469 |
local_file_path = None
|
|
|
|
| 1476 |
# Extract the original file name to preserve the extension
|
| 1477 |
original_filename = file_path_from_api.split('/')[-1]
|
| 1478 |
|
| 1479 |
+
# Save to current directory instead of /tmp
|
| 1480 |
local_file_path = original_filename
|
| 1481 |
|
| 1482 |
print(f"π₯ Downloading file for task {task_id}...")
|
| 1483 |
print(f" URL: {file_download_url}")
|
| 1484 |
+
print(f" Original filename: {original_filename}")
|
| 1485 |
print(f" Saving to: {local_file_path}")
|
| 1486 |
|
| 1487 |
try:
|
| 1488 |
file_response = requests.get(file_download_url, timeout=15)
|
| 1489 |
file_response.raise_for_status()
|
| 1490 |
|
| 1491 |
+
# Save the raw bytes content to the local file path
|
| 1492 |
with open(local_file_path, 'wb') as f:
|
| 1493 |
f.write(file_response.content)
|
| 1494 |
|
| 1495 |
file_size = os.path.getsize(local_file_path)
|
| 1496 |
print(f"β
Downloaded file: {original_filename} ({file_size} bytes)")
|
| 1497 |
|
| 1498 |
+
# Verify file exists and is readable
|
| 1499 |
if not os.path.exists(local_file_path):
|
| 1500 |
print(f"β οΈ Warning: File saved but cannot be found at {local_file_path}")
|
| 1501 |
local_file_path = None
|
| 1502 |
else:
|
| 1503 |
+
print(f"β File accessible at: {os.path.abspath(local_file_path)}")
|
| 1504 |
|
| 1505 |
except requests.exceptions.RequestException as e:
|
| 1506 |
error_message = f"[FILE DOWNLOAD ERROR: Could not fetch file: {e}]"
|
|
|
|
| 1512 |
local_file_path = None
|
| 1513 |
|
| 1514 |
if not task_id or question_text is None:
|
| 1515 |
+
print(f"β οΈ Skipping item with missing task_id or question: {item}")
|
| 1516 |
continue
|
| 1517 |
|
| 1518 |
try:
|
| 1519 |
# Pass file_path to agent
|
| 1520 |
submitted_answer = agent(question_text, local_file_path)
|
| 1521 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 1522 |
+
|
| 1523 |
+
# Check if answer is correct
|
| 1524 |
+
is_correct = submitted_answer.strip().lower() == correct_answer.strip().lower()
|
| 1525 |
+
correctness = "β
CORRECT" if is_correct else "β WRONG"
|
| 1526 |
+
|
| 1527 |
+
# Log with correctness indicator
|
| 1528 |
+
print(f"\n{correctness} - Task {task_id}")
|
| 1529 |
+
print(f" Submitted: '{submitted_answer}'")
|
| 1530 |
+
print(f" Expected: '{correct_answer}'")
|
| 1531 |
+
|
| 1532 |
+
results_log.append({
|
| 1533 |
+
"Task ID": task_id,
|
| 1534 |
+
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 1535 |
+
"Submitted Answer": submitted_answer,
|
| 1536 |
+
"Correct Answer": correct_answer,
|
| 1537 |
+
"Status": "β
" if is_correct else "β"
|
| 1538 |
+
})
|
| 1539 |
+
|
| 1540 |
+
print(f"β
Question {idx}/{len(questions_data)} completed")
|
| 1541 |
+
|
| 1542 |
except Exception as e:
|
| 1543 |
+
print(f"β Error running agent on task {task_id}: {e}")
|
| 1544 |
+
print(traceback.format_exc())
|
| 1545 |
+
results_log.append({
|
| 1546 |
+
"Task ID": task_id,
|
| 1547 |
+
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 1548 |
+
"Submitted Answer": f"AGENT ERROR: {e}",
|
| 1549 |
+
"Correct Answer": correct_answer,
|
| 1550 |
+
"Status": "β"
|
| 1551 |
+
})
|
| 1552 |
+
# Continue with other questions even if one fails
|
| 1553 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": f"ERROR: {str(e)[:100]}"})
|
| 1554 |
+
|
| 1555 |
+
# Summary after all questions processed
|
| 1556 |
+
print(f"\n{'='*70}")
|
| 1557 |
+
print(f"β
ALL QUESTIONS PROCESSED")
|
| 1558 |
+
print(f"{'='*70}")
|
| 1559 |
+
print(f"Total answers collected: {len(answers_payload)}")
|
| 1560 |
+
|
| 1561 |
+
# Calculate pre-submission accuracy
|
| 1562 |
+
correct_count = sum(1 for log in results_log if log.get("Status") == "β
")
|
| 1563 |
+
total_count = len(results_log)
|
| 1564 |
+
accuracy = (correct_count / total_count * 100) if total_count > 0 else 0
|
| 1565 |
+
|
| 1566 |
+
print(f"\n{'='*70}")
|
| 1567 |
+
print(f"π PRE-SUBMISSION SUMMARY")
|
| 1568 |
+
print(f"{'='*70}")
|
| 1569 |
+
print(f"Correct: {correct_count}/{total_count} ({accuracy:.1f}%)")
|
| 1570 |
+
print(f"{'='*70}\n")
|
| 1571 |
|
| 1572 |
if not answers_payload:
|
| 1573 |
+
print("β οΈ Agent did not produce any answers to submit.")
|
| 1574 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 1575 |
|
| 1576 |
# 4. Prepare Submission
|
| 1577 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
|
|
|
|
|
|
| 1578 |
|
| 1579 |
# 5. Submit
|
| 1580 |
+
print(f"\n{'='*70}")
|
| 1581 |
+
print(f"π€ SUBMITTING TO API")
|
| 1582 |
+
print(f"{'='*70}")
|
| 1583 |
+
print(f"URL: {submit_url}")
|
| 1584 |
+
print(f"Username: {username}")
|
| 1585 |
+
print(f"Answers to submit: {len(answers_payload)}")
|
| 1586 |
+
print(f"{'='*70}\n")
|
| 1587 |
+
|
| 1588 |
try:
|
| 1589 |
+
print("β³ Sending POST request...")
|
| 1590 |
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 1591 |
+
print(f"β
Got response: Status {response.status_code}")
|
| 1592 |
+
|
| 1593 |
response.raise_for_status()
|
| 1594 |
result_data = response.json()
|
| 1595 |
+
|
| 1596 |
+
print(f"\n{'='*70}")
|
| 1597 |
+
print(f"π SUBMISSION RESULTS")
|
| 1598 |
+
print(f"{'='*70}")
|
| 1599 |
+
print(f"Response data: {result_data}")
|
| 1600 |
+
print(f"{'='*70}\n")
|
| 1601 |
+
|
| 1602 |
final_status = (
|
| 1603 |
f"Submission Successful!\n"
|
| 1604 |
f"User: {result_data.get('username')}\n"
|
|
|
|
| 1606 |
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 1607 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 1608 |
)
|
| 1609 |
+
|
| 1610 |
+
print(final_status)
|
| 1611 |
+
print("="*70)
|
| 1612 |
+
print("β
Submission successful.")
|
| 1613 |
+
|
| 1614 |
results_df = pd.DataFrame(results_log)
|
| 1615 |
return final_status, results_df
|
| 1616 |
+
|
| 1617 |
except requests.exceptions.HTTPError as e:
|
| 1618 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 1619 |
try:
|
|
|
|
| 1622 |
except requests.exceptions.JSONDecodeError:
|
| 1623 |
error_detail += f" Response: {e.response.text[:500]}"
|
| 1624 |
status_message = f"Submission Failed: {error_detail}"
|
| 1625 |
+
print(f"\n{'='*70}")
|
| 1626 |
+
print(f"β SUBMISSION FAILED")
|
| 1627 |
+
print(f"{'='*70}")
|
| 1628 |
print(status_message)
|
| 1629 |
+
print(f"{'='*70}\n")
|
| 1630 |
results_df = pd.DataFrame(results_log)
|
| 1631 |
return status_message, results_df
|
| 1632 |
+
|
| 1633 |
except requests.exceptions.Timeout:
|
| 1634 |
status_message = "Submission Failed: The request timed out."
|
| 1635 |
+
print(f"\n{'='*70}")
|
| 1636 |
+
print(f"β SUBMISSION FAILED")
|
| 1637 |
+
print(f"{'='*70}")
|
| 1638 |
print(status_message)
|
| 1639 |
+
print(f"{'='*70}\n")
|
| 1640 |
results_df = pd.DataFrame(results_log)
|
| 1641 |
return status_message, results_df
|
| 1642 |
+
|
| 1643 |
except requests.exceptions.RequestException as e:
|
| 1644 |
status_message = f"Submission Failed: Network error - {e}"
|
| 1645 |
+
print(f"\n{'='*70}")
|
| 1646 |
+
print(f"β SUBMISSION FAILED")
|
| 1647 |
+
print(f"{'='*70}")
|
| 1648 |
print(status_message)
|
| 1649 |
+
print(f"{'='*70}\n")
|
| 1650 |
results_df = pd.DataFrame(results_log)
|
| 1651 |
return status_message, results_df
|
| 1652 |
+
|
| 1653 |
except Exception as e:
|
| 1654 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 1655 |
+
print(f"\n{'='*70}")
|
| 1656 |
+
print(f"β SUBMISSION FAILED")
|
| 1657 |
+
print(f"{'='*70}")
|
| 1658 |
print(status_message)
|
| 1659 |
+
print(traceback.format_exc())
|
| 1660 |
+
print(f"{'='*70}\n")
|
| 1661 |
results_df = pd.DataFrame(results_log)
|
| 1662 |
return status_message, results_df
|
| 1663 |
|