Spaces:
Sleeping
Sleeping
GAIA Developer
Claude
commited on
Commit
Β·
520f8ca
1
Parent(s):
b58a59f
π§ Fix web interface accuracy by removing redundant answer extraction
Browse filesFixed critical issue where solve_question() output was being double-processed,
causing accuracy to drop from 90% to 30%. The solve_question method already
returns clean, processed answers, so removed redundant _extract_answer() call.
Also fixed import paths to ensure GAIASolver initializes properly.
π€ Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
- app/app.py +16 -16
app/app.py
CHANGED
|
@@ -17,6 +17,7 @@ from pathlib import Path
|
|
| 17 |
|
| 18 |
# Add current directory to Python path to find main modules
|
| 19 |
sys.path.insert(0, '/home/user/app')
|
|
|
|
| 20 |
|
| 21 |
# --- Constants ---
|
| 22 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
@@ -138,21 +139,19 @@ class AdvancedGAIAAgent:
|
|
| 138 |
"question": question,
|
| 139 |
"file_name": ""
|
| 140 |
}
|
| 141 |
-
|
| 142 |
-
answer = self.
|
| 143 |
elif self.solver == "refactored":
|
| 144 |
# For refactored architecture
|
| 145 |
try:
|
| 146 |
from main_refactored import main as refactored_main
|
| 147 |
-
|
| 148 |
-
answer = self._extract_answer(result)
|
| 149 |
except Exception as e:
|
| 150 |
print(f"Refactored solver error: {e}")
|
| 151 |
answer = f"Refactored solver error: {e}"
|
| 152 |
elif hasattr(self.solver, '__call__'):
|
| 153 |
# Generic callable solver
|
| 154 |
-
|
| 155 |
-
answer = self._extract_answer(result)
|
| 156 |
else:
|
| 157 |
# Last resort
|
| 158 |
answer = "Unable to process question with current solver"
|
|
@@ -260,9 +259,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 260 |
"Our Answer": submitted_answer[:50] + "..." if len(str(submitted_answer)) > 50 else submitted_answer,
|
| 261 |
"Expected Answer": correct_answer,
|
| 262 |
"Result": f"{validation_result['icon']} {validation_result['status']}",
|
| 263 |
-
"
|
| 264 |
-
"
|
| 265 |
-
"Time (s)": f"{question_time:.2f}"
|
| 266 |
})
|
| 267 |
print(f"β
Completed in {question_time:.2f}s - {validation_result['icon']} {validation_result['status']}")
|
| 268 |
|
|
@@ -274,9 +272,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 274 |
"Our Answer": f"ERROR: {e}",
|
| 275 |
"Expected Answer": correct_answers.get(task_id, {}).get('answer', 'Not available'),
|
| 276 |
"Result": "β ERROR",
|
| 277 |
-
"
|
| 278 |
-
"
|
| 279 |
-
"Time (s)": "Error"
|
| 280 |
})
|
| 281 |
|
| 282 |
total_time = time.time() - start_time
|
|
@@ -289,12 +286,12 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 289 |
|
| 290 |
for result in results_log:
|
| 291 |
try:
|
| 292 |
-
score = float(result.get('
|
| 293 |
total_score += score
|
| 294 |
validated_count += 1
|
| 295 |
if score >= 1.0:
|
| 296 |
correct_count += 1
|
| 297 |
-
except ValueError:
|
| 298 |
pass
|
| 299 |
|
| 300 |
local_accuracy = (total_score / validated_count * 100) if validated_count > 0 else 0
|
|
@@ -306,7 +303,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 306 |
|
| 307 |
if not answers_payload:
|
| 308 |
print("β Agent did not produce any answers to submit.")
|
| 309 |
-
|
|
|
|
| 310 |
|
| 311 |
# 4. Prepare Submission
|
| 312 |
submission_data = {
|
|
@@ -347,7 +345,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 347 |
f"- Features: Enhanced reasoning, 42 specialized tools, domain expertise"
|
| 348 |
)
|
| 349 |
print("β
Submission successful.")
|
| 350 |
-
|
|
|
|
|
|
|
| 351 |
return final_status, results_df
|
| 352 |
|
| 353 |
except requests.exceptions.HTTPError as e:
|
|
|
|
| 17 |
|
| 18 |
# Add current directory to Python path to find main modules
|
| 19 |
sys.path.insert(0, '/home/user/app')
|
| 20 |
+
sys.path.insert(0, '/home/user')
|
| 21 |
|
| 22 |
# --- Constants ---
|
| 23 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
|
|
| 139 |
"question": question,
|
| 140 |
"file_name": ""
|
| 141 |
}
|
| 142 |
+
# solve_question already returns a clean, processed answer string
|
| 143 |
+
answer = self.solver.solve_question(question_data)
|
| 144 |
elif self.solver == "refactored":
|
| 145 |
# For refactored architecture
|
| 146 |
try:
|
| 147 |
from main_refactored import main as refactored_main
|
| 148 |
+
answer = refactored_main(question)
|
|
|
|
| 149 |
except Exception as e:
|
| 150 |
print(f"Refactored solver error: {e}")
|
| 151 |
answer = f"Refactored solver error: {e}"
|
| 152 |
elif hasattr(self.solver, '__call__'):
|
| 153 |
# Generic callable solver
|
| 154 |
+
answer = self.solver(question)
|
|
|
|
| 155 |
else:
|
| 156 |
# Last resort
|
| 157 |
answer = "Unable to process question with current solver"
|
|
|
|
| 259 |
"Our Answer": submitted_answer[:50] + "..." if len(str(submitted_answer)) > 50 else submitted_answer,
|
| 260 |
"Expected Answer": correct_answer,
|
| 261 |
"Result": f"{validation_result['icon']} {validation_result['status']}",
|
| 262 |
+
"Time (s)": f"{question_time:.2f}",
|
| 263 |
+
"_score": validation_result['score'] # Keep for calculation but don't display
|
|
|
|
| 264 |
})
|
| 265 |
print(f"β
Completed in {question_time:.2f}s - {validation_result['icon']} {validation_result['status']}")
|
| 266 |
|
|
|
|
| 272 |
"Our Answer": f"ERROR: {e}",
|
| 273 |
"Expected Answer": correct_answers.get(task_id, {}).get('answer', 'Not available'),
|
| 274 |
"Result": "β ERROR",
|
| 275 |
+
"Time (s)": "Error",
|
| 276 |
+
"_score": 0.0 # Keep for calculation but don't display
|
|
|
|
| 277 |
})
|
| 278 |
|
| 279 |
total_time = time.time() - start_time
|
|
|
|
| 286 |
|
| 287 |
for result in results_log:
|
| 288 |
try:
|
| 289 |
+
score = float(result.get('_score', 0.0))
|
| 290 |
total_score += score
|
| 291 |
validated_count += 1
|
| 292 |
if score >= 1.0:
|
| 293 |
correct_count += 1
|
| 294 |
+
except (ValueError, TypeError):
|
| 295 |
pass
|
| 296 |
|
| 297 |
local_accuracy = (total_score / validated_count * 100) if validated_count > 0 else 0
|
|
|
|
| 303 |
|
| 304 |
if not answers_payload:
|
| 305 |
print("β Agent did not produce any answers to submit.")
|
| 306 |
+
display_results = [{k: v for k, v in result.items() if not k.startswith('_')} for result in results_log]
|
| 307 |
+
return "Agent did not produce any answers to submit.", pd.DataFrame(display_results)
|
| 308 |
|
| 309 |
# 4. Prepare Submission
|
| 310 |
submission_data = {
|
|
|
|
| 345 |
f"- Features: Enhanced reasoning, 42 specialized tools, domain expertise"
|
| 346 |
)
|
| 347 |
print("β
Submission successful.")
|
| 348 |
+
# Create DataFrame excluding hidden score field
|
| 349 |
+
display_results = [{k: v for k, v in result.items() if not k.startswith('_')} for result in results_log]
|
| 350 |
+
results_df = pd.DataFrame(display_results)
|
| 351 |
return final_status, results_df
|
| 352 |
|
| 353 |
except requests.exceptions.HTTPError as e:
|