Cuzz-Hugg / app.py
cousintiz
qf
1dbba7c
import os
import gradio as gr
import requests
import pandas as pd
from smolagents import CodeAgent, InferenceClientModel, OpenAIModel
# --- Constants ---
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
GAIA_SYSTEM_PROMPT = """You are solving GAIA level 1 questions with extreme precision.
CRITICAL RULES:
1. Return ONLY the final answer - no explanations, no context, no preamble
2. For numbers: just the number (no units unless explicitly requested)
3. For strings: just the answer (no articles like "the" or "a")
4. For lists: format as "item1, item2, item3" (no quotes, no brackets)
STRATEGY:
- Use web search liberally - search multiple times with different keywords
- Visit actual webpages to get complete information
- Cross-reference multiple sources
- Think step-by-step but output only the final answer
- If you find relevant info but not the complete answer, search again with more specific terms
NEVER output:
- "FINAL ANSWER:"
- "The answer is:"
- Explanations or reasoning
- "No information found" (keep searching!)
Examples of correct outputs:
Question: "How many studio albums?" β†’ Answer: "7"
Question: "What is the capital?" β†’ Answer: "Paris"
Question: "List the winners" β†’ Answer: "John, Mary, Bob"
"""
class SmolGaiaAgent:
"""
Premium agent optimized for maximum accuracy on GAIA Level 1.
"""
def __init__(self):
print("Initializing Premium SmolGaiaAgent...")
# Use the most capable model available
# Option 1: Qwen 32B (current - good balance)
self.model = OpenAIModel(
model_id="gpt-4.1",
api_key=os.getenv("OPENAI_API_KEY"),
)
# Option 2: Try Claude or GPT-4 via API if available
# self.model = InferenceClientModel(
# model_id="anthropic/claude-3-5-sonnet",
# api_key=os.getenv("ANTHROPIC_API_KEY"),
# )
# MORE STEPS = Better accuracy (but slower)
try:
self.agent = CodeAgent(
tools=[],
add_base_tools=True,
model=self.model,
max_steps=12, # INCREASED from 6 to 12 for thorough reasoning
system_prompt=GAIA_SYSTEM_PROMPT,
)
print("Agent initialized with system_prompt parameter")
self.use_task_prefix = False
except TypeError as e:
print(f"system_prompt not supported, using task prefix: {e}")
self.agent = CodeAgent(
tools=[],
add_base_tools=True,
model=self.model,
max_steps=12,
)
self.use_task_prefix = True
def __call__(self, question: str) -> str:
"""
Runs the CodeAgent on one question with enhanced answer extraction.
"""
print(f"[Premium Agent] Question: {question[:80]}...")
if self.use_task_prefix:
task = f"{GAIA_SYSTEM_PROMPT}\n\nTask: {question}"
else:
task = question
try:
answer = self.agent.run(task)
answer = str(answer).strip()
# Enhanced answer cleaning
answer = self.aggressive_clean_answer(answer)
print(f"[Premium Agent] Final Answer: {answer}")
return answer
except Exception as e:
print(f"[Premium Agent] Error: {e}")
import traceback
traceback.print_exc()
return "Error processing question"
def aggressive_clean_answer(self, answer: str) -> str:
"""
Aggressively clean the answer to extract just the answer.
"""
original = answer
# Remove common prefixes (case insensitive)
prefixes_to_remove = [
"final answer:",
"the final answer is:",
"answer:",
"the answer is:",
"the answer is",
"result:",
"solution:",
"output:",
]
answer_lower = answer.lower()
for prefix in prefixes_to_remove:
if answer_lower.startswith(prefix):
answer = answer[len(prefix):].strip()
answer_lower = answer.lower()
# Remove surrounding quotes
if (answer.startswith('"') and answer.endswith('"')) or \
(answer.startswith("'") and answer.endswith("'")):
answer = answer[1:-1].strip()
# If answer contains "is:" extract what comes after
if " is:" in answer.lower():
parts = answer.split("is:")
if len(parts) > 1:
answer = parts[-1].strip()
# If answer contains "are:" extract what comes after
if " are:" in answer.lower():
parts = answer.split("are:")
if len(parts) > 1:
answer = parts[-1].strip()
# Remove trailing periods (unless it's a decimal number)
if answer.endswith('.') and not answer[-2].isdigit():
answer = answer[:-1].strip()
# If answer starts with "The " and is followed by a name/noun, remove "The "
if answer.startswith("The ") and len(answer) > 4:
# Check if next word is capitalized (likely a proper noun)
next_word = answer.split()[1] if len(answer.split()) > 1 else ""
if next_word and next_word[0].isupper():
answer = answer[4:].strip()
# Remove "a " or "an " from the beginning
if answer.lower().startswith("a "):
answer = answer[2:].strip()
elif answer.lower().startswith("an "):
answer = answer[3:].strip()
print(f"[Cleaning] Original: '{original}' β†’ Cleaned: '{answer}'")
return answer
def run_and_submit_all(profile: gr.OAuthProfile | None):
"""
Fetches all questions, runs the Premium Agent, submits answers.
"""
space_id = os.getenv("SPACE_ID")
if profile is None:
return "Please Login to Hugging Face with the button.", None
try:
username = profile.username
print(f"User logged in: {username}")
except AttributeError:
return "Please Login to Hugging Face with the button.", None
api_url = DEFAULT_API_URL
questions_url = f"{api_url}/questions"
submit_url = f"{api_url}/submit"
# 1. Instantiate Agent
print("\n" + "="*70)
print("INITIALIZING PREMIUM AGENT")
print("="*70)
try:
agent = SmolGaiaAgent()
except Exception as e:
print(f"Error instantiating agent: {e}")
import traceback
traceback.print_exc()
return f"Error initializing agent: {e}", None
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
# 2. Fetch Questions
print(f"\nFetching questions from: {questions_url}")
try:
response = requests.get(questions_url, timeout=15)
response.raise_for_status()
questions_data = response.json()
if not questions_data:
return "Fetched questions list is empty or invalid format.", None
print(f"βœ“ Fetched {len(questions_data)} questions.")
except Exception as e:
return f"Error fetching questions: {e}", None
# 3. Run Agent with detailed progress tracking
results_log = []
answers_payload = []
total = len(questions_data)
print("\n" + "="*70)
print(f"PROCESSING {total} QUESTIONS")
print("="*70 + "\n")
for idx, item in enumerate(questions_data, 1):
task_id = item.get("task_id")
question_text = item.get("question")
if not task_id or question_text is None:
print(f"⚠ Skipping item with missing task_id or question")
continue
print(f"\n{'='*70}")
print(f"QUESTION {idx}/{total}")
print(f"Task ID: {task_id}")
print(f"Question: {question_text[:100]}...")
print('='*70)
try:
submitted_answer = agent(question_text)
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
results_log.append({
"Task ID": task_id,
"Question": question_text,
"Submitted Answer": submitted_answer
})
print(f"βœ“ Answer recorded: {submitted_answer}")
except Exception as e:
print(f"βœ— Error processing question: {e}")
import traceback
traceback.print_exc()
results_log.append({
"Task ID": task_id,
"Question": question_text,
"Submitted Answer": f"AGENT ERROR: {e}"
})
if not answers_payload:
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
# 4. Submit
submission_data = {
"username": username.strip(),
"agent_code": agent_code,
"answers": answers_payload
}
print("\n" + "="*70)
print(f"SUBMITTING {len(answers_payload)} ANSWERS")
print("="*70)
try:
response = requests.post(submit_url, json=submission_data, timeout=60)
response.raise_for_status()
result_data = response.json()
score = result_data.get('score', 'N/A')
correct = result_data.get('correct_count', '?')
total_attempted = result_data.get('total_attempted', '?')
final_status = (
f"πŸŽ‰ Submission Successful!\n"
f"User: {result_data.get('username')}\n"
f"Overall Score: {score}% ({correct}/{total_attempted} correct)\n"
f"Message: {result_data.get('message', 'No message received.')}\n\n"
f"{'πŸ† EXCELLENT!' if float(score) >= 80 else 'πŸ‘ Good job!' if float(score) >= 50 else 'πŸ’ͺ Keep improving!'}"
)
print(f"\nβœ“ Submission successful! Score: {score}%")
results_df = pd.DataFrame(results_log)
return final_status, results_df
except Exception as e:
print(f"βœ— Submission error: {e}")
results_df = pd.DataFrame(results_log)
return f"Submission Failed: {e}", results_df
# --- Build Gradio Interface ---
with gr.Blocks() as demo:
gr.Markdown("# πŸ† Premium Agent - Optimized for Maximum Accuracy")
gr.Markdown(
"""
**Current Configuration:**
- 🧠 Model: Qwen/Qwen2.5-Coder-32B-Instruct (most capable)
- πŸ”„ Max Steps: 12 (thorough reasoning)
- 🧹 Enhanced answer cleaning
- πŸ“Š Detailed progress logging
**Target Performance:**
- ⏱️ Time: ~20-25 minutes for 20 questions
- 🎯 Target Score: 60-80% (realistic for Level 1)
- πŸ† Stretch Goal: 80%+ with optimal configuration
**To Reach 100%:**
Getting 100% on GAIA Level 1 is extremely difficult. The benchmark shows:
- GPT-4 achieves ~70-80%
- Claude 3.5 achieves ~75-85%
- Human experts achieve ~90-95%
For the best possible score:
1. βœ… Use this premium configuration (12 steps, 32B model)
2. πŸ” Manually review failed questions and add custom logic
3. πŸ› οΈ Create specialized tools for specific question types
4. πŸ§ͺ Test and iterate on difficult questions
"""
)
gr.LoginButton()
run_button = gr.Button("πŸš€ Run Premium Evaluation & Submit")
status_output = gr.Textbox(label="Run Status / Submission Result", lines=7, interactive=False)
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
run_button.click(
fn=run_and_submit_all,
outputs=[status_output, results_table]
)
if __name__ == "__main__":
print("\n" + "="*70)
print("PREMIUM AGENT STARTING")
print("="*70)
space_host = os.getenv("SPACE_HOST")
space_id = os.getenv("SPACE_ID")
if space_host:
print(f"βœ“ Runtime URL: https://{space_host}.hf.space")
if space_id:
print(f"βœ“ Repo URL: https://huggingface.co/spaces/{space_id}/tree/main")
print("="*70 + "\n")
demo.launch(debug=True, share=False)