Cuzz-Hugg

Sleeping

cousintiz

1dbba7c 5 months ago

12.2 kB

	import os
	import gradio as gr
	import requests
	import pandas as pd

	from smolagents import CodeAgent, InferenceClientModel, OpenAIModel


	# --- Constants ---
	DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

	GAIA_SYSTEM_PROMPT = """You are solving GAIA level 1 questions with extreme precision.

	CRITICAL RULES:
	1. Return ONLY the final answer - no explanations, no context, no preamble
	2. For numbers: just the number (no units unless explicitly requested)
	3. For strings: just the answer (no articles like "the" or "a")
	4. For lists: format as "item1, item2, item3" (no quotes, no brackets)

	STRATEGY:
	- Use web search liberally - search multiple times with different keywords
	- Visit actual webpages to get complete information
	- Cross-reference multiple sources
	- Think step-by-step but output only the final answer
	- If you find relevant info but not the complete answer, search again with more specific terms

	NEVER output:
	- "FINAL ANSWER:"
	- "The answer is:"
	- Explanations or reasoning
	- "No information found" (keep searching!)

	Examples of correct outputs:
	Question: "How many studio albums?" → Answer: "7"
	Question: "What is the capital?" → Answer: "Paris"
	Question: "List the winners" → Answer: "John, Mary, Bob"
	"""


	class SmolGaiaAgent:
	"""
	Premium agent optimized for maximum accuracy on GAIA Level 1.
	"""

	def __init__(self):
	print("Initializing Premium SmolGaiaAgent...")

	# Use the most capable model available
	# Option 1: Qwen 32B (current - good balance)
	self.model = OpenAIModel(
	model_id="gpt-4.1",
	api_key=os.getenv("OPENAI_API_KEY"),
	)

	# Option 2: Try Claude or GPT-4 via API if available
	# self.model = InferenceClientModel(
	# model_id="anthropic/claude-3-5-sonnet",
	# api_key=os.getenv("ANTHROPIC_API_KEY"),
	# )

	# MORE STEPS = Better accuracy (but slower)
	try:
	self.agent = CodeAgent(
	tools=[],
	add_base_tools=True,
	model=self.model,
	max_steps=12, # INCREASED from 6 to 12 for thorough reasoning
	system_prompt=GAIA_SYSTEM_PROMPT,
	)
	print("Agent initialized with system_prompt parameter")
	self.use_task_prefix = False
	except TypeError as e:
	print(f"system_prompt not supported, using task prefix: {e}")
	self.agent = CodeAgent(
	tools=[],
	add_base_tools=True,
	model=self.model,
	max_steps=12,
	)
	self.use_task_prefix = True

	def __call__(self, question: str) -> str:
	"""
	Runs the CodeAgent on one question with enhanced answer extraction.
	"""
	print(f"[Premium Agent] Question: {question[:80]}...")

	if self.use_task_prefix:
	task = f"{GAIA_SYSTEM_PROMPT}\n\nTask: {question}"
	else:
	task = question

	try:
	answer = self.agent.run(task)
	answer = str(answer).strip()

	# Enhanced answer cleaning
	answer = self.aggressive_clean_answer(answer)

	print(f"[Premium Agent] Final Answer: {answer}")
	return answer
	except Exception as e:
	print(f"[Premium Agent] Error: {e}")
	import traceback
	traceback.print_exc()
	return "Error processing question"

	def aggressive_clean_answer(self, answer: str) -> str:
	"""
	Aggressively clean the answer to extract just the answer.
	"""
	original = answer

	# Remove common prefixes (case insensitive)
	prefixes_to_remove = [
	"final answer:",
	"the final answer is:",
	"answer:",
	"the answer is:",
	"the answer is",
	"result:",
	"solution:",
	"output:",
	]

	answer_lower = answer.lower()
	for prefix in prefixes_to_remove:
	if answer_lower.startswith(prefix):
	answer = answer[len(prefix):].strip()
	answer_lower = answer.lower()

	# Remove surrounding quotes
	if (answer.startswith('"') and answer.endswith('"')) or \
	(answer.startswith("'") and answer.endswith("'")):
	answer = answer[1:-1].strip()

	# If answer contains "is:" extract what comes after
	if " is:" in answer.lower():
	parts = answer.split("is:")
	if len(parts) > 1:
	answer = parts[-1].strip()

	# If answer contains "are:" extract what comes after
	if " are:" in answer.lower():
	parts = answer.split("are:")
	if len(parts) > 1:
	answer = parts[-1].strip()

	# Remove trailing periods (unless it's a decimal number)
	if answer.endswith('.') and not answer[-2].isdigit():
	answer = answer[:-1].strip()

	# If answer starts with "The " and is followed by a name/noun, remove "The "
	if answer.startswith("The ") and len(answer) > 4:
	# Check if next word is capitalized (likely a proper noun)
	next_word = answer.split()[1] if len(answer.split()) > 1 else ""
	if next_word and next_word[0].isupper():
	answer = answer[4:].strip()

	# Remove "a " or "an " from the beginning
	if answer.lower().startswith("a "):
	answer = answer[2:].strip()
	elif answer.lower().startswith("an "):
	answer = answer[3:].strip()

	print(f"[Cleaning] Original: '{original}' → Cleaned: '{answer}'")
	return answer


	def run_and_submit_all(profile: gr.OAuthProfile \| None):
	"""
	Fetches all questions, runs the Premium Agent, submits answers.
	"""
	space_id = os.getenv("SPACE_ID")

	if profile is None:
	return "Please Login to Hugging Face with the button.", None

	try:
	username = profile.username
	print(f"User logged in: {username}")
	except AttributeError:
	return "Please Login to Hugging Face with the button.", None

	api_url = DEFAULT_API_URL
	questions_url = f"{api_url}/questions"
	submit_url = f"{api_url}/submit"

	# 1. Instantiate Agent
	print("\n" + "="*70)
	print("INITIALIZING PREMIUM AGENT")
	print("="*70)
	try:
	agent = SmolGaiaAgent()
	except Exception as e:
	print(f"Error instantiating agent: {e}")
	import traceback
	traceback.print_exc()
	return f"Error initializing agent: {e}", None

	agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"

	# 2. Fetch Questions
	print(f"\nFetching questions from: {questions_url}")
	try:
	response = requests.get(questions_url, timeout=15)
	response.raise_for_status()
	questions_data = response.json()
	if not questions_data:
	return "Fetched questions list is empty or invalid format.", None
	print(f"✓ Fetched {len(questions_data)} questions.")
	except Exception as e:
	return f"Error fetching questions: {e}", None

	# 3. Run Agent with detailed progress tracking
	results_log = []
	answers_payload = []
	total = len(questions_data)

	print("\n" + "="*70)
	print(f"PROCESSING {total} QUESTIONS")
	print("="*70 + "\n")

	for idx, item in enumerate(questions_data, 1):
	task_id = item.get("task_id")
	question_text = item.get("question")

	if not task_id or question_text is None:
	print(f"⚠ Skipping item with missing task_id or question")
	continue

	print(f"\n{'='*70}")
	print(f"QUESTION {idx}/{total}")
	print(f"Task ID: {task_id}")
	print(f"Question: {question_text[:100]}...")
	print('='*70)

	try:
	submitted_answer = agent(question_text)
	answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
	results_log.append({
	"Task ID": task_id,
	"Question": question_text,
	"Submitted Answer": submitted_answer
	})
	print(f"✓ Answer recorded: {submitted_answer}")
	except Exception as e:
	print(f"✗ Error processing question: {e}")
	import traceback
	traceback.print_exc()
	results_log.append({
	"Task ID": task_id,
	"Question": question_text,
	"Submitted Answer": f"AGENT ERROR: {e}"
	})

	if not answers_payload:
	return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)

	# 4. Submit
	submission_data = {
	"username": username.strip(),
	"agent_code": agent_code,
	"answers": answers_payload
	}

	print("\n" + "="*70)
	print(f"SUBMITTING {len(answers_payload)} ANSWERS")
	print("="*70)

	try:
	response = requests.post(submit_url, json=submission_data, timeout=60)
	response.raise_for_status()
	result_data = response.json()

	score = result_data.get('score', 'N/A')
	correct = result_data.get('correct_count', '?')
	total_attempted = result_data.get('total_attempted', '?')

	final_status = (
	f"🎉 Submission Successful!\n"
	f"User: {result_data.get('username')}\n"
	f"Overall Score: {score}% ({correct}/{total_attempted} correct)\n"
	f"Message: {result_data.get('message', 'No message received.')}\n\n"
	f"{'🏆 EXCELLENT!' if float(score) >= 80 else '👍 Good job!' if float(score) >= 50 else '💪 Keep improving!'}"
	)
	print(f"\n✓ Submission successful! Score: {score}%")
	results_df = pd.DataFrame(results_log)
	return final_status, results_df
	except Exception as e:
	print(f"✗ Submission error: {e}")
	results_df = pd.DataFrame(results_log)
	return f"Submission Failed: {e}", results_df


	# --- Build Gradio Interface ---
	with gr.Blocks() as demo:
	gr.Markdown("# 🏆 Premium Agent - Optimized for Maximum Accuracy")
	gr.Markdown(
	"""
	Current Configuration:
	- 🧠 Model: Qwen/Qwen2.5-Coder-32B-Instruct (most capable)
	- 🔄 Max Steps: 12 (thorough reasoning)
	- 🧹 Enhanced answer cleaning
	- 📊 Detailed progress logging

	Target Performance:
	- ⏱️ Time: ~20-25 minutes for 20 questions
	- 🎯 Target Score: 60-80% (realistic for Level 1)
	- 🏆 Stretch Goal: 80%+ with optimal configuration

	To Reach 100%:
	Getting 100% on GAIA Level 1 is extremely difficult. The benchmark shows:
	- GPT-4 achieves ~70-80%
	- Claude 3.5 achieves ~75-85%
	- Human experts achieve ~90-95%

	For the best possible score:
	1. ✅ Use this premium configuration (12 steps, 32B model)
	2. 🔍 Manually review failed questions and add custom logic
	3. 🛠️ Create specialized tools for specific question types
	4. 🧪 Test and iterate on difficult questions
	"""
	)

	gr.LoginButton()
	run_button = gr.Button("🚀 Run Premium Evaluation & Submit")
	status_output = gr.Textbox(label="Run Status / Submission Result", lines=7, interactive=False)
	results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)

	run_button.click(
	fn=run_and_submit_all,
	outputs=[status_output, results_table]
	)

	if __name__ == "__main__":
	print("\n" + "="*70)
	print("PREMIUM AGENT STARTING")
	print("="*70)

	space_host = os.getenv("SPACE_HOST")
	space_id = os.getenv("SPACE_ID")

	if space_host:
	print(f"✓ Runtime URL: https://{space_host}.hf.space")
	if space_id:
	print(f"✓ Repo URL: https://huggingface.co/spaces/{space_id}/tree/main")

	print("="*70 + "\n")
	demo.launch(debug=True, share=False)