emanuelediluzio's picture
Update app.py
a88e8d8 verified
"""
GAIA Agent v5 β€” With Vision and Audio Transcription!
Target: 40%+ (8+/20)
"""
import os
import re
import io
import time
import base64
import traceback
import gradio as gr
import requests
import pandas as pd
from bs4 import BeautifulSoup
from typing import Optional, Tuple, List, Dict, Any
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
GROQ_API = "https://api.groq.com/openai/v1/chat/completions"
GROQ_AUDIO_API = "https://api.groq.com/openai/v1/audio/transcriptions"
# Models to try in order of preference
GROQ_MODELS = [
"llama-3.3-70b-versatile",
"llama-3.1-70b-versatile",
"mixtral-8x7b-32768",
]
GROQ_VISION_MODEL = "llama-3.2-90b-vision-preview"
GROQ_AUDIO_MODEL = "whisper-large-v3"
# ==========================================
# VISION & AUDIO TOOLS
# ==========================================
def transcribe_audio(audio_bytes: bytes, groq_key: str, filename: str = "audio.mp3") -> str:
"""Transcribe audio using Groq Whisper API."""
if not groq_key or not audio_bytes:
return ""
try:
print(f" 🎀 Transcribing audio ({len(audio_bytes)/1024:.1f} KB)...")
files = {
'file': (filename, audio_bytes, 'audio/mpeg'),
'model': (None, GROQ_AUDIO_MODEL),
}
resp = requests.post(
GROQ_AUDIO_API,
headers={"Authorization": f"Bearer {groq_key}"},
files=files,
timeout=60,
)
if resp.status_code == 200:
result = resp.json()
text = result.get("text", "")
print(f" βœ… Transcribed: {text[:100]}...")
return text
else:
print(f" ⚠️ Audio transcription failed: {resp.status_code} - {resp.text[:200]}")
return ""
except Exception as e:
print(f" ⚠️ Audio transcription error: {e}")
return ""
def analyze_image(image_bytes: bytes, question: str, groq_key: str) -> str:
"""Analyze image using Groq Vision API."""
if not groq_key or not image_bytes:
return ""
try:
print(f" πŸ–ΌοΈ Analyzing image ({len(image_bytes)/1024:.1f} KB)...")
# Convert to base64
image_b64 = base64.b64encode(image_bytes).decode('utf-8')
# Detect image type
if image_bytes[:8] == b'\x89PNG\r\n\x1a\n':
mime_type = "image/png"
elif image_bytes[:2] == b'\xff\xd8':
mime_type = "image/jpeg"
elif image_bytes[:6] in (b'GIF87a', b'GIF89a'):
mime_type = "image/gif"
else:
mime_type = "image/png" # default
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": f"Look at this image and answer the question precisely. Give ONLY the answer, no explanation.\n\nQuestion: {question}"
},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{image_b64}"
}
}
]
}
]
resp = requests.post(
GROQ_API,
headers={
"Authorization": f"Bearer {groq_key}",
"Content-Type": "application/json"
},
json={
"model": GROQ_VISION_MODEL,
"messages": messages,
"temperature": 0.1,
"max_tokens": 300,
},
timeout=60,
)
if resp.status_code == 200:
result = resp.json()
answer = result.get("choices", [{}])[0].get("message", {}).get("content", "")
print(f" βœ… Vision response: {answer[:100]}...")
return answer
else:
print(f" ⚠️ Vision failed: {resp.status_code} - {resp.text[:200]}")
return ""
except Exception as e:
print(f" ⚠️ Vision error: {e}")
return ""
# ==========================================
# TOOLS
# ==========================================
def fetch_webpage(url: str, timeout: int = 15) -> str:
"""Fetch and extract text from a webpage."""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# Remove unwanted elements
for el in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe", "form"]):
el.extract()
# Try to get main content first
main_content = soup.find("main") or soup.find("article") or soup.find("div", {"class": re.compile(r"content|main|article", re.I)})
if main_content:
text = main_content.get_text("\n", strip=True)
else:
text = soup.get_text("\n", strip=True)
lines = [l.strip() for l in text.splitlines() if l.strip() and len(l.strip()) > 2]
return "\n".join(lines)[:10000]
except Exception as e:
print(f" ⚠️ Webpage fetch error: {e}")
return ""
def fetch_youtube_transcript(url: str) -> str:
"""Fetch YouTube video transcript with multiple fallback methods."""
try:
from youtube_transcript_api import YouTubeTranscriptApi
# Extract video ID
patterns = [
r"(?:v=|/v/|youtu\.be/|embed/|shorts/)([a-zA-Z0-9_-]{11})",
r"^([a-zA-Z0-9_-]{11})$"
]
vid = None
for pattern in patterns:
match = re.search(pattern, url)
if match:
vid = match.group(1)
break
if not vid:
print(f" ⚠️ Could not extract video ID from: {url}")
return ""
print(f" πŸ“Ί Video ID: {vid}")
# Create API instance (new API style)
ytt_api = YouTubeTranscriptApi()
# Try multiple language options
lang_options = [
("en",),
("en", "en-US", "en-GB"),
("it", "it-IT"),
("en", "it", "fr", "de", "es", "pt"),
]
for langs in lang_options:
try:
transcript = ytt_api.fetch(vid, languages=langs)
# transcript is a FetchedTranscript object, iterate to get snippets
text = " ".join([snippet.text for snippet in transcript])
if text:
print(f" βœ“ Got transcript ({len(text)} chars, langs: {langs})")
return text[:8000]
except Exception as e:
continue
# Try listing all transcripts and fetching any available
try:
transcript_list = ytt_api.list(vid)
# Try manually created first
for t in transcript_list:
if not t.is_generated:
try:
fetched = t.fetch()
text = " ".join([snippet.text for snippet in fetched])
if text:
print(f" βœ“ Got manual transcript ({len(text)} chars)")
return text[:8000]
except:
pass
# Then auto-generated
for t in transcript_list:
if t.is_generated:
try:
fetched = t.fetch()
text = " ".join([snippet.text for snippet in fetched])
if text:
print(f" βœ“ Got auto transcript ({len(text)} chars)")
return text[:8000]
except:
pass
# Try translated
for t in transcript_list:
try:
translated = t.translate('en')
fetched = translated.fetch()
text = " ".join([snippet.text for snippet in fetched])
if text:
print(f" βœ“ Got translated transcript ({len(text)} chars)")
return text[:8000]
except:
pass
except Exception as e:
print(f" ⚠️ Transcript list error: {e}")
return ""
except ImportError:
print(" ⚠️ youtube_transcript_api not installed")
return ""
except Exception as e:
print(f" ⚠️ YouTube error: {e}")
return ""
def fetch_task_file(task_id: str) -> Tuple[str, str, Optional[bytes]]:
"""Fetch and parse attached file for a task. Returns (content_str, file_type, raw_bytes_for_media)."""
try:
url = f"{DEFAULT_API_URL}/files/{task_id}"
resp = requests.get(url, timeout=30)
if resp.status_code == 404:
return "", "none", None
if resp.status_code != 200:
print(f" ⚠️ File fetch failed: {resp.status_code}")
return "", "none", None
ct = resp.headers.get("Content-Type", "").lower()
cd = resp.headers.get("Content-Disposition", "")
# Extract filename
filename = ""
if "filename=" in cd:
filename = cd.split("filename=")[-1].strip('" ')
ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
print(f" πŸ“Ž File: {filename or 'unknown'}, type: {ct[:50]}")
# Text/Code files
if any(t in ct for t in ["text/", "json", "javascript", "python"]) or ext in ["txt", "csv", "json", "py", "md", "js", "html"]:
text = resp.text
# CSV parsing
if ext == "csv" or "csv" in ct:
try:
df = pd.read_csv(io.StringIO(text))
summary = f"CSV file with {len(df)} rows and columns: {list(df.columns)}\n"
summary += f"Data:\n{df.to_string()}"
return summary[:8000], "csv", None
except Exception as e:
print(f" ⚠️ CSV parse error: {e}")
# Python code
if ext == "py":
return f"Python code:\n```python\n{text[:6000]}\n```", "python", None
return text[:8000], "text", None
# Excel files
if "spreadsheet" in ct or "excel" in ct or ext in ["xlsx", "xls"]:
try:
df = pd.read_excel(io.BytesIO(resp.content), engine="openpyxl")
summary = f"Excel file with {len(df)} rows and columns: {list(df.columns)}\n"
summary += f"Data:\n{df.to_string()}"
return summary[:8000], "excel", None
except Exception as e:
print(f" ⚠️ Excel parse error: {e}")
try:
df = pd.read_excel(io.BytesIO(resp.content))
summary = f"Excel file with {len(df)} rows and columns: {list(df.columns)}\n"
summary += f"Data:\n{df.to_string()}"
return summary[:8000], "excel", None
except:
return "Excel file (could not parse)", "excel", None
# PDF files
if "pdf" in ct or ext == "pdf":
try:
import PyPDF2
reader = PyPDF2.PdfReader(io.BytesIO(resp.content))
text_parts = []
for i, page in enumerate(reader.pages):
page_text = page.extract_text() or ""
if page_text:
text_parts.append(f"--- Page {i+1} ---\n{page_text}")
text = "\n".join(text_parts)
return text[:8000] if text else "PDF (no extractable text)", "pdf", None
except ImportError:
print(" ⚠️ PyPDF2 not installed")
return "PDF file (PyPDF2 not available)", "pdf", None
except Exception as e:
print(f" ⚠️ PDF parse error: {e}")
return "PDF file (parse error)", "pdf", None
# Audio files - return raw bytes for transcription
if "audio" in ct or ext in ["mp3", "wav", "m4a", "ogg", "flac"]:
size_kb = len(resp.content) / 1024
print(f" 🎡 Audio file detected ({size_kb:.1f} KB) - will transcribe")
return f"Audio file ({ext or 'unknown'}, {size_kb:.1f} KB)", "audio", resp.content
# Image files - return raw bytes for vision analysis
if "image" in ct or ext in ["png", "jpg", "jpeg", "gif", "webp", "bmp"]:
size_kb = len(resp.content) / 1024
print(f" πŸ–ΌοΈ Image file detected ({size_kb:.1f} KB) - will analyze")
return f"Image file ({ext or 'unknown'}, {size_kb:.1f} KB)", "image", resp.content
# Try to decode as text
try:
text = resp.content.decode("utf-8")
return text[:8000], "text", None
except:
try:
text = resp.content.decode("latin-1")
return text[:8000], "text", None
except:
return f"Binary file ({ct or 'unknown type'}, {len(resp.content)} bytes)", "binary", None
except requests.exceptions.Timeout:
print(" ⚠️ File fetch timeout")
return "", "none", None
except Exception as e:
print(f" ⚠️ File fetch error: {e}")
return "", "none", None
def web_search(query: str, max_results: int = 5) -> List[Dict[str, str]]:
"""Search the web and return results."""
results = []
# Try ddgs package (new name)
try:
from ddgs import DDGS
ddgs = DDGS()
for r in ddgs.text(query, max_results=max_results):
results.append({
"title": r.get("title", ""),
"body": r.get("body", ""),
"href": r.get("href", "")
})
if results:
print(f" πŸ” ddgs found {len(results)} results")
return results
except ImportError:
pass
except Exception as e:
print(f" ⚠️ ddgs error: {e}")
# Fallback: try duckduckgo-search package
try:
from duckduckgo_search import DDGS
with DDGS() as ddgs:
for r in ddgs.text(query, max_results=max_results):
results.append({
"title": r.get("title", ""),
"body": r.get("body", ""),
"href": r.get("href", "")
})
if results:
print(f" πŸ” DDG found {len(results)} results")
return results
except ImportError:
print(" ⚠️ duckduckgo-search not installed")
except Exception as e:
print(f" ⚠️ DDG error: {e}")
return results
def search_wikipedia(query: str) -> str:
"""Search Wikipedia and return article content."""
try:
headers = {
"User-Agent": "GAIAAgent/1.0 (https://huggingface.co/spaces; contact@example.com)"
}
# Search for article
search_url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "query",
"list": "search",
"srsearch": query,
"format": "json",
"srlimit": 3
}
resp = requests.get(search_url, params=params, headers=headers, timeout=10)
if resp.status_code != 200:
print(f" ⚠️ Wikipedia search HTTP {resp.status_code}")
return ""
data = resp.json()
results = data.get("query", {}).get("search", [])
if not results:
return ""
# Get the first article
title = results[0]["title"]
# Fetch article content using REST API
encoded_title = requests.utils.quote(title.replace(' ', '_'))
content_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{encoded_title}"
resp = requests.get(content_url, headers=headers, timeout=10)
if resp.status_code == 200:
article = resp.json()
extract = article.get("extract", "")
if extract:
print(f" πŸ“– Wikipedia: {title}")
return f"Wikipedia - {title}:\n{extract}"
return ""
except requests.exceptions.Timeout:
print(f" ⚠️ Wikipedia timeout")
return ""
except Exception as e:
print(f" ⚠️ Wikipedia error: {e}")
return ""
# ==========================================
# GROQ LLM
# ==========================================
def ask_groq(messages: List[Dict], groq_key: str, max_tokens: int = 400, temperature: float = 0.1, model: str = None) -> str:
"""Send request to Groq API with retries and model fallback."""
if not groq_key:
print(" ❌ GROQ_API_KEY is empty!")
return ""
# Use specified model or try all models in order
models_to_try = [model] if model else GROQ_MODELS
for model_name in models_to_try:
for attempt in range(2): # 2 attempts per model
try:
resp = requests.post(
GROQ_API,
headers={
"Authorization": f"Bearer {groq_key}",
"Content-Type": "application/json"
},
json={
"model": model_name,
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
},
timeout=60,
)
if resp.status_code == 200:
result = resp.json()
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
if content:
print(f" πŸ“ [{model_name}] Response: {content[:80]}...")
return content.strip()
else:
print(f" ⚠️ [{model_name}] Empty content")
elif resp.status_code == 429:
wait_time = 10 * (attempt + 1)
print(f" ⏳ [{model_name}] Rate limited, waiting {wait_time}s...")
time.sleep(wait_time)
elif resp.status_code == 401:
print(f" ❌ Groq API key invalid!")
return ""
elif resp.status_code == 404:
print(f" ⚠️ Model {model_name} not found, trying next...")
break # Try next model
else:
print(f" ⚠️ [{model_name}] HTTP {resp.status_code}: {resp.text[:200]}")
time.sleep(3)
except requests.exceptions.Timeout:
print(f" ⚠️ [{model_name}] Timeout (attempt {attempt + 1}/2)")
time.sleep(5)
except Exception as e:
print(f" ⚠️ [{model_name}] Error: {type(e).__name__}: {e}")
time.sleep(3)
print(" ❌ All Groq attempts failed")
return ""
# ==========================================
# TEXT PROCESSING
# ==========================================
def preprocess_question(question: str) -> str:
"""Handle reversed or scrambled text."""
stripped = question.strip()
# Check for reversed text
reversed_text = stripped[::-1]
# Keywords that indicate proper English text
keywords = ["answer", "what", "who", "how", "find", "list", "which", "where",
"when", "the", "is", "are", "was", "were", "has", "have", "this",
"that", "from", "with", "about", "question", "video", "image",
"write", "opposite", "sentence", "if", "you", "understand"]
orig_score = sum(1 for w in keywords if w in stripped.lower())
rev_score = sum(1 for w in keywords if w in reversed_text.lower())
print(f" πŸ“Š Text analysis: orig_keywords={orig_score}, rev_keywords={rev_score}")
# If reversed text has more keywords, use it
if rev_score > orig_score + 1 and len(stripped) > 20:
print(f" πŸ”„ Detected reversed text!")
print(f" πŸ“ Reversed: {reversed_text[:100]}...")
return reversed_text
# Also check if text starts with punctuation (common in reversed text)
if stripped and stripped[0] in '.!?,;:' and rev_score >= orig_score:
print(f" πŸ”„ Text starts with punctuation, trying reversed")
print(f" πŸ“ Reversed: {reversed_text[:100]}...")
return reversed_text
return stripped
def clean_answer(raw: str) -> str:
"""Extract and clean the final answer from LLM response."""
if not raw:
return ""
answer = raw.strip()
# Take first non-empty line
for line in answer.split("\n"):
line = line.strip()
if line and not line.startswith("#"):
answer = line
break
# Remove common prefixes (case-insensitive)
prefixes = [
"the answer is:", "the answer is", "answer:", "answer is:",
"final answer:", "final answer is:", "the final answer is:",
"the correct answer is:", "the correct answer is",
"result:", "the result is:",
"based on my analysis,", "based on my analysis",
"based on the", "according to",
"sure,", "here is", "here's", "i found that"
]
# Apply prefix removal iteratively
changed = True
max_iterations = 10
iterations = 0
while changed and iterations < max_iterations:
changed = False
iterations += 1
answer_lower = answer.lower()
for prefix in prefixes:
if answer_lower.startswith(prefix):
answer = answer[len(prefix):].strip()
changed = True
break
# Remove trailing punctuation (period, comma, etc.)
# But preserve decimal numbers like "3.14"
while answer and answer[-1] in '.,:;!':
char = answer[-1]
if char == '.' and len(answer) >= 2:
# Check if this is a decimal number (has digits on both sides of a period)
before = answer[:-1]
# If there's already a period in the string AND it's followed by digits, it's a decimal
if '.' in before:
# e.g., "3.14." - the inner period is decimal, outer is punctuation
answer = answer[:-1].strip()
elif before and before[-1].isdigit():
# Could be end of integer "42." or a decimal "3.14"
# Check if there are non-digit chars (indicating it's just "42." not "3.14")
# A pure decimal would be all digits and one period
test_str = before.lstrip('-') # Handle negative numbers
if test_str.isdigit():
# It's just an integer with a period, remove the period
answer = answer[:-1].strip()
else:
# Might have letters or other chars, remove period
answer = answer[:-1].strip()
else:
answer = answer[:-1].strip()
else:
answer = answer[:-1].strip()
# Clean up formatting
answer = answer.replace("**", "").strip('"\'`')
return answer.strip()
def is_valid_answer(answer: str) -> bool:
"""Check if an answer is valid (not a refusal or error)."""
if not answer or len(answer.strip()) < 1:
return False
# If answer is too long, it's probably not a direct answer
if len(answer) > 150:
print(f" ⚠️ Answer too long ({len(answer)} chars), likely not a direct answer")
return False
# Check for refusal phrases at the START of the answer
refusal_starts = [
"no image", "no information", "no transcript", "no data",
"i do not", "i don't", "i cannot", "i can't", "i am not able",
"unable to", "cannot determine", "not able to",
"without access", "i'm not sure", "i am unable",
"there is no", "there's no", "no file", "no video"
]
answer_lower = answer.lower().strip()
for phrase in refusal_starts:
if answer_lower.startswith(phrase):
print(f" ⚠️ Answer starts with refusal: '{phrase}'")
return False
invalid_phrases = [
"i don't know", "i dont know", "i do not know",
"n/a", "error",
"i cannot", "i can't", "i cant",
"not available", "no answer", "unable to",
"i'm not sure", "im not sure", "i am not sure",
"no image", "cannot determine", "insufficient information",
"not provided", "cannot access", "i'm unable", "i am unable",
"not able to", "i am not able", "however,", "based on typical",
"without access", "no transcript", "no information"
]
return not any(phrase in answer_lower for phrase in invalid_phrases)
# ==========================================
# MAIN SOLVER
# ==========================================
SYSTEM_PROMPT = """Answer the question with ONLY the final answer. No explanation.
Format:
- Numbers: just the number (e.g., 5)
- Names: just the name (e.g., John Smith)
- Words: just the word (e.g., right)
- Lists: comma-separated (e.g., a, b, c)
IMPORTANT:
- If counting items from a list or table, count carefully and give the exact number
- If asked for opposite of a word, give that opposite word
- Always give your best answer, never refuse"""
def is_simple_question(question: str) -> bool:
"""Check if question is simple enough to answer without web search."""
q_lower = question.lower()
# Simple questions about opposites, basic facts, math
simple_patterns = [
"opposite of", "antonym of", "what is the opposite",
"write the opposite", "2+2", "2 + 2",
]
return any(p in q_lower for p in simple_patterns) and len(question) < 200
def solve_question(question: str, task_id: str, groq_key: str) -> str:
"""Main function to solve a GAIA question."""
print(f"\n[Q]: {question[:150]}{'...' if len(question) > 150 else ''}")
# Preprocess the question
processed_q = preprocess_question(question)
context_parts = []
# Check if it's a simple question that doesn't need web search
if is_simple_question(processed_q):
print(" ⚑ Simple question detected, answering directly")
answer_raw = ask_groq([
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Answer this directly: {processed_q}"}
], groq_key, max_tokens=50, temperature=0.0)
answer = clean_answer(answer_raw) if answer_raw else ""
if answer and is_valid_answer(answer):
print(f" βœ… Direct answer: {answer}")
return answer
# 1. Check for attached files
file_content, file_type, file_bytes = fetch_task_file(task_id)
if file_content and file_type != "none":
# Handle images with Vision API
if file_type == "image" and file_bytes:
print(f" πŸ–ΌοΈ Analyzing image with Vision API...")
vision_answer = analyze_image(file_bytes, processed_q, groq_key)
if vision_answer and is_valid_answer(clean_answer(vision_answer)):
# If vision gives a good answer, use it directly
answer = clean_answer(vision_answer)
print(f" βœ… Vision answer: {answer}")
return answer
elif vision_answer:
# Add vision analysis to context
context_parts.append(f"[IMAGE ANALYSIS]:\n{vision_answer}")
# Handle audio with Transcription API
elif file_type == "audio" and file_bytes:
print(f" 🎡 Transcribing audio with Whisper...")
transcript = transcribe_audio(file_bytes, groq_key)
if transcript:
context_parts.append(f"[AUDIO TRANSCRIPTION]:\n{transcript}")
print(f" βœ… Got audio transcript ({len(transcript)} chars)")
else:
context_parts.append(f"[NOTE: Audio file attached but transcription failed.]")
# Normal files
else:
context_parts.append(f"[ATTACHED FILE - {file_type.upper()}]:\n{file_content}")
print(f" πŸ“ Got {file_type} file ({len(file_content)} chars)")
# 2. Process YouTube URLs
yt_urls = re.findall(r'https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/shorts/)[^\s\)\]]+', processed_q)
for yt_url in yt_urls[:2]: # Limit to 2 videos
clean_url = yt_url.rstrip('.,;:')
print(f" 🎬 Fetching transcript: {clean_url}")
transcript = fetch_youtube_transcript(clean_url)
if transcript:
context_parts.append(f"[YOUTUBE VIDEO TRANSCRIPT]:\n{transcript}")
else:
# Try to search for information about this video
vid_match = re.search(r'(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})', clean_url)
if vid_match:
vid_id = vid_match.group(1)
print(f" πŸ” No transcript, searching for video info: {vid_id}")
video_results = web_search(f"youtube {vid_id} video content summary", max_results=3)
if video_results:
snippets = "\n".join([f"β€’ {r.get('title', '')}: {r.get('body', '')}" for r in video_results])
context_parts.append(f"[YOUTUBE VIDEO INFO (no transcript available)]:\nVideo URL: {clean_url}\nSearch results about this video:\n{snippets}")
else:
context_parts.append(f"[YOUTUBE VIDEO]: {clean_url} - No transcript or info available.")
else:
context_parts.append(f"[YOUTUBE VIDEO]: {clean_url} - Could not process.")
# 3. Process other URLs
other_urls = re.findall(r'https?://[^\s\)\]]+', processed_q)
other_urls = [u.rstrip('.,;:') for u in other_urls
if "youtube.com" not in u and "youtu.be" not in u]
for url in other_urls[:2]: # Limit to 2 URLs
print(f" 🌐 Fetching page: {url[:60]}...")
page_content = fetch_webpage(url)
if page_content:
context_parts.append(f"[WEBPAGE: {url}]:\n{page_content}")
# 4. Web search for additional context
# Skip search if we have good file data (Excel/CSV with actual data)
should_search = True
if file_type in ["excel", "csv"] and len(file_content) > 500:
should_search = False # We have data to analyze
print(" ⏭️ Skipping search - using file data")
if should_search and not yt_urls:
# Generate search query
search_query = processed_q[:200] if len(processed_q) < 200 else processed_q[:200]
# Try to extract key terms for search
query_prompt = ask_groq([
{"role": "system", "content": "Extract the key search terms from this question. Output ONLY the search query (3-8 words), nothing else."},
{"role": "user", "content": processed_q[:400]}
], groq_key, max_tokens=30, temperature=0.0)
if query_prompt and len(query_prompt) < 100 and len(query_prompt) > 3:
search_query = query_prompt
print(f" πŸ” Searching: '{search_query[:50]}'")
# Try web search
results = web_search(search_query, max_results=5)
if results:
# Add search snippets - these are often the most useful
snippets = "\n".join([f"β€’ {r.get('title', '')}: {r.get('body', '')}" for r in results])
context_parts.append(f"[SEARCH RESULTS]:\n{snippets}")
# Fetch Wikipedia page if in results (most reliable)
wiki_fetched = False
for r in results:
href = r.get("href", "")
if "wikipedia.org" in href and not wiki_fetched:
page = fetch_webpage(href)
if page and len(page) > 500:
context_parts.append(f"[WIKIPEDIA PAGE]:\n{page[:6000]}")
wiki_fetched = True
print(f" πŸ“– Fetched Wikipedia: {href[:50]}")
break
# If no Wikipedia, fetch first non-wiki result
if not wiki_fetched:
for r in results[:2]:
href = r.get("href", "")
if href and "youtube" not in href:
page = fetch_webpage(href)
if page and len(page) > 300:
context_parts.append(f"[WEB PAGE]:\n{page[:4000]}")
print(f" 🌐 Fetched: {href[:50]}")
break
# Also try direct Wikipedia search
wiki_content = search_wikipedia(search_query)
if wiki_content and "[WIKIPEDIA PAGE]" not in str(context_parts):
context_parts.append(f"[WIKIPEDIA]:\n{wiki_content}")
# 5. Build context and query LLM
context = "\n\n".join(context_parts) if context_parts else ""
# Truncate context if too long
if len(context) > 12000:
context = context[:12000] + "\n[...truncated]"
# Check if this is a counting/analysis question
is_counting_q = any(w in processed_q.lower() for w in ['how many', 'count', 'number of', 'total'])
is_list_q = any(w in processed_q.lower() for w in ['list', 'name all', 'what are'])
# First attempt with context - use 2-step for complex questions
if context and (is_counting_q or is_list_q):
# Step 1: Extract relevant data
extract_prompt = f"""From this context, extract ONLY the specific information needed to answer the question.
Context: {context[:8000]}
Question: {processed_q}
List the relevant facts (be brief):"""
extracted = ask_groq([
{"role": "user", "content": extract_prompt}
], groq_key, max_tokens=500, temperature=0.0)
if extracted:
print(f" πŸ“‹ Extracted: {extracted[:150]}...")
# Step 2: Answer based on extracted info
answer_raw = ask_groq([
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Based on these facts:\n{extracted}\n\nQuestion: {processed_q}\n\nFinal answer (just the answer, nothing else):"}
], groq_key, max_tokens=100, temperature=0.0)
else:
answer_raw = ""
elif context:
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {processed_q}\n\nAnswer:"}
]
answer_raw = ask_groq(messages, groq_key, max_tokens=100, temperature=0.1)
else:
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Question: {processed_q}\n\nAnswer:"}
]
answer_raw = ask_groq(messages, groq_key, max_tokens=100, temperature=0.1)
answer = clean_answer(answer_raw) if answer_raw else ""
print(f" πŸ“€ Raw: '{answer_raw[:100] if answer_raw else '[empty]'}' -> Clean: '{answer}'")
# If answer isn't valid, try again with more forceful prompt
if not is_valid_answer(answer):
print(f" ⚠️ First attempt invalid: '{answer}', retrying...")
# More forceful prompt
retry_messages = [
{"role": "system", "content": "Give ONLY the answer. One word or number if possible."},
{"role": "user", "content": f"{processed_q}"}
]
answer_raw = ask_groq(retry_messages, groq_key, max_tokens=50, temperature=0.2)
answer = clean_answer(answer_raw) if answer_raw else ""
print(f" πŸ“€ Retry: '{answer}'")
# If still not valid, try one more time with knowledge-based approach
if not is_valid_answer(answer):
print(f" ⚠️ Second attempt invalid: '{answer}', trying knowledge-based...")
retry_messages = [
{"role": "system", "content": "Give ONLY the answer, nothing else. Best guess if unsure."},
{"role": "user", "content": processed_q}
]
answer_raw = ask_groq(retry_messages, groq_key, max_tokens=50, temperature=0.5)
answer = clean_answer(answer_raw) if answer_raw else ""
print(f" πŸ“€ Third try raw: '{answer_raw[:100] if answer_raw else '[empty]'}' -> Clean: '{answer}'")
# If still no valid answer but we have some text, extract first meaningful chunk
if not answer or len(answer.strip()) == 0 or not is_valid_answer(answer):
if answer_raw and len(answer_raw.strip()) > 0:
# Try to extract just the answer part
lines = answer_raw.strip().split('\n')
for line in lines:
line = line.strip()
if line and len(line) < 100 and not any(x in line.lower() for x in ['cannot', "don't know", 'unable', 'no image']):
answer = clean_answer(line)
print(f" πŸ”„ Extracted from response: '{answer}'")
break
# Absolute final fallback
if not answer or len(answer.strip()) == 0:
answer = "unknown"
print(f" ❌ No answer found, defaulting to 'unknown'")
print(f" βœ… Final Answer: {answer}")
return answer
# ==========================================
# GRADIO INTERFACE
# ==========================================
def run_and_submit_all(profile: gr.OAuthProfile | None):
"""Run the agent on all questions and submit answers."""
space_id = os.getenv("SPACE_ID", "")
if not profile:
return "Effettua il login con Hugging Face per continuare.", None
username = profile.username
groq_key = os.getenv("GROQ_API_KEY", "")
if not groq_key:
return "❌ GROQ_API_KEY non configurata! Aggiungi la chiave nelle impostazioni dello Space.", None
print(f"\n{'='*60}")
print(f"πŸ‘€ User: {username}")
print(f"πŸ€– Agent: GAIA Agent v5")
print(f"πŸ”‘ API Key: {groq_key[:8]}...{groq_key[-4:]}")
print(f"{'='*60}")
# Test Groq API connectivity first
print("\nπŸ” Testing Groq API connectivity...")
test_response = ask_groq(
[{"role": "user", "content": "Say 'OK' and nothing else."}],
groq_key, max_tokens=10, temperature=0.0
)
if not test_response:
return "❌ Groq API test failed! Check your API key and try again.", None
print(f"βœ… Groq API test passed: '{test_response}'")
# Fetch questions
try:
resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=20)
resp.raise_for_status()
questions = resp.json()
except Exception as e:
return f"❌ Errore nel recupero delle domande: {e}", None
print(f"\nπŸ“‹ {len(questions)} domande da processare\n")
results = []
answers = []
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
for i, item in enumerate(questions):
task_id = item.get("task_id", "")
q = item.get("question")
if not task_id or q is None:
print(f"[{i+1}] Skipping invalid item")
continue
print(f"\n{'─'*60}")
print(f"[{i+1}/{len(questions)}] Task: {task_id[:20]}...")
try:
answer = solve_question(q, task_id, groq_key)
except Exception as e:
print(f" πŸ’₯ Exception: {e}")
traceback.print_exc()
answer = "I don't know"
answers.append({
"task_id": task_id,
"submitted_answer": answer
})
results.append({
"Task ID": task_id[:20] + "...",
"Question": q[:80] + ("..." if len(q) > 80 else ""),
"Answer": answer
})
# Rate limit protection - increase delay between questions
time.sleep(2.5)
if not answers:
return "❌ Nessuna risposta generata.", pd.DataFrame(results)
# Submit answers
print(f"\n{'='*60}")
print(f"πŸ“€ Submitting {len(answers)} answers...")
try:
submit_resp = requests.post(
f"{DEFAULT_API_URL}/submit",
json={
"username": username,
"agent_code": agent_code,
"answers": answers
},
timeout=60,
)
submit_resp.raise_for_status()
result = submit_resp.json()
score = result.get('score', 'N/A')
correct = result.get('correct_count', '?')
total = result.get('total_attempted', '?')
message = result.get('message', '')
status = f"""βœ… Completato!
πŸ‘€ {result.get('username')}
πŸ† {score}% ({correct}/{total})
πŸ“ {message}"""
print(f"\n{status}")
return status, pd.DataFrame(results)
except Exception as e:
error_msg = f"❌ Errore nell'invio: {e}"
print(error_msg)
return error_msg, pd.DataFrame(results)
def create_demo():
"""Build and return the Gradio interface."""
with gr.Blocks(title="GAIA Agent v5") as demo:
gr.Markdown("""# πŸš€ GAIA Agent v5
**Full-featured agent with Vision & Audio!**
- 🧠 Groq Llama 3.3 70B for reasoning
- πŸ‘οΈ Llama 3.2 Vision for image analysis
- 🎀 Whisper for audio transcription
- πŸ” Smart web search + Wikipedia
- πŸ“Ί YouTube transcript extraction
- πŸ“ File parsing (CSV, Excel, PDF, Python)
""")
gr.LoginButton()
run_button = gr.Button("πŸ”₯ Avvia Valutazione", variant="primary", size="lg")
status_output = gr.Textbox(
label="Risultato",
lines=6,
interactive=False
)
results_table = gr.DataFrame(
label="Risposte",
wrap=True
)
run_button.click(
fn=run_and_submit_all,
outputs=[status_output, results_table]
)
return demo
if __name__ == "__main__":
demo = create_demo()
demo.queue(default_concurrency_limit=1).launch(debug=True, share=False)