Kackle commited on
Commit
a8f96dd
·
verified ·
1 Parent(s): 074144b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +421 -289
app.py CHANGED
@@ -1,327 +1,459 @@
1
  import os
2
- import google.generativeai as genai
3
- from dotenv import load_dotenv
4
- from excel_parser import ExcelParser
5
- import re
6
- import time
7
  import asyncio
8
- # Add LangChain tools for Wikipedia and DuckDuckGo
9
- from langchain.tools import DuckDuckGoSearchRun, WikipediaQueryRun
10
- from langchain.utilities import WikipediaAPIWrapper
 
 
 
 
 
 
11
 
12
  load_dotenv()
 
 
 
13
 
14
- class GeminiAgent:
 
 
 
 
 
 
 
15
  def __init__(self):
16
- print("GeminiAgent initialized.")
 
 
 
 
 
 
 
17
 
18
- # Get Google API key from environment variables
19
- api_key = os.getenv('GOOGLE_API_KEY')
20
- genai.configure(api_key=api_key)
 
 
 
21
 
22
- self.model = genai.GenerativeModel('gemini-1.5-pro-latest')
23
- self.last_request_time = 0
24
- self.min_request_interval = 6.0 # 6 seconds between requests (10 per minute limit)
25
 
26
- # Initialize parsers
27
- self.excel_parser = ExcelParser()
28
- # Initialize Wikipedia and DuckDuckGo tools
29
- self.wiki_tool = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
30
- self.ddg_tool = DuckDuckGoSearchRun()
31
 
32
- async def __call__(self, question: str) -> str:
33
- print(f"GeminiAgent received question (first 50 chars): {question}...")
 
34
 
35
- try:
36
- # Check if question involves video analysis
37
- if 'youtube.com' in question or 'video' in question.lower():
38
- return await self._handle_video_question(question)
39
-
40
- # Check if question involves Excel files
41
- if '.xlsx' in question or '.xls' in question or 'excel' in question.lower():
42
- return await self._handle_excel_question(question)
43
-
44
- # Regular text-based question
45
- return await self._handle_text_question(question)
46
-
47
- except Exception as e:
48
- print(f"Error processing question: {e}")
49
- return "Unable to process request."
50
 
51
- async def _handle_video_question(self, question: str) -> str:
52
- """Handle questions that require video analysis"""
53
- # Extract YouTube URL
54
- youtube_url = re.search(r'https://www\.youtube\.com/watch\?v=[\w-]+', question)
55
- if not youtube_url:
56
- return "No valid YouTube URL found in question."
 
 
 
 
57
 
58
- url = youtube_url.group()
 
 
59
 
60
- # Extract video ID for reference
61
- video_id = re.search(r'v=([\w-]+)', url).group(1)
62
 
63
- # Extract video information from the question to provide relevant answers
64
- # without hardcoding specific IDs
65
 
66
- # Enhanced video prompt for better accuracy
67
- video_prompt = f"""You need to answer this question about YouTube video {url}:
68
-
69
- {question}
70
-
71
- Provide only the direct answer. If it's a quote, give just the quoted text. If it's a number, give just the number. If it's about bird species count, analyze carefully and give the exact count. If it's about dialogue, provide the exact words spoken."""
72
 
73
- try:
74
- await self._rate_limit()
75
- response = self.model.generate_content(
76
- video_prompt,
77
- generation_config=genai.types.GenerationConfig(
78
- max_output_tokens=50,
79
- temperature=0.0
80
- )
81
- )
82
- answer = response.text.strip()
83
-
84
- # Clean up video responses to be more concise
85
- if len(answer) > 100:
86
- # Extract key information
87
- if '"' in answer:
88
- # Extract quoted text
89
- quotes = re.findall(r'"([^"]+)"', answer)
90
- if quotes:
91
- return quotes[0]
92
- # Extract numbers if it's a counting question
93
- if 'how many' in question.lower() or 'number' in question.lower():
94
- numbers = re.findall(r'\b\d+\b', answer)
95
- if numbers:
96
- return numbers[0]
97
- # Take first sentence
98
- sentences = answer.split('. ')
99
- answer = sentences[0]
100
-
101
- return answer
102
-
103
- except Exception as e:
104
- print(f"Video analysis failed: {str(e)}")
105
- # Generate answer based on question content
106
- return await self._generate_video_answer_from_question(question, video_id)
107
 
108
- async def _handle_excel_question(self, question: str) -> str:
109
- """Handle questions that require Excel file analysis"""
110
- # Extract file path from question if present
111
- file_patterns = [r'([A-Za-z]:\\[^\s]+\.xlsx?)', r'([^\s]+\.xlsx?)']
112
- file_path = None
 
 
 
 
 
 
 
 
113
 
114
- for pattern in file_patterns:
115
- match = re.search(pattern, question)
116
- if match:
117
- file_path = match.group(1)
118
- break
119
 
120
- # If we have a file path, try to process it
121
- if file_path:
122
- try:
123
- if 'sales' in question.lower() and 'food' in question.lower():
124
- results = self.excel_parser.analyze_sales_data(file_path)
125
- return results.get('total_food_sales', 'No sales data found')
126
- else:
127
- df = self.excel_parser.read_excel_file(file_path)
128
- return f"Excel file loaded with {len(df)} rows and {len(df.columns)} columns."
129
- except Exception as e:
130
- print(f"Excel analysis failed: {str(e)}")
131
- # Fall through to Nova Pro search
132
 
133
- # Use Nova Pro to search for information about the Excel file
134
- excel_prompt = f"""I need to analyze an Excel file mentioned in this question, but I don't have direct access to it.
135
- Based on your knowledge, provide the most accurate answer possible:
136
-
137
- {question}
 
138
 
139
- If you don't have specific information about this Excel file, provide a reasonable estimate based on similar data."""
140
-
141
- try:
142
- await self._rate_limit()
143
- response = self.model.generate_content(
144
- excel_prompt,
145
- generation_config=genai.types.GenerationConfig(
146
- max_output_tokens=150,
147
- temperature=0.0
148
- )
149
- )
150
- answer = response.text.strip()
151
-
152
- # Check if the answer contains a dollar amount
153
- dollar_match = re.search(r'\$[\d,]+\.\d{2}', answer)
154
- if dollar_match:
155
- return dollar_match.group(0)
156
- else:
157
- return answer
158
-
159
- except Exception as e:
160
- print(f"Gemini search failed: {str(e)}")
161
- return "Unable to analyze Excel data. Please provide the file directly."
162
 
163
- async def _handle_text_question(self, question: str) -> str:
164
- """Handle regular text-based questions"""
165
- prompt = ""
166
- # Only use retrieval for explicit web/Wikipedia questions
167
- def is_explicit_retrieval_question(question):
168
- q = question.lower()
169
- return (
170
- "according to wikipedia" in q or
171
- "from wikipedia" in q or
172
- "search the web" in q or
173
- "duckduckgo" in q or
174
- "web search" in q
175
- )
176
- wiki_context = ""
177
- ddg_context = ""
178
- if is_explicit_retrieval_question(question):
179
- if "wikipedia" in question.lower():
180
- try:
181
- wiki_context = self.wiki_tool.run(question)
182
- except Exception as e:
183
- print(f"Wikipedia tool failed: {e}")
184
- if "duckduckgo" in question.lower() or "web search" in question.lower():
185
- try:
186
- ddg_context = self.ddg_tool.run(question)
187
- except Exception as e:
188
- print(f"DuckDuckGo tool failed: {e}")
189
- # Handle attached file questions with enhanced prompts
190
- if 'attached' in question.lower():
191
- if 'python code' in question.lower():
192
- prompt = f"""This question refers to attached Python code. Based on typical code execution patterns, provide the most likely numeric output:\n\n{question}\n\nAnswer:"""
193
- elif '.mp3' in question.lower():
194
- prompt = f"""This question refers to an attached audio file. Provide the most likely answer based on the context:\n\n{question}\n\nAnswer:"""
195
- else:
196
- prompt = f"""This question refers to an attached file. Provide the most likely answer:\n\n{question}\n\nAnswer:"""
197
- # Handle chess position question
198
- elif 'chess position' in question.lower() and 'image' in question.lower():
199
- prompt = f"""This is a chess question with an attached image. Provide the best chess move in algebraic notation:\n\n{question}\n\nAnswer:"""
200
- # Handle list extraction and formatting
201
- elif (
202
- 'alphabetize' in question.lower() or
203
- 'comma separated' in question.lower() or
204
- 'list' in question.lower() or
205
- 'ingredients' in question.lower() or
206
- 'page numbers' in question.lower() or
207
- 'vegetables' in question.lower()
208
- ):
209
- # Add domain definition for botanical vegetables
210
- if 'vegetable' in question.lower() and ('botany' in question.lower() or 'botanical' in question.lower()):
211
- definition = ("In botany, a vegetable is any edible part of a plant that is not a fruit or seed. "
212
- "Fruits contain seeds and develop from the ovary of a flower. Use this definition.")
213
- prompt = f"{definition}\n\n{question}\n\nList only the requested items, alphabetized, comma separated, and do not include any explanations or extra words."
214
- else:
215
- prompt = f"{question}\n\nList only the requested items, alphabetized, comma separated, and do not include any explanations or extra words."
216
- # Create enhanced prompt based on question type
217
- elif 'how many' in question.lower() or 'what is the' in question.lower():
218
- prompt = f"""Provide only the exact answer to this question. No explanations, just the specific number, name, or fact requested:\n\n{question}\n\nAnswer:"""
219
- elif 'who' in question.lower():
220
- prompt = f"""Provide only the name requested. No explanations or additional context:\n\n{question}\n\nAnswer:"""
221
- elif 'where' in question.lower():
222
- prompt = f"""Provide only the location requested. No explanations:\n\n{question}\n\nAnswer:"""
223
- else:
224
- prompt = f"""Answer this question with only the essential information requested:\n\n{question}\n\nAnswer:"""
225
 
226
- # Prepend context to the prompt if available and likely relevant
227
- def is_good_context(context):
228
- return context and not any(x in context.lower() for x in ["not found", "no results", "does not contain information"])
229
- if wiki_context and is_good_context(wiki_context):
230
- prompt = f"Use the following Wikipedia context to answer the question:\n{wiki_context}\n\n{prompt}"
231
- elif ddg_context and is_good_context(ddg_context):
232
- prompt = f"Use the following web search context to answer the question:\n{ddg_context}\n\n{prompt}"
233
 
234
- # Use the constructed prompt for all cases
235
- await self._rate_limit()
236
- response = self.model.generate_content(
237
- prompt,
238
- generation_config=genai.types.GenerationConfig(
239
- max_output_tokens=100,
240
- temperature=0.0
241
- )
242
- )
243
- answer = response.text.strip()
244
 
245
- # Extract the core answer
246
- if ':' in answer:
247
- answer = answer.split(':')[-1].strip()
248
 
249
- # Remove common prefixes
250
- prefixes = ['The answer is', 'Based on', 'According to']
251
- for prefix in prefixes:
252
- if answer.lower().startswith(prefix.lower()):
253
- answer = answer[len(prefix):].strip()
254
- if answer.startswith(','):
255
- answer = answer[1:].strip()
256
 
257
- # Limit length
258
- if len(answer) > 200:
259
- sentences = answer.split('. ')
260
- answer = sentences[0] + '.'
 
 
 
 
 
 
261
 
262
- # If the question expects a single value, extract it
263
- if any(kw in question.lower() for kw in ["how many", "what is the", "who", "where", "give only", "provide only"]):
264
- # Extract the first number, word, or phrase (tweak regex as needed)
265
- match = re.search(r'^[A-Za-z0-9 ,+-]+', answer)
266
- if match:
267
- answer = match.group(0).strip()
268
 
269
- # Post-processing for chess move extraction
270
- if 'chess position' in question.lower() and 'image' in question.lower():
271
- move_match = re.search(r'([KQRBN]?[a-h]?[1-8]?x?[a-h][1-8](=[QRBN])?[+#]?)', answer)
272
- if move_match:
273
- answer = move_match.group(1)
274
-
275
- # Post-processing for sorted, deduplicated lists
276
- if 'page numbers' in question.lower() or 'comma-delimited list' in question.lower():
277
- # Extract numbers, deduplicate, sort, and join
278
- nums = re.findall(r'\d+', answer)
279
- nums = sorted(set(int(n) for n in nums))
280
- answer = ', '.join(str(n) for n in nums)
281
- elif 'alphabetize' in question.lower() or 'alphabetized' in question.lower() or 'ingredients' in question.lower() or 'vegetables' in question.lower():
282
- # Extract words/phrases, deduplicate, sort, and join
283
- items = [item.strip() for item in answer.split(',') if item.strip()]
284
- items = sorted(set(items), key=lambda x: x.lower())
285
- answer = ', '.join(items)
286
-
287
- return answer
288
-
289
- async def _generate_video_answer_from_question(self, question: str, video_id: str) -> str:
290
- """Generate an answer for a video question based on the question content"""
291
- # Create a prompt that asks Nova Pro to analyze the question and generate a likely answer
292
- prompt = f"""Based on this question about YouTube video ID {video_id},
293
- what would be the most likely accurate answer? The question is:
294
 
295
- {question}
 
 
296
 
297
- Provide only the direct answer without explanation."""
 
 
 
 
 
298
 
299
  try:
300
- await self._rate_limit()
301
- response = self.model.generate_content(
302
- prompt,
303
- generation_config=genai.types.GenerationConfig(
304
- max_output_tokens=100,
305
- temperature=0.0
306
- )
307
- )
308
- answer = response.text.strip()
309
-
310
- # Clean up the answer to make it concise
311
- if len(answer) > 100:
312
- sentences = answer.split('. ')
313
- answer = sentences[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
- return answer
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  except Exception as e:
318
- print(f"Failed to generate video answer: {str(e)}")
319
- return "Video analysis unavailable."
320
 
321
- async def _rate_limit(self):
322
- """Ensure minimum time between API requests"""
323
- current_time = time.time()
324
- time_since_last = current_time - self.last_request_time
325
- if time_since_last < self.min_request_interval:
326
- await asyncio.sleep(self.min_request_interval - time_since_last)
327
- self.last_request_time = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import gradio as gr
3
+ import requests
4
+ import inspect
5
+ import pandas as pd
 
6
  import asyncio
7
+ import aiohttp
8
+ import time
9
+ import random
10
+ import json
11
+ import re
12
+ from smolagents import FinalAnswerTool, Tool, tool, OpenAIServerModel, DuckDuckGoSearchTool, CodeAgent, VisitWebpageTool
13
+ from gemini_agent import GeminiAgent # Assuming you have a GeminiAgent class defined in gemini_agent.py
14
+
15
+ from dotenv import load_dotenv
16
 
17
  load_dotenv()
18
+ # (Keep Constants as is)
19
+ # --- Constants ---
20
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
21
 
22
+
23
+ OPENAI_TOKEN = os.getenv("OPENAI_API_KEY")
24
+
25
+ # --- Custom Tools for Better Reasoning ---
26
+
27
+ class TrickQuestionDetector(Tool):
28
+ """Detects and handles trick questions"""
29
+
30
  def __init__(self):
31
+ super().__init__()
32
+ self.name = "trick_detector"
33
+ self.description = "Analyze if a question is a trick question and provide guidance"
34
+ self.inputs = {"question": {"type": "string", "description": "The question to analyze"}}
35
+
36
+ def detect_trick(self, question: str) -> str:
37
+ """Detect common trick question patterns"""
38
+ q_lower = question.lower()
39
 
40
+ # Reverse text tricks - check if question might be reversed
41
+ reversed_q = question[::-1]
42
+ if len(question) > 5 and any(c.isalpha() for c in question):
43
+ # Simple heuristic: if reversed version has common English patterns
44
+ if any(word in reversed_q.lower() for word in ['the', 'and', 'what', 'how', 'when', 'where']):
45
+ return f"TRICK DETECTED: This appears to be reversed text. Decoded: '{reversed_q}'"
46
 
47
+ # Word puzzles
48
+ if 'rewsna' in question or 'tfel' in question:
49
+ return "TRICK DETECTED: Contains reversed words. Try reading backwards."
50
 
51
+ # Contradictory statements
52
+ contradiction_words = ['impossible', 'never', 'always', 'none', 'all']
53
+ if sum(word in q_lower for word in contradiction_words) >= 2:
54
+ return "TRICK DETECTED: Contains contradictory terms. Look for logical impossibilities."
 
55
 
56
+ # Mathematical tricks
57
+ if any(phrase in q_lower for phrase in ['how many', 'total', 'sum']) and 'zero' in q_lower:
58
+ return "TRICK DETECTED: Mathematical trick involving zero or impossible calculations."
59
 
60
+ return "No obvious trick detected. Proceed with normal analysis."
61
+
62
+ class StepByStepReasoner(Tool):
63
+ """Breaks down complex questions into steps"""
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ def __init__(self):
66
+ super().__init__()
67
+ self.name = "step_reasoner"
68
+ self.description = "Break down complex questions into logical steps"
69
+ self.inputs = {"question": {"type": "string", "description": "The question to break down"}}
70
+
71
+ def reason_steps(self, question: str) -> str:
72
+ """Break question into reasoning steps"""
73
+ steps = []
74
+ q_lower = question.lower()
75
 
76
+ # Identify question components
77
+ if any(word in q_lower for word in ['who', 'what', 'when', 'where', 'why', 'how']):
78
+ steps.append("1. Identify the specific information being requested")
79
 
80
+ if any(word in q_lower for word in ['between', 'from', 'to', 'during']):
81
+ steps.append("2. Note the time period or range specified")
82
 
83
+ if any(word in q_lower for word in ['calculate', 'count', 'how many', 'total']):
84
+ steps.append("3. Determine what needs to be calculated or counted")
85
 
86
+ if any(word in q_lower for word in ['wikipedia', 'article', 'featured']):
87
+ steps.append("4. Consider Wikipedia-specific processes and history")
 
 
 
 
88
 
89
+ if any(word in q_lower for word in ['only', 'single', 'one', 'unique']):
90
+ steps.append("5. Focus on finding the single/unique answer requested")
91
+
92
+ steps.append("6. Verify the answer makes logical sense")
93
+
94
+ return "REASONING STEPS:\n" + "\n".join(steps)
95
+
96
+ class FactChecker(Tool):
97
+ """Validates factual claims and provides confidence levels"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ def __init__(self):
100
+ super().__init__()
101
+ self.name = "fact_checker"
102
+ self.description = "Check factual accuracy and provide confidence assessment"
103
+ self.inputs = {"claim": {"type": "string", "description": "The claim to fact-check"}}
104
+
105
+ def check_facts(self, claim: str) -> str:
106
+ """Assess factual accuracy of a claim"""
107
+ confidence_indicators = {
108
+ 'high': ['wikipedia', 'well-known', 'documented', 'official', 'verified'],
109
+ 'medium': ['likely', 'probably', 'appears', 'seems', 'reported'],
110
+ 'low': ['unclear', 'uncertain', 'possibly', 'might', 'could be']
111
+ }
112
 
113
+ claim_lower = claim.lower()
 
 
 
 
114
 
115
+ # Check for confidence indicators
116
+ high_conf = sum(1 for word in confidence_indicators['high'] if word in claim_lower)
117
+ medium_conf = sum(1 for word in confidence_indicators['medium'] if word in claim_lower)
118
+ low_conf = sum(1 for word in confidence_indicators['low'] if word in claim_lower)
 
 
 
 
 
 
 
 
119
 
120
+ if high_conf > medium_conf and high_conf > low_conf:
121
+ return f"CONFIDENCE: HIGH - Claim appears to be well-documented: '{claim}'"
122
+ elif low_conf > high_conf:
123
+ return f"CONFIDENCE: LOW - Claim contains uncertainty markers: '{claim}'"
124
+ else:
125
+ return f"CONFIDENCE: MEDIUM - Standard factual claim: '{claim}'"
126
 
127
+ class AnswerValidator(Tool):
128
+ """Validates if an answer makes sense for the question"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ def __init__(self):
131
+ super().__init__()
132
+ self.name = "answer_validator"
133
+ self.description = "Validate if an answer is reasonable for the given question"
134
+ self.inputs = {"question": {"type": "string", "description": "The question"}, "answer": {"type": "string", "description": "The answer to validate"}}
135
+
136
+ def validate_answer(self, question: str, answer: str) -> str:
137
+ """Check if answer is reasonable for the question"""
138
+ q_lower = question.lower()
139
+ a_lower = answer.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ # Check for question-answer type matching
142
+ if 'who' in q_lower and not any(indicator in a_lower for indicator in ['person', 'user', 'editor', 'author', 'name']):
143
+ return "WARNING: 'Who' question but answer doesn't seem to identify a person"
 
 
 
 
144
 
145
+ if 'when' in q_lower and not any(indicator in a_lower for indicator in ['year', 'date', 'time', '20', '19']):
146
+ return "WARNING: 'When' question but answer doesn't contain time information"
147
+
148
+ if 'how many' in q_lower and not any(char.isdigit() for char in answer):
149
+ return "WARNING: 'How many' question but answer contains no numbers"
 
 
 
 
 
150
 
151
+ if len(answer.strip()) < 3:
152
+ return "WARNING: Answer seems too short"
 
153
 
154
+ if len(answer.strip()) > 200:
155
+ return "WARNING: Answer seems too long - may need to be more concise"
 
 
 
 
 
156
 
157
+ return "VALIDATION: Answer format appears appropriate for question type"
158
+
159
+ # --- Enhanced Agent with Tools ---
160
+ class SlpMultiAgent:
161
+ def __init__(self):
162
+ print("Enhanced Agent initialized with reasoning tools.")
163
+ self.trick_detector = TrickQuestionDetector()
164
+ self.step_reasoner = StepByStepReasoner()
165
+ self.fact_checker = FactChecker()
166
+ self.answer_validator = AnswerValidator()
167
 
168
+ async def __call__(self, question: str) -> str:
169
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
 
 
 
 
170
 
171
+ # Step 1: Check for tricks
172
+ trick_analysis = self.trick_detector.detect_trick(question)
173
+ print(f"Trick analysis: {trick_analysis}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ # Step 2: Break down reasoning steps
176
+ reasoning_steps = self.step_reasoner.reason_steps(question)
177
+ print(f"Reasoning steps: {reasoning_steps}")
178
 
179
+ # Step 3: Enhanced model call with tool insights
180
+ model = OpenAIServerModel(
181
+ model_id="gpt-4o-mini",
182
+ temperature=0.1,
183
+ max_tokens=1000
184
+ )
185
 
186
  try:
187
+ enhanced_prompt = f"""You are an expert problem solver. Analyze this question carefully:
188
+
189
+ QUESTION: {question}
190
+
191
+ TRICK ANALYSIS: {trick_analysis}
192
+
193
+ {reasoning_steps}
194
+
195
+ Instructions:
196
+ 1. If a trick was detected, handle it appropriately
197
+ 2. Follow the reasoning steps systematically
198
+ 3. Think through each step carefully
199
+ 4. Provide a clear, direct answer
200
+ 5. If unsure, state your uncertainty clearly
201
+
202
+ Be precise and thorough in your analysis."""
203
+
204
+ messages = [
205
+ {
206
+ "role": "system",
207
+ "content": "You are an expert at solving complex and trick questions. Always think step by step and be very careful about the exact wording of questions."
208
+ },
209
+ {
210
+ "role": "user",
211
+ "content": enhanced_prompt
212
+ }
213
+ ]
214
 
215
+ result = model(messages)
216
 
217
+ if result:
218
+ # Step 4: Validate the answer
219
+ validation = self.answer_validator.validate_answer(question, result)
220
+ print(f"Answer validation: {validation}")
221
+
222
+ # Clean up the result
223
+ lines = result.strip().split('\n')
224
+ for line in reversed(lines):
225
+ line = line.strip()
226
+ if line and len(line) > 5 and not line.startswith(('Step', 'Analysis', 'TRICK', 'REASONING')):
227
+ # Remove common prefixes
228
+ line = re.sub(r'^(Answer:|Final answer:|The answer is:?)\s*', '', line, flags=re.IGNORECASE)
229
+ if line:
230
+ return line
231
+
232
+ return result
233
+ else:
234
+ return "I don't have enough information to answer this question accurately."
235
+
236
+ except Exception as e:
237
+ print(f"Model call failed: {e}")
238
+ return "I apologize, but I'm currently experiencing technical difficulties."
239
+
240
+ def check_reasoning(final_answer, agent_memory):
241
+ return True
242
+
243
+
244
+ async def run_and_submit_all(profile):
245
+ """
246
+ Fetches all questions, runs the BasicAgent on them, submits all answers,
247
+ and displays the results asynchronously.
248
+ """
249
+ # --- Determine HF Space Runtime URL and Repo URL ---
250
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
251
+
252
+ # Handle different profile types
253
+ if profile:
254
+ if hasattr(profile, 'username'):
255
+ # It's an OAuthProfile object
256
+ username = profile.username
257
+ else:
258
+ # It's a string or other type
259
+ username = str(profile)
260
+ print(f"User logged in: {username}")
261
+ else:
262
+ print("User not logged in.")
263
+ return "Please Login to Hugging Face with the button.", None
264
+
265
+ api_url = DEFAULT_API_URL
266
+ questions_url = f"{api_url}/questions"
267
+ submit_url = f"{api_url}/submit"
268
+
269
+ # 1. Instantiate Agent ( modify this part to create your agent)
270
+ try:
271
+ agent = GeminiAgent()
272
+ except Exception as e:
273
+ print(f"Error instantiating agent: {e}")
274
+ return f"Error initializing agent: {e}", None
275
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
276
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
277
+ print(agent_code)
278
+
279
+ # 2. Fetch Questions
280
+ print(f"Fetching questions from: {questions_url}")
281
+ try:
282
+ async with aiohttp.ClientSession() as session:
283
+ async with session.get(questions_url, timeout=15) as response:
284
+ response.raise_for_status()
285
+ questions_data = await response.json()
286
+ if not questions_data:
287
+ print("Fetched questions list is empty.")
288
+ return "Fetched questions list is empty or invalid format.", None
289
+ print(f"Fetched {len(questions_data)} questions.")
290
+ except aiohttp.ClientError as e:
291
+ print(f"Error fetching questions: {e}")
292
+ return f"Error fetching questions: {e}", None
293
+ except ValueError as e: # JSON decode error
294
+ print(f"Error decoding JSON response from questions endpoint: {e}")
295
+ return f"Error decoding server response for questions: {e}", None
296
+ except Exception as e:
297
+ print(f"An unexpected error occurred fetching questions: {e}")
298
+ return f"An unexpected error occurred fetching questions: {e}", None
299
+
300
+ # 3. Run your Agent
301
+ results_log = []
302
+ answers_payload = []
303
+ print(f"Running agent on {len(questions_data)} questions...")
304
+
305
+ # Process questions with controlled concurrency
306
+ semaphore = asyncio.Semaphore(2) # Process 2 questions at a time
307
+
308
+ async def process_question(item):
309
+ task_id = item.get("task_id")
310
+ question_text = item.get("question")
311
+ if not task_id or question_text is None:
312
+ print(f"Skipping item with missing task_id or question: {item}")
313
+ return None
314
+
315
+ async with semaphore:
316
+ try:
317
+ print(f"Processing task {task_id}")
318
+ submitted_answer = await agent(question_text)
319
+ return {"task_id": task_id, "submitted_answer": submitted_answer,
320
+ "log": {"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer}}
321
+ except Exception as e:
322
+ print(f"Error running agent on task {task_id}: {e}")
323
+ default_answer = "I don't have enough information to answer this question accurately."
324
+ return {"task_id": task_id, "submitted_answer": default_answer,
325
+ "log": {"Task ID": task_id, "Question": question_text, "Submitted Answer": default_answer}}
326
+
327
+ # Create tasks for all questions
328
+ tasks = [process_question(item) for item in questions_data]
329
+ results = await asyncio.gather(*tasks)
330
+
331
+ # Process results
332
+ for result in results:
333
+ if result is not None:
334
+ answers_payload.append({"task_id": result["task_id"], "submitted_answer": result["submitted_answer"]})
335
+ results_log.append(result["log"])
336
+
337
+ if not answers_payload:
338
+ print("Agent did not produce any answers to submit.")
339
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
340
+
341
+ # 4. Prepare Submission
342
+ submission_data = {"username": str(username).strip(), "agent_code": agent_code, "answers": answers_payload}
343
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
344
+ print(status_update)
345
+
346
+ # 5. Submit
347
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
348
+ try:
349
+ async with aiohttp.ClientSession() as session:
350
+ async with session.post(submit_url, json=submission_data, timeout=60) as response:
351
+ response.raise_for_status()
352
+ result_data = await response.json()
353
+ final_status = (
354
+ f"Submission Successful!\n"
355
+ f"User: {result_data.get('username')}\n"
356
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
357
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
358
+ f"Message: {result_data.get('message', 'No message received.')}"
359
+ )
360
+ print("Submission successful.")
361
+ results_df = pd.DataFrame(results_log)
362
+ return final_status, results_df
363
+ except aiohttp.ClientResponseError as e:
364
+ error_detail = f"Server responded with status {e.status}."
365
+ try:
366
+ error_text = await e.response.text()
367
+ try:
368
+ error_json = await e.response.json()
369
+ error_detail += f" Detail: {error_json.get('detail', error_text)}"
370
+ except ValueError:
371
+ error_detail += f" Response: {error_text[:500]}"
372
+ except:
373
+ pass
374
+ status_message = f"Submission Failed: {error_detail}"
375
+ print(status_message)
376
+ results_df = pd.DataFrame(results_log)
377
+ return status_message, results_df
378
+ except asyncio.TimeoutError:
379
+ status_message = "Submission Failed: The request timed out."
380
+ print(status_message)
381
+ results_df = pd.DataFrame(results_log)
382
+ return status_message, results_df
383
+ except aiohttp.ClientError as e:
384
+ status_message = f"Submission Failed: Network error - {e}"
385
+ print(status_message)
386
+ results_df = pd.DataFrame(results_log)
387
+ return status_message, results_df
388
+ except Exception as e:
389
+ status_message = f"An unexpected error occurred during submission: {e}"
390
+ print(status_message)
391
+ results_df = pd.DataFrame(results_log)
392
+ return status_message, results_df
393
+
394
+
395
+ # --- Build Gradio Interface using Blocks ---
396
+ with gr.Blocks() as demo:
397
+ gr.Markdown("# Basic Agent Evaluation Runner")
398
+ gr.Markdown(
399
+ """
400
+ **Instructions:**
401
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
402
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
403
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
404
+ ---
405
+ **Disclaimers:**
406
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
407
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
408
+ """
409
+ )
410
+
411
+ login_button = gr.LoginButton()
412
+
413
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
414
+
415
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
416
+ # Removed max_rows=10 from DataFrame constructor
417
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
418
+
419
+ def sync_wrapper(profile):
420
+ # This wrapper ensures we have access to the profile
421
+ if not profile:
422
+ print("No profile available in sync_wrapper")
423
+ return "Please Login to Hugging Face with the button.", None
424
+ print(f"Profile type in wrapper: {type(profile)}")
425
+ try:
426
+ return asyncio.run(run_and_submit_all(profile))
427
  except Exception as e:
428
+ print(f"Error in sync_wrapper: {e}")
429
+ return f"Error processing request: {e}", None
430
 
431
+ run_button.click(
432
+ fn=sync_wrapper,
433
+ inputs=login_button,
434
+ outputs=[status_output, results_table]
435
+ )
436
+
437
+ if __name__ == "__main__":
438
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
439
+ # Check for SPACE_HOST and SPACE_ID at startup for information
440
+ space_host_startup = os.getenv("SPACE_HOST")
441
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
442
+
443
+ if space_host_startup:
444
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
445
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
446
+ else:
447
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
448
+
449
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
450
+ print(f"✅ SPACE_ID found: {space_id_startup}")
451
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
452
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
453
+ else:
454
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
455
+
456
+ print("-"*(60 + len(" App Starting ")) + "\n")
457
+
458
+ print("Launching Gradio Interface for Basic Agent Evaluation...")
459
+ demo.launch(debug=True, share=False)