Final_Assignment_AGENT_GAIA

Sleeping

App Files Files Community

Isateles commited on May 30, 2025

Commit

a7b80a9

1 Parent(s): 394d24e

Update GAIA agent-fixed extract answer

Browse files

Files changed (1) hide show

app.py +62 -131

app.py CHANGED Viewed

@@ -71,49 +71,44 @@ def setup_llm():
 def extract_final_answer(response_text: str) -> str:
-    """Extract answer aligned with GAIA scoring rules"""
     # Look for FINAL ANSWER pattern
     match = re.search(r"FINAL ANSWER:\s*(.+?)(?:\n|$)", response_text, re.IGNORECASE | re.DOTALL)
     if not match:
-        # Fallback: look for answer at the end of response
-        lines = response_text.strip().split('\n')
-        if lines:
-            # Check if last line looks like an answer
-            last_line = lines[-1].strip()
-            if len(last_line) < 100 and not last_line.startswith(('I', 'The', 'To', 'Based')):
-                answer = last_line
-            else:
-                logger.warning("No FINAL ANSWER found")
-                return ""
-        else:
-            return ""
-    else:
-        answer = match.group(1).strip()
-    # Remove any trailing punctuation that's not part of the answer
-    answer = answer.rstrip('.')
     # Clean for GAIA scoring
-    # 1. Handle numbers with more precision
     if re.match(r'^[\d\s.,\-+e]+$', answer):
-        # Remove all formatting
         cleaned = answer.replace(',', '').replace(' ', '')
         try:
-            # Try to parse as float
             num = float(cleaned)
-            # Return integer if whole number, otherwise keep precision
-            if num.is_integer():
-                return str(int(num))
-            else:
-                # Keep original precision, don't round
-                return str(num)
         except:
             pass
-    # 2. Handle percentages (remove % sign)
     if answer.endswith('%'):
         answer = answer[:-1].strip()
         try:
@@ -122,43 +117,30 @@ def extract_final_answer(response_text: str) -> str:
         except:
             pass
-    # 3. Lists: clean and standardize
-    if ',' in answer or ' and ' in answer.lower():
-        # Split on commas and 'and'
-        parts = re.split(r',|\s+and\s+', answer)
-        cleaned_parts = []
-        for part in parts:
-            part = part.strip()
-            if not part:
-                continue
-            # Try to parse as number
-            try:
-                num = float(part.replace('$', '').replace('%', '').replace(',', ''))
-                cleaned_parts.append(str(int(num)) if num.is_integer() else str(num))
-            except:
-                # Remove articles from strings
-                words = part.split()
-                if words and words[0].lower() in ['the', 'a', 'an']:
-                    cleaned_parts.append(' '.join(words[1:]))
-                else:
-                    cleaned_parts.append(part)
-        return ', '.join(cleaned_parts)
-    # 4. Yes/No answers
     if answer.lower() in ['yes', 'no']:
         return answer.lower()
-    # 5. Single words/phrases: remove articles
     words = answer.split()
     if words and words[0].lower() in ['the', 'a', 'an']:
         return ' '.join(words[1:])
     return answer
 class GAIAAgent:
     """GAIA RAG Agent using LlamaIndex AgentWorkflow"""
@@ -197,119 +179,68 @@ class GAIAAgent:
         import warnings
         warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*Event loop is closed.*")
         try:
-            # Create new event loop for async operations
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
             try:
                 async def run_agent():
-                    # Track what happened during execution
-                    tool_calls = []
-                    response_chunks = []
                     try:
-                        # Start the agent workflow
                         handler = self.agent.run(user_msg=question)
-                        # IMPORTANT: Process events WITHOUT consuming them
-                        # We need to collect BOTH tool usage AND response content
-                        from llama_index.core.agent.workflow import ToolCallResult
-                        # Stream events and collect information
-                        async for event in handler.stream_events():
-                            # Log tool usage
-                            if isinstance(event, ToolCallResult):
-                                tool_info = f"{event.tool_name}: {str(event.result)[:100]}..."
-                                tool_calls.append(tool_info)
-                                logger.info(f"Tool used: {tool_info}")
-                            # Also collect any text responses
-                            # Different event types might have content in different attributes
-                            if hasattr(event, 'delta'):
-                                response_chunks.append(str(event.delta))
-                            elif hasattr(event, 'content'):
-                                response_chunks.append(str(event.content))
-                            elif hasattr(event, 'response'):
-                                response_chunks.append(str(event.response))
-                        # Get the final result after streaming
                         result = await handler
-                        # Extract the final response text
-                        # Priority: accumulated chunks > result.response > str(result)
-                        if response_chunks:
-                            response_text = ''.join(response_chunks)
-                        elif hasattr(result, 'response'):
-                            response_text = str(result.response)
                         else:
                             response_text = str(result)
-                        # Log what tools were used for debugging
-                        if tool_calls:
-                            logger.info(f"Tools used in this query: {', '.join(set(tool_calls))}")
-                        # CRITICAL: Check if we got a meaningful response
-                        # This prevents infinite loops
-                        if not response_text or len(response_text.strip()) < 10:
-                            logger.warning("Got empty or too short response from agent")
-                            # Return a fallback response
-                            return "FINAL ANSWER: Unable to determine answer"
                         return response_text
-                    except asyncio.TimeoutError:
-                        # Prevent infinite waiting
-                        logger.error("Agent timeout - preventing infinite loop")
-                        return "FINAL ANSWER: Request timeout"
                     except Exception as e:
                         logger.error(f"Agent execution error: {e}")
-                        # Return structured error response
-                        return f"FINAL ANSWER: Error occurred"
-                # Run with timeout to prevent infinite loops
                 response_text = loop.run_until_complete(
-                    asyncio.wait_for(run_agent(), timeout=120)  # 2 minute timeout
                 )
                 # Extract clean answer
                 clean_answer = extract_final_answer(response_text)
-                # VALIDATION: Ensure we have a valid answer
-                if not clean_answer:
-                    logger.warning("No answer extracted, using fallback")
-                    # Try to extract any number or short phrase from response
-                    # This prevents returning empty string to GAIA
-                    numbers = re.findall(r'\b\d+\.?\d*\b', response_text)
-                    if numbers:
-                        clean_answer = numbers[-1]  # Use last number found
-                    else:
-                        # Look for any short phrase that could be an answer
-                        sentences = response_text.split('.')
-                        for sent in reversed(sentences):
-                            sent = sent.strip()
-                            if 0 < len(sent) < 50 and not sent.startswith(('I', 'The', 'To')):
-                                clean_answer = sent
-                                break
                 logger.info(f"Full response preview: {response_text[:200]}...")
                 logger.info(f"Extracted answer: '{clean_answer}'")
                 return clean_answer
             finally:
-                # Always close the loop
                 loop.close()
         except Exception as e:
             logger.error(f"Error processing question: {e}")
-            # Never return empty string to GAIA - always return something
-            return "0"  # Safe fallback for math questions
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     """Run GAIA evaluation following course template structure"""

 def extract_final_answer(response_text: str) -> str:
+    """Extract answer aligned with GAIA scoring rules - FIXED VERSION"""
+    # First, remove any "assistant:" prefix that might have been added
+    response_text = re.sub(r'^assistant:\s*', '', response_text, flags=re.IGNORECASE)
     # Look for FINAL ANSWER pattern
     match = re.search(r"FINAL ANSWER:\s*(.+?)(?:\n|$)", response_text, re.IGNORECASE | re.DOTALL)
     if not match:
+        logger.warning("No FINAL ANSWER found in response")
+        return ""
+    answer = match.group(1).strip()
+    # CRITICAL: Stop processing if we hit "assistant:" or any reasoning text
+    if 'assistant:' in answer:
+        answer = answer.split('assistant:')[0].strip()
+    # Remove any trailing explanatory text (usually starts with lowercase after answer)
+    sentences = answer.split('.')
+    if len(sentences) > 1:
+        # Check if second sentence starts with lowercase (indicates explanation)
+        first_sentence = sentences[0].strip()
+        if first_sentence and (not sentences[1].strip() or sentences[1].strip()[0].islower()):
+            answer = first_sentence
     # Clean for GAIA scoring
+    # 1. Handle pure numbers
     if re.match(r'^[\d\s.,\-+e]+$', answer):
         cleaned = answer.replace(',', '').replace(' ', '')
         try:
             num = float(cleaned)
+            return str(int(num)) if num.is_integer() else str(num)
         except:
             pass
+    # 2. Handle percentages
     if answer.endswith('%'):
         answer = answer[:-1].strip()
         try:
         except:
             pass
+    # 3. Handle yes/no
     if answer.lower() in ['yes', 'no']:
         return answer.lower()
+    # 4. Handle lists
+    if ',' in answer:
+        items = [item.strip() for item in answer.split(',')]
+        cleaned_items = []
+        for item in items:
+            # Remove articles
+            words = item.split()
+            if words and words[0].lower() in ['the', 'a', 'an']:
+                cleaned_items.append(' '.join(words[1:]))
+            else:
+                cleaned_items.append(item)
+        return ', '.join(cleaned_items)
+    # 5. Single answer - remove articles
     words = answer.split()
     if words and words[0].lower() in ['the', 'a', 'an']:
         return ' '.join(words[1:])
     return answer
 class GAIAAgent:
     """GAIA RAG Agent using LlamaIndex AgentWorkflow"""
         import warnings
         warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*Event loop is closed.*")
         try:
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
             try:
                 async def run_agent():
                     try:
                         handler = self.agent.run(user_msg=question)
+                        # Wait for the result
                         result = await handler
+                        # Extract response text more carefully
+                        response_text = ""
+                        # Try different ways to get the response
+                        if hasattr(result, 'response'):
+                            if hasattr(result.response, 'message'):
+                                if hasattr(result.response.message, 'content'):
+                                    response_text = result.response.message.content
+                                else:
+                                    response_text = str(result.response.message)
+                            else:
+                                response_text = str(result.response)
+                        elif hasattr(result, 'content'):
+                            response_text = result.content
+                        elif hasattr(result, 'output'):
+                            response_text = result.output
                         else:
                             response_text = str(result)
+                        # Clean up any streaming artifacts
+                        response_text = re.sub(r'assistant:\s*', '', response_text, flags=re.IGNORECASE)
                         return response_text
                     except Exception as e:
                         logger.error(f"Agent execution error: {e}")
+                        import traceback
+                        logger.error(traceback.format_exc())
+                        return "FINAL ANSWER: "
                 response_text = loop.run_until_complete(
+                    asyncio.wait_for(run_agent(), timeout=60)
                 )
                 # Extract clean answer
                 clean_answer = extract_final_answer(response_text)
                 logger.info(f"Full response preview: {response_text[:200]}...")
                 logger.info(f"Extracted answer: '{clean_answer}'")
                 return clean_answer
             finally:
                 loop.close()
         except Exception as e:
             logger.error(f"Error processing question: {e}")
+            return ""
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     """Run GAIA evaluation following course template structure"""