Final_Assignment_Template

Running

App Files Files Community

Paperbag commited on Mar 23

Commit

d20527e

1 Parent(s): 6000e5d

feat: Implement audio analysis tools, enhance agent reasoning with a multi-step ReAct loop, and add local submission backup.

Browse files

Files changed (3) hide show

agent.py +76 -30
app.py +9 -0
requirements.txt +1 -0

agent.py CHANGED Viewed

@@ -16,11 +16,21 @@ from langchain_groq import ChatGroq
 from langchain_community.document_loaders.image import UnstructuredImageLoader
 from langchain_community.document_loaders import WebBaseLoader
 import base64
 try:
     import cv2
 except ImportError:
     cv2 = None
 load_dotenv()
 # Base Hugging Face LLM used by the chat wrapper
@@ -124,6 +134,23 @@ def analyze_image(image_path: str, question: str) -> str:
     except Exception as e:
         return f"Error analyzing image: {str(e)}"
 @tool
 def analyze_video(video_path: str, question: str) -> str:
     """
@@ -172,7 +199,17 @@ def analyze_video(video_path: str, question: str) -> str:
         # 2. Compile the context for the agent
         video_context = "\n".join(extracted_descriptions)
-        return f"Video Summary based on extracted frames:\n{video_context}"
     except Exception as e:
         return f"Error analyzing video: {str(e)}"
@@ -281,7 +318,7 @@ def restart_required(state: AgentState) -> AgentState:
 #     return {"messages": messages + [response]}
 # Augment the LLM with tools
-tools = [web_search, wiki_search, analyze_image, analyze_video, read_url, run_python_script, read_document]
 tools_by_name = {tool.name: tool for tool in tools}
 model_with_tools = model.bind_tools(tools)
@@ -297,7 +334,7 @@ def answer_message(state: AgentState) -> AgentState:
     TODAY'S EXACT DATE is {current_date}. Keep this in mind for all time-sensitive queries.
     CRITICAL RULES FOR SEARCH & TOOLS:
-    1. If a file is attached, use the appropriate tool (run_python_script, read_document, analyze_image, analyze_video) to answer the question based on the file content.
     2. Use run_python_script freely to process data (pandas), read complex documents (.xlsx, .pdf), or do heavy math calculations.
     3. When using tools like web_search or wiki_search, do not blindly search the entire question. Extract the core entities.
     4. If the first search result doesn't contain the answer, THINK step-by-step, refine your search query (e.g., use synonyms, or search for broader concepts), and search again.
@@ -316,33 +353,42 @@ def answer_message(state: AgentState) -> AgentState:
     """)]
     messages = prompt + messages
-    # First pass: let model decide whether to call web_search
-    ai_msg = model_with_tools.invoke(messages)
-    messages.append(ai_msg)
-    # If the model didn't request any tools, its content is already the answer
-    tool_calls = getattr(ai_msg, "tool_calls", None) or []
-    if not tool_calls:
-        print(f"Final response: {ai_msg}")
-        return {"messages": messages}
-    # Execute requested tools and append their text output into the conversation
-    for tool_call in tool_calls:
-        name = tool_call["name"]
-        args = tool_call["args"]
-        tool = tools_by_name[name]
-        tool_result = tool.invoke(args)  # this is a plain string from web_search
-        messages.append(HumanMessage(content=f"Tool result ({name}):\n{tool_result}"))
-    # Second pass: force a plain-text final answer (no tool calls expected)
-    final_instruction = HumanMessage(
-        content=(
-            "Using the tool results above, provide the FINAL numeric/text answer now. "
-            "Do not call any tools. Provide exactly what was asked."
-        )
-    )
-    messages.append(final_instruction)
-    draft_response = model.invoke(messages)
     # Third pass: strict GAIA formatting extraction
     formatting_sys = SystemMessage(

 from langchain_community.document_loaders.image import UnstructuredImageLoader
 from langchain_community.document_loaders import WebBaseLoader
 import base64
 try:
     import cv2
 except ImportError:
     cv2 = None
+whisper_model = None
+def get_whisper():
+    global whisper_model
+    if whisper_model is None:
+        import whisper
+        # Lazy load the smallest, fastest model
+        whisper_model = whisper.load_model("base")
+    return whisper_model
 load_dotenv()
 # Base Hugging Face LLM used by the chat wrapper
     except Exception as e:
         return f"Error analyzing image: {str(e)}"
+@tool
+def analyze_audio(audio_path: str, question: str) -> str:
+    """
+    Transcribes an audio file (.mp3, .wav, .m4a) to answer questions about what is spoken.
+    Args:
+        audio_path: The local path to the audio file.
+        question: The specific question to ask.
+    """
+    try:
+        model = get_whisper()
+        result = model.transcribe(audio_path)
+        transcript = result["text"]
+        return f"Audio Transcript:\n{transcript}"
+    except Exception as e:
+        return f"Error analyzing audio: {str(e)}. Tip: You requires 'ffmpeg' installed on your system."
 @tool
 def analyze_video(video_path: str, question: str) -> str:
     """
         # 2. Compile the context for the agent
         video_context = "\n".join(extracted_descriptions)
+        # 3. Transcribe audio if possible
+        try:
+            whisper_mod = get_whisper()
+            trans_result = whisper_mod.transcribe(video_path)
+            transcript = trans_result.get("text", "")
+            if transcript.strip():
+                video_context += f"\n\nVideo Audio Transcript:\n{transcript}"
+        except Exception as e:
+            video_context += f"\n\n(No audio transcript generated: {e})"
+        return f"Video Summary based on extracted frames and audio:\n{video_context}"
     except Exception as e:
         return f"Error analyzing video: {str(e)}"
 #     return {"messages": messages + [response]}
 # Augment the LLM with tools
+tools = [web_search, wiki_search, analyze_image, analyze_audio, analyze_video, read_url, run_python_script, read_document]
 tools_by_name = {tool.name: tool for tool in tools}
 model_with_tools = model.bind_tools(tools)
     TODAY'S EXACT DATE is {current_date}. Keep this in mind for all time-sensitive queries.
     CRITICAL RULES FOR SEARCH & TOOLS:
+    1. If a file is attached, use the appropriate tool (run_python_script, read_document, analyze_image, analyze_audio, analyze_video) to answer the question based on the file content.
     2. Use run_python_script freely to process data (pandas), read complex documents (.xlsx, .pdf), or do heavy math calculations.
     3. When using tools like web_search or wiki_search, do not blindly search the entire question. Extract the core entities.
     4. If the first search result doesn't contain the answer, THINK step-by-step, refine your search query (e.g., use synonyms, or search for broader concepts), and search again.
     """)]
     messages = prompt + messages
+    # Multi-step ReAct Loop (Up to 8 reasoning steps)
+    max_steps = 8
+    draft_response = None
+    for step in range(max_steps):
+        print(f"--- ReAct Step {step + 1} ---")
+        ai_msg = model_with_tools.invoke(messages)
+        messages.append(ai_msg)
+        # Check if the model requested tools
+        tool_calls = getattr(ai_msg, "tool_calls", None) or []
+        if not tool_calls:
+            # Model decided it has enough info to answer
+            draft_response = ai_msg
+            print(f"Model found answer or stopped tools: {ai_msg.content}")
+            break
+        # Execute requested tools and append their text output into the conversation
+        for tool_call in tool_calls:
+            name = tool_call["name"]
+            args = tool_call["args"]
+            print(f"Calling tool: {name} with args: {args}")
+            try:
+                tool = tools_by_name[name]
+                tool_result = tool.invoke(args)
+            except Exception as e:
+                tool_result = f"Error executing tool {name}: {str(e)}"
+            messages.append(HumanMessage(content=f"Tool result ({name}):\n{tool_result}"))
+    # If we exhausted all steps without an answer, force a draft response
+    if draft_response is None:
+        print("Max reasoning steps reached. Forcing answer extraction.")
+        forced_msg = HumanMessage(content="You have reached the maximum reasoning steps. Please provide your best final answer based on the current context without any more tool calls.")
+        messages.append(forced_msg)
+        draft_response = model.invoke(messages)
     # Third pass: strict GAIA formatting extraction
     formatting_sys = SystemMessage(

app.py CHANGED Viewed

@@ -159,6 +159,15 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     print(status_update)
     # 5. Submit
     print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:

     status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
     print(status_update)
+    # Backup locally just in case the HF submission server 500 crashes
+    import json
+    try:
+        with open("backup_submission.json", "w") as f:
+            json.dump(submission_data, f)
+        print("Answers backed up to backup_submission.json successfully.")
+    except Exception as e:
+        print(f"Could not backup answers: {e}")
     # 5. Submit
     print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:

requirements.txt CHANGED Viewed

@@ -24,3 +24,4 @@ unstructured[all-docs]
 opencv-python
 beautifulsoup4
 PyPDF2

 opencv-python
 beautifulsoup4
 PyPDF2
+openai-whisper