Spaces:

bchander
/

agents_course

Sleeping

App Files Files Community

Bhanu-Chander-ABB commited on Jun 12, 2025

Commit

8e63348

1 Parent(s): 85a86de

process_attachment tool

Browse files

Files changed (2) hide show

app.py +97 -5
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import os
 import gradio as gr
 import requests
 import pandas as pd
 import datetime
 from langchain.tools import tool
@@ -264,12 +267,89 @@ def python_executor(code: str) -> str:
         return str(result)
     except Exception as e:
         return f"error: {e}"
 ##-- Tool Discovery ---
 # Use @tool for each function.
 # Use get_all_tools() to auto-discover all decorated tools.
 # tools_list = get_all_tools()
 tools_list = [
     search_tool,
     get_weather,
     calculator,
@@ -300,11 +380,22 @@ You have access to a set of tools that you can use to answer the question:
 {tool_descriptions}
 You must use the tools only if necessary, and you must not use multiple tools in a single call. You should not use a tool if you know the exact answer and can answer by yourself. Don't hallucinate.
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you don't have a valid answer, just return "no_answer".
-If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
-If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
-If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 """
 # system_prompt = f"""
@@ -373,8 +464,9 @@ agent = initialize_agent(
     agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
     agent_kwargs={"system_message": system_prompt},
     verbose=True,
-    max_iterations=10, # Increase as needed
-    max_execution_time=3000, # Increase as needed
     handle_parsing_errors=True
 )

 import os
 import gradio as gr
 import requests
+import tempfile
+import mimetypes
+import base64
 import pandas as pd
 import datetime
 from langchain.tools import tool
         return str(result)
     except Exception as e:
         return f"error: {e}"
+# --- TOOL 15: Attachment Processing Tool ---
+@tool
+def process_attachment(file_bytes: bytes, filename: str) -> str:
+    """
+    Processes an input attachment (audio, image, or video) and returns extracted text or a summary suitable for LLM input.
+    - For audio: transcribes to text using Whisper.
+    - For image: encodes as base64 and returns a prompt for LLMs that support image input.
+    - For video: extracts audio, transcribes, and returns the transcript.
+    - For unsupported types: returns an error message.
+    """
+    # Detect file type
+    mime_type, _ = mimetypes.guess_type(filename)
+    if not mime_type:
+        return "error: Could not determine file type. Skip the file"
+    # Handle audio files
+    if mime_type.startswith("audio"):
+        api_url = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
+        headers = {"Authorization": f"Bearer {HF_ACCESS_KEY}"}
+        files = {"file": (filename, file_bytes)}
+        try:
+            resp = requests.post(api_url, headers=headers, files=files, timeout=60)
+            resp.raise_for_status()
+            data = resp.json()
+            transcript = data.get("text", "")
+            if transcript:
+                return f"Transcript of the audio: {transcript}"
+            else:
+                return "error: No transcript returned."
+        except Exception as e:
+            return f"error: {e}"
+    # Handle image files
+    elif mime_type.startswith("image"):
+        image_b64 = base64.b64encode(file_bytes).decode()
+        return f"Attached image (base64): {image_b64}"
+    # Handle video files (extract audio, then transcribe)
+    elif mime_type.startswith("video"):
+        try:
+            # Save video to temp file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=filename.split('.')[-1]) as tmp_video:
+                tmp_video.write(file_bytes)
+                tmp_video.flush()
+                video_path = tmp_video.name
+            # Extract audio using ffmpeg (requires ffmpeg installed)
+            audio_path = video_path + ".wav"
+            import subprocess
+            subprocess.run([
+                "ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_path
+            ], check=True)
+            # Read audio bytes
+            with open(audio_path, "rb") as f:
+                audio_bytes = f.read()
+            # Transcribe audio
+            api_url = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
+            headers = {"Authorization": f"Bearer {HF_ACCESS_KEY}"}
+            files = {"file": ("audio.wav", audio_bytes)}
+            resp = requests.post(api_url, headers=headers, files=files, timeout=120)
+            resp.raise_for_status()
+            data = resp.json()
+            transcript = data.get("text", "")
+            if transcript:
+                return f"Transcript of the video audio: {transcript}"
+            else:
+                return "error: No transcript returned from video audio."
+        except Exception as e:
+            return f"error: {e}"
+    else:
+        return "error: Unsupported file type. Please skip the file usage."
 ##-- Tool Discovery ---
 # Use @tool for each function.
 # Use get_all_tools() to auto-discover all decorated tools.
 # tools_list = get_all_tools()
 tools_list = [
+    process_attachment,
     search_tool,
     get_weather,
     calculator,
 {tool_descriptions}
+If there is a file (image, audio, or video) attached to the question, you should use the process_attachment tool to process it.
+For audio or video attachments, the process_attachment tool will transcribe the audio and return the transcript, which you can use to answer the question.
+For image attachments, the process_attachment tool will return a base64 encoded string of the image. You can use this encoded information to provide answer.
 You must use the tools only if necessary, and you must not use multiple tools in a single call. You should not use a tool if you know the exact answer and can answer by yourself. Don't hallucinate.
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you don't have a valid answer, just return "no_answer".
+Example of a valid answer:
+If your response to a question is "The capital of France is Paris", you should return "Paris" as your final answer.
+If your response to a question is "The population of France is 67 million", you should return "67" as your final answer.
+If your response to a question is "4 studio albums were published by Mercedes Sosa between 2000 and 2009", you should return "4" as your final answer.
+Further instructions:
+- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
+- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 """
 # system_prompt = f"""
     agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
     agent_kwargs={"system_message": system_prompt},
     verbose=True,
+    max_iterations=20, # Increase as needed
+    max_execution_time=4000, # Increase as needed
+    early_stopping_method="generate",
     handle_parsing_errors=True
 )

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ langchain-huggingface
 langchain-community
 transformers
 langchain-openai
-beautifulsoup4

 langchain-community
 transformers
 langchain-openai
+beautifulsoup4
+mimetype