Spaces:
Sleeping
Sleeping
Commit
·
8e63348
1
Parent(s):
85a86de
process_attachment tool
Browse files- app.py +97 -5
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
|
|
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import datetime
|
| 6 |
from langchain.tools import tool
|
|
@@ -264,12 +267,89 @@ def python_executor(code: str) -> str:
|
|
| 264 |
return str(result)
|
| 265 |
except Exception as e:
|
| 266 |
return f"error: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
|
|
|
| 268 |
##-- Tool Discovery ---
|
| 269 |
# Use @tool for each function.
|
| 270 |
# Use get_all_tools() to auto-discover all decorated tools.
|
| 271 |
# tools_list = get_all_tools()
|
| 272 |
tools_list = [
|
|
|
|
| 273 |
search_tool,
|
| 274 |
get_weather,
|
| 275 |
calculator,
|
|
@@ -300,11 +380,22 @@ You have access to a set of tools that you can use to answer the question:
|
|
| 300 |
|
| 301 |
{tool_descriptions}
|
| 302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
You must use the tools only if necessary, and you must not use multiple tools in a single call. You should not use a tool if you know the exact answer and can answer by yourself. Don't hallucinate.
|
| 304 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you don't have a valid answer, just return "no_answer".
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
If
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
"""
|
| 309 |
|
| 310 |
# system_prompt = f"""
|
|
@@ -373,8 +464,9 @@ agent = initialize_agent(
|
|
| 373 |
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
| 374 |
agent_kwargs={"system_message": system_prompt},
|
| 375 |
verbose=True,
|
| 376 |
-
max_iterations=
|
| 377 |
-
max_execution_time=
|
|
|
|
| 378 |
handle_parsing_errors=True
|
| 379 |
)
|
| 380 |
|
|
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
| 4 |
+
import tempfile
|
| 5 |
+
import mimetypes
|
| 6 |
+
import base64
|
| 7 |
import pandas as pd
|
| 8 |
import datetime
|
| 9 |
from langchain.tools import tool
|
|
|
|
| 267 |
return str(result)
|
| 268 |
except Exception as e:
|
| 269 |
return f"error: {e}"
|
| 270 |
+
|
| 271 |
+
# --- TOOL 15: Attachment Processing Tool ---
|
| 272 |
+
@tool
|
| 273 |
+
def process_attachment(file_bytes: bytes, filename: str) -> str:
|
| 274 |
+
"""
|
| 275 |
+
Processes an input attachment (audio, image, or video) and returns extracted text or a summary suitable for LLM input.
|
| 276 |
+
- For audio: transcribes to text using Whisper.
|
| 277 |
+
- For image: encodes as base64 and returns a prompt for LLMs that support image input.
|
| 278 |
+
- For video: extracts audio, transcribes, and returns the transcript.
|
| 279 |
+
- For unsupported types: returns an error message.
|
| 280 |
+
"""
|
| 281 |
+
# Detect file type
|
| 282 |
+
mime_type, _ = mimetypes.guess_type(filename)
|
| 283 |
+
if not mime_type:
|
| 284 |
+
return "error: Could not determine file type. Skip the file"
|
| 285 |
+
|
| 286 |
+
# Handle audio files
|
| 287 |
+
if mime_type.startswith("audio"):
|
| 288 |
+
api_url = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
|
| 289 |
+
headers = {"Authorization": f"Bearer {HF_ACCESS_KEY}"}
|
| 290 |
+
files = {"file": (filename, file_bytes)}
|
| 291 |
+
try:
|
| 292 |
+
resp = requests.post(api_url, headers=headers, files=files, timeout=60)
|
| 293 |
+
resp.raise_for_status()
|
| 294 |
+
data = resp.json()
|
| 295 |
+
transcript = data.get("text", "")
|
| 296 |
+
if transcript:
|
| 297 |
+
return f"Transcript of the audio: {transcript}"
|
| 298 |
+
else:
|
| 299 |
+
return "error: No transcript returned."
|
| 300 |
+
except Exception as e:
|
| 301 |
+
return f"error: {e}"
|
| 302 |
+
|
| 303 |
+
# Handle image files
|
| 304 |
+
elif mime_type.startswith("image"):
|
| 305 |
+
image_b64 = base64.b64encode(file_bytes).decode()
|
| 306 |
+
return f"Attached image (base64): {image_b64}"
|
| 307 |
+
|
| 308 |
+
# Handle video files (extract audio, then transcribe)
|
| 309 |
+
elif mime_type.startswith("video"):
|
| 310 |
+
try:
|
| 311 |
+
# Save video to temp file
|
| 312 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=filename.split('.')[-1]) as tmp_video:
|
| 313 |
+
tmp_video.write(file_bytes)
|
| 314 |
+
tmp_video.flush()
|
| 315 |
+
video_path = tmp_video.name
|
| 316 |
+
|
| 317 |
+
# Extract audio using ffmpeg (requires ffmpeg installed)
|
| 318 |
+
audio_path = video_path + ".wav"
|
| 319 |
+
import subprocess
|
| 320 |
+
subprocess.run([
|
| 321 |
+
"ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_path
|
| 322 |
+
], check=True)
|
| 323 |
+
|
| 324 |
+
# Read audio bytes
|
| 325 |
+
with open(audio_path, "rb") as f:
|
| 326 |
+
audio_bytes = f.read()
|
| 327 |
+
|
| 328 |
+
# Transcribe audio
|
| 329 |
+
api_url = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
|
| 330 |
+
headers = {"Authorization": f"Bearer {HF_ACCESS_KEY}"}
|
| 331 |
+
files = {"file": ("audio.wav", audio_bytes)}
|
| 332 |
+
resp = requests.post(api_url, headers=headers, files=files, timeout=120)
|
| 333 |
+
resp.raise_for_status()
|
| 334 |
+
data = resp.json()
|
| 335 |
+
transcript = data.get("text", "")
|
| 336 |
+
if transcript:
|
| 337 |
+
return f"Transcript of the video audio: {transcript}"
|
| 338 |
+
else:
|
| 339 |
+
return "error: No transcript returned from video audio."
|
| 340 |
+
except Exception as e:
|
| 341 |
+
return f"error: {e}"
|
| 342 |
+
|
| 343 |
+
else:
|
| 344 |
+
return "error: Unsupported file type. Please skip the file usage."
|
| 345 |
|
| 346 |
+
|
| 347 |
##-- Tool Discovery ---
|
| 348 |
# Use @tool for each function.
|
| 349 |
# Use get_all_tools() to auto-discover all decorated tools.
|
| 350 |
# tools_list = get_all_tools()
|
| 351 |
tools_list = [
|
| 352 |
+
process_attachment,
|
| 353 |
search_tool,
|
| 354 |
get_weather,
|
| 355 |
calculator,
|
|
|
|
| 380 |
|
| 381 |
{tool_descriptions}
|
| 382 |
|
| 383 |
+
If there is a file (image, audio, or video) attached to the question, you should use the process_attachment tool to process it.
|
| 384 |
+
For audio or video attachments, the process_attachment tool will transcribe the audio and return the transcript, which you can use to answer the question.
|
| 385 |
+
For image attachments, the process_attachment tool will return a base64 encoded string of the image. You can use this encoded information to provide answer.
|
| 386 |
+
|
| 387 |
You must use the tools only if necessary, and you must not use multiple tools in a single call. You should not use a tool if you know the exact answer and can answer by yourself. Don't hallucinate.
|
| 388 |
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you don't have a valid answer, just return "no_answer".
|
| 389 |
+
|
| 390 |
+
Example of a valid answer:
|
| 391 |
+
If your response to a question is "The capital of France is Paris", you should return "Paris" as your final answer.
|
| 392 |
+
If your response to a question is "The population of France is 67 million", you should return "67" as your final answer.
|
| 393 |
+
If your response to a question is "4 studio albums were published by Mercedes Sosa between 2000 and 2009", you should return "4" as your final answer.
|
| 394 |
+
|
| 395 |
+
Further instructions:
|
| 396 |
+
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
| 397 |
+
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 398 |
+
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
| 399 |
"""
|
| 400 |
|
| 401 |
# system_prompt = f"""
|
|
|
|
| 464 |
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
| 465 |
agent_kwargs={"system_message": system_prompt},
|
| 466 |
verbose=True,
|
| 467 |
+
max_iterations=20, # Increase as needed
|
| 468 |
+
max_execution_time=4000, # Increase as needed
|
| 469 |
+
early_stopping_method="generate",
|
| 470 |
handle_parsing_errors=True
|
| 471 |
)
|
| 472 |
|
requirements.txt
CHANGED
|
@@ -8,4 +8,5 @@ langchain-huggingface
|
|
| 8 |
langchain-community
|
| 9 |
transformers
|
| 10 |
langchain-openai
|
| 11 |
-
beautifulsoup4
|
|
|
|
|
|
| 8 |
langchain-community
|
| 9 |
transformers
|
| 10 |
langchain-openai
|
| 11 |
+
beautifulsoup4
|
| 12 |
+
mimetype
|