chore: speech-to-text for final answer (slow)
Browse files- app.py +59 -4
- tools/rag_transformers.py +55 -0
app.py
CHANGED
|
@@ -2,10 +2,13 @@ import os
|
|
| 2 |
import base64
|
| 3 |
import math
|
| 4 |
import pytz
|
|
|
|
| 5 |
import yaml
|
| 6 |
import pycountry
|
| 7 |
import subprocess
|
| 8 |
import sys
|
|
|
|
|
|
|
| 9 |
|
| 10 |
from tools.final_answer import FinalAnswerTool
|
| 11 |
from tools.visit_webpage import VisitWebpageTool
|
|
@@ -21,6 +24,7 @@ from datetime import datetime
|
|
| 21 |
from skimage import io
|
| 22 |
from PIL import Image
|
| 23 |
from typing import Optional, Tuple
|
|
|
|
| 24 |
|
| 25 |
from opentelemetry.sdk.trace import TracerProvider
|
| 26 |
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
|
|
@@ -190,7 +194,47 @@ def browser_automation(original_user_query:str)->str:
|
|
| 190 |
print("vision_web_browser.py: ", result.stderr)
|
| 191 |
return result.stdout
|
| 192 |
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
def initialize_langfuse_opentelemetry_instrumentation():
|
| 195 |
LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
|
| 196 |
LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY")
|
|
@@ -204,6 +248,8 @@ def initialize_langfuse_opentelemetry_instrumentation():
|
|
| 204 |
|
| 205 |
SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
|
| 206 |
|
|
|
|
|
|
|
| 207 |
initialize_langfuse_opentelemetry_instrumentation()
|
| 208 |
|
| 209 |
# load tools from /tools/
|
|
@@ -232,7 +278,15 @@ image_generation_tool_fast = Tool.from_space(
|
|
| 232 |
)
|
| 233 |
|
| 234 |
|
| 235 |
-
ceo_model = load_model("LiteLLMModel", "gpt-4o") # or anthropic/claude-3-sonnet
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
with open("prompts.yaml", 'r') as stream:
|
| 238 |
prompt_templates = yaml.safe_load(stream)
|
|
@@ -260,13 +314,14 @@ agent = CodeAgent(
|
|
| 260 |
max_steps=20, # 15 is good for a light manager, too much when there is no need of a manager
|
| 261 |
verbosity_level=2,
|
| 262 |
grammar=None,
|
| 263 |
-
planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager)
|
| 264 |
name="Alfredo",
|
| 265 |
description="CEO",
|
| 266 |
prompt_templates=prompt_templates,
|
| 267 |
# executor_type="e2b", # security, could also be "docker" (set keys)
|
| 268 |
# sandbox=E2BSandbox() (or E2BExecutor?),
|
| 269 |
-
# step_callbacks=[save_screenshot], # todo: configure the web_navigation agent as a separate agent and
|
|
|
|
| 270 |
additional_authorized_imports=[
|
| 271 |
"geopandas",
|
| 272 |
"plotly",
|
|
|
|
| 2 |
import base64
|
| 3 |
import math
|
| 4 |
import pytz
|
| 5 |
+
import torch
|
| 6 |
import yaml
|
| 7 |
import pycountry
|
| 8 |
import subprocess
|
| 9 |
import sys
|
| 10 |
+
import numpy as np
|
| 11 |
+
import sounddevice as sd
|
| 12 |
|
| 13 |
from tools.final_answer import FinalAnswerTool
|
| 14 |
from tools.visit_webpage import VisitWebpageTool
|
|
|
|
| 24 |
from skimage import io
|
| 25 |
from PIL import Image
|
| 26 |
from typing import Optional, Tuple
|
| 27 |
+
from IPython.display import Audio, display
|
| 28 |
|
| 29 |
from opentelemetry.sdk.trace import TracerProvider
|
| 30 |
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
|
|
|
|
| 194 |
print("vision_web_browser.py: ", result.stderr)
|
| 195 |
return result.stdout
|
| 196 |
|
| 197 |
+
print(f"torch.cuda.is_available(): {torch.cuda.is_available()}")
|
| 198 |
+
text_to_speech_pipe = pipeline(
|
| 199 |
+
"text-to-speech",
|
| 200 |
+
model="suno/bark-small",
|
| 201 |
+
device = 0 if torch.cuda.is_available() else "cpu",
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def speech_to_text(final_answer_text, agent_memory):
|
| 206 |
+
text = f"[clears throat] {final_answer_text}"
|
| 207 |
+
output = text_to_speech_pipe(text)
|
| 208 |
+
# display(Audio(output["audio"], rate=output["sampling_rate"])) # notebook
|
| 209 |
+
audio = np.array(output["audio"], dtype=np.float32)
|
| 210 |
+
print("Original audio shape:", audio.shape)
|
| 211 |
+
|
| 212 |
+
# Adjust audio shape if necessary:
|
| 213 |
+
if audio.ndim == 1:
|
| 214 |
+
# Mono audio, should be fine. You can check if your device expects stereo.
|
| 215 |
+
print("Mono audio... should be fine. You can check if your device expects stereo.")
|
| 216 |
+
elif audio.ndim == 2:
|
| 217 |
+
# Check if the number of channels is acceptable (e.g., 1 or 2)
|
| 218 |
+
channels = audio.shape[1]
|
| 219 |
+
if channels not in [1, 2]:
|
| 220 |
+
# Try to squeeze extra dimensions
|
| 221 |
+
audio = np.squeeze(audio)
|
| 222 |
+
print("Squeezed audio shape:", audio.shape)
|
| 223 |
+
else:
|
| 224 |
+
# If audio has more dimensions than expected, flatten or reshape as needed
|
| 225 |
+
audio = np.squeeze(audio)
|
| 226 |
+
print("Squeezed audio shape:", audio.shape)
|
| 227 |
+
|
| 228 |
+
# Play the audio using sounddevice
|
| 229 |
+
try:
|
| 230 |
+
sd.play(audio, output["sampling_rate"])
|
| 231 |
+
sd.wait() # Wait until audio playback is complete
|
| 232 |
+
except Exception as e:
|
| 233 |
+
print(f"Error playing audio: {e}")
|
| 234 |
+
|
| 235 |
+
return True
|
| 236 |
+
|
| 237 |
+
|
| 238 |
def initialize_langfuse_opentelemetry_instrumentation():
|
| 239 |
LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
|
| 240 |
LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY")
|
|
|
|
| 248 |
|
| 249 |
SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
|
| 250 |
|
| 251 |
+
|
| 252 |
+
# telemetry
|
| 253 |
initialize_langfuse_opentelemetry_instrumentation()
|
| 254 |
|
| 255 |
# load tools from /tools/
|
|
|
|
| 278 |
)
|
| 279 |
|
| 280 |
|
| 281 |
+
# ceo_model = load_model("LiteLLMModel", "gpt-4o") # or anthropic/claude-3-sonnet
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
ceo_model = HfApiModel(
|
| 285 |
+
max_tokens=2096, # 8096 for manager
|
| 286 |
+
temperature=0.5,
|
| 287 |
+
model_id= 'https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # "meta-llama/Llama-3.3-70B-Instruct", # 'https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # same as Qwen/Qwen2.5-Coder-32B-Instruct
|
| 288 |
+
custom_role_conversions=None,
|
| 289 |
+
)
|
| 290 |
|
| 291 |
with open("prompts.yaml", 'r') as stream:
|
| 292 |
prompt_templates = yaml.safe_load(stream)
|
|
|
|
| 314 |
max_steps=20, # 15 is good for a light manager, too much when there is no need of a manager
|
| 315 |
verbosity_level=2,
|
| 316 |
grammar=None,
|
| 317 |
+
# planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager) # test for crashing issues.
|
| 318 |
name="Alfredo",
|
| 319 |
description="CEO",
|
| 320 |
prompt_templates=prompt_templates,
|
| 321 |
# executor_type="e2b", # security, could also be "docker" (set keys)
|
| 322 |
# sandbox=E2BSandbox() (or E2BExecutor?),
|
| 323 |
+
# step_callbacks=[save_screenshot], # todo: configure the web_navigation agent as a separate agent and manage it with alfred
|
| 324 |
+
final_answer_checks=[speech_to_text],
|
| 325 |
additional_authorized_imports=[
|
| 326 |
"geopandas",
|
| 327 |
"plotly",
|
tools/rag_transformers.py
CHANGED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datasets
|
| 2 |
+
from langchain.docstore.document import Document
|
| 3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
+
from langchain_community.retrievers import BM25Retriever
|
| 5 |
+
from smolagents import Tool
|
| 6 |
+
|
| 7 |
+
knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
|
| 8 |
+
knowledge_base = knowledge_base.filter(lambda row: row["source"].startswith("huggingface/transformers"))
|
| 9 |
+
|
| 10 |
+
source_docs = [
|
| 11 |
+
Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]})
|
| 12 |
+
for doc in knowledge_base
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 16 |
+
chunk_size=500,
|
| 17 |
+
chunk_overlap=50,
|
| 18 |
+
add_start_index=True,
|
| 19 |
+
strip_whitespace=True,
|
| 20 |
+
separators=["\n\n", "\n", ".", " ", ""],
|
| 21 |
+
)
|
| 22 |
+
docs_processed = text_splitter.split_documents(source_docs)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class TransformersRetrieverTool(Tool):
|
| 26 |
+
name = "transformers_retriever"
|
| 27 |
+
description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query."
|
| 28 |
+
inputs = {
|
| 29 |
+
"query": {
|
| 30 |
+
"type": "string",
|
| 31 |
+
"description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
output_type = "string"
|
| 35 |
+
|
| 36 |
+
def __init__(self, docs, **kwargs):
|
| 37 |
+
super().__init__(**kwargs)
|
| 38 |
+
self.retriever = BM25Retriever.from_documents(
|
| 39 |
+
docs, k=10
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
def forward(self, query: str) -> str:
|
| 43 |
+
assert isinstance(query, str), "Your search query must be a string"
|
| 44 |
+
|
| 45 |
+
docs = self.retriever.invoke(
|
| 46 |
+
query,
|
| 47 |
+
)
|
| 48 |
+
return "\nRetrieved documents:\n" + "".join(
|
| 49 |
+
[
|
| 50 |
+
f"\n\n===== Document {str(i)} =====\n" + doc.page_content
|
| 51 |
+
for i, doc in enumerate(docs)
|
| 52 |
+
]
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
retriever_tool = TransformersRetrieverTool(docs_processed)
|