Spaces:
Sleeping
Sleeping
audio
Browse files
app.py
CHANGED
|
@@ -20,8 +20,8 @@ from state import AgentState
|
|
| 20 |
# --- Constants ---
|
| 21 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 22 |
|
| 23 |
-
from tools import ocr_image_tool, parse_excel_tool, web_search_tool, run_tools
|
| 24 |
-
tool_node = ToolNode([ocr_image_tool, parse_excel_tool, web_search_tool])
|
| 25 |
|
| 26 |
llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.0)
|
| 27 |
|
|
@@ -45,14 +45,14 @@ def plan_node(state: AgentState) -> AgentState:
|
|
| 45 |
# 2) Build a fresh SystemMessage explaining exactly one dict key
|
| 46 |
system_msg = SystemMessage(
|
| 47 |
content=(
|
| 48 |
-
"You
|
|
|
|
|
|
|
| 49 |
" • web_search_query: <search terms>\n"
|
| 50 |
" • ocr_path: <path to an image file>\n"
|
| 51 |
-
" • excel_path: <path to a .xlsx file
|
| 52 |
-
" •
|
| 53 |
-
"
|
| 54 |
-
"Example: {'web_search_query':'Mercedes Sosa discography'}\n"
|
| 55 |
-
"Respond with only that Python dict literal—no extra text or explanation."
|
| 56 |
)
|
| 57 |
)
|
| 58 |
human_msg = HumanMessage(content=user_input)
|
|
@@ -73,6 +73,7 @@ def plan_node(state: AgentState) -> AgentState:
|
|
| 73 |
"ocr_path",
|
| 74 |
"excel_path",
|
| 75 |
"excel_sheet_name",
|
|
|
|
| 76 |
"final_answer"
|
| 77 |
}
|
| 78 |
for k, v in parsed.items():
|
|
@@ -110,7 +111,11 @@ def finalize_node(state: AgentState) -> AgentState:
|
|
| 110 |
combined += f"OCR_RESULT: {orc}\n"
|
| 111 |
if exr := state.get("excel_result"):
|
| 112 |
combined += f"EXCEL_RESULT: {exr}\n"
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
llm_response = llm([SystemMessage(content=combined)])
|
| 116 |
return {"final_answer": llm_response.content.strip()}
|
|
@@ -178,11 +183,12 @@ def respond_to_input(user_input: str) -> str:
|
|
| 178 |
system_msg = SystemMessage(
|
| 179 |
content=(
|
| 180 |
"You are an agent that decides whether to call a tool or answer the user directly. "
|
| 181 |
-
"The user
|
| 182 |
"If you need to call a tool, set exactly one key from the following in a Python dict: "
|
| 183 |
" • web_search_query: <search terms>\n"
|
| 184 |
" • ocr_path: <path to an image file>\n"
|
| 185 |
" • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
|
|
|
|
| 186 |
"Do not include any extra text or markdown—only return a valid Python dict literal."
|
| 187 |
)
|
| 188 |
)
|
|
|
|
| 20 |
# --- Constants ---
|
| 21 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 22 |
|
| 23 |
+
from tools import ocr_image_tool, parse_excel_tool, web_search_tool, run_tools, audio_transcriber_tool
|
| 24 |
+
tool_node = ToolNode([ocr_image_tool, parse_excel_tool, web_search_tool, audio_transcriber_tool])
|
| 25 |
|
| 26 |
llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.0)
|
| 27 |
|
|
|
|
| 45 |
# 2) Build a fresh SystemMessage explaining exactly one dict key
|
| 46 |
system_msg = SystemMessage(
|
| 47 |
content=(
|
| 48 |
+
"You are an agent that decides whether to call a tool or answer the user directly. "
|
| 49 |
+
"The user's question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
|
| 50 |
+
"If you need to call a tool, set exactly one key from the following in a Python dict: "
|
| 51 |
" • web_search_query: <search terms>\n"
|
| 52 |
" • ocr_path: <path to an image file>\n"
|
| 53 |
+
" • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
|
| 54 |
+
" • audio_path: <path to an audio file>\n"
|
| 55 |
+
"Do not include any extra text or markdown—only return a valid Python dict literal."
|
|
|
|
|
|
|
| 56 |
)
|
| 57 |
)
|
| 58 |
human_msg = HumanMessage(content=user_input)
|
|
|
|
| 73 |
"ocr_path",
|
| 74 |
"excel_path",
|
| 75 |
"excel_sheet_name",
|
| 76 |
+
"audio_path",
|
| 77 |
"final_answer"
|
| 78 |
}
|
| 79 |
for k, v in parsed.items():
|
|
|
|
| 111 |
combined += f"OCR_RESULT: {orc}\n"
|
| 112 |
if exr := state.get("excel_result"):
|
| 113 |
combined += f"EXCEL_RESULT: {exr}\n"
|
| 114 |
+
# Check for both possible transcript keys
|
| 115 |
+
audio_transcript = state.get("audio_transcript") or state.get("transcript")
|
| 116 |
+
if audio_transcript:
|
| 117 |
+
combined += f"AUDIO_TRANSCRIPT: {audio_transcript}\n"
|
| 118 |
+
combined += "Based on the above, provide ONLY the final answer. Do not include any explanation or extra text."
|
| 119 |
|
| 120 |
llm_response = llm([SystemMessage(content=combined)])
|
| 121 |
return {"final_answer": llm_response.content.strip()}
|
|
|
|
| 183 |
system_msg = SystemMessage(
|
| 184 |
content=(
|
| 185 |
"You are an agent that decides whether to call a tool or answer the user directly. "
|
| 186 |
+
"The user's question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
|
| 187 |
"If you need to call a tool, set exactly one key from the following in a Python dict: "
|
| 188 |
" • web_search_query: <search terms>\n"
|
| 189 |
" • ocr_path: <path to an image file>\n"
|
| 190 |
" • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
|
| 191 |
+
" • audio_path: <path to an audio file>\n"
|
| 192 |
"Do not include any extra text or markdown—only return a valid Python dict literal."
|
| 193 |
)
|
| 194 |
)
|
requirements.txt
CHANGED
|
@@ -8,3 +8,5 @@ openai
|
|
| 8 |
pandas
|
| 9 |
langchain_openai
|
| 10 |
langchain_community
|
|
|
|
|
|
|
|
|
| 8 |
pandas
|
| 9 |
langchain_openai
|
| 10 |
langchain_community
|
| 11 |
+
pydub
|
| 12 |
+
whisper
|
state.py
CHANGED
|
@@ -12,4 +12,7 @@ class AgentState(TypedDict, total=False):
|
|
| 12 |
ocr_result: str
|
| 13 |
excel_result: str
|
| 14 |
final_answer: str
|
| 15 |
-
user_input: str
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
ocr_result: str
|
| 13 |
excel_result: str
|
| 14 |
final_answer: str
|
| 15 |
+
user_input: str
|
| 16 |
+
audio_path: str
|
| 17 |
+
transcript: str
|
| 18 |
+
audio_transcript: str
|
tools.py
CHANGED
|
@@ -79,4 +79,62 @@ def run_tools(state: AgentState, tool_out: AgentState) -> AgentState:
|
|
| 79 |
This node should be wired as its own graph node, not as a transition function.
|
| 80 |
"""
|
| 81 |
new_state = {**state, **tool_out}
|
| 82 |
-
return new_state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
This node should be wired as its own graph node, not as a transition function.
|
| 80 |
"""
|
| 81 |
new_state = {**state, **tool_out}
|
| 82 |
+
return new_state
|
| 83 |
+
|
| 84 |
+
import whisper
|
| 85 |
+
import os
|
| 86 |
+
from pydub import AudioSegment
|
| 87 |
+
from pydub.utils import make_chunks
|
| 88 |
+
|
| 89 |
+
_whisper_model = whisper.load_model("base")
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def audio_transcriber_tool(state: AgentState) -> AgentState:
|
| 93 |
+
"""
|
| 94 |
+
LangGraph tool for transcribing audio via Whisper.
|
| 95 |
+
Expects: state["audio_path"] to be a path to a .wav/.mp3/.m4a file.
|
| 96 |
+
Returns:
|
| 97 |
+
{
|
| 98 |
+
"audio_path": None,
|
| 99 |
+
"transcript": "<full transcribed text>"
|
| 100 |
+
}
|
| 101 |
+
If no valid audio_path is found, returns {} to signal "no-op."
|
| 102 |
+
"""
|
| 103 |
+
path = state.get("audio_path", "")
|
| 104 |
+
if not path or not os.path.exists(path):
|
| 105 |
+
return {}
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
# Whisper API has a ~25 MB limit per request. If file is small, transcribe directly.
|
| 109 |
+
max_bytes = 25 * 1024 * 1024
|
| 110 |
+
if os.path.getsize(path) <= max_bytes:
|
| 111 |
+
result = _whisper_model.transcribe(path)
|
| 112 |
+
text = result["text"].strip()
|
| 113 |
+
else:
|
| 114 |
+
# For large files, split into 2-minute (120 s) chunks
|
| 115 |
+
audio = AudioSegment.from_file(path)
|
| 116 |
+
chunk_length_ms = 120 * 1000
|
| 117 |
+
chunks = make_chunks(audio, chunk_length_ms)
|
| 118 |
+
|
| 119 |
+
transcripts = []
|
| 120 |
+
for i, chunk in enumerate(chunks):
|
| 121 |
+
chunk_name = f"temp_chunk_{i}.wav"
|
| 122 |
+
chunk.export(chunk_name, format="wav")
|
| 123 |
+
res = _whisper_model.transcribe(chunk_name)
|
| 124 |
+
transcripts.append(res["text"].strip())
|
| 125 |
+
os.remove(chunk_name)
|
| 126 |
+
text = "\n".join(transcripts)
|
| 127 |
+
|
| 128 |
+
except Exception as e:
|
| 129 |
+
text = f"Error during transcription: {e}"
|
| 130 |
+
|
| 131 |
+
return {
|
| 132 |
+
"audio_path": None,
|
| 133 |
+
"transcript": text
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
|