New_Final_Assignment

Sleeping

App Files Files Community

naman1102 commited on Jun 2, 2025

Commit

7fb0070

1 Parent(s): 8e1dd81

audio

Browse files

Files changed (4) hide show

app.py +16 -10
requirements.txt +2 -0
state.py +4 -1
tools.py +59 -1

app.py CHANGED Viewed

@@ -20,8 +20,8 @@ from state import AgentState
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-from tools import ocr_image_tool, parse_excel_tool, web_search_tool, run_tools
-tool_node = ToolNode([ocr_image_tool, parse_excel_tool, web_search_tool])
 llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.0)
@@ -45,14 +45,14 @@ def plan_node(state: AgentState) -> AgentState:
     # 2) Build a fresh SystemMessage explaining exactly one dict key
     system_msg = SystemMessage(
         content=(
-            "You can set exactly one of these keys in a Python dict and nothing else:\n"
             "  • web_search_query: <search terms>\n"
             "  • ocr_path: <path to an image file>\n"
-            "  • excel_path: <path to a .xlsx file>\n"
-            "  • excel_sheet_name: <sheet name>\n"
-            "Or, if no tool is needed, set final_answer: <your answer>.\n"
-            "Example: {'web_search_query':'Mercedes Sosa discography'}\n"
-            "Respond with only that Python dict literal—no extra text or explanation."
         )
     )
     human_msg = HumanMessage(content=user_input)
@@ -73,6 +73,7 @@ def plan_node(state: AgentState) -> AgentState:
                 "ocr_path",
                 "excel_path",
                 "excel_sheet_name",
                 "final_answer"
             }
             for k, v in parsed.items():
@@ -110,7 +111,11 @@ def finalize_node(state: AgentState) -> AgentState:
         combined += f"OCR_RESULT: {orc}\n"
     if exr := state.get("excel_result"):
         combined += f"EXCEL_RESULT: {exr}\n"
-    combined += "Based on the above, provide the final answer."
     llm_response = llm([SystemMessage(content=combined)])
     return {"final_answer": llm_response.content.strip()}
@@ -178,11 +183,12 @@ def respond_to_input(user_input: str) -> str:
     system_msg = SystemMessage(
     content=(
         "You are an agent that decides whether to call a tool or answer the user directly. "
-        "The user’s question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
         "If you need to call a tool, set exactly one key from the following in a Python dict: "
         "  • web_search_query: <search terms>\n"
         "  • ocr_path: <path to an image file>\n"
         "  • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
         "Do not include any extra text or markdown—only return a valid Python dict literal."
     )
     )

 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+from tools import ocr_image_tool, parse_excel_tool, web_search_tool, run_tools, audio_transcriber_tool
+tool_node = ToolNode([ocr_image_tool, parse_excel_tool, web_search_tool, audio_transcriber_tool])
 llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0.0)
     # 2) Build a fresh SystemMessage explaining exactly one dict key
     system_msg = SystemMessage(
         content=(
+            "You are an agent that decides whether to call a tool or answer the user directly. "
+            "The user's question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
+            "If you need to call a tool, set exactly one key from the following in a Python dict: "
             "  • web_search_query: <search terms>\n"
             "  • ocr_path: <path to an image file>\n"
+            "  • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
+            "  • audio_path: <path to an audio file>\n"
+            "Do not include any extra text or markdown—only return a valid Python dict literal."
         )
     )
     human_msg = HumanMessage(content=user_input)
                 "ocr_path",
                 "excel_path",
                 "excel_sheet_name",
+                "audio_path",
                 "final_answer"
             }
             for k, v in parsed.items():
         combined += f"OCR_RESULT: {orc}\n"
     if exr := state.get("excel_result"):
         combined += f"EXCEL_RESULT: {exr}\n"
+    # Check for both possible transcript keys
+    audio_transcript = state.get("audio_transcript") or state.get("transcript")
+    if audio_transcript:
+        combined += f"AUDIO_TRANSCRIPT: {audio_transcript}\n"
+    combined += "Based on the above, provide ONLY the final answer. Do not include any explanation or extra text."
     llm_response = llm([SystemMessage(content=combined)])
     return {"final_answer": llm_response.content.strip()}
     system_msg = SystemMessage(
     content=(
         "You are an agent that decides whether to call a tool or answer the user directly. "
+        "The user's question is below. If the answer can be given directly, return {'final_answer': <your answer>}."
         "If you need to call a tool, set exactly one key from the following in a Python dict: "
         "  • web_search_query: <search terms>\n"
         "  • ocr_path: <path to an image file>\n"
         "  • excel_path: <path to a .xlsx file>, excel_sheet_name: <sheet name>.\n"
+        "  • audio_path: <path to an audio file>\n"
         "Do not include any extra text or markdown—only return a valid Python dict literal."
     )
     )

requirements.txt CHANGED Viewed

@@ -8,3 +8,5 @@ openai
 pandas
 langchain_openai
 langchain_community

 pandas
 langchain_openai
 langchain_community
+pydub
+whisper

state.py CHANGED Viewed

@@ -12,4 +12,7 @@ class AgentState(TypedDict, total=False):
     ocr_result: str
     excel_result: str
     final_answer: str
-    user_input: str

     ocr_result: str
     excel_result: str
     final_answer: str
+    user_input: str
+    audio_path: str
+    transcript: str
+    audio_transcript: str

tools.py CHANGED Viewed

@@ -79,4 +79,62 @@ def run_tools(state: AgentState, tool_out: AgentState) -> AgentState:
     This node should be wired as its own graph node, not as a transition function.
     """
     new_state = {**state, **tool_out}
-    return new_state

     This node should be wired as its own graph node, not as a transition function.
     """
     new_state = {**state, **tool_out}
+    return new_state
+import whisper
+import os
+from pydub import AudioSegment
+from pydub.utils import make_chunks
+_whisper_model = whisper.load_model("base")
+def audio_transcriber_tool(state: AgentState) -> AgentState:
+    """
+    LangGraph tool for transcribing audio via Whisper.
+    Expects: state["audio_path"] to be a path to a .wav/.mp3/.m4a file.
+    Returns:
+        {
+            "audio_path": None,
+            "transcript": "<full transcribed text>"
+        }
+    If no valid audio_path is found, returns {} to signal "no-op."
+    """
+    path = state.get("audio_path", "")
+    if not path or not os.path.exists(path):
+        return {}
+    try:
+        # Whisper API has a ~25 MB limit per request. If file is small, transcribe directly.
+        max_bytes = 25 * 1024 * 1024
+        if os.path.getsize(path) <= max_bytes:
+            result = _whisper_model.transcribe(path)
+            text = result["text"].strip()
+        else:
+            # For large files, split into 2-minute (120 s) chunks
+            audio = AudioSegment.from_file(path)
+            chunk_length_ms = 120 * 1000
+            chunks = make_chunks(audio, chunk_length_ms)
+            transcripts = []
+            for i, chunk in enumerate(chunks):
+                chunk_name = f"temp_chunk_{i}.wav"
+                chunk.export(chunk_name, format="wav")
+                res = _whisper_model.transcribe(chunk_name)
+                transcripts.append(res["text"].strip())
+                os.remove(chunk_name)
+            text = "\n".join(transcripts)
+    except Exception as e:
+        text = f"Error during transcription: {e}"
+    return {
+        "audio_path": None,
+        "transcript": text
+    }