Spaces:

Agents-MCP-Hackathon
/

Audio-Agent

Sleeping

App Files Files Community

YigitSekerci commited on Jun 8, 2025

Commit

5e87361

1 Parent(s): f4438c7

try new agent style

Browse files

Files changed (8) hide show

src/agent.py +77 -104
src/nodes/agent.py +128 -0
src/nodes/chat.py +1 -1
src/nodes/planner.py +1 -1
src/nodes/processor.py +1 -1
src/{state.py → nodes/state.py} +0 -0
src/nodes/validator.py +1 -1
src/ui.py +38 -20

src/agent.py CHANGED Viewed

@@ -1,14 +1,62 @@
 from dotenv import load_dotenv
-from functools import partial
 from langchain_mcp_adapters.client import MultiServerMCPClient
-from langgraph.graph import StateGraph, END, START
-from .state import AgentState, InputState, OutputState
-from .nodes.chat import chat_node, chat_node_router
-from .nodes.planner import planner_node
-from .nodes.processor import processor_node
-from .nodes.validator import validator_node, validator_node_router
 class AudioAgent:
     def __init__(
@@ -24,105 +72,30 @@ class AudioAgent:
         self._client = MultiServerMCPClient({
             "audio-tools": {"url": self.server_url, "transport": "sse"}
         })
-    @property
-    def is_initialized(self) -> bool:
-        return self.graph is not None
-    async def _build_graph(self) -> None:
-        """Build the LangGraph workflow."""
-        _graph = StateGraph(
-            AgentState,
-            input=InputState,
-            output=OutputState
-        )
-        _graph.add_node("chat", chat_node)
-        _graph.add_conditional_edges(
-            "chat",
-            chat_node_router,
-            {
-                "planner": "planner",
-                "end": END
-            }
         )
-        _graph.add_node("planner", planner_node)
-        _graph.add_edge("planner", "audio_processor")
-        processor_node_with_tools = partial(processor_node, tools=self.tools)
-        _graph.add_node("audio_processor", processor_node_with_tools)
-        # TODO: add validator edge to here
-        _graph.add_edge("audio_processor", "chat")
-        _graph.add_node("validator", validator_node)
-        _graph.add_conditional_edges(
-            "validator",
-            validator_node_router,
-            {
-                "chat": "chat",
-                "planner": "planner"
-            }
-        )
-        _graph.add_edge(START, "chat")
-        _graph.add_edge("chat", END)
-        self.graph = _graph.compile()
-    async def initialize(self) -> None:
-        """Initialize the LangGraph workflow with audio tools."""
-        if self.is_initialized:
-            return
-        self.tools = await self._client.get_tools()
-        if not self.tools:
-            raise RuntimeError("No tools available from MCP server")
-        await self._build_graph()
-    def _extract_audio_paths(self, user_message: str) -> tuple[str, list[str]]:
-        """Extract audio file paths from user message and return cleaned message."""
-        audio_files = []
-        lines = user_message.split('\n')
-        clean_lines = []
-        for line in lines:
-            if line.strip().startswith('Audio file:'):
-                # Extract the file path
-                audio_path = line.replace('Audio file:', '').strip()
-                audio_files.append(audio_path)
-            else:
-                clean_lines.append(line)
-        clean_message = '\n'.join(clean_lines).strip()
-        return clean_message, audio_files
-    async def chat(self, user_message: str):
-        """Stream chat responses with node information."""
-        if not self.is_initialized:
-            await self.initialize()
-        # Extract audio file paths from the message
-        clean_message, audio_files = self._extract_audio_paths(user_message)
-        # Set up initial state
-        initial_state = {
-            "user_input": clean_message,
-            "input_audio_files": audio_files,
-            "steps_details": [],
-            "plan": "",
-            "final_response": "",
-            "requires_processing": False,
-            "validator_feedback": "",
-            "output_audio_files": []
-        }
-        # Stream the graph execution
-        return await self.graph.ainvoke(initial_state, stream_mode="values")
-    def draw_graph(self) -> None:
-        """Draw the graph to a file."""
-        graph_image = self.graph.get_graph().draw_mermaid_png()
-        with open("graph.png", "wb") as f:
-            f.write(graph_image)

+from langgraph.prebuilt import create_react_agent
+from pydantic import BaseModel, Field
 from dotenv import load_dotenv
 from langchain_mcp_adapters.client import MultiServerMCPClient
+class AgentOutput(BaseModel):
+    final_response: str = Field(description="The final response to the user.", default="")
+    output_audio_files: list[str] = Field(description="The output audio files.", default=[])
+system_prompt = """You are an expert Audio Processing Assistant with specialized capabilities in audio manipulation, analysis, and editing. Your primary purpose is to help users with audio-related tasks and provide knowledgeable assistance in the audio domain.
+## Core Behavior Guidelines:
+### Conversation Scope:
+- ONLY engage in conversations related to audio processing, audio editing, sound engineering, music production, audio analysis, audio formats, and related audio technologies
+- If a user asks about topics outside the audio domain, politely decline and redirect them back to audio-related assistance
+- Be conversational, friendly, and helpful when discussing audio topics
+- Share your expertise about audio concepts, techniques, and best practices when relevant
+### Audio Processing Workflow:
+When a user requests audio processing and provides input files, follow this structured approach:
+1. **ANALYSIS PHASE:**
+   - Analyze the user's request to understand their goals
+   - Examine the provided input audio files if available
+   - Identify what audio processing operations are needed
+2. **PLANNING PHASE:**
+   - Create a clear, step-by-step plan for the audio processing task
+   - Explain your plan to the user before execution
+   - Ensure the plan addresses their specific requirements
+3. **EXECUTION PHASE:**
+   - Use the available audio tools to implement your plan
+   - Process the audio files according to the planned steps
+   - Handle any errors or unexpected results gracefully
+4. **VALIDATION PHASE:**
+   - Verify that the processed audio meets the user's requirements
+   - Check the quality and correctness of the output
+   - Test that the processing achieved the desired results
+5. **RESPONSE PHASE:**
+   - Provide a clear summary of what was accomplished
+   - Include the output audio files in your response
+   - Offer additional suggestions or next steps if relevant
+## Available Context:
+- You have access to input_audio_files when provided by the user
+- You can generate output_audio_files through your audio processing tools
+- Use your tools effectively to analyze, edit, convert, and manipulate audio
+## Response Format:
+- Always provide helpful, accurate information about audio topics
+- When processing audio, be transparent about your process and results
+- Include relevant technical details when appropriate
+- Maintain a professional yet approachable tone
+Remember: Stay focused on audio-related assistance and use your specialized tools to help users achieve their audio processing goals efficiently and effectively."""
 class AudioAgent:
     def __init__(
         self._client = MultiServerMCPClient({
             "audio-tools": {"url": self.server_url, "transport": "sse"}
         })
+        self.agent = None
+    async def build_agent(self):
+        tools = await self._client.get_tools()
+        agent = create_react_agent(
+            model="gpt-4.1",
+            tools=tools,
+            prompt=system_prompt,
+            response_format=AgentOutput,
         )
+        return agent
+    async def run_agent(self, user_input: str, input_audio_files: list[str]):
+        if self.agent is None:
+            self.agent = await self.build_agent()
+        input_context = f"""
+        User Request: {user_input}
+        Input Audio Files: {', '.join(input_audio_files) if input_audio_files else 'None'}
+        """
+        return await self.agent.ainvoke(
+            {"messages": [{"role": "user", "content": input_context}]}
+        )

src/nodes/agent.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from dotenv import load_dotenv
+from functools import partial
+from langchain_mcp_adapters.client import MultiServerMCPClient
+from langgraph.graph import StateGraph, END, START
+from .state import AgentState, InputState, OutputState
+from .chat import chat_node, chat_node_router
+from .planner import planner_node
+from .processor import processor_node
+from .validator import validator_node, validator_node_router
+class AudioAgent:
+    def __init__(
+        self,
+        model_name: str = "gpt-4o",
+        server_url: str = "https://agents-mcp-hackathon-audioeditor.hf.space/gradio_api/mcp/sse",
+    ):
+        load_dotenv()
+        self.model_name = model_name
+        self.server_url = server_url
+        self.graph = None
+        self._client = MultiServerMCPClient({
+            "audio-tools": {"url": self.server_url, "transport": "sse"}
+        })
+    @property
+    def is_initialized(self) -> bool:
+        return self.graph is not None
+    async def _build_graph(self) -> None:
+        """Build the LangGraph workflow."""
+        _graph = StateGraph(
+            AgentState,
+            input=InputState,
+            output=OutputState
+        )
+        _graph.add_node("chat", chat_node)
+        _graph.add_conditional_edges(
+            "chat",
+            chat_node_router,
+            {
+                "planner": "planner",
+                "end": END
+            }
+        )
+        _graph.add_node("planner", planner_node)
+        _graph.add_edge("planner", "audio_processor")
+        processor_node_with_tools = partial(processor_node, tools=self.tools)
+        _graph.add_node("audio_processor", processor_node_with_tools)
+        # TODO: add validator edge to here
+        _graph.add_edge("audio_processor", "chat")
+        _graph.add_node("validator", validator_node)
+        _graph.add_conditional_edges(
+            "validator",
+            validator_node_router,
+            {
+                "chat": "chat",
+                "planner": "planner"
+            }
+        )
+        _graph.add_edge(START, "chat")
+        _graph.add_edge("chat", END)
+        self.graph = _graph.compile()
+    async def initialize(self) -> None:
+        """Initialize the LangGraph workflow with audio tools."""
+        if self.is_initialized:
+            return
+        self.tools = await self._client.get_tools()
+        if not self.tools:
+            raise RuntimeError("No tools available from MCP server")
+        await self._build_graph()
+    def _extract_audio_paths(self, user_message: str) -> tuple[str, list[str]]:
+        """Extract audio file paths from user message and return cleaned message."""
+        audio_files = []
+        lines = user_message.split('\n')
+        clean_lines = []
+        for line in lines:
+            if line.strip().startswith('Audio file:'):
+                # Extract the file path
+                audio_path = line.replace('Audio file:', '').strip()
+                audio_files.append(audio_path)
+            else:
+                clean_lines.append(line)
+        clean_message = '\n'.join(clean_lines).strip()
+        return clean_message, audio_files
+    async def chat(self, user_message: str):
+        """Stream chat responses with node information."""
+        if not self.is_initialized:
+            await self.initialize()
+        # Extract audio file paths from the message
+        clean_message, audio_files = self._extract_audio_paths(user_message)
+        # Set up initial state
+        initial_state = {
+            "user_input": clean_message,
+            "input_audio_files": audio_files,
+            "steps_details": [],
+            "plan": "",
+            "final_response": "",
+            "requires_processing": False,
+            "validator_feedback": "",
+            "output_audio_files": []
+        }
+        # Stream the graph execution
+        return await self.graph.ainvoke(initial_state, stream_mode="values")
+    def draw_graph(self) -> None:
+        """Draw the graph to a file."""
+        graph_image = self.graph.get_graph().draw_mermaid_png()
+        with open("graph.png", "wb") as f:
+            f.write(graph_image)

src/nodes/chat.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableParallel
-from src.state import AgentState, ChatInputState, ChatOutputState
 from operator import itemgetter
 def chat_node(state: ChatInputState) -> ChatOutputState:

 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableParallel
+from nodes.state import AgentState, ChatInputState, ChatOutputState
 from operator import itemgetter
 def chat_node(state: ChatInputState) -> ChatOutputState:

src/nodes/planner.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableParallel
-from src.state import AgentState, PlannerInputState, PlannerOutputState
 from operator import itemgetter
 def planner_node(state: PlannerInputState) -> PlannerOutputState:

 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.runnables import RunnableParallel
+from nodes.state import AgentState, PlannerInputState, PlannerOutputState
 from operator import itemgetter
 def planner_node(state: PlannerInputState) -> PlannerOutputState:

src/nodes/processor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from src.state import ProcessorInputState, ProcessorOutputState
 from langgraph.prebuilt import create_react_agent
 from pydantic import BaseModel, Field

+from nodes.state import ProcessorInputState, ProcessorOutputState
 from langgraph.prebuilt import create_react_agent
 from pydantic import BaseModel, Field

src/{state.py → nodes/state.py} RENAMED Viewed

File without changes

src/nodes/validator.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
-from src.state import AgentState, ValidatorInputState, ValidatorOutputState
 from operator import itemgetter
 from langchain_core.runnables import RunnableParallel

 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
+from nodes.state import AgentState, ValidatorInputState, ValidatorOutputState
 from operator import itemgetter
 from langchain_core.runnables import RunnableParallel

src/ui.py CHANGED Viewed

@@ -33,31 +33,42 @@ def user_input(user_message, audio_files, history):
             audio_file_urls.append(get_share_url(file_path))
     if audio_file_urls:
-        audio_list = "\n".join([f"Audio file: {url}" for url in audio_file_urls])
-        combined_message = f"{user_message}\n\n{audio_list}" if user_message.strip() else audio_list
     else:
         combined_message = user_message
     history.append({"role": "user", "content": combined_message})
-    return "", [], history
-async def bot_response(history):
     """
-    Generate bot response using the simple chat method
     """
     if not history or history[-1]["role"] != "user":
         return history
     user_message = history[-1]["content"]
-    try:
-        # Initialize agent if not already done
-        if not agent.is_initialized:
-            await agent.initialize()
-        # Get the response from the agent
-        result = await agent.chat(user_message)
         # Extract the final response and audio files from the result
         final_response = result.get("final_response", "")
@@ -80,14 +91,14 @@ async def bot_response(history):
     return history
-def bot_response_sync(history):
     """
     Synchronous wrapper for the async bot response
     """
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     try:
-        return loop.run_until_complete(bot_response(history))
     finally:
         loop.close()
@@ -110,6 +121,9 @@ def create_interface():
         **Supported formats**: MP3, WAV, M4A, FLAC, AAC, OGG
         """)
         with gr.Row():
             with gr.Column(scale=2):
                 chatbot = gr.Chatbot(
@@ -184,35 +198,39 @@ def create_interface():
         # Handle user input and bot response
         def handle_submit(message, files, history):
-            return user_input(message, files, history)
         msg.submit(
             handle_submit,
             [msg, audio_files, chatbot],
-            [msg, audio_files, chatbot],
             queue=False
         ).then(
             bot_response_sync,
-            chatbot,
             chatbot
         )
         send_btn.click(
             handle_submit,
             [msg, audio_files, chatbot],
-            [msg, audio_files, chatbot],
             queue=False
         ).then(
             bot_response_sync,
-            chatbot,
             chatbot
         )
         # Clear chat
         clear_btn.click(
-            lambda: ([], []),
             None,
-            [chatbot, audio_files],
             queue=False
         )

             audio_file_urls.append(get_share_url(file_path))
+    # For display purposes, show what audio files were uploaded
     if audio_file_urls:
+        audio_list = "\n".join([f"🎵 Uploaded: {url.split('/')[-1]}" for url in audio_file_urls])
+        combined_message = f"{user_message}\n\n{audio_list}" if user_message.strip() else f"Process uploaded audio files:\n{audio_list}"
     else:
         combined_message = user_message
     history.append({"role": "user", "content": combined_message})
+    return "", [], history, audio_file_urls
+async def bot_response(history, audio_file_urls):
     """
+    Generate bot response using the test agent
     """
     if not history or history[-1]["role"] != "user":
         return history
+    # Get the actual user message (without the audio file display text)
     user_message = history[-1]["content"]
+    # Clean the user message by removing the uploaded file display text
+    if "🎵 Uploaded:" in user_message:
+        lines = user_message.split('\n')
+        clean_lines = []
+        for line in lines:
+            if not line.strip().startswith('🎵 Uploaded:'):
+                clean_lines.append(line)
+        user_message = '\n'.join(clean_lines).strip()
+        # If message is empty after cleaning, provide default message
+        if not user_message:
+            user_message = "Please process these audio files"
+    try:
+        # Use the test agent's run_agent method with separate parameters
+        result = await agent.run_agent(user_message, audio_file_urls or [])
         # Extract the final response and audio files from the result
         final_response = result.get("final_response", "")
     return history
+def bot_response_sync(history, audio_file_urls):
     """
     Synchronous wrapper for the async bot response
     """
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     try:
+        return loop.run_until_complete(bot_response(history, audio_file_urls))
     finally:
         loop.close()
         **Supported formats**: MP3, WAV, M4A, FLAC, AAC, OGG
         """)
+        # Hidden state to store audio file URLs
+        audio_urls_state = gr.State([])
         with gr.Row():
             with gr.Column(scale=2):
                 chatbot = gr.Chatbot(
         # Handle user input and bot response
         def handle_submit(message, files, history):
+            new_msg, new_files, updated_history, audio_urls = user_input(message, files, history)
+            return new_msg, new_files, updated_history, audio_urls
         msg.submit(
             handle_submit,
             [msg, audio_files, chatbot],
+            [msg, audio_files, chatbot, audio_urls_state],
             queue=False
         ).then(
             bot_response_sync,
+            [chatbot, audio_urls_state],
             chatbot
         )
         send_btn.click(
             handle_submit,
             [msg, audio_files, chatbot],
+            [msg, audio_files, chatbot, audio_urls_state],
             queue=False
         ).then(
             bot_response_sync,
+            [chatbot, audio_urls_state],
             chatbot
         )
         # Clear chat
+        def clear_chat():
+            return [], [], []
         clear_btn.click(
+            clear_chat,
             None,
+            [chatbot, audio_files, audio_urls_state],
             queue=False
         )