Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import os | |
| import shutil | |
| from gradio_client import Client, handle_file # handle_file might be used by the agent | |
| # Use InferenceClientModel instead of HfApiModel | |
| from smolagents import Tool, CodeAgent, InferenceClientModel, ToolCollection # Tool is needed for subclassing | |
| import uuid | |
| import httpx # Often a dependency for HTTP clients, good to have | |
| from tenacity import retry, stop_after_attempt, wait_exponential | |
| from huggingface_hub import list_spaces | |
| from PIL import Image # For potential image manipulation by the agent | |
| import traceback # For more detailed error logging if needed | |
| # Define initial tools from Spaces | |
| spaces = [ | |
| {"repo_id": "black-forest-labs/FLUX.1-schnell", | |
| "name": "image_generator_flux_schnell", | |
| "description": "Generate an image from a prompt using FLUX.1-schnell. Expects a text prompt.", | |
| "api_name": "/infer"}, | |
| {"repo_id": "Remsky/Kokoro-TTS-Zero", | |
| "name": "text_to_speech_kokoro", | |
| "description": "Generates speech (audio) from input text using Kokoro TTS Zero. Expects text input.", | |
| "api_name": "/generate_speech_from_ui"}, | |
| {"repo_id": "opendatalab/MinerU", | |
| "name": "pdf_text_extraction_mineru", | |
| "description": "Extracts the text of a PDF up to 20 pages long using MinerU. Expects a PDF file.", | |
| "api_name": "/to_pdf"}, | |
| ] | |
| # Create tools from predefined Spaces with retry logic | |
| tools = [] | |
| for space_info in spaces: | |
| repo_id = space_info['repo_id'] | |
| name = space_info.get('name', repo_id.split('/')[-1].replace('-', '_')) | |
| description = space_info.get('description', f'A tool to interact with the Hugging Face Space: {repo_id}') | |
| api_name = space_info.get('api_name') | |
| def create_tool_with_retry(repo_id, name, description, api_name): | |
| print(f"Attempting to create tool: '{name}' from space: {repo_id} with api_name: {api_name}") | |
| new_tool = Tool.from_space(repo_id, name=name, description=description, api_name=api_name) | |
| if not hasattr(new_tool, 'name') or new_tool.name != name: | |
| print(f"WARNING: Tool '{name}' from space {repo_id} might have a name mismatch or missing name attribute after creation. Actual name: {getattr(new_tool, 'name', 'MISSING')}") | |
| return new_tool | |
| try: | |
| tool_instance = create_tool_with_retry(repo_id, name, description, api_name) # Renamed to avoid conflict | |
| tools.append(tool_instance) | |
| print(f"Successfully loaded predefined tool: {name} from {repo_id}") | |
| except Exception as e: | |
| print(f"Failed to load predefined tool from {repo_id}. Error: {str(e)}. Continuing with available tools.") | |
| # --- Refactored HuggingFaceSpaceSearcherTool --- | |
| class HuggingFaceSpaceSearcherTool(Tool): | |
| name = "huggingface_space_searcher" | |
| description = "Searches for Hugging Face Spaces that can perform a specific task. Input is a search query string (e.g., 'text to image', 'speech recognition'). Returns a list of Space IDs, their descriptions, and instructions on how to try using them." | |
| inputs = { | |
| "query": { | |
| "type": "string", | |
| "description": "The search query for Hugging Face Spaces." | |
| }, | |
| "top_k": { | |
| "type": "integer", | |
| "description": "The number of top results to return (default is 3).", | |
| "nullable": True | |
| } | |
| } | |
| output_type = "string" | |
| def forward(self, query: str, top_k: int = 3) -> str: | |
| try: | |
| actual_top_k = top_k if top_k is not None else 3 | |
| print(f"Searching spaces with query: {query}, top_k: {actual_top_k}") | |
| spaces_found = list(list_spaces(search=query, full=True, limit=actual_top_k, sort="likes", direction=-1)) | |
| if not spaces_found: | |
| return "No Spaces found for your query." | |
| results = "Found the following Spaces (sorted by likes):\n" | |
| for i, space_data in enumerate(spaces_found): | |
| description = "No description provided." | |
| if hasattr(space_data, 'cardData') and space_data.cardData and 'description' in space_data.cardData: | |
| description = space_data.cardData['description'] | |
| elif hasattr(space_data, 'title') and space_data.title: | |
| description = space_data.title | |
| results += ( | |
| f"{i+1}. ID: {space_data.id}\n" | |
| f" Description: {description}\n" | |
| f" Likes: {space_data.likes if hasattr(space_data, 'likes') else 'N/A'}\n" | |
| f" Last Modified: {space_data.lastModified if hasattr(space_data, 'lastModified') else 'N/A'}\n\n" | |
| ) | |
| results += ("\nTo use one of these, you **MUST** first try creating a tool using " # Emphasized MUST | |
| "`Tool.from_space(repo_id='SPACE_ID_HERE', name='custom_tool_name')`. " | |
| "Then call that new tool: `result = custom_tool_name(argument_name=value)`. " | |
| "The arguments depend on the specific Space. If `Tool.from_space` fails, " | |
| "the Space might not have a compatible public API for this method.") | |
| return results | |
| except Exception as e: | |
| print(f"Error searching Spaces: {str(e)}") | |
| return f"Error searching Spaces: {str(e)}" | |
| space_search_tool = HuggingFaceSpaceSearcherTool() | |
| tools.append(space_search_tool) | |
| # Initialize the model | |
| model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct") | |
| # Create the agent | |
| agent = CodeAgent( | |
| tools=tools, | |
| model=model, | |
| additional_authorized_imports=['PIL', 'Pillow', 'os', 'sys', 'numpy', 'huggingface_hub', 'gradio_client', 'uuid'], | |
| add_base_tools=True | |
| ) | |
| AGENT_INSTRUCTIONS = """You are a highly capable AI assistant. Your primary goal is to accomplish tasks using a variety of tools, prioritizing Hugging Face Spaces. | |
| Follow these steps: | |
| 1. **Understand the Request:** Carefully analyze the user's prompt. Identify the core task and any specific requirements or inputs. | |
| 2. **Check Predefined Tools:** Review your list of available tools. If a predefined tool can directly address the request, use it. | |
| * For the 'huggingface_space_searcher' tool, call it with direct keyword arguments like: `huggingface_space_searcher(query="your search term", top_k=3)`. The `query` is mandatory. `top_k` is optional and defaults to 3 if not provided. | |
| 3. **Search for Spaces (If Needed):** If no predefined tool is suitable, use the `huggingface_space_searcher` tool as described above. The search results will explicitly tell you to use `Tool.from_space()`. | |
| 4. **Select and Instantiate a Space Tool (CRITICAL PRIORITY):** From the search results, choose the most promising Space. **You MUST attempt to use this Space by creating a tool from it using `Tool.from_space(repo_id='SELECTED_SPACE_ID', name='a_unique_and_descriptive_tool_name')`. DO NOT use `gradio_client.Client()` directly unless `Tool.from_space()` explicitly fails for that Space.** | |
| * If `Tool.from_space()` succeeds, you now have a new tool. Call this new tool with the appropriate arguments for that Space (e.g., `newly_created_tool(prompt="some prompt")`). | |
| * If `Tool.from_space()` fails (e.g., raises an exception), print a message saying it failed and then you may consider trying the next Space from your search results using `Tool.from_space()` again, or falling back to a predefined tool if appropriate. Only consider `gradio_client.Client()` as an absolute last resort if all other methods fail and you have a very specific understanding of the Space's raw API. | |
| 5. **Execute the Tool:** Call the tool (predefined, or dynamically created via `Tool.from_space()`) with the necessary arguments. | |
| * **File Inputs:** If the user uploads files, their paths will be available as global string variables: `input_image_path`, `input_audio_path`, `input_video_path`, `input_3d_model_path`, `input_file_path`. Before using these variables, check if they exist and are not None. Pass these file paths as arguments to tools that require them. | |
| * **Imports in Generated Code:** If your code block for execution uses modules like `os` or `uuid`, **you must include the import statements (e.g., `import os`, `import uuid`) within that specific code block.** | |
| 6. **Output Management & Concluding a Step:** | |
| * When your code block for a step is complete and has a result (e.g., a text string, a filepath from a tool), use the `return` statement (e.g., `return my_result_variable`). | |
| * The system will use this returned value. You might see "ReturnException" in system logs; this is a normal part of a successful `return` and not an error you need to act upon. Based on the returned value, decide on your next action or if the task is complete. | |
| * **If the entire user request is satisfied by the value you are returning, that `return` statement concludes your work for the current task.** You do not need to call `final_answer()` yourself; the system handles this based on your `return`. | |
| 7. **Clarity and Error Handling:** If you encounter issues (e.g., a Space tool fails, required inputs are missing), clearly explain the problem in your response. If a Space doesn't work, try to explain why or suggest an alternative if possible. | |
| Example of the **CORRECT AND PREFERRED** way to use a discovered Space: | |
| ```python | |
| # User prompt: "Find a space that can make an image of a cat and use it." | |
| # | |
| # Step 1: Search for the space | |
| # search_results = huggingface_space_searcher(query="text to image cat", top_k=1) | |
| # print(search_results) # Assume 'someuser/cat-image-generator' is found. | |
| # | |
| # Step 2: Try to create a tool from the discovered space using Tool.from_space() | |
| # try: | |
| # cat_tool = Tool.from_space(repo_id="someuser/cat-image-generator", name="cat_image_generator_tool") | |
| # # Now use the newly created tool. Arguments depend on the Space's API. | |
| # # Let's assume it takes a 'prompt'. | |
| # image_filepath = cat_tool(prompt="A fluffy siamese cat, cyberpunk style") | |
| # return image_filepath # Return the filepath directly. This is the final result for this task. | |
| # except Exception as e: | |
| # print(f"Failed to create or use tool from Space 'someuser/cat-image-generator' using Tool.from_space(): {e}") | |
| # # If Tool.from_space() fails, DO NOT immediately try gradio_client.Client(). | |
| # # Instead, consider another space or a predefined tool. | |
| # # return "Could not use the discovered space via Tool.from_space(). Trying a fallback..." (then try another step) | |
| ``` | |
| Example of using a predefined tool that returns a filepath: | |
| ```python | |
| # User prompt: "Generate an image of a happy robot." | |
| # (Assuming 'image_generator_flux_schnell' is a predefined tool) | |
| # | |
| # image_filepath = image_generator_flux_schnell(prompt="A happy robot coding on a laptop, cyberpunk style") | |
| # return image_filepath # Return the filepath string directly. This is the final result for this task. | |
| ``` | |
| Always ensure your generated Python code is complete and directly callable. | |
| You have access to `PIL.Image` (as `Image`), `os`, `sys`, `numpy`, `huggingface_hub`, `gradio_client`, `uuid`. Remember to import them if you use them in a code block. | |
| """ | |
| # Gradio interface function | |
| def gradio_interface(user_prompt, input_image_path, input_audio_path, input_video_path, input_3d_model_path, input_file_path, progress=gr.Progress(track_tqdm=True)): | |
| try: | |
| progress(0, desc="Initializing...") # Step 0 | |
| print("Progress: 0% - Initializing...") | |
| full_prompt_with_instructions = f"{AGENT_INSTRUCTIONS}\n\nUSER PROMPT: {user_prompt}" | |
| dynamic_globals_for_run = {} | |
| if input_image_path: dynamic_globals_for_run["input_image_path"] = str(input_image_path) | |
| if input_audio_path: dynamic_globals_for_run["input_audio_path"] = str(input_audio_path) | |
| if input_video_path: dynamic_globals_for_run["input_video_path"] = str(input_video_path) | |
| if input_3d_model_path: dynamic_globals_for_run["input_3d_model_path"] = str(input_3d_model_path) | |
| if input_file_path: dynamic_globals_for_run["input_file_path"] = str(input_file_path) | |
| # Access the agent's python_interpreter's globals | |
| # The python_interpreter tool is usually named 'python_interpreter' or 'python' | |
| # We need to ensure it exists and has a 'globals' attribute. | |
| interpreter_tool = None | |
| if hasattr(agent, 'python_interpreter') and agent.python_interpreter is not None: # Common attribute name | |
| interpreter_tool = agent.python_interpreter | |
| elif 'python_interpreter' in agent.tools and agent.tools['python_interpreter'] is not None: | |
| interpreter_tool = agent.tools['python_interpreter'] | |
| elif 'python' in agent.tools and agent.tools['python'] is not None: # Another common name for the tool | |
| interpreter_tool = agent.tools['python'] | |
| original_interpreter_globals = {} | |
| if interpreter_tool and hasattr(interpreter_tool, 'globals') and isinstance(interpreter_tool.globals, dict): | |
| original_interpreter_globals = interpreter_tool.globals.copy() | |
| interpreter_tool.globals.update(dynamic_globals_for_run) | |
| print(f"Updated agent.python_interpreter.globals with: {dynamic_globals_for_run}") | |
| else: | |
| print("Warning: Could not find or update python_interpreter globals on the agent.") | |
| progress(0.2, desc="Agent processing request...") | |
| result = None | |
| try: | |
| result = agent.run(full_prompt_with_instructions) | |
| finally: | |
| # Restore the agent's original python_interpreter globals | |
| if interpreter_tool and hasattr(interpreter_tool, 'globals'): | |
| interpreter_tool.globals = original_interpreter_globals | |
| print(f"Restored agent.python_interpreter.globals.") | |
| else: | |
| print("Warning: Could not restore python_interpreter globals.") | |
| outputs = { | |
| "image": gr.update(value=None, visible=False), "file": gr.update(value=None, visible=False), | |
| "path": gr.update(value=None, visible=False), "audio": gr.update(value=None, visible=False), | |
| "model3d": gr.update(value=None, visible=False), "text": gr.update(value=None, visible=True), | |
| } | |
| if isinstance(result, str): | |
| if os.path.isfile(result): | |
| file_path = result | |
| outputs["file"] = gr.update(value=file_path, visible=True) | |
| outputs["path"] = gr.update(value=file_path, visible=True) | |
| ext = os.path.splitext(file_path.lower())[1] | |
| if ext in ('.png', '.jpg', '.jpeg', '.gif', '.webp'): outputs["image"] = gr.update(value=file_path, visible=True) | |
| elif ext in ('.mp3', '.wav', '.ogg', '.flac'): outputs["audio"] = gr.update(value=file_path, visible=True) | |
| elif ext == '.glb': outputs["model3d"] = gr.update(value=file_path, visible=True) | |
| else: outputs["text"] = gr.update(value=f"Output is a file: {os.path.basename(file_path)}. Download it.", visible=True) | |
| else: outputs["text"] = gr.update(value=result, visible=True) | |
| elif result is None: outputs["text"] = gr.update(value="Agent returned no result (None).", visible=True) | |
| else: outputs["text"] = gr.update(value=f"Unexpected result type: {type(result)}. Content: {str(result)}", visible=True) | |
| progress(1, desc="Done!") # Step 3: All processing finished | |
| print("Progress: 100% - Done!") | |
| return (outputs["image"], outputs["file"], outputs["path"], outputs["audio"], outputs["model3d"], outputs["text"]) | |
| except Exception as e: | |
| error_msg = f"An error occurred: {str(e)}" | |
| print(error_msg) | |
| traceback.print_exc() | |
| progress(1, desc="Error occurred.") # Ensure progress completes on error | |
| return (None, None, None, None, None, gr.update(value=error_msg, visible=True)) | |
| # Create the Gradio app | |
| with gr.Blocks(theme=gr.themes.Soft()) as app: | |
| gr.Markdown("## π€ Smolagent: Multi-Modal Agent with Hugging Face Space Discovery") | |
| gr.Markdown("Ask the agent to perform tasks...") | |
| with gr.Row(): | |
| prompt_input = gr.Textbox(label="Enter your prompt", placeholder="e.g., 'Generate an image of a futuristic city'", lines=3, elem_id="user_prompt_textbox") | |
| with gr.Accordion("Optional File Inputs", open=False): | |
| # Using gr.Group for better visual separation of input groups | |
| with gr.Group(): | |
| with gr.Row(): | |
| input_image = gr.Image(label="Image Input", type="filepath", sources=["upload", "clipboard"], elem_id="input_image_upload") | |
| input_audio = gr.Audio(label="Audio Input", type="filepath", sources=["upload", "microphone"], elem_id="input_audio_upload") | |
| with gr.Group(): | |
| with gr.Row(): | |
| input_video = gr.Video(label="Video Input", sources=["upload"], elem_id="input_video_upload") | |
| input_model3d = gr.Model3D(label="3D Model Input", elem_id="input_model3d_upload") | |
| with gr.Group(): | |
| with gr.Row(): | |
| input_file = gr.File(label="Generic File Input (PDF, TXT, etc.)", type="filepath", elem_id="input_file_upload") | |
| submit_button = gr.Button("π Generate", variant="primary", elem_id="submit_button_generate") | |
| gr.Markdown("### Outputs:") | |
| with gr.Row(): | |
| image_output = gr.Image(label="Image Output", interactive=False, visible=False, show_download_button=True, elem_id="output_image_display") | |
| audio_output = gr.Audio(label="Audio Output", interactive=False, visible=False, show_download_button=True, elem_id="output_audio_display") | |
| with gr.Row(): | |
| model3d_output = gr.Model3D(label="3D Model Output", interactive=False, visible=False, elem_id="output_model3d_display") | |
| text_output = gr.Textbox(label="Text / Log Output", interactive=False, visible=True, lines=5, max_lines=20, elem_id="output_text_log") | |
| with gr.Row(): | |
| file_output = gr.File(label="Download File Output", interactive=False, visible=False, elem_id="output_file_download") | |
| path_output = gr.Textbox(label="Output File Path", interactive=False, visible=False, elem_id="output_file_path_text") | |
| # Define the list of inputs and outputs for the click and submit events | |
| event_inputs = [prompt_input, input_image, input_audio, input_video, input_model3d, input_file] | |
| event_outputs = [image_output, file_output, path_output, audio_output, model3d_output, text_output] | |
| submit_button.click( | |
| fn=gradio_interface, | |
| inputs=event_inputs, | |
| outputs=event_outputs | |
| ) | |
| # Add the submit event to the prompt_input Textbox | |
| prompt_input.submit( | |
| fn=gradio_interface, | |
| inputs=event_inputs, | |
| outputs=event_outputs | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["Generate an image of a happy robot coding on a laptop, cyberpunk style.", None, None, None, None, None], | |
| ["Convert the following text to speech: 'Smolagents are amazing for building AI applications.'", None, None, None, None, None], | |
| ["Search for a Hugging Face Space that can perform image captioning. Describe the Caption the following image.", "Wizard Oasis.webp", None, None, None, None], | |
| ["I have an image of a robot. Make this image Ghibli style.", "Happy Robot Coding.webp", None, None, None, None], | |
| ["Generate an EDM jazz song about a futuristic city.", None, None, None, None, None], | |
| ["Generate audio of a dog barking.", None, None, None, None, None], | |
| ], | |
| inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file], | |
| label="Example Prompts (Note: For examples with file inputs, you'll need to upload a relevant file first or ensure the named file exists in the Space's root)" | |
| ) | |
| if __name__ == "__main__": | |
| app.launch(debug=True) | |