Spaces:
Runtime error
Runtime error
File size: 20,050 Bytes
24a4bd7 cb57dca 24fb7b9 675bab3 cb57dca 675bab3 cb57dca 675bab3 cb57dca 9549eae cb57dca 3b02325 cb57dca 675bab3 cb57dca 675bab3 cb57dca 675bab3 3b02325 24a4bd7 675bab3 3b02325 cb57dca 675bab3 24fb7b9 675bab3 24fb7b9 675bab3 2994b0b 8fd199a 675bab3 24fb7b9 675bab3 3b02325 24fb7b9 b24bf0f b593116 b24bf0f b593116 24fb7b9 b593116 b24bf0f 24fb7b9 41fd677 b593116 41fd677 24fb7b9 675bab3 3b02325 b593116 24fb7b9 3b02325 24fb7b9 675bab3 f501ab7 675bab3 3b02325 cb57dca 903bdac cb57dca b593116 b24bf0f b593116 41fd677 b593116 24fb7b9 b593116 903bdac b593116 903bdac 41fd677 cb57dca b593116 41fd677 cb57dca b593116 903bdac cb57dca 41fd677 b593116 903bdac b593116 903bdac cb57dca 24fb7b9 b593116 cb57dca 3b02325 cb57dca 24a4bd7 903bdac cb57dca 41fd677 6973e68 f501ab7 cb57dca 675bab3 6973e68 f501ab7 903bdac 675bab3 24fb7b9 675bab3 2994b0b 24fb7b9 903bdac 24fb7b9 675bab3 24a4bd7 24fb7b9 2994b0b 903bdac 24fb7b9 24a4bd7 3b02325 675bab3 24fb7b9 675bab3 3b02325 24fb7b9 903bdac 24a4bd7 cb57dca 24a4bd7 675bab3 cb57dca 675bab3 92d9b2d 2994b0b 675bab3 cb57dca 24fb7b9 675bab3 40ba5c6 675bab3 40ba5c6 675bab3 40ba5c6 ed5b226 24fb7b9 675bab3 eb60a0e 1ab5d95 903bdac 675bab3 1ab5d95 675bab3 24a4bd7 675bab3 b593116 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 |
import gradio as gr
import os
import shutil
from gradio_client import Client, handle_file # handle_file might be used by the agent
# Use InferenceClientModel instead of HfApiModel
from smolagents import Tool, CodeAgent, InferenceClientModel, ToolCollection # Tool is needed for subclassing
import uuid
import httpx # Often a dependency for HTTP clients, good to have
from tenacity import retry, stop_after_attempt, wait_exponential
from huggingface_hub import list_spaces
from PIL import Image # For potential image manipulation by the agent
import traceback # For more detailed error logging if needed
# Define initial tools from Spaces
spaces = [
{"repo_id": "black-forest-labs/FLUX.1-schnell",
"name": "image_generator_flux_schnell",
"description": "Generate an image from a prompt using FLUX.1-schnell. Expects a text prompt.",
"api_name": "/infer"},
{"repo_id": "Remsky/Kokoro-TTS-Zero",
"name": "text_to_speech_kokoro",
"description": "Generates speech (audio) from input text using Kokoro TTS Zero. Expects text input.",
"api_name": "/generate_speech_from_ui"},
{"repo_id": "opendatalab/MinerU",
"name": "pdf_text_extraction_mineru",
"description": "Extracts the text of a PDF up to 20 pages long using MinerU. Expects a PDF file.",
"api_name": "/to_pdf"},
]
# Create tools from predefined Spaces with retry logic
tools = []
for space_info in spaces:
repo_id = space_info['repo_id']
name = space_info.get('name', repo_id.split('/')[-1].replace('-', '_'))
description = space_info.get('description', f'A tool to interact with the Hugging Face Space: {repo_id}')
api_name = space_info.get('api_name')
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def create_tool_with_retry(repo_id, name, description, api_name):
print(f"Attempting to create tool: '{name}' from space: {repo_id} with api_name: {api_name}")
new_tool = Tool.from_space(repo_id, name=name, description=description, api_name=api_name)
if not hasattr(new_tool, 'name') or new_tool.name != name:
print(f"WARNING: Tool '{name}' from space {repo_id} might have a name mismatch or missing name attribute after creation. Actual name: {getattr(new_tool, 'name', 'MISSING')}")
return new_tool
try:
tool_instance = create_tool_with_retry(repo_id, name, description, api_name) # Renamed to avoid conflict
tools.append(tool_instance)
print(f"Successfully loaded predefined tool: {name} from {repo_id}")
except Exception as e:
print(f"Failed to load predefined tool from {repo_id}. Error: {str(e)}. Continuing with available tools.")
# --- Refactored HuggingFaceSpaceSearcherTool ---
class HuggingFaceSpaceSearcherTool(Tool):
name = "huggingface_space_searcher"
description = "Searches for Hugging Face Spaces that can perform a specific task. Input is a search query string (e.g., 'text to image', 'speech recognition'). Returns a list of Space IDs, their descriptions, and instructions on how to try using them."
inputs = {
"query": {
"type": "string",
"description": "The search query for Hugging Face Spaces."
},
"top_k": {
"type": "integer",
"description": "The number of top results to return (default is 3).",
"nullable": True
}
}
output_type = "string"
def forward(self, query: str, top_k: int = 3) -> str:
try:
actual_top_k = top_k if top_k is not None else 3
print(f"Searching spaces with query: {query}, top_k: {actual_top_k}")
spaces_found = list(list_spaces(search=query, full=True, limit=actual_top_k, sort="likes", direction=-1))
if not spaces_found:
return "No Spaces found for your query."
results = "Found the following Spaces (sorted by likes):\n"
for i, space_data in enumerate(spaces_found):
description = "No description provided."
if hasattr(space_data, 'cardData') and space_data.cardData and 'description' in space_data.cardData:
description = space_data.cardData['description']
elif hasattr(space_data, 'title') and space_data.title:
description = space_data.title
results += (
f"{i+1}. ID: {space_data.id}\n"
f" Description: {description}\n"
f" Likes: {space_data.likes if hasattr(space_data, 'likes') else 'N/A'}\n"
f" Last Modified: {space_data.lastModified if hasattr(space_data, 'lastModified') else 'N/A'}\n\n"
)
results += ("\nTo use one of these, you **MUST** first try creating a tool using " # Emphasized MUST
"`Tool.from_space(repo_id='SPACE_ID_HERE', name='custom_tool_name')`. "
"Then call that new tool: `result = custom_tool_name(argument_name=value)`. "
"The arguments depend on the specific Space. If `Tool.from_space` fails, "
"the Space might not have a compatible public API for this method.")
return results
except Exception as e:
print(f"Error searching Spaces: {str(e)}")
return f"Error searching Spaces: {str(e)}"
space_search_tool = HuggingFaceSpaceSearcherTool()
tools.append(space_search_tool)
# Initialize the model
model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")
# Create the agent
agent = CodeAgent(
tools=tools,
model=model,
additional_authorized_imports=['PIL', 'Pillow', 'os', 'sys', 'numpy', 'huggingface_hub', 'gradio_client', 'uuid'],
add_base_tools=True
)
AGENT_INSTRUCTIONS = """You are a highly capable AI assistant. Your primary goal is to accomplish tasks using a variety of tools, prioritizing Hugging Face Spaces.
Follow these steps:
1. **Understand the Request:** Carefully analyze the user's prompt. Identify the core task and any specific requirements or inputs.
2. **Check Predefined Tools:** Review your list of available tools. If a predefined tool can directly address the request, use it.
* For the 'huggingface_space_searcher' tool, call it with direct keyword arguments like: `huggingface_space_searcher(query="your search term", top_k=3)`. The `query` is mandatory. `top_k` is optional and defaults to 3 if not provided.
3. **Search for Spaces (If Needed):** If no predefined tool is suitable, use the `huggingface_space_searcher` tool as described above. The search results will explicitly tell you to use `Tool.from_space()`.
4. **Select and Instantiate a Space Tool (CRITICAL PRIORITY):** From the search results, choose the most promising Space. **You MUST attempt to use this Space by creating a tool from it using `Tool.from_space(repo_id='SELECTED_SPACE_ID', name='a_unique_and_descriptive_tool_name')`. DO NOT use `gradio_client.Client()` directly unless `Tool.from_space()` explicitly fails for that Space.**
* If `Tool.from_space()` succeeds, you now have a new tool. Call this new tool with the appropriate arguments for that Space (e.g., `newly_created_tool(prompt="some prompt")`).
* If `Tool.from_space()` fails (e.g., raises an exception), print a message saying it failed and then you may consider trying the next Space from your search results using `Tool.from_space()` again, or falling back to a predefined tool if appropriate. Only consider `gradio_client.Client()` as an absolute last resort if all other methods fail and you have a very specific understanding of the Space's raw API.
5. **Execute the Tool:** Call the tool (predefined, or dynamically created via `Tool.from_space()`) with the necessary arguments.
* **File Inputs:** If the user uploads files, their paths will be available as global string variables: `input_image_path`, `input_audio_path`, `input_video_path`, `input_3d_model_path`, `input_file_path`. Before using these variables, check if they exist and are not None. Pass these file paths as arguments to tools that require them.
* **Imports in Generated Code:** If your code block for execution uses modules like `os` or `uuid`, **you must include the import statements (e.g., `import os`, `import uuid`) within that specific code block.**
6. **Output Management & Concluding a Step:**
* When your code block for a step is complete and has a result (e.g., a text string, a filepath from a tool), use the `return` statement (e.g., `return my_result_variable`).
* The system will use this returned value. You might see "ReturnException" in system logs; this is a normal part of a successful `return` and not an error you need to act upon. Based on the returned value, decide on your next action or if the task is complete.
* **If the entire user request is satisfied by the value you are returning, that `return` statement concludes your work for the current task.** You do not need to call `final_answer()` yourself; the system handles this based on your `return`.
7. **Clarity and Error Handling:** If you encounter issues (e.g., a Space tool fails, required inputs are missing), clearly explain the problem in your response. If a Space doesn't work, try to explain why or suggest an alternative if possible.
Example of the **CORRECT AND PREFERRED** way to use a discovered Space:
```python
# User prompt: "Find a space that can make an image of a cat and use it."
#
# Step 1: Search for the space
# search_results = huggingface_space_searcher(query="text to image cat", top_k=1)
# print(search_results) # Assume 'someuser/cat-image-generator' is found.
#
# Step 2: Try to create a tool from the discovered space using Tool.from_space()
# try:
# cat_tool = Tool.from_space(repo_id="someuser/cat-image-generator", name="cat_image_generator_tool")
# # Now use the newly created tool. Arguments depend on the Space's API.
# # Let's assume it takes a 'prompt'.
# image_filepath = cat_tool(prompt="A fluffy siamese cat, cyberpunk style")
# return image_filepath # Return the filepath directly. This is the final result for this task.
# except Exception as e:
# print(f"Failed to create or use tool from Space 'someuser/cat-image-generator' using Tool.from_space(): {e}")
# # If Tool.from_space() fails, DO NOT immediately try gradio_client.Client().
# # Instead, consider another space or a predefined tool.
# # return "Could not use the discovered space via Tool.from_space(). Trying a fallback..." (then try another step)
```
Example of using a predefined tool that returns a filepath:
```python
# User prompt: "Generate an image of a happy robot."
# (Assuming 'image_generator_flux_schnell' is a predefined tool)
#
# image_filepath = image_generator_flux_schnell(prompt="A happy robot coding on a laptop, cyberpunk style")
# return image_filepath # Return the filepath string directly. This is the final result for this task.
```
Always ensure your generated Python code is complete and directly callable.
You have access to `PIL.Image` (as `Image`), `os`, `sys`, `numpy`, `huggingface_hub`, `gradio_client`, `uuid`. Remember to import them if you use them in a code block.
"""
# Gradio interface function
def gradio_interface(user_prompt, input_image_path, input_audio_path, input_video_path, input_3d_model_path, input_file_path, progress=gr.Progress(track_tqdm=True)):
try:
progress(0, desc="Initializing...") # Step 0
print("Progress: 0% - Initializing...")
full_prompt_with_instructions = f"{AGENT_INSTRUCTIONS}\n\nUSER PROMPT: {user_prompt}"
dynamic_globals_for_run = {}
if input_image_path: dynamic_globals_for_run["input_image_path"] = str(input_image_path)
if input_audio_path: dynamic_globals_for_run["input_audio_path"] = str(input_audio_path)
if input_video_path: dynamic_globals_for_run["input_video_path"] = str(input_video_path)
if input_3d_model_path: dynamic_globals_for_run["input_3d_model_path"] = str(input_3d_model_path)
if input_file_path: dynamic_globals_for_run["input_file_path"] = str(input_file_path)
# Access the agent's python_interpreter's globals
# The python_interpreter tool is usually named 'python_interpreter' or 'python'
# We need to ensure it exists and has a 'globals' attribute.
interpreter_tool = None
if hasattr(agent, 'python_interpreter') and agent.python_interpreter is not None: # Common attribute name
interpreter_tool = agent.python_interpreter
elif 'python_interpreter' in agent.tools and agent.tools['python_interpreter'] is not None:
interpreter_tool = agent.tools['python_interpreter']
elif 'python' in agent.tools and agent.tools['python'] is not None: # Another common name for the tool
interpreter_tool = agent.tools['python']
original_interpreter_globals = {}
if interpreter_tool and hasattr(interpreter_tool, 'globals') and isinstance(interpreter_tool.globals, dict):
original_interpreter_globals = interpreter_tool.globals.copy()
interpreter_tool.globals.update(dynamic_globals_for_run)
print(f"Updated agent.python_interpreter.globals with: {dynamic_globals_for_run}")
else:
print("Warning: Could not find or update python_interpreter globals on the agent.")
progress(0.2, desc="Agent processing request...")
result = None
try:
result = agent.run(full_prompt_with_instructions)
finally:
# Restore the agent's original python_interpreter globals
if interpreter_tool and hasattr(interpreter_tool, 'globals'):
interpreter_tool.globals = original_interpreter_globals
print(f"Restored agent.python_interpreter.globals.")
else:
print("Warning: Could not restore python_interpreter globals.")
outputs = {
"image": gr.update(value=None, visible=False), "file": gr.update(value=None, visible=False),
"path": gr.update(value=None, visible=False), "audio": gr.update(value=None, visible=False),
"model3d": gr.update(value=None, visible=False), "text": gr.update(value=None, visible=True),
}
if isinstance(result, str):
if os.path.isfile(result):
file_path = result
outputs["file"] = gr.update(value=file_path, visible=True)
outputs["path"] = gr.update(value=file_path, visible=True)
ext = os.path.splitext(file_path.lower())[1]
if ext in ('.png', '.jpg', '.jpeg', '.gif', '.webp'): outputs["image"] = gr.update(value=file_path, visible=True)
elif ext in ('.mp3', '.wav', '.ogg', '.flac'): outputs["audio"] = gr.update(value=file_path, visible=True)
elif ext == '.glb': outputs["model3d"] = gr.update(value=file_path, visible=True)
else: outputs["text"] = gr.update(value=f"Output is a file: {os.path.basename(file_path)}. Download it.", visible=True)
else: outputs["text"] = gr.update(value=result, visible=True)
elif result is None: outputs["text"] = gr.update(value="Agent returned no result (None).", visible=True)
else: outputs["text"] = gr.update(value=f"Unexpected result type: {type(result)}. Content: {str(result)}", visible=True)
progress(1, desc="Done!") # Step 3: All processing finished
print("Progress: 100% - Done!")
return (outputs["image"], outputs["file"], outputs["path"], outputs["audio"], outputs["model3d"], outputs["text"])
except Exception as e:
error_msg = f"An error occurred: {str(e)}"
print(error_msg)
traceback.print_exc()
progress(1, desc="Error occurred.") # Ensure progress completes on error
return (None, None, None, None, None, gr.update(value=error_msg, visible=True))
# Create the Gradio app
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("## π€ Smolagent: Multi-Modal Agent with Hugging Face Space Discovery")
gr.Markdown("Ask the agent to perform tasks...")
with gr.Row():
prompt_input = gr.Textbox(label="Enter your prompt", placeholder="e.g., 'Generate an image of a futuristic city'", lines=3, elem_id="user_prompt_textbox")
with gr.Accordion("Optional File Inputs", open=False):
# Using gr.Group for better visual separation of input groups
with gr.Group():
with gr.Row():
input_image = gr.Image(label="Image Input", type="filepath", sources=["upload", "clipboard"], elem_id="input_image_upload")
input_audio = gr.Audio(label="Audio Input", type="filepath", sources=["upload", "microphone"], elem_id="input_audio_upload")
with gr.Group():
with gr.Row():
input_video = gr.Video(label="Video Input", sources=["upload"], elem_id="input_video_upload")
input_model3d = gr.Model3D(label="3D Model Input", elem_id="input_model3d_upload")
with gr.Group():
with gr.Row():
input_file = gr.File(label="Generic File Input (PDF, TXT, etc.)", type="filepath", elem_id="input_file_upload")
submit_button = gr.Button("π Generate", variant="primary", elem_id="submit_button_generate")
gr.Markdown("### Outputs:")
with gr.Row():
image_output = gr.Image(label="Image Output", interactive=False, visible=False, show_download_button=True, elem_id="output_image_display")
audio_output = gr.Audio(label="Audio Output", interactive=False, visible=False, show_download_button=True, elem_id="output_audio_display")
with gr.Row():
model3d_output = gr.Model3D(label="3D Model Output", interactive=False, visible=False, elem_id="output_model3d_display")
text_output = gr.Textbox(label="Text / Log Output", interactive=False, visible=True, lines=5, max_lines=20, elem_id="output_text_log")
with gr.Row():
file_output = gr.File(label="Download File Output", interactive=False, visible=False, elem_id="output_file_download")
path_output = gr.Textbox(label="Output File Path", interactive=False, visible=False, elem_id="output_file_path_text")
# Define the list of inputs and outputs for the click and submit events
event_inputs = [prompt_input, input_image, input_audio, input_video, input_model3d, input_file]
event_outputs = [image_output, file_output, path_output, audio_output, model3d_output, text_output]
submit_button.click(
fn=gradio_interface,
inputs=event_inputs,
outputs=event_outputs
)
# Add the submit event to the prompt_input Textbox
prompt_input.submit(
fn=gradio_interface,
inputs=event_inputs,
outputs=event_outputs
)
gr.Examples(
examples=[
["Generate an image of a happy robot coding on a laptop, cyberpunk style.", None, None, None, None, None],
["Convert the following text to speech: 'Smolagents are amazing for building AI applications.'", None, None, None, None, None],
["Search for a Hugging Face Space that can perform image captioning. Describe the Caption the following image.", "Wizard Oasis.webp", None, None, None, None],
["I have an image of a robot. Make this image Ghibli style.", "Happy Robot Coding.webp", None, None, None, None],
["Generate an EDM jazz song about a futuristic city.", None, None, None, None, None],
["Generate audio of a dog barking.", None, None, None, None, None],
],
inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
label="Example Prompts (Note: For examples with file inputs, you'll need to upload a relevant file first or ensure the named file exists in the Space's root)"
)
if __name__ == "__main__":
app.launch(debug=True)
|