jkorstad commited on
Commit
cb57dca
Β·
verified Β·
1 Parent(s): c78470e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -117
app.py CHANGED
@@ -1,49 +1,55 @@
1
  import gradio as gr
2
  import os
3
  import shutil
4
- from gradio_client import Client, handle_file # handle_file might be used by the agent if it constructs client calls manually
5
- from smolagents import Tool, CodeAgent, HfApiModel, ToolCollection
 
6
  import uuid
7
- import httpx
8
  from tenacity import retry, stop_after_attempt, wait_exponential
9
- from huggingface_hub import list_spaces # For the new search tool
10
  from PIL import Image # For potential image manipulation by the agent
 
11
 
12
- # Define initial tools from Spaces (your existing list)
 
 
 
 
13
  spaces = [
14
- {"repo_id": "black-forest-labs/FLUX.1-schnell",
15
- "name": "image_generator_flux_schnell", # Renamed for clarity if multiple image generators exist
16
  "description": "Generate an image from a prompt using FLUX.1-schnell. Expects a text prompt.",
17
  "api_name": "/infer"},
18
- {"repo_id": "Remsky/Kokoro-TTS-Zero",
19
- "name": "text_to_speech_kokoro",
20
  "description": "Generates speech (audio) from input text using Kokoro TTS Zero. Expects text input.",
21
  "api_name": "/generate_speech_from_ui"},
22
- {"repo_id": "jamesliu1217/EasyControl_Ghibli",
23
- "name": "ghibli_style_image_control", # Renamed for clarity
24
  "description": "Create Ghibli style image from an input image using EasyControl_Ghibli. Expects an image and a prompt/control parameters.",
25
  "api_name": "/single_condition_generate_image"},
26
- {"repo_id": "opendatalab/MinerU",
27
- "name": "pdf_text_extraction_mineru", # Renamed for clarity
28
  "description": "Extracts the text of a PDF up to 20 pages long using MinerU. Expects a PDF file.",
29
  "api_name": "/to_pdf"},
30
- {"repo_id": "InstantX/InstantCharacter",
31
- "name": "instant_character_customization", # Renamed for clarity
32
- "description": "Personalize Any Characters with a Scalable Diffusion Transformer Framework to any style or pose using InstantCharacter. Expects an input image and potentially pose/style images or prompts.",
33
- "api_name": "/predict"}, # Common API name, verify for this space
34
- {"repo_id": "fotographerai/Zen-Style-Shape",
35
- "name": "img_to_img_style_transfer_zen_shape", # Renamed for clarity
36
- "description": "Flux[dev] Redux + Flux[dev] Canny. Implements a custom image-to-image style transfer pipeline blending style from Image A to structure of Image B. Expects two images.",
37
- "api_name": "/predict"}, # Common API name, verify for this space
38
- {"repo_id": "moonshotai/Kimi-VL-A3B-Thinking",
39
- "name": "multimodal_vlm_llm_kimi", # Renamed for clarity
40
- "description": "Kimi-VL-A3B-Thinking is a multi-modal LLM that can understand text and images, and generate text with thinking processes. Ask any question about an image. Expects text and optionally an image.",
41
- "api_name": "/chat"}, # Verify this api_name for Kimi spaces
42
  ]
43
 
44
  # Create tools from predefined Spaces with retry logic
45
  tools = []
46
- for space_info in spaces: # Renamed 'space' to 'space_info' to avoid conflict
47
  repo_id = space_info['repo_id']
48
  name = space_info.get('name', repo_id.split('/')[-1].replace('-', '_')) # Default name from repo_id
49
  description = space_info.get('description', f'A tool to interact with the Hugging Face Space: {repo_id}')
@@ -62,16 +68,17 @@ for space_info in spaces: # Renamed 'space' to 'space_info' to avoid conflict
62
  print(f"Failed to load predefined tool from {repo_id}. Error: {str(e)}. Continuing with available tools.")
63
 
64
  # Load tools from a Hugging Face Collection
65
- collection_slug = "jkorstad/tools-680127d17eed47e759549ff4"
66
  try:
67
- collection = ToolCollection.from_hub(collection_slug=collection_slug)
 
68
  tools.extend(collection.tools)
69
  print(f"Successfully loaded tools from collection: {collection_slug}")
70
  except Exception as e:
71
  print(f"Warning: Failed to load collection {collection_slug}. Error: {str(e)}")
72
 
73
 
74
- # NEW: Tool for searching Hugging Face Spaces
75
  def search_hf_spaces(query: str, top_k: int = 3) -> str:
76
  """
77
  Searches Hugging Face Spaces for a given query and returns the top_k results.
@@ -82,23 +89,26 @@ def search_hf_spaces(query: str, top_k: int = 3) -> str:
82
  """
83
  try:
84
  print(f"Searching spaces with query: {query}, top_k: {top_k}")
 
 
85
  spaces_found = list(list_spaces(search=query, full=True, limit=top_k, sort="likes", direction=-1))
86
  if not spaces_found:
87
  return "No Spaces found for your query."
88
-
89
  results = "Found the following Spaces (sorted by likes):\n"
90
  for i, space_data in enumerate(spaces_found):
91
- description = "No description."
92
- if space_data.cardData and 'description' in space_data.cardData:
 
93
  description = space_data.cardData['description']
94
- elif space_data.title: # Fallback to title if description missing
95
  description = space_data.title
96
 
97
  results += (
98
  f"{i+1}. ID: {space_data.id}\n"
99
  f" Description: {description}\n"
100
- f" Likes: {space_data.likes}\n"
101
- f" Last Modified: {space_data.lastModified}\n\n"
102
  )
103
  results += ("\nTo use one of these, you can try creating a tool in the code like this: "
104
  "my_new_tool = Tool.from_space(repo_id='SPACE_ID_HERE', name='custom_tool_name'). "
@@ -108,85 +118,97 @@ def search_hf_spaces(query: str, top_k: int = 3) -> str:
108
  return results
109
  except Exception as e:
110
  print(f"Error searching Spaces: {str(e)}")
 
111
  return f"Error searching Spaces: {str(e)}"
112
 
113
  space_search_tool = Tool(
114
  name="huggingface_space_searcher",
115
  description="Searches for Hugging Face Spaces that can perform a specific task. Input is a search query string (e.g., 'text to image', 'speech recognition'). Returns a list of Space IDs, their descriptions, and instructions on how to try using them.",
116
  func=search_hf_spaces,
117
- # args_schema can be defined if you want Pydantic validation for args, e.g., using a class Query(BaseModel): query: str; top_k: int = 3
118
  )
119
  tools.append(space_search_tool)
120
 
121
 
122
- # Initialize the model
123
- model = HfApiModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct") # Or your preferred model
124
 
125
- # Create the agent with extended imports and a more detailed system prompt
126
  agent = CodeAgent(
127
  tools=tools,
128
  model=model,
129
  additional_authorized_imports=['PIL', 'Pillow', 'os', 'sys', 'numpy', 'huggingface_hub', 'gradio_client', 'uuid'],
130
  add_base_tools=True, # Includes web search, python interpreter
131
- system_prompt="""You are a highly capable AI assistant. Your primary goal is to accomplish tasks using a variety of tools, prioritizing Hugging Face Spaces.
132
-
133
- Follow these steps:
134
- 1. **Understand the Request:** Carefully analyze the user's prompt. Identify the core task and any specific requirements or inputs.
135
- 2. **Check Predefined Tools:** Review your list of available tools. If a predefined tool can directly address the request, use it.
136
- 3. **Search for Spaces (If Needed):** If no predefined tool is suitable, use the `huggingface_space_searcher` tool. Provide a concise search query related to the task (e.g., "image classification", "voice cloning", "document question answering").
137
- 4. **Select and Instantiate a Space Tool:** From the search results, choose the most promising Space. Attempt to create a tool from it using `Tool.from_space(repo_id='SELECTED_SPACE_ID', name='a_unique_tool_name')`. You might need to give it a unique name. If `Tool.from_space` fails, the Space might not be compatible, or you could try another one from the search results. Note that some Spaces might not have a public API or may require a specific `api_name` that `Tool.from_space` cannot infer; in such cases, you might not be able to use them.
138
- 5. **Execute the Tool:** Call the tool (either predefined or dynamically created) with the necessary arguments.
139
- * **File Inputs:** If the user uploads files, their paths will be available as global string variables: `input_image_path`, `input_audio_path`, `input_video_path`, `input_3d_model_path`, `input_file_path`. Before using these variables, check if they exist and are not None (e.g., `if 'input_image_path' in globals() and input_image_path:`). Pass these file paths as arguments to tools that require them. `Tool.from_space` handles file uploads for compatible Spaces when you pass the filepath string.
140
- * **Chaining Tools:** If the task requires multiple steps, chain the tools together, passing the output of one tool as the input to the next.
141
- 6. **Output Management:**
142
- * If a tool generates a file (image, audio, etc.), save it to the current working directory using a unique filename (e.g., `output_filename = os.path.join(os.getcwd(), f"{uuid.uuid4()}.png")`).
143
- * **Return the RESULT:** Your final response should be either:
144
- * A string containing the direct text answer.
145
- * The string path to the generated output file (e.g., `return output_filename`).
146
- 7. **Clarity and Error Handling:** If you encounter issues (e.g., a Space tool fails, required inputs are missing), clearly explain the problem in your response. If a Space doesn't work, try to explain why or suggest an alternative if possible.
147
-
148
- Example of dynamically using a Space after searching:
149
- ```python
150
- # user_prompt = "Find a space that can make an image of a cat and then use it."
151
- # First, I would use huggingface_space_searcher to find relevant spaces.
152
- # search_results = huggingface_space_searcher(query="text to image cat")
153
- # print(search_results) # This would show me some options. Let's say 'user/cat-generator' is found.
154
- # try:
155
- # cat_image_tool = Tool.from_space(repo_id="user/cat-generator", name="cat_generator_tool")
156
- # # The arguments for cat_image_tool depend on the Space. I'll assume it takes a 'prompt' argument.
157
- # image_path = cat_image_tool(prompt="A fluffy siamese cat")
158
- # # image_path should be a path to the generated image file
159
- # return image_path
160
- # except Exception as e:
161
- # return f"Failed to use the cat generator Space: {e}"
162
- ```
163
- Always ensure your generated Python code is complete and directly callable. Use `print()` for debugging if necessary, but the final returned value should be the result or file path.
164
- You have access to `os`, `uuid`, `PIL.Image`.
165
- """
166
  )
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # Gradio interface function
169
- def gradio_interface(prompt, input_image_path, input_audio_path, input_video_path, input_3d_model_path, input_file_path, progress=gr.Progress(track_tqdm=True)):
170
  try:
171
  progress(0, desc="Initializing Agent...")
 
 
 
 
172
  # Prepare a dictionary of potential inputs for the agent's execution scope
173
- agent_context_inputs = {"prompt": prompt}
174
  # These will be available as global variables in the agent's Python execution environment
 
 
175
  if input_image_path:
176
- agent_context_inputs["input_image_path"] = str(input_image_path) # Ensure it's a string path
177
  if input_audio_path:
178
- agent_context_inputs["input_audio_path"] = str(input_audio_path)
179
  if input_video_path:
180
- agent_context_inputs["input_video_path"] = str(input_video_path)
181
  if input_3d_model_path:
182
- agent_context_inputs["input_3d_model_path"] = str(input_3d_model_path) # Path to .glb or similar
183
  if input_file_path:
184
- agent_context_inputs["input_file_path"] = str(input_file_path) # Path to PDF, TXT etc.
185
-
186
- # The agent will use these global variables based on the system prompt's guidance
187
- # The `prompt` variable is the main user query.
188
  progress(0.2, desc="Agent processing request...")
189
- result = agent.run(**agent_context_inputs) # Pass main prompt and other inputs to be set in global scope
 
 
190
 
191
  progress(0.8, desc="Processing result...")
192
  # Default all outputs to invisible and None
@@ -196,7 +218,7 @@ def gradio_interface(prompt, input_image_path, input_audio_path, input_video_pat
196
  "path": gr.update(value=None, visible=False),
197
  "audio": gr.update(value=None, visible=False),
198
  "model3d": gr.update(value=None, visible=False),
199
- "text": gr.update(value=None, visible=False),
200
  }
201
 
202
  if isinstance(result, str):
@@ -204,23 +226,23 @@ def gradio_interface(prompt, input_image_path, input_audio_path, input_video_pat
204
  file_path = result
205
  outputs["file"] = gr.update(value=file_path, visible=True)
206
  outputs["path"] = gr.update(value=file_path, visible=True)
207
- ext = file_path.lower().split('.')[-1]
208
- if ext in ('png', 'jpg', 'jpeg', 'gif', 'webp'):
209
  outputs["image"] = gr.update(value=file_path, visible=True)
210
- elif ext in ('mp3', 'wav', 'ogg', 'flac'):
211
  outputs["audio"] = gr.update(value=file_path, visible=True)
212
- elif ext == 'glb': # Common format for Model3D
213
  outputs["model3d"] = gr.update(value=file_path, visible=True)
214
  else: # Other file types like PDF, TXT - user can download via file component
215
- outputs["text"] = gr.update(value=f"Output is a file (e.g., PDF, TXT): {os.path.basename(file_path)}. Download it above.", visible=True)
216
  else:
217
- # Result is a string (e.g., text output from a tool)
218
  outputs["text"] = gr.update(value=result, visible=True)
219
  elif result is None:
220
- outputs["text"] = gr.update(value="Agent returned no result (None). Check logs if available.", visible=True)
221
  else: # Other types (e.g. if agent returns a dict or list by mistake)
222
- outputs["text"] = gr.update(value=f"Unexpected result type: {type(result)}. Content: {str(result)}", visible=True)
223
-
224
  progress(1, desc="Done!")
225
  return (
226
  outputs["image"], outputs["file"], outputs["path"],
@@ -230,11 +252,11 @@ def gradio_interface(prompt, input_image_path, input_audio_path, input_video_pat
230
  except Exception as e:
231
  error_msg = f"An error occurred in the Gradio interface or agent execution: {str(e)}"
232
  print(error_msg) # Also print to console for server-side logs
233
- # traceback.print_exc() # For more detailed debugging
234
  return (
235
- gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
236
- gr.update(visible=False), gr.update(visible=False),
237
- gr.update(value=error_msg, visible=True)
238
  )
239
 
240
  # Create the Gradio app
@@ -246,31 +268,32 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
246
  prompt_input = gr.Textbox(
247
  label="Enter your prompt for the agent",
248
  placeholder="e.g., 'Generate an image of a futuristic city', 'Convert this text to speech: Hello world', or 'Search for a space that translates English to French and use it for: Good morning'",
249
- lines=3
 
250
  )
251
-
252
  with gr.Accordion("Optional File Inputs (for tasks requiring them)", open=False):
253
  with gr.Row():
254
- input_image = gr.Image(label="Image Input", type="filepath", sources=["upload", "clipboard"], elem_id="input_image")
255
- input_audio = gr.Audio(label="Audio Input", type="filepath", sources=["upload", "microphone"], elem_id="input_audio")
256
  with gr.Row():
257
- input_video = gr.Video(label="Video Input", type="filepath", sources=["upload"], elem_id="input_video")
258
- input_model3d = gr.Model3D(label="3D Model Input (.glb)", type="filepath", elem_id="input_model3d") # Gradio Model3D component expects .glb usually
259
  with gr.Row():
260
- input_file = gr.File(label="Generic File Input (PDF, TXT, etc.)", type="filepath", elem_id="input_file")
261
 
262
- submit_button = gr.Button("πŸš€ Generate", variant="primary")
263
 
264
  gr.Markdown("### Outputs:")
265
  with gr.Row():
266
- image_output = gr.Image(label="Image Output", interactive=False, visible=False, show_download_button=True)
267
- audio_output = gr.Audio(label="Audio Output", interactive=False, visible=False, show_download_button=True)
268
  with gr.Row():
269
- model3d_output = gr.Model3D(label="3D Model Output", interactive=False, visible=False, show_download_button=True)
270
- text_output = gr.Textbox(label="Text / Log Output", interactive=False, visible=True, lines=5, max_lines=10) # Start visible for logs/text
271
  with gr.Row():
272
- file_output = gr.File(label="Download File Output", interactive=False, visible=False)
273
- path_output = gr.Textbox(label="Output File Path (Copyable)", interactive=True, visible=False) # Keep for copying if needed
274
 
275
  # Link button click to the interface function
276
  submit_button.click(
@@ -278,18 +301,22 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
278
  inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
279
  outputs=[image_output, file_output, path_output, audio_output, model3d_output, text_output]
280
  )
281
-
282
  gr.Examples(
283
  examples=[
284
  ["Generate an image of a happy robot coding on a laptop, cyberpunk style.", None, None, None, None, None],
285
  ["Convert the following text to speech: 'Smolagents are amazing for building AI applications.'", None, None, None, None, None],
286
  ["Search for a Hugging Face Space that can perform image captioning. Describe the first result.", None, None, None, None, None],
287
- ["I have an image of a cat (you'll need to upload one). Find a space that can make it look like a painting and apply it.", "path/to/your/cat_image.png", None, None, None, None], # User would replace path or upload
 
 
288
  ],
289
  inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
290
- label="Example Prompts (Note: For examples with file inputs, you'll need to upload a relevant file)"
291
  )
292
 
293
  # Launch the app
294
  if __name__ == "__main__":
295
- app.launch(debug=True) # Enable debug for more detailed logs during development
 
 
 
1
  import gradio as gr
2
  import os
3
  import shutil
4
+ from gradio_client import Client, handle_file # handle_file might be used by the agent
5
+ # Use InferenceClientModel instead of HfApiModel
6
+ from smolagents import Tool, CodeAgent, InferenceClientModel, ToolCollection
7
  import uuid
8
+ import httpx # Often a dependency for HTTP clients, good to have
9
  from tenacity import retry, stop_after_attempt, wait_exponential
10
+ from huggingface_hub import list_spaces
11
  from PIL import Image # For potential image manipulation by the agent
12
+ import traceback # For more detailed error logging if needed
13
 
14
+ # Define initial tools from Spaces
15
+ # Commenting out problematic spaces for now.
16
+ # You'll need to verify their api_name or compatibility if you re-enable them.
17
+ # Ensure the api_name is correct if you uncomment these.
18
+ # Visit the HF Space page and look for "API - via gradio_client" for hints.
19
  spaces = [
20
+ {"repo_id": "black-forest-labs/FLUX.1-schnell",
21
+ "name": "image_generator_flux_schnell",
22
  "description": "Generate an image from a prompt using FLUX.1-schnell. Expects a text prompt.",
23
  "api_name": "/infer"},
24
+ {"repo_id": "Remsky/Kokoro-TTS-Zero",
25
+ "name": "text_to_speech_kokoro",
26
  "description": "Generates speech (audio) from input text using Kokoro TTS Zero. Expects text input.",
27
  "api_name": "/generate_speech_from_ui"},
28
+ {"repo_id": "jamesliu1217/EasyControl_Ghibli",
29
+ "name": "ghibli_style_image_control",
30
  "description": "Create Ghibli style image from an input image using EasyControl_Ghibli. Expects an image and a prompt/control parameters.",
31
  "api_name": "/single_condition_generate_image"},
32
+ {"repo_id": "opendatalab/MinerU",
33
+ "name": "pdf_text_extraction_mineru",
34
  "description": "Extracts the text of a PDF up to 20 pages long using MinerU. Expects a PDF file.",
35
  "api_name": "/to_pdf"},
36
+ # {"repo_id": "InstantX/InstantCharacter",
37
+ # "name": "instant_character_customization",
38
+ # "description": "Personalize Any Characters with a Scalable Diffusion Transformer Framework to any style or pose using InstantCharacter. Expects an input image and potentially pose/style images or prompts.",
39
+ # "api_name": "/predict"}, # Example: Verify this api_name if re-enabling
40
+ # {"repo_id": "fotographerai/Zen-Style-Shape",
41
+ # "name": "img_to_img_style_transfer_zen_shape",
42
+ # "description": "Flux[dev] Redux + Flux[dev] Canny. Implements a custom image-to-image style transfer pipeline blending style from Image A to structure of Image B. Expects two images.",
43
+ # "api_name": "/predict"}, # Example: Verify this api_name if re-enabling
44
+ # {"repo_id": "moonshotai/Kimi-VL-A3B-Thinking",
45
+ # "name": "multimodal_vlm_llm_kimi",
46
+ # "description": "Kimi-VL-A3B-Thinking is a multi-modal LLM that can understand text and images, and generate text with thinking processes. Ask any question about an image. Expects text and optionally an image.",
47
+ # "api_name": "/chat"}, # Example: Verify this api_name if re-enabling
48
  ]
49
 
50
  # Create tools from predefined Spaces with retry logic
51
  tools = []
52
+ for space_info in spaces:
53
  repo_id = space_info['repo_id']
54
  name = space_info.get('name', repo_id.split('/')[-1].replace('-', '_')) # Default name from repo_id
55
  description = space_info.get('description', f'A tool to interact with the Hugging Face Space: {repo_id}')
 
68
  print(f"Failed to load predefined tool from {repo_id}. Error: {str(e)}. Continuing with available tools.")
69
 
70
  # Load tools from a Hugging Face Collection
71
+ collection_slug = "jkorstad/tools-680127d17eed47e759549ff4"
72
  try:
73
+ # Added trust_remote_code=True
74
+ collection = ToolCollection.from_hub(collection_slug=collection_slug, trust_remote_code=True)
75
  tools.extend(collection.tools)
76
  print(f"Successfully loaded tools from collection: {collection_slug}")
77
  except Exception as e:
78
  print(f"Warning: Failed to load collection {collection_slug}. Error: {str(e)}")
79
 
80
 
81
+ # Tool for searching Hugging Face Spaces
82
  def search_hf_spaces(query: str, top_k: int = 3) -> str:
83
  """
84
  Searches Hugging Face Spaces for a given query and returns the top_k results.
 
89
  """
90
  try:
91
  print(f"Searching spaces with query: {query}, top_k: {top_k}")
92
+ # Using list_spaces, ensure it's imported: from huggingface_hub import list_spaces
93
+ # full=True gives more metadata, sort by likes, direction=-1 for descending
94
  spaces_found = list(list_spaces(search=query, full=True, limit=top_k, sort="likes", direction=-1))
95
  if not spaces_found:
96
  return "No Spaces found for your query."
97
+
98
  results = "Found the following Spaces (sorted by likes):\n"
99
  for i, space_data in enumerate(spaces_found):
100
+ # Safely access attributes, as they might not always be present
101
+ description = "No description provided."
102
+ if hasattr(space_data, 'cardData') and space_data.cardData and 'description' in space_data.cardData:
103
  description = space_data.cardData['description']
104
+ elif hasattr(space_data, 'title') and space_data.title: # Fallback to title
105
  description = space_data.title
106
 
107
  results += (
108
  f"{i+1}. ID: {space_data.id}\n"
109
  f" Description: {description}\n"
110
+ f" Likes: {space_data.likes if hasattr(space_data, 'likes') else 'N/A'}\n"
111
+ f" Last Modified: {space_data.lastModified if hasattr(space_data, 'lastModified') else 'N/A'}\n\n"
112
  )
113
  results += ("\nTo use one of these, you can try creating a tool in the code like this: "
114
  "my_new_tool = Tool.from_space(repo_id='SPACE_ID_HERE', name='custom_tool_name'). "
 
118
  return results
119
  except Exception as e:
120
  print(f"Error searching Spaces: {str(e)}")
121
+ # traceback.print_exc() # Uncomment for detailed search error debugging
122
  return f"Error searching Spaces: {str(e)}"
123
 
124
  space_search_tool = Tool(
125
  name="huggingface_space_searcher",
126
  description="Searches for Hugging Face Spaces that can perform a specific task. Input is a search query string (e.g., 'text to image', 'speech recognition'). Returns a list of Space IDs, their descriptions, and instructions on how to try using them.",
127
  func=search_hf_spaces,
 
128
  )
129
  tools.append(space_search_tool)
130
 
131
 
132
+ # Initialize the model - Use InferenceClientModel
133
+ model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct") # Or your preferred model
134
 
135
+ # Create the agent - Removed system_prompt from constructor
136
  agent = CodeAgent(
137
  tools=tools,
138
  model=model,
139
  additional_authorized_imports=['PIL', 'Pillow', 'os', 'sys', 'numpy', 'huggingface_hub', 'gradio_client', 'uuid'],
140
  add_base_tools=True, # Includes web search, python interpreter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  )
142
 
143
+ # This is the detailed instruction set that was previously in system_prompt
144
+ AGENT_INSTRUCTIONS = """You are a highly capable AI assistant. Your primary goal is to accomplish tasks using a variety of tools, prioritizing Hugging Face Spaces.
145
+
146
+ Follow these steps:
147
+ 1. **Understand the Request:** Carefully analyze the user's prompt (which will follow these instructions). Identify the core task and any specific requirements or inputs.
148
+ 2. **Check Predefined Tools:** Review your list of available tools. If a predefined tool can directly address the request, use it.
149
+ 3. **Search for Spaces (If Needed):** If no predefined tool is suitable, use the `huggingface_space_searcher` tool. Provide a concise search query related to the task (e.g., "image classification", "voice cloning", "document question answering").
150
+ 4. **Select and Instantiate a Space Tool:** From the search results, choose the most promising Space. Attempt to create a tool from it using `Tool.from_space(repo_id='SELECTED_SPACE_ID', name='a_unique_tool_name')`. You might need to give it a unique name. If `Tool.from_space` fails, the Space might not be compatible, or you could try another one from the search results. Note that some Spaces might not have a public API or may require a specific `api_name` that `Tool.from_space` cannot infer; in such cases, you might not be able to use them.
151
+ 5. **Execute the Tool:** Call the tool (either predefined or dynamically created) with the necessary arguments.
152
+ * **File Inputs:** If the user uploads files, their paths will be available as global string variables: `input_image_path`, `input_audio_path`, `input_video_path`, `input_3d_model_path`, `input_file_path`. Before using these variables, check if they exist and are not None (e.g., `if 'input_image_path' in globals() and input_image_path:`). Pass these file paths as arguments to tools that require them. `Tool.from_space` handles file uploads for compatible Spaces when you pass the filepath string.
153
+ * **Chaining Tools:** If the task requires multiple steps, chain the tools together, passing the output of one tool as the input to the next.
154
+ 6. **Output Management:**
155
+ * If a tool generates a file (image, audio, etc.), save it to the current working directory using a unique filename (e.g., `output_filename = os.path.join(os.getcwd(), f"{uuid.uuid4()}.png")`).
156
+ * **Return the RESULT:** Your final response should be either:
157
+ * A string containing the direct text answer.
158
+ * The string path to the generated output file (e.g., `return output_filename`).
159
+ 7. **Clarity and Error Handling:** If you encounter issues (e.g., a Space tool fails, required inputs are missing), clearly explain the problem in your response. If a Space doesn't work, try to explain why or suggest an alternative if possible.
160
+
161
+ Example of dynamically using a Space after searching:
162
+ ```python
163
+ # This is an example of how I, the agent, would think and act.
164
+ # User's actual prompt would follow these instructions.
165
+ # Example user prompt: "Find a space that can make an image of a cat and then use it."
166
+ #
167
+ # My thought process:
168
+ # 1. The user wants an image of a cat, and wants me to find a Space for it.
169
+ # 2. I'll use `huggingface_space_searcher`.
170
+ # search_results = huggingface_space_searcher(query="text to image cat")
171
+ # print(search_results) # This would show me some options. Let's say 'user/cat-generator' is found.
172
+ # try:
173
+ # cat_image_tool = Tool.from_space(repo_id="user/cat-generator", name="cat_generator_tool")
174
+ # # The arguments for cat_image_tool depend on the Space. I'll assume it takes a 'prompt' argument.
175
+ # image_path = cat_image_tool(prompt="A fluffy siamese cat")
176
+ # # image_path should be a path to the generated image file
177
+ # return image_path
178
+ # except Exception as e:
179
+ # return f"Failed to use the cat generator Space: {e}"
180
+ ```
181
+ Always ensure your generated Python code is complete and directly callable. Use `print()` for debugging if necessary, but the final returned value should be the result or file path.
182
+ You have access to `os`, `uuid`, `PIL.Image`.
183
+ """
184
+
185
  # Gradio interface function
186
+ def gradio_interface(user_prompt, input_image_path, input_audio_path, input_video_path, input_3d_model_path, input_file_path, progress=gr.Progress(track_tqdm=True)):
187
  try:
188
  progress(0, desc="Initializing Agent...")
189
+
190
+ # Combine instructions with the user's prompt
191
+ full_prompt_with_instructions = f"{AGENT_INSTRUCTIONS}\n\nUSER PROMPT: {user_prompt}"
192
+
193
  # Prepare a dictionary of potential inputs for the agent's execution scope
 
194
  # These will be available as global variables in the agent's Python execution environment
195
+ # when agent.run is called with keyword arguments.
196
+ agent_kwargs = {}
197
  if input_image_path:
198
+ agent_kwargs["input_image_path"] = str(input_image_path) # Ensure it's a string path
199
  if input_audio_path:
200
+ agent_kwargs["input_audio_path"] = str(input_audio_path)
201
  if input_video_path:
202
+ agent_kwargs["input_video_path"] = str(input_video_path)
203
  if input_3d_model_path:
204
+ agent_kwargs["input_3d_model_path"] = str(input_3d_model_path)
205
  if input_file_path:
206
+ agent_kwargs["input_file_path"] = str(input_file_path)
207
+
 
 
208
  progress(0.2, desc="Agent processing request...")
209
+ # The first argument to agent.run is the main prompt.
210
+ # Other kwargs are set as global variables in the agent's execution context.
211
+ result = agent.run(full_prompt_with_instructions, **agent_kwargs)
212
 
213
  progress(0.8, desc="Processing result...")
214
  # Default all outputs to invisible and None
 
218
  "path": gr.update(value=None, visible=False),
219
  "audio": gr.update(value=None, visible=False),
220
  "model3d": gr.update(value=None, visible=False),
221
+ "text": gr.update(value=None, visible=True), # Text output is often default
222
  }
223
 
224
  if isinstance(result, str):
 
226
  file_path = result
227
  outputs["file"] = gr.update(value=file_path, visible=True)
228
  outputs["path"] = gr.update(value=file_path, visible=True)
229
+ ext = os.path.splitext(file_path.lower())[1] # Get extension like .png
230
+ if ext in ('.png', '.jpg', '.jpeg', '.gif', '.webp'):
231
  outputs["image"] = gr.update(value=file_path, visible=True)
232
+ elif ext in ('.mp3', '.wav', '.ogg', '.flac'):
233
  outputs["audio"] = gr.update(value=file_path, visible=True)
234
+ elif ext == '.glb': # Common format for Model3D
235
  outputs["model3d"] = gr.update(value=file_path, visible=True)
236
  else: # Other file types like PDF, TXT - user can download via file component
237
+ outputs["text"] = gr.update(value=f"Output is a file: {os.path.basename(file_path)}. Download it using the 'Download File Output' component.", visible=True)
238
  else:
239
+ # Result is a string (e.g., text output from a tool or an error message from the agent)
240
  outputs["text"] = gr.update(value=result, visible=True)
241
  elif result is None:
242
+ outputs["text"] = gr.update(value="Agent returned no result (None). This might indicate an issue or that the task didn't produce a specific output string/file.", visible=True)
243
  else: # Other types (e.g. if agent returns a dict or list by mistake)
244
+ outputs["text"] = gr.update(value=f"Unexpected result type from agent: {type(result)}. Content: {str(result)}", visible=True)
245
+
246
  progress(1, desc="Done!")
247
  return (
248
  outputs["image"], outputs["file"], outputs["path"],
 
252
  except Exception as e:
253
  error_msg = f"An error occurred in the Gradio interface or agent execution: {str(e)}"
254
  print(error_msg) # Also print to console for server-side logs
255
+ traceback.print_exc() # For more detailed debugging
256
  return (
257
+ gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False),
258
+ gr.update(value=None, visible=False), gr.update(value=None, visible=False),
259
+ gr.update(value=error_msg, visible=True) # Show error in the text output
260
  )
261
 
262
  # Create the Gradio app
 
268
  prompt_input = gr.Textbox(
269
  label="Enter your prompt for the agent",
270
  placeholder="e.g., 'Generate an image of a futuristic city', 'Convert this text to speech: Hello world', or 'Search for a space that translates English to French and use it for: Good morning'",
271
+ lines=3,
272
+ elem_id="user_prompt_textbox"
273
  )
274
+
275
  with gr.Accordion("Optional File Inputs (for tasks requiring them)", open=False):
276
  with gr.Row():
277
+ input_image = gr.Image(label="Image Input", type="filepath", sources=["upload", "clipboard"], elem_id="input_image_upload")
278
+ input_audio = gr.Audio(label="Audio Input", type="filepath", sources=["upload", "microphone"], elem_id="input_audio_upload")
279
  with gr.Row():
280
+ input_video = gr.Video(label="Video Input", type="filepath", sources=["upload"], elem_id="input_video_upload") # Gradio Video component might have limitations
281
+ input_model3d = gr.Model3D(label="3D Model Input (.glb, .obj, etc.)", type="filepath", elem_id="input_model3d_upload") # Check Gradio docs for supported Model3D types
282
  with gr.Row():
283
+ input_file = gr.File(label="Generic File Input (PDF, TXT, etc.)", type="filepath", elem_id="input_file_upload")
284
 
285
+ submit_button = gr.Button("πŸš€ Generate", variant="primary", elem_id="submit_button_generate")
286
 
287
  gr.Markdown("### Outputs:")
288
  with gr.Row():
289
+ image_output = gr.Image(label="Image Output", interactive=False, visible=False, show_download_button=True, elem_id="output_image_display")
290
+ audio_output = gr.Audio(label="Audio Output", interactive=False, visible=False, show_download_button=True, elem_id="output_audio_display")
291
  with gr.Row():
292
+ model3d_output = gr.Model3D(label="3D Model Output", interactive=False, visible=False, show_download_button=True, elem_id="output_model3d_display")
293
+ text_output = gr.Textbox(label="Text / Log Output", interactive=False, visible=True, lines=5, max_lines=20, elem_id="output_text_log") # Start visible for logs/text
294
  with gr.Row():
295
+ file_output = gr.File(label="Download File Output", interactive=False, visible=False, elem_id="output_file_download")
296
+ path_output = gr.Textbox(label="Output File Path (Copyable)", interactive=False, visible=False, elem_id="output_file_path_text") # Keep for copying if needed
297
 
298
  # Link button click to the interface function
299
  submit_button.click(
 
301
  inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
302
  outputs=[image_output, file_output, path_output, audio_output, model3d_output, text_output]
303
  )
304
+
305
  gr.Examples(
306
  examples=[
307
  ["Generate an image of a happy robot coding on a laptop, cyberpunk style.", None, None, None, None, None],
308
  ["Convert the following text to speech: 'Smolagents are amazing for building AI applications.'", None, None, None, None, None],
309
  ["Search for a Hugging Face Space that can perform image captioning. Describe the first result.", None, None, None, None, None],
310
+ # For examples with file inputs, the user needs to upload a file manually.
311
+ # The string path here is just a placeholder for the example text.
312
+ ["I have an image of a cat. Find a space that can make it look like a painting and apply it. You will need to use the 'input_image_path' variable which will contain the path to the uploaded cat image.", "path/to/your/cat_image.png", None, None, None, None],
313
  ],
314
  inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
315
+ label="Example Prompts (Note: For examples with file inputs, you'll need to upload a relevant file first using the 'Optional File Inputs' section)"
316
  )
317
 
318
  # Launch the app
319
  if __name__ == "__main__":
320
+ # share=True can be used to create a public link if you're running this locally and want to test from another device.
321
+ # debug=True provides more detailed Gradio logs.
322
+ app.launch(debug=True)