jkorstad commited on
Commit
675bab3
Β·
verified Β·
1 Parent(s): 1ed6758

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +266 -57
app.py CHANGED
@@ -1,86 +1,295 @@
1
  import gradio as gr
2
  import os
3
  import shutil
4
- from gradio_client import Client, handle_file
5
- from smolagents import Tool, CodeAgent, HfApiModel
 
 
 
 
 
6
 
7
- # import spaces - if using ZeroGPU
8
-
9
- # Define tools from Spaces
10
  spaces = [
11
  {"repo_id": "black-forest-labs/FLUX.1-schnell",
12
- "name": "image_generator",
13
- "description": "Generate an image from a prompt"},
14
-
 
 
 
 
15
  {"repo_id": "jamesliu1217/EasyControl_Ghibli",
16
- "name": "Ghibli_style_Image_control",
17
- "description": "Create Ghibli style image"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ]
19
 
 
20
  tools = []
21
- for space in spaces:
22
- # Access repo_id, name, and description
23
- repo_id = space['repo_id']
24
- name = space.get('name', repo_id) # Use repo_id as name if not specified
25
- description = space.get('description', '') # Use empty string if not specified
 
 
 
 
 
26
 
27
- # Create Tool instance
28
- tool = Tool.from_space(repo_id, name=name, description=description)
29
- tools.append(tool)
 
 
 
30
 
31
- # Define a custom tool
32
- class CustomTool(Tool):
33
- name = "custom_tool"
34
- description = "A custom tool that processes input text"
35
- inputs = {"input": {"type": "string", "description": "Some input text to process"}}
36
- output_type = "string"
37
- def forward(self, input: str):
38
- return f"Processed: {input}"
39
 
40
- tools.append(CustomTool())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  # Initialize the model
44
- model = HfApiModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")
45
 
46
- # Create the agent
47
- agent = CodeAgent(tools=tools, model=model)
 
 
 
 
 
48
 
49
- # Function to run the agent and return the image path
50
- def generate_and_transform(prompt):
51
- result = agent.run(prompt)
52
-
53
- if isinstance(result, str): # Assuming result is a file path
54
- # Copy the temporary file to a permanent location
55
- permanent_path = "ghibli_output.webp"
56
- shutil.copy(result, permanent_path)
57
- return permanent_path
58
- else:
59
- raise ValueError("Unexpected result type from agent")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  # Gradio interface function
62
- def gradio_interface(prompt):
63
  try:
64
- image_path = generate_and_transform(prompt)
65
- return image_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  except Exception as e:
67
- return str(e)
 
 
 
 
 
 
 
68
 
69
  # Create the Gradio app
70
- with gr.Blocks() as app:
71
- gr.Markdown("### Smolagent Image Generator with Ghibli Style")
 
 
72
  with gr.Row():
73
- prompt_input = gr.Textbox(label="Enter your prompt", placeholder="e.g., Generate an image of a dog and then make an 'xyz' style version of that image")
74
- submit_button = gr.Button("Generate")
75
- output_image = gr.Image(label="Generated Image")
76
- download_button = gr.File(label="Download Image")
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- # Connect the button to the function
79
- def on_submit(prompt):
80
- image_path = gradio_interface(prompt)
81
- return image_path, image_path
82
 
83
- submit_button.click(on_submit, inputs=prompt_input, outputs=[output_image, download_button])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  # Launch the app
86
- app.launch()
 
 
1
  import gradio as gr
2
  import os
3
  import shutil
4
+ from gradio_client import Client, handle_file # handle_file might be used by the agent if it constructs client calls manually
5
+ from smolagents import Tool, CodeAgent, HfApiModel, ToolCollection
6
+ import uuid
7
+ import httpx
8
+ from tenacity import retry, stop_after_attempt, wait_exponential
9
+ from huggingface_hub import list_spaces # For the new search tool
10
+ from PIL import Image # For potential image manipulation by the agent
11
 
12
+ # Define initial tools from Spaces (your existing list)
 
 
13
  spaces = [
14
  {"repo_id": "black-forest-labs/FLUX.1-schnell",
15
+ "name": "image_generator_flux_schnell", # Renamed for clarity if multiple image generators exist
16
+ "description": "Generate an image from a prompt using FLUX.1-schnell. Expects a text prompt.",
17
+ "api_name": "/infer"},
18
+ {"repo_id": "Remsky/Kokoro-TTS-Zero",
19
+ "name": "text_to_speech_kokoro",
20
+ "description": "Generates speech (audio) from input text using Kokoro TTS Zero. Expects text input.",
21
+ "api_name": "/generate_speech_from_ui"},
22
  {"repo_id": "jamesliu1217/EasyControl_Ghibli",
23
+ "name": "ghibli_style_image_control", # Renamed for clarity
24
+ "description": "Create Ghibli style image from an input image using EasyControl_Ghibli. Expects an image and a prompt/control parameters.",
25
+ "api_name": "/single_condition_generate_image"},
26
+ {"repo_id": "opendatalab/MinerU",
27
+ "name": "pdf_text_extraction_mineru", # Renamed for clarity
28
+ "description": "Extracts the text of a PDF up to 20 pages long using MinerU. Expects a PDF file.",
29
+ "api_name": "/to_pdf"},
30
+ {"repo_id": "InstantX/InstantCharacter",
31
+ "name": "instant_character_customization", # Renamed for clarity
32
+ "description": "Personalize Any Characters with a Scalable Diffusion Transformer Framework to any style or pose using InstantCharacter. Expects an input image and potentially pose/style images or prompts.",
33
+ "api_name": "/predict"}, # Common API name, verify for this space
34
+ {"repo_id": "fotographerai/Zen-Style-Shape",
35
+ "name": "img_to_img_style_transfer_zen_shape", # Renamed for clarity
36
+ "description": "Flux[dev] Redux + Flux[dev] Canny. Implements a custom image-to-image style transfer pipeline blending style from Image A to structure of Image B. Expects two images.",
37
+ "api_name": "/predict"}, # Common API name, verify for this space
38
+ {"repo_id": "moonshotai/Kimi-VL-A3B-Thinking",
39
+ "name": "multimodal_vlm_llm_kimi", # Renamed for clarity
40
+ "description": "Kimi-VL-A3B-Thinking is a multi-modal LLM that can understand text and images, and generate text with thinking processes. Ask any question about an image. Expects text and optionally an image.",
41
+ "api_name": "/chat"}, # Verify this api_name for Kimi spaces
42
  ]
43
 
44
+ # Create tools from predefined Spaces with retry logic
45
  tools = []
46
+ for space_info in spaces: # Renamed 'space' to 'space_info' to avoid conflict
47
+ repo_id = space_info['repo_id']
48
+ name = space_info.get('name', repo_id.split('/')[-1].replace('-', '_')) # Default name from repo_id
49
+ description = space_info.get('description', f'A tool to interact with the Hugging Face Space: {repo_id}')
50
+ api_name = space_info.get('api_name') # Can be None, Tool.from_space will try to infer
51
+
52
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
53
+ def create_tool_with_retry(repo_id, name, description, api_name):
54
+ # If api_name is None, Tool.from_space will try to find a public API endpoint.
55
+ return Tool.from_space(repo_id, name=name, description=description, api_name=api_name)
56
 
57
+ try:
58
+ tool = create_tool_with_retry(repo_id, name, description, api_name)
59
+ tools.append(tool)
60
+ print(f"Successfully loaded predefined tool: {name} from {repo_id}")
61
+ except Exception as e:
62
+ print(f"Failed to load predefined tool from {repo_id}. Error: {str(e)}. Continuing with available tools.")
63
 
64
+ # Load tools from a Hugging Face Collection
65
+ collection_slug = "jkorstad/tools-680127d17eed47e759549ff4"
66
+ try:
67
+ collection = ToolCollection.from_hub(collection_slug=collection_slug)
68
+ tools.extend(collection.tools)
69
+ print(f"Successfully loaded tools from collection: {collection_slug}")
70
+ except Exception as e:
71
+ print(f"Warning: Failed to load collection {collection_slug}. Error: {str(e)}")
72
 
73
+
74
+ # NEW: Tool for searching Hugging Face Spaces
75
+ def search_hf_spaces(query: str, top_k: int = 3) -> str:
76
+ """
77
+ Searches Hugging Face Spaces for a given query and returns the top_k results.
78
+ Provides repo_id, description, likes, and last modified date for each space found.
79
+ Use this to discover new tools if the existing ones are not suitable.
80
+ To use a found space, try: new_tool = Tool.from_space(repo_id='the_space_id', name='a_descriptive_name')
81
+ Then call it: result = new_tool(param1=value1, ...)
82
+ """
83
+ try:
84
+ print(f"Searching spaces with query: {query}, top_k: {top_k}")
85
+ spaces_found = list(list_spaces(search=query, full=True, limit=top_k, sort="likes", direction=-1))
86
+ if not spaces_found:
87
+ return "No Spaces found for your query."
88
+
89
+ results = "Found the following Spaces (sorted by likes):\n"
90
+ for i, space_data in enumerate(spaces_found):
91
+ description = "No description."
92
+ if space_data.cardData and 'description' in space_data.cardData:
93
+ description = space_data.cardData['description']
94
+ elif space_data.title: # Fallback to title if description missing
95
+ description = space_data.title
96
+
97
+ results += (
98
+ f"{i+1}. ID: {space_data.id}\n"
99
+ f" Description: {description}\n"
100
+ f" Likes: {space_data.likes}\n"
101
+ f" Last Modified: {space_data.lastModified}\n\n"
102
+ )
103
+ results += ("\nTo use one of these, you can try creating a tool in the code like this: "
104
+ "my_new_tool = Tool.from_space(repo_id='SPACE_ID_HERE', name='custom_tool_name'). "
105
+ "Then you can call it: result = my_new_tool(argument_name=value). "
106
+ "The arguments depend on the specific Space. If Tool.from_space fails or the tool doesn't work, "
107
+ "the Space might not have a compatible public API or may require a specific api_name.")
108
+ return results
109
+ except Exception as e:
110
+ print(f"Error searching Spaces: {str(e)}")
111
+ return f"Error searching Spaces: {str(e)}"
112
+
113
+ space_search_tool = Tool(
114
+ name="huggingface_space_searcher",
115
+ description="Searches for Hugging Face Spaces that can perform a specific task. Input is a search query string (e.g., 'text to image', 'speech recognition'). Returns a list of Space IDs, their descriptions, and instructions on how to try using them.",
116
+ func=search_hf_spaces,
117
+ # args_schema can be defined if you want Pydantic validation for args, e.g., using a class Query(BaseModel): query: str; top_k: int = 3
118
+ )
119
+ tools.append(space_search_tool)
120
 
121
 
122
  # Initialize the model
123
+ model = HfApiModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct") # Or your preferred model
124
 
125
+ # Create the agent with extended imports and a more detailed system prompt
126
+ agent = CodeAgent(
127
+ tools=tools,
128
+ model=model,
129
+ additional_authorized_imports=['PIL', 'Pillow', 'os', 'sys', 'numpy', 'huggingface_hub', 'gradio_client', 'uuid'],
130
+ add_base_tools=True, # Includes web search, python interpreter
131
+ system_prompt="""You are a highly capable AI assistant. Your primary goal is to accomplish tasks using a variety of tools, prioritizing Hugging Face Spaces.
132
 
133
+ Follow these steps:
134
+ 1. **Understand the Request:** Carefully analyze the user's prompt. Identify the core task and any specific requirements or inputs.
135
+ 2. **Check Predefined Tools:** Review your list of available tools. If a predefined tool can directly address the request, use it.
136
+ 3. **Search for Spaces (If Needed):** If no predefined tool is suitable, use the `huggingface_space_searcher` tool. Provide a concise search query related to the task (e.g., "image classification", "voice cloning", "document question answering").
137
+ 4. **Select and Instantiate a Space Tool:** From the search results, choose the most promising Space. Attempt to create a tool from it using `Tool.from_space(repo_id='SELECTED_SPACE_ID', name='a_unique_tool_name')`. You might need to give it a unique name. If `Tool.from_space` fails, the Space might not be compatible, or you could try another one from the search results. Note that some Spaces might not have a public API or may require a specific `api_name` that `Tool.from_space` cannot infer; in such cases, you might not be able to use them.
138
+ 5. **Execute the Tool:** Call the tool (either predefined or dynamically created) with the necessary arguments.
139
+ * **File Inputs:** If the user uploads files, their paths will be available as global string variables: `input_image_path`, `input_audio_path`, `input_video_path`, `input_3d_model_path`, `input_file_path`. Before using these variables, check if they exist and are not None (e.g., `if 'input_image_path' in globals() and input_image_path:`). Pass these file paths as arguments to tools that require them. `Tool.from_space` handles file uploads for compatible Spaces when you pass the filepath string.
140
+ * **Chaining Tools:** If the task requires multiple steps, chain the tools together, passing the output of one tool as the input to the next.
141
+ 6. **Output Management:**
142
+ * If a tool generates a file (image, audio, etc.), save it to the current working directory using a unique filename (e.g., `output_filename = os.path.join(os.getcwd(), f"{uuid.uuid4()}.png")`).
143
+ * **Return the RESULT:** Your final response should be either:
144
+ * A string containing the direct text answer.
145
+ * The string path to the generated output file (e.g., `return output_filename`).
146
+ 7. **Clarity and Error Handling:** If you encounter issues (e.g., a Space tool fails, required inputs are missing), clearly explain the problem in your response. If a Space doesn't work, try to explain why or suggest an alternative if possible.
147
+
148
+ Example of dynamically using a Space after searching:
149
+ ```python
150
+ # user_prompt = "Find a space that can make an image of a cat and then use it."
151
+ # First, I would use huggingface_space_searcher to find relevant spaces.
152
+ # search_results = huggingface_space_searcher(query="text to image cat")
153
+ # print(search_results) # This would show me some options. Let's say 'user/cat-generator' is found.
154
+ # try:
155
+ # cat_image_tool = Tool.from_space(repo_id="user/cat-generator", name="cat_generator_tool")
156
+ # # The arguments for cat_image_tool depend on the Space. I'll assume it takes a 'prompt' argument.
157
+ # image_path = cat_image_tool(prompt="A fluffy siamese cat")
158
+ # # image_path should be a path to the generated image file
159
+ # return image_path
160
+ # except Exception as e:
161
+ # return f"Failed to use the cat generator Space: {e}"
162
+ ```
163
+ Always ensure your generated Python code is complete and directly callable. Use `print()` for debugging if necessary, but the final returned value should be the result or file path.
164
+ You have access to `os`, `uuid`, `PIL.Image`.
165
+ """
166
+ )
167
 
168
  # Gradio interface function
169
+ def gradio_interface(prompt, input_image_path, input_audio_path, input_video_path, input_3d_model_path, input_file_path, progress=gr.Progress(track_tqdm=True)):
170
  try:
171
+ progress(0, desc="Initializing Agent...")
172
+ # Prepare a dictionary of potential inputs for the agent's execution scope
173
+ agent_context_inputs = {"prompt": prompt}
174
+ # These will be available as global variables in the agent's Python execution environment
175
+ if input_image_path:
176
+ agent_context_inputs["input_image_path"] = str(input_image_path) # Ensure it's a string path
177
+ if input_audio_path:
178
+ agent_context_inputs["input_audio_path"] = str(input_audio_path)
179
+ if input_video_path:
180
+ agent_context_inputs["input_video_path"] = str(input_video_path)
181
+ if input_3d_model_path:
182
+ agent_context_inputs["input_3d_model_path"] = str(input_3d_model_path) # Path to .glb or similar
183
+ if input_file_path:
184
+ agent_context_inputs["input_file_path"] = str(input_file_path) # Path to PDF, TXT etc.
185
+
186
+ # The agent will use these global variables based on the system prompt's guidance
187
+ # The `prompt` variable is the main user query.
188
+ progress(0.2, desc="Agent processing request...")
189
+ result = agent.run(**agent_context_inputs) # Pass main prompt and other inputs to be set in global scope
190
+
191
+ progress(0.8, desc="Processing result...")
192
+ # Default all outputs to invisible and None
193
+ outputs = {
194
+ "image": gr.update(value=None, visible=False),
195
+ "file": gr.update(value=None, visible=False),
196
+ "path": gr.update(value=None, visible=False),
197
+ "audio": gr.update(value=None, visible=False),
198
+ "model3d": gr.update(value=None, visible=False),
199
+ "text": gr.update(value=None, visible=False),
200
+ }
201
+
202
+ if isinstance(result, str):
203
+ if os.path.isfile(result):
204
+ file_path = result
205
+ outputs["file"] = gr.update(value=file_path, visible=True)
206
+ outputs["path"] = gr.update(value=file_path, visible=True)
207
+ ext = file_path.lower().split('.')[-1]
208
+ if ext in ('png', 'jpg', 'jpeg', 'gif', 'webp'):
209
+ outputs["image"] = gr.update(value=file_path, visible=True)
210
+ elif ext in ('mp3', 'wav', 'ogg', 'flac'):
211
+ outputs["audio"] = gr.update(value=file_path, visible=True)
212
+ elif ext == 'glb': # Common format for Model3D
213
+ outputs["model3d"] = gr.update(value=file_path, visible=True)
214
+ else: # Other file types like PDF, TXT - user can download via file component
215
+ outputs["text"] = gr.update(value=f"Output is a file (e.g., PDF, TXT): {os.path.basename(file_path)}. Download it above.", visible=True)
216
+ else:
217
+ # Result is a string (e.g., text output from a tool)
218
+ outputs["text"] = gr.update(value=result, visible=True)
219
+ elif result is None:
220
+ outputs["text"] = gr.update(value="Agent returned no result (None). Check logs if available.", visible=True)
221
+ else: # Other types (e.g. if agent returns a dict or list by mistake)
222
+ outputs["text"] = gr.update(value=f"Unexpected result type: {type(result)}. Content: {str(result)}", visible=True)
223
+
224
+ progress(1, desc="Done!")
225
+ return (
226
+ outputs["image"], outputs["file"], outputs["path"],
227
+ outputs["audio"], outputs["model3d"], outputs["text"]
228
+ )
229
+
230
  except Exception as e:
231
+ error_msg = f"An error occurred in the Gradio interface or agent execution: {str(e)}"
232
+ print(error_msg) # Also print to console for server-side logs
233
+ # traceback.print_exc() # For more detailed debugging
234
+ return (
235
+ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
236
+ gr.update(visible=False), gr.update(visible=False),
237
+ gr.update(value=error_msg, visible=True)
238
+ )
239
 
240
  # Create the Gradio app
241
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
242
+ gr.Markdown("## πŸ€– Smolagent: Multi-Modal Agent with Hugging Face Space Discovery")
243
+ gr.Markdown("Ask the agent to perform tasks. It will try to use its tools or find Hugging Face Spaces to help. You can provide optional file inputs below if your task requires them (e.g., 'Make this image Ghibli style', 'Summarize this PDF').")
244
+
245
  with gr.Row():
246
+ prompt_input = gr.Textbox(
247
+ label="Enter your prompt for the agent",
248
+ placeholder="e.g., 'Generate an image of a futuristic city', 'Convert this text to speech: Hello world', or 'Search for a space that translates English to French and use it for: Good morning'",
249
+ lines=3
250
+ )
251
+
252
+ with gr.Accordion("Optional File Inputs (for tasks requiring them)", open=False):
253
+ with gr.Row():
254
+ input_image = gr.Image(label="Image Input", type="filepath", sources=["upload", "clipboard"], elem_id="input_image")
255
+ input_audio = gr.Audio(label="Audio Input", type="filepath", sources=["upload", "microphone"], elem_id="input_audio")
256
+ with gr.Row():
257
+ input_video = gr.Video(label="Video Input", type="filepath", sources=["upload"], elem_id="input_video")
258
+ input_model3d = gr.Model3D(label="3D Model Input (.glb)", type="filepath", elem_id="input_model3d") # Gradio Model3D component expects .glb usually
259
+ with gr.Row():
260
+ input_file = gr.File(label="Generic File Input (PDF, TXT, etc.)", type="filepath", elem_id="input_file")
261
 
262
+ submit_button = gr.Button("πŸš€ Generate", variant="primary")
 
 
 
263
 
264
+ gr.Markdown("### Outputs:")
265
+ with gr.Row():
266
+ image_output = gr.Image(label="Image Output", interactive=False, visible=False, show_download_button=True)
267
+ audio_output = gr.Audio(label="Audio Output", interactive=False, visible=False, show_download_button=True)
268
+ with gr.Row():
269
+ model3d_output = gr.Model3D(label="3D Model Output", interactive=False, visible=False, show_download_button=True)
270
+ text_output = gr.Textbox(label="Text / Log Output", interactive=False, visible=True, lines=5, max_lines=10) # Start visible for logs/text
271
+ with gr.Row():
272
+ file_output = gr.File(label="Download File Output", interactive=False, visible=False)
273
+ path_output = gr.Textbox(label="Output File Path (Copyable)", interactive=True, visible=False) # Keep for copying if needed
274
+
275
+ # Link button click to the interface function
276
+ submit_button.click(
277
+ fn=gradio_interface,
278
+ inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
279
+ outputs=[image_output, file_output, path_output, audio_output, model3d_output, text_output]
280
+ )
281
+
282
+ gr.Examples(
283
+ examples=[
284
+ ["Generate an image of a happy robot coding on a laptop, cyberpunk style.", None, None, None, None, None],
285
+ ["Convert the following text to speech: 'Smolagents are amazing for building AI applications.'", None, None, None, None, None],
286
+ ["Search for a Hugging Face Space that can perform image captioning. Describe the first result.", None, None, None, None, None],
287
+ ["I have an image of a cat (you'll need to upload one). Find a space that can make it look like a painting and apply it.", "path/to/your/cat_image.png", None, None, None, None], # User would replace path or upload
288
+ ],
289
+ inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
290
+ label="Example Prompts (Note: For examples with file inputs, you'll need to upload a relevant file)"
291
+ )
292
 
293
  # Launch the app
294
+ if __name__ == "__main__":
295
+ app.launch(debug=True) # Enable debug for more detailed logs during development