File size: 17,624 Bytes
24a4bd7
 
 
cb57dca
 
24fb7b9
675bab3
cb57dca
675bab3
cb57dca
675bab3
cb57dca
9549eae
cb57dca
3b02325
cb57dca
 
675bab3
 
cb57dca
 
675bab3
 
cb57dca
 
675bab3
 
cb57dca
 
675bab3
 
3b02325
24a4bd7
675bab3
3b02325
cb57dca
675bab3
24fb7b9
675bab3
24fb7b9
675bab3
 
 
2994b0b
 
 
 
 
8fd199a
675bab3
24fb7b9
 
675bab3
 
 
3b02325
24fb7b9
 
 
 
 
b24bf0f
 
 
 
 
 
 
 
 
 
fbfcf3c
b24bf0f
 
 
24fb7b9
 
b24bf0f
24fb7b9
 
 
 
 
 
fbfcf3c
 
b24bf0f
 
 
24fb7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b24bf0f
 
24fb7b9
b24bf0f
24fb7b9
 
b24bf0f
24fb7b9
b24bf0f
24fb7b9
675bab3
3b02325
24fb7b9
2994b0b
 
 
 
 
 
 
 
b24bf0f
2994b0b
 
 
 
 
 
3b02325
cb57dca
24fb7b9
3b02325
24fb7b9
675bab3
 
 
 
24fb7b9
675bab3
3b02325
cb57dca
 
 
 
b24bf0f
fbfcf3c
b24bf0f
24fb7b9
cb57dca
24fb7b9
 
cb57dca
24fb7b9
 
 
cb57dca
 
 
b24bf0f
24fb7b9
cb57dca
 
24fb7b9
cb57dca
 
 
 
24fb7b9
cb57dca
 
 
3b02325
cb57dca
24a4bd7
675bab3
cb57dca
 
24fb7b9
 
 
 
 
cb57dca
675bab3
cb57dca
675bab3
 
 
24fb7b9
 
 
675bab3
 
 
 
 
 
 
2994b0b
24fb7b9
 
 
 
 
 
 
 
675bab3
24fb7b9
675bab3
24a4bd7
24fb7b9
2994b0b
 
24fb7b9
24a4bd7
3b02325
675bab3
 
24fb7b9
675bab3
3b02325
24fb7b9
 
 
675bab3
cb57dca
 
675bab3
1cf26dc
 
24fb7b9
675bab3
24fb7b9
24a4bd7
cb57dca
24a4bd7
675bab3
 
cb57dca
 
675bab3
cb57dca
2994b0b
675bab3
cb57dca
24fb7b9
675bab3
 
 
1cf26dc
675bab3
 
24fb7b9
675bab3
 
 
 
 
cb57dca
675bab3
 
24fb7b9
675bab3
24a4bd7
675bab3
1cf26dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import gradio as gr
import os
import shutil
from gradio_client import Client, handle_file # handle_file might be used by the agent
# Use InferenceClientModel instead of HfApiModel
from smolagents import Tool, CodeAgent, InferenceClientModel, ToolCollection # Tool is needed for subclassing
import uuid
import httpx # Often a dependency for HTTP clients, good to have
from tenacity import retry, stop_after_attempt, wait_exponential
from huggingface_hub import list_spaces
from PIL import Image # For potential image manipulation by the agent
import traceback # For more detailed error logging if needed

# Define initial tools from Spaces
spaces = [
    {"repo_id": "black-forest-labs/FLUX.1-schnell",
     "name": "image_generator_flux_schnell",
     "description": "Generate an image from a prompt using FLUX.1-schnell. Expects a text prompt.",
     "api_name": "/infer"},
    {"repo_id": "Remsky/Kokoro-TTS-Zero",
     "name": "text_to_speech_kokoro",
     "description": "Generates speech (audio) from input text using Kokoro TTS Zero. Expects text input.",
     "api_name": "/generate_speech_from_ui"},
    {"repo_id": "jamesliu1217/EasyControl_Ghibli",
     "name": "ghibli_style_image_control",
     "description": "Create Ghibli style image from an input image using EasyControl_Ghibli. Expects an image and a prompt/control parameters.",
     "api_name": "/single_condition_generate_image"},
    {"repo_id": "opendatalab/MinerU",
     "name": "pdf_text_extraction_mineru",
     "description": "Extracts the text of a PDF up to 20 pages long using MinerU. Expects a PDF file.",
     "api_name": "/to_pdf"},
]

# Create tools from predefined Spaces with retry logic
tools = []
for space_info in spaces:
    repo_id = space_info['repo_id']
    name = space_info.get('name', repo_id.split('/')[-1].replace('-', '_'))
    description = space_info.get('description', f'A tool to interact with the Hugging Face Space: {repo_id}')
    api_name = space_info.get('api_name')

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    def create_tool_with_retry(repo_id, name, description, api_name):
        print(f"Attempting to create tool: '{name}' from space: {repo_id} with api_name: {api_name}")
        new_tool = Tool.from_space(repo_id, name=name, description=description, api_name=api_name)
        if not hasattr(new_tool, 'name') or new_tool.name != name:
            print(f"WARNING: Tool '{name}' from space {repo_id} might have a name mismatch or missing name attribute after creation. Actual name: {getattr(new_tool, 'name', 'MISSING')}")
        return new_tool

    try:
        tool_instance = create_tool_with_retry(repo_id, name, description, api_name) # Renamed to avoid conflict
        tools.append(tool_instance)
        print(f"Successfully loaded predefined tool: {name} from {repo_id}")
    except Exception as e:
        print(f"Failed to load predefined tool from {repo_id}. Error: {str(e)}. Continuing with available tools.")

# --- Refactored HuggingFaceSpaceSearcherTool ---
class HuggingFaceSpaceSearcherTool(Tool):
    # Define attributes as class variables
    name = "huggingface_space_searcher"
    description = "Searches for Hugging Face Spaces that can perform a specific task. Input is a search query string (e.g., 'text to image', 'speech recognition'). Returns a list of Space IDs, their descriptions, and instructions on how to try using them."
    
    # Define input schema as required by smolagents.Tool base class
    inputs = {
        "query": {
            "type": "string",
            "description": "The search query for Hugging Face Spaces."
        },
        "top_k": {
            "type": "integer",
            "description": "The number of top results to return (default is 3).",
            "nullable": True # Changed from "optional" to "nullable"
        }
    }
    output_type = "string" # Optional: define output type

    # The core logic goes into the forward method
    # The Tool base class will likely call this with arguments unpacked from a dictionary matching the 'inputs' schema.
    def forward(self, query: str, top_k: int = 3) -> str:
        """
        Searches Hugging Face Spaces for a given query and returns the top_k results.
        Provides repo_id, description, likes, and last modified date for each space found.
        """
        try:
            # top_k will be provided by the Tool base class mechanism.
            # If "nullable": True and it's not provided by the agent, it might be None.
            actual_top_k = top_k if top_k is not None else 3 # Ensure top_k has a value
            print(f"Searching spaces with query: {query}, top_k: {actual_top_k}")
            spaces_found = list(list_spaces(search=query, full=True, limit=actual_top_k, sort="likes", direction=-1))
            if not spaces_found:
                return "No Spaces found for your query."

            results = "Found the following Spaces (sorted by likes):\n"
            for i, space_data in enumerate(spaces_found):
                description = "No description provided."
                if hasattr(space_data, 'cardData') and space_data.cardData and 'description' in space_data.cardData:
                    description = space_data.cardData['description']
                elif hasattr(space_data, 'title') and space_data.title:
                    description = space_data.title

                results += (
                    f"{i+1}. ID: {space_data.id}\n"
                    f"   Description: {description}\n"
                    f"   Likes: {space_data.likes if hasattr(space_data, 'likes') else 'N/A'}\n"
                    f"   Last Modified: {space_data.lastModified if hasattr(space_data, 'lastModified') else 'N/A'}\n\n"
                )
            results += ("\nTo use one of these, you can try creating a tool in the code like this: "
                        "my_new_tool = Tool.from_space(repo_id='SPACE_ID_HERE', name='custom_tool_name'). "
                        "Then you can call it: result = my_new_tool(argument_name=value). "
                        "The arguments depend on the specific Space. If Tool.from_space fails or the tool doesn't work, "
                        "the Space might not have a compatible public API or may require a specific api_name.")
            return results
        except Exception as e:
            print(f"Error searching Spaces: {str(e)}")
            return f"Error searching Spaces: {str(e)}"

# Instantiate the custom tool
space_search_tool = HuggingFaceSpaceSearcherTool()
# ---- Debug print for the refactored tool ----
try:
    print(f"\nDEBUG: 'space_search_tool' (refactored class) immediately after creation.")
    print(f"DEBUG: Name: {space_search_tool.name}") 
    print(f"DEBUG: Inputs: {space_search_tool.inputs}") # Check if inputs are set
    print(f"DEBUG: Type: {type(space_search_tool)}")
    # print(f"DEBUG: All attributes: {dir(space_search_tool)}\n") # Can be verbose
except AttributeError as e:
    print(f"\nDEBUG: 'space_search_tool' (refactored class) immediately after creation.")
    print(f"DEBUG: Attribute MISSING. Error: {e}")
    print(f"DEBUG: Type: {type(space_search_tool)}")
    # print(f"DEBUG: All attributes: {dir(space_search_tool)}\n")
# ---- END Debug print ----
tools.append(space_search_tool)


# --- Debugging: Inspect tools before CodeAgent initialization ---
print("\n--- Inspecting tools before CodeAgent initialization ---")
for i, t in enumerate(tools):
    if t is None:
        print(f"Tool at index {i} is None!")
        continue
    try:
        tool_name = t.name
        print(f"Tool {i}: Name='{tool_name}', Type={type(t)}, Inputs: {getattr(t, 'inputs', 'Not defined')}")
    except AttributeError:
        print(f"!!! CRITICAL: Tool at index {i} (Type={type(t)}) is missing 'name' attribute.")
    except Exception as e:
        print(f"!!! ERROR inspecting tool at index {i} (Type={type(t)}): {str(e)}")
print("-------------------------------------------------------\n")


# Initialize the model - Use InferenceClientModel
model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")

# Create the agent
agent = CodeAgent(
    tools=tools,
    model=model,
    additional_authorized_imports=['PIL', 'Pillow', 'os', 'sys', 'numpy', 'huggingface_hub', 'gradio_client', 'uuid'],
    add_base_tools=True,
)

AGENT_INSTRUCTIONS = """You are a highly capable AI assistant. Your primary goal is to accomplish tasks using a variety of tools, prioritizing Hugging Face Spaces.

Follow these steps:
1.  **Understand the Request:** Carefully analyze the user's prompt (which will follow these instructions). Identify the core task and any specific requirements or inputs.
2.  **Check Predefined Tools:** Review your list of available tools. If a predefined tool can directly address the request, use it.
    * For the 'huggingface_space_searcher' tool, you MUST provide its arguments as a dictionary. For example: `huggingface_space_searcher(arguments={"query": "your search term", "top_k": 3})`. The `query` is mandatory. `top_k` is optional (defaults to 3 if not provided in the dictionary or if the key is absent).
3.  **Search for Spaces (If Needed):** If no predefined tool is suitable, use the `huggingface_space_searcher` tool as described above.
4.  **Select and Instantiate a Space Tool:** From the search results, choose the most promising Space. Attempt to create a tool from it using `Tool.from_space(repo_id='SELECTED_SPACE_ID', name='a_unique_tool_name')`. You might need to give it a unique name. If `Tool.from_space` fails, the Space might not be compatible, or you could try another one from the search results.
5.  **Execute the Tool:** Call the tool (either predefined or dynamically created) with the necessary arguments.
    * **File Inputs:** If the user uploads files, their paths will be available as global string variables: `input_image_path`, `input_audio_path`, `input_video_path`, `input_3d_model_path`, `input_file_path`. Before using these variables, check if they exist and are not None. Pass these file paths as arguments to tools that require them.
    * **Chaining Tools:** If the task requires multiple steps, chain the tools together.
6.  **Output Management:**
    * If a tool generates a file, save it to the current working directory using a unique filename (e.g., `output_filename = os.path.join(os.getcwd(), f"{uuid.uuid4()}.png")`).
    * **Return the RESULT:** Your final response should be either a string text answer or the string path to the generated output file.
7.  **Clarity and Error Handling:** If you encounter issues, explain the problem.

Example of dynamically using a Space after searching:
```python
# search_results = huggingface_space_searcher(arguments={"query": "text to image cat", "top_k": 1}) # Note the arguments dictionary
# print(search_results)
# try:
#     cat_image_tool = Tool.from_space(repo_id="user/cat-generator", name="cat_generator_tool")
#     image_path = cat_image_tool(prompt="A fluffy siamese cat") # Arguments depend on the Space
#     return image_path
# except Exception as e:
#     return f"Failed to use the cat generator Space: {e}"
```
Always ensure your generated Python code is complete and directly callable.
You have access to `os`, `uuid`, `PIL.Image`.
"""

# Gradio interface function
def gradio_interface(user_prompt, input_image_path, input_audio_path, input_video_path, input_3d_model_path, input_file_path, progress=gr.Progress(track_tqdm=True)):
    try:
        progress(0, desc="Initializing Agent...")
        full_prompt_with_instructions = f"{AGENT_INSTRUCTIONS}\n\nUSER PROMPT: {user_prompt}"
        agent_kwargs = {}
        if input_image_path: agent_kwargs["input_image_path"] = str(input_image_path)
        if input_audio_path: agent_kwargs["input_audio_path"] = str(input_audio_path)
        if input_video_path: agent_kwargs["input_video_path"] = str(input_video_path)
        if input_3d_model_path: agent_kwargs["input_3d_model_path"] = str(input_3d_model_path)
        if input_file_path: agent_kwargs["input_file_path"] = str(input_file_path)

        progress(0.2, desc="Agent processing request...")
        result = agent.run(full_prompt_with_instructions, **agent_kwargs)

        progress(0.8, desc="Processing result...")
        outputs = {
            "image": gr.update(value=None, visible=False), "file": gr.update(value=None, visible=False),
            "path": gr.update(value=None, visible=False), "audio": gr.update(value=None, visible=False),
            "model3d": gr.update(value=None, visible=False), "text": gr.update(value=None, visible=True),
        }

        if isinstance(result, str):
            if os.path.isfile(result):
                file_path = result
                outputs["file"] = gr.update(value=file_path, visible=True)
                outputs["path"] = gr.update(value=file_path, visible=True)
                ext = os.path.splitext(file_path.lower())[1]
                if ext in ('.png', '.jpg', '.jpeg', '.gif', '.webp'): outputs["image"] = gr.update(value=file_path, visible=True)
                elif ext in ('.mp3', '.wav', '.ogg', '.flac'): outputs["audio"] = gr.update(value=file_path, visible=True)
                elif ext == '.glb': outputs["model3d"] = gr.update(value=file_path, visible=True)
                else: outputs["text"] = gr.update(value=f"Output is a file: {os.path.basename(file_path)}. Download it.", visible=True)
            else: outputs["text"] = gr.update(value=result, visible=True)
        elif result is None: outputs["text"] = gr.update(value="Agent returned no result (None).", visible=True)
        else: outputs["text"] = gr.update(value=f"Unexpected result type: {type(result)}. Content: {str(result)}", visible=True)
        
        progress(1, desc="Done!")
        return (outputs["image"], outputs["file"], outputs["path"], outputs["audio"], outputs["model3d"], outputs["text"])

    except Exception as e:
        error_msg = f"An error occurred: {str(e)}"
        print(error_msg)
        traceback.print_exc()
        return (None, None, None, None, None, gr.update(value=error_msg, visible=True))

# Create the Gradio app
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("## πŸ€– Smolagent: Multi-Modal Agent with Hugging Face Space Discovery")
    gr.Markdown("Ask the agent to perform tasks...")

    with gr.Row():
        prompt_input = gr.Textbox(label="Enter your prompt", placeholder="e.g., 'Generate an image of a futuristic city'", lines=3, elem_id="user_prompt_textbox")
    
    with gr.Accordion("Optional File Inputs", open=False):
        with gr.Row():
            input_image = gr.Image(label="Image Input", type="filepath", sources=["upload", "clipboard"], elem_id="input_image_upload")
            input_audio = gr.Audio(label="Audio Input", type="filepath", sources=["upload", "microphone"], elem_id="input_audio_upload")
        with gr.Row():
            # Removed type="filepath" from gr.Video
            input_video = gr.Video(label="Video Input", sources=["upload"], elem_id="input_video_upload")
            input_model3d = gr.Model3D(label="3D Model Input", type="filepath", elem_id="input_model3d_upload")
        with gr.Row():
            input_file = gr.File(label="Generic File Input", type="filepath", elem_id="input_file_upload")

    submit_button = gr.Button("πŸš€ Generate", variant="primary", elem_id="submit_button_generate")

    gr.Markdown("### Outputs:")
    with gr.Row():
        image_output = gr.Image(label="Image Output", interactive=False, visible=False, show_download_button=True, elem_id="output_image_display")
        audio_output = gr.Audio(label="Audio Output", interactive=False, visible=False, show_download_button=True, elem_id="output_audio_display")
    with gr.Row():
        model3d_output = gr.Model3D(label="3D Model Output", interactive=False, visible=False, show_download_button=True, elem_id="output_model3d_display")
        text_output = gr.Textbox(label="Text / Log Output", interactive=False, visible=True, lines=5, max_lines=20, elem_id="output_text_log")
    with gr.Row():
        file_output = gr.File(label="Download File Output", interactive=False, visible=False, elem_id="output_file_download")
        path_output = gr.Textbox(label="Output File Path", interactive=False, visible=False, elem_id="output_file_path_text")

    submit_button.click(
        fn=gradio_interface,
        inputs=[prompt_input, input_image, input_audio, input_video, input_3d_model_path, input_file],
        outputs=[image_output, file_output, path_output, audio_output, model3d_output, text_output]
    )
    
    gr.Examples(
        examples=[
            ["Generate an image of a happy robot coding on a laptop, cyberpunk style.", None, None, None, None, None],
            ["Convert the following text to speech: 'Smolagents are amazing for building AI applications.'", None, None, None, None, None],
            ["Search for a Hugging Face Space that can perform image captioning. Describe the first result.", None, None, None, None, None],
            ["I have an image of a cat. Find a space that can make it look like a painting and apply it. You will need to use the 'input_image_path' variable which will contain the path to the uploaded cat image.", "path/to/your/cat_image.png", None, None, None, None],
        ],
        inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
        label="Example Prompts (Note: For examples with file inputs, you'll need to upload a relevant file first)"
    )

if __name__ == "__main__":
    app.launch(debug=True)