Spaces:

jkorstad
/

Easy-Spaces

Runtime error

File size: 16,925 Bytes

24a4bd7
 
 
cb57dca
 
24fb7b9
675bab3
cb57dca
675bab3
cb57dca
675bab3
cb57dca
9549eae
cb57dca
3b02325
cb57dca
 
675bab3
 
cb57dca
 
675bab3
 
cb57dca
 
675bab3
 
cb57dca
 
675bab3
 
3b02325
24a4bd7
675bab3
3b02325
cb57dca
675bab3
24fb7b9
675bab3
24fb7b9
675bab3
 
 
2994b0b
 
 
 
 
8fd199a
675bab3
24fb7b9
 
675bab3
 
 
3b02325
24fb7b9
 
 
 
b24bf0f
 
 
 
 
 
 
 
b593116
b24bf0f
 
b593116
24fb7b9
 
 
b593116
b24bf0f
 
24fb7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b593116
 
 
 
 
24fb7b9
 
 
 
 
 
675bab3
3b02325
b593116
24fb7b9
3b02325
24fb7b9
675bab3
 
 
 
24fb7b9
675bab3
3b02325
cb57dca
 
 
b593116
b24bf0f
b593116
b24bf0f
b593116
 
 
 
24fb7b9
b593116
cb57dca
b593116
 
 
cb57dca
b593116
cb57dca
b593116
 
 
 
 
 
 
cb57dca
b593116
 
 
 
 
cb57dca
b593116
 
 
 
 
 
 
 
 
 
 
 
cb57dca
24fb7b9
b593116
cb57dca
 
3b02325
cb57dca
24a4bd7
675bab3
cb57dca
 
24fb7b9
 
 
 
 
cb57dca
675bab3
cb57dca
675bab3
 
 
24fb7b9
 
 
675bab3
 
 
 
 
 
 
2994b0b
24fb7b9
 
 
 
 
 
 
 
675bab3
24fb7b9
675bab3
24a4bd7
24fb7b9
2994b0b
 
24fb7b9
24a4bd7
3b02325
675bab3
 
24fb7b9
675bab3
3b02325
24fb7b9
 
 
675bab3
cb57dca
 
675bab3
1cf26dc
91093b4
675bab3
24fb7b9
24a4bd7
cb57dca
24a4bd7
675bab3
 
cb57dca
 
675bab3
92d9b2d
2994b0b
675bab3
cb57dca
24fb7b9
675bab3
 
 
487e8eb
675bab3
 
24fb7b9
675bab3
 
 
 
 
cb57dca
675bab3
 
24fb7b9
675bab3
24a4bd7
675bab3
b593116

import gradio as gr
import os
import shutil
from gradio_client import Client, handle_file # handle_file might be used by the agent
# Use InferenceClientModel instead of HfApiModel
from smolagents import Tool, CodeAgent, InferenceClientModel, ToolCollection # Tool is needed for subclassing
import uuid
import httpx # Often a dependency for HTTP clients, good to have
from tenacity import retry, stop_after_attempt, wait_exponential
from huggingface_hub import list_spaces
from PIL import Image # For potential image manipulation by the agent
import traceback # For more detailed error logging if needed

# Define initial tools from Spaces
spaces = [
    {"repo_id": "black-forest-labs/FLUX.1-schnell",
     "name": "image_generator_flux_schnell",
     "description": "Generate an image from a prompt using FLUX.1-schnell. Expects a text prompt.",
     "api_name": "/infer"},
    {"repo_id": "Remsky/Kokoro-TTS-Zero",
     "name": "text_to_speech_kokoro",
     "description": "Generates speech (audio) from input text using Kokoro TTS Zero. Expects text input.",
     "api_name": "/generate_speech_from_ui"},
    {"repo_id": "jamesliu1217/EasyControl_Ghibli",
     "name": "ghibli_style_image_control",
     "description": "Create Ghibli style image from an input image using EasyControl_Ghibli. Expects an image and a prompt/control parameters.",
     "api_name": "/single_condition_generate_image"},
    {"repo_id": "opendatalab/MinerU",
     "name": "pdf_text_extraction_mineru",
     "description": "Extracts the text of a PDF up to 20 pages long using MinerU. Expects a PDF file.",
     "api_name": "/to_pdf"},
]

# Create tools from predefined Spaces with retry logic
tools = []
for space_info in spaces:
    repo_id = space_info['repo_id']
    name = space_info.get('name', repo_id.split('/')[-1].replace('-', '_'))
    description = space_info.get('description', f'A tool to interact with the Hugging Face Space: {repo_id}')
    api_name = space_info.get('api_name')

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    def create_tool_with_retry(repo_id, name, description, api_name):
        print(f"Attempting to create tool: '{name}' from space: {repo_id} with api_name: {api_name}")
        new_tool = Tool.from_space(repo_id, name=name, description=description, api_name=api_name)
        if not hasattr(new_tool, 'name') or new_tool.name != name:
            print(f"WARNING: Tool '{name}' from space {repo_id} might have a name mismatch or missing name attribute after creation. Actual name: {getattr(new_tool, 'name', 'MISSING')}")
        return new_tool

    try:
        tool_instance = create_tool_with_retry(repo_id, name, description, api_name) # Renamed to avoid conflict
        tools.append(tool_instance)
        print(f"Successfully loaded predefined tool: {name} from {repo_id}")
    except Exception as e:
        print(f"Failed to load predefined tool from {repo_id}. Error: {str(e)}. Continuing with available tools.")

# --- Refactored HuggingFaceSpaceSearcherTool ---
class HuggingFaceSpaceSearcherTool(Tool):
    name = "huggingface_space_searcher"
    description = "Searches for Hugging Face Spaces that can perform a specific task. Input is a search query string (e.g., 'text to image', 'speech recognition'). Returns a list of Space IDs, their descriptions, and instructions on how to try using them."
    inputs = {
        "query": {
            "type": "string",
            "description": "The search query for Hugging Face Spaces."
        },
        "top_k": {
            "type": "integer",
            "description": "The number of top results to return (default is 3).",
            "nullable": True
        }
    }
    output_type = "string"

    def forward(self, query: str, top_k: int = 3) -> str:
        try:
            actual_top_k = top_k if top_k is not None else 3
            print(f"Searching spaces with query: {query}, top_k: {actual_top_k}")
            spaces_found = list(list_spaces(search=query, full=True, limit=actual_top_k, sort="likes", direction=-1))
            if not spaces_found:
                return "No Spaces found for your query."
            results = "Found the following Spaces (sorted by likes):\n"
            for i, space_data in enumerate(spaces_found):
                description = "No description provided."
                if hasattr(space_data, 'cardData') and space_data.cardData and 'description' in space_data.cardData:
                    description = space_data.cardData['description']
                elif hasattr(space_data, 'title') and space_data.title:
                    description = space_data.title
                results += (
                    f"{i+1}. ID: {space_data.id}\n"
                    f"   Description: {description}\n"
                    f"   Likes: {space_data.likes if hasattr(space_data, 'likes') else 'N/A'}\n"
                    f"   Last Modified: {space_data.lastModified if hasattr(space_data, 'lastModified') else 'N/A'}\n\n"
                )
            results += ("\nTo use one of these, you should first try creating a tool using "
                        "`Tool.from_space(repo_id='SPACE_ID_HERE', name='custom_tool_name')`. "
                        "Then call that new tool: `result = custom_tool_name(argument_name=value)`. "
                        "The arguments depend on the specific Space. If `Tool.from_space` fails, "
                        "the Space might not have a compatible public API.")
            return results
        except Exception as e:
            print(f"Error searching Spaces: {str(e)}")
            return f"Error searching Spaces: {str(e)}"

space_search_tool = HuggingFaceSpaceSearcherTool()
tools.append(space_search_tool)

# Initialize the model
model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")

# Create the agent
agent = CodeAgent(
    tools=tools,
    model=model,
    additional_authorized_imports=['PIL', 'Pillow', 'os', 'sys', 'numpy', 'huggingface_hub', 'gradio_client', 'uuid'],
    add_base_tools=True,
)

AGENT_INSTRUCTIONS = """You are a highly capable AI assistant. Your primary goal is to accomplish tasks using a variety of tools, prioritizing Hugging Face Spaces.

Follow these steps:
1.  **Understand the Request:** Carefully analyze the user's prompt. Identify the core task and any specific requirements or inputs.
2.  **Check Predefined Tools:** Review your list of available tools. If a predefined tool can directly address the request, use it.
    * For the 'huggingface_space_searcher' tool, call it with direct keyword arguments like: `huggingface_space_searcher(query="your search term", top_k=3)`. The `query` is mandatory. `top_k` is optional and defaults to 3 if not provided.
3.  **Search for Spaces (If Needed):** If no predefined tool is suitable, use the `huggingface_space_searcher` tool as described above.
4.  **Select and Instantiate a Space Tool (PRIORITY):** From the search results, choose the most promising Space. **Your primary method to use this Space is to attempt to create a tool from it using `Tool.from_space(repo_id='SELECTED_SPACE_ID', name='a_unique_tool_name')`.** You *must* give it a unique and descriptive name.
    * If `Tool.from_space()` succeeds, you now have a new tool. Call this new tool with the appropriate arguments for that Space.
    * If `Tool.from_space()` fails (e.g., raises an exception), the Space might not have a compatible public API for this method. In this case, you can *then* consider if direct `gradio_client.Client` usage is feasible, but this is a fallback and often more complex.
5.  **Execute the Tool:** Call the tool (predefined, or dynamically created via `Tool.from_space()`) with the necessary arguments.
    * **File Inputs:** If the user uploads files, their paths will be available as global string variables: `input_image_path`, `input_audio_path`, `input_video_path`, `input_3d_model_path`, `input_file_path`. Before using these variables, check if they exist and are not None. Pass these file paths as arguments to tools that require them.
    * **Imports in Generated Code:** If your code block for execution uses modules like `os` or `uuid`, **you must include the import statements (e.g., `import os`, `import uuid`) within that specific code block.**
6.  **Output Management:**
    * **If a tool returns a filepath string (e.g., to an image, audio, or other file), your final answer for this step should usually be that direct filepath string.** Do NOT attempt to re-save the file using `os.path.join` or `image.save()` unless you are performing an explicit transformation on the file content that requires loading and then saving. The system is designed to handle these returned filepaths.
    * If a tool returns text, return that text.
7.  **Clarity and Error Handling:** If you encounter issues (e.g., a Space tool fails, required inputs are missing), clearly explain the problem in your response. If a Space doesn't work, try to explain why or suggest an alternative if possible.

Example of the PREFERRED way to use a discovered Space:
```python
# User prompt: "Find a space that can make an image of a cat and use it."
#
# Step 1: Search for the space
# search_results = huggingface_space_searcher(query="text to image cat", top_k=1)
# print(search_results) # Assume 'someuser/cat-image-generator' is found.
#
# Step 2: Try to create a tool from the discovered space
# try:
#     cat_tool = Tool.from_space(repo_id="someuser/cat-image-generator", name="cat_image_generator_tool")
#     # Now use the newly created tool. Arguments depend on the Space's API.
#     # Let's assume it takes a 'prompt'.
#     image_filepath = cat_tool(prompt="A fluffy siamese cat, cyberpunk style")
#     return image_filepath # Return the filepath directly
# except Exception as e:
#     print(f"Failed to create or use tool from Space 'someuser/cat-image-generator': {e}")
#     # Optionally, try another space or a predefined tool if appropriate.
#     # return "Could not use the discovered space. Trying a fallback..." (then try another step)
```

Example of using a predefined tool that returns a filepath:
```python
# User prompt: "Generate an image of a happy robot."
# (Assuming 'image_generator_flux_schnell' is a predefined tool)
#
# image_filepath = image_generator_flux_schnell(prompt="A happy robot coding on a laptop, cyberpunk style")
# return image_filepath # Return the filepath string directly.
```
Always ensure your generated Python code is complete and directly callable.
You have access to `PIL.Image` (as `Image`), `os`, `sys`, `numpy`, `huggingface_hub`, `gradio_client`, `uuid`. Remember to import them if you use them in a code block.
"""

# Gradio interface function
def gradio_interface(user_prompt, input_image_path, input_audio_path, input_video_path, input_3d_model_path, input_file_path, progress=gr.Progress(track_tqdm=True)):
    try:
        progress(0, desc="Initializing Agent...")
        full_prompt_with_instructions = f"{AGENT_INSTRUCTIONS}\n\nUSER PROMPT: {user_prompt}"
        agent_kwargs = {}
        if input_image_path: agent_kwargs["input_image_path"] = str(input_image_path)
        if input_audio_path: agent_kwargs["input_audio_path"] = str(input_audio_path)
        if input_video_path: agent_kwargs["input_video_path"] = str(input_video_path)
        if input_3d_model_path: agent_kwargs["input_3d_model_path"] = str(input_3d_model_path)
        if input_file_path: agent_kwargs["input_file_path"] = str(input_file_path)

        progress(0.2, desc="Agent processing request...")
        result = agent.run(full_prompt_with_instructions, **agent_kwargs)

        progress(0.8, desc="Processing result...")
        outputs = {
            "image": gr.update(value=None, visible=False), "file": gr.update(value=None, visible=False),
            "path": gr.update(value=None, visible=False), "audio": gr.update(value=None, visible=False),
            "model3d": gr.update(value=None, visible=False), "text": gr.update(value=None, visible=True),
        }

        if isinstance(result, str):
            if os.path.isfile(result):
                file_path = result
                outputs["file"] = gr.update(value=file_path, visible=True)
                outputs["path"] = gr.update(value=file_path, visible=True)
                ext = os.path.splitext(file_path.lower())[1]
                if ext in ('.png', '.jpg', '.jpeg', '.gif', '.webp'): outputs["image"] = gr.update(value=file_path, visible=True)
                elif ext in ('.mp3', '.wav', '.ogg', '.flac'): outputs["audio"] = gr.update(value=file_path, visible=True)
                elif ext == '.glb': outputs["model3d"] = gr.update(value=file_path, visible=True)
                else: outputs["text"] = gr.update(value=f"Output is a file: {os.path.basename(file_path)}. Download it.", visible=True)
            else: outputs["text"] = gr.update(value=result, visible=True)
        elif result is None: outputs["text"] = gr.update(value="Agent returned no result (None).", visible=True)
        else: outputs["text"] = gr.update(value=f"Unexpected result type: {type(result)}. Content: {str(result)}", visible=True)
        
        progress(1, desc="Done!")
        return (outputs["image"], outputs["file"], outputs["path"], outputs["audio"], outputs["model3d"], outputs["text"])

    except Exception as e:
        error_msg = f"An error occurred: {str(e)}"
        print(error_msg)
        traceback.print_exc()
        return (None, None, None, None, None, gr.update(value=error_msg, visible=True))

# Create the Gradio app
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("## 🤖 Smolagent: Multi-Modal Agent with Hugging Face Space Discovery")
    gr.Markdown("Ask the agent to perform tasks...")

    with gr.Row():
        prompt_input = gr.Textbox(label="Enter your prompt", placeholder="e.g., 'Generate an image of a futuristic city'", lines=3, elem_id="user_prompt_textbox")
    
    with gr.Accordion("Optional File Inputs", open=False):
        with gr.Row():
            input_image = gr.Image(label="Image Input", type="filepath", sources=["upload", "clipboard"], elem_id="input_image_upload")
            input_audio = gr.Audio(label="Audio Input", type="filepath", sources=["upload", "microphone"], elem_id="input_audio_upload")
        with gr.Row():
            input_video = gr.Video(label="Video Input", sources=["upload"], elem_id="input_video_upload")
            input_model3d = gr.Model3D(label="3D Model Input", elem_id="input_model3d_upload")
        with gr.Row():
            input_file = gr.File(label="Generic File Input", type="filepath", elem_id="input_file_upload")

    submit_button = gr.Button("🚀 Generate", variant="primary", elem_id="submit_button_generate")

    gr.Markdown("### Outputs:")
    with gr.Row():
        image_output = gr.Image(label="Image Output", interactive=False, visible=False, show_download_button=True, elem_id="output_image_display")
        audio_output = gr.Audio(label="Audio Output", interactive=False, visible=False, show_download_button=True, elem_id="output_audio_display")
    with gr.Row():
        model3d_output = gr.Model3D(label="3D Model Output", interactive=False, visible=False, elem_id="output_model3d_display")
        text_output = gr.Textbox(label="Text / Log Output", interactive=False, visible=True, lines=5, max_lines=20, elem_id="output_text_log")
    with gr.Row():
        file_output = gr.File(label="Download File Output", interactive=False, visible=False, elem_id="output_file_download")
        path_output = gr.Textbox(label="Output File Path", interactive=False, visible=False, elem_id="output_file_path_text")

    submit_button.click(
        fn=gradio_interface,
        inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
        outputs=[image_output, file_output, path_output, audio_output, model3d_output, text_output]
    )
    
    gr.Examples(
        examples=[
            ["Generate an image of a happy robot coding on a laptop, cyberpunk style.", None, None, None, None, None],
            ["Convert the following text to speech: 'Smolagents are amazing for building AI applications.'", None, None, None, None, None],
            ["Search for a Hugging Face Space that can perform image captioning. Describe the first result.", None, None, None, None, None],
            ["I have an image of a cat. Find a space that can make it look like a painting and apply it. You will need to use the 'input_image_path' variable which will contain the path to the uploaded cat image.", "path/to/your/cat_image.png", None, None, None, None],
        ],
        inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
        label="Example Prompts (Note: For examples with file inputs, you'll need to upload a relevant file first)"
    )

if __name__ == "__main__":
    app.launch(debug=True)