Spaces:

jkorstad
/

Easy-Spaces

Runtime error

App Files Files Community

jkorstad commited on May 16

Commit

675bab3

verified ·

1 Parent(s): 1ed6758

Update app.py

Browse files

Files changed (1) hide show

app.py +266 -57

app.py CHANGED Viewed

@@ -1,86 +1,295 @@
 import gradio as gr
 import os
 import shutil
-from gradio_client import Client, handle_file
-from smolagents import Tool, CodeAgent, HfApiModel
-# import spaces - if using ZeroGPU
-# Define tools from Spaces
 spaces = [
     {"repo_id": "black-forest-labs/FLUX.1-schnell",
-     "name": "image_generator",
-     "description": "Generate an image from a prompt"},
     {"repo_id": "jamesliu1217/EasyControl_Ghibli",
-     "name": "Ghibli_style_Image_control",
-     "description": "Create Ghibli style image"},
 ]
 tools = []
-for space in spaces:
-    # Access repo_id, name, and description
-    repo_id = space['repo_id']
-    name = space.get('name', repo_id)  # Use repo_id as name if not specified
-    description = space.get('description', '')  # Use empty string if not specified
-    # Create Tool instance
-    tool = Tool.from_space(repo_id, name=name, description=description)
-    tools.append(tool)
-# Define a custom tool
-class CustomTool(Tool):
-    name = "custom_tool"
-    description = "A custom tool that processes input text"
-    inputs = {"input": {"type": "string", "description": "Some input text to process"}}
-    output_type = "string"
-    def forward(self, input: str):
-        return f"Processed: {input}"
-tools.append(CustomTool())
 # Initialize the model
-model = HfApiModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")
-# Create the agent
-agent = CodeAgent(tools=tools, model=model)
-# Function to run the agent and return the image path
-def generate_and_transform(prompt):
-    result = agent.run(prompt)
-    if isinstance(result, str):  # Assuming result is a file path
-        # Copy the temporary file to a permanent location
-        permanent_path = "ghibli_output.webp"
-        shutil.copy(result, permanent_path)
-        return permanent_path
-    else:
-        raise ValueError("Unexpected result type from agent")
 # Gradio interface function
-def gradio_interface(prompt):
     try:
-        image_path = generate_and_transform(prompt)
-        return image_path
     except Exception as e:
-        return str(e)
 # Create the Gradio app
-with gr.Blocks() as app:
-    gr.Markdown("### Smolagent Image Generator with Ghibli Style")
     with gr.Row():
-        prompt_input = gr.Textbox(label="Enter your prompt", placeholder="e.g., Generate an image of a dog and then make an 'xyz' style version of that image")
-        submit_button = gr.Button("Generate")
-    output_image = gr.Image(label="Generated Image")
-    download_button = gr.File(label="Download Image")
-    # Connect the button to the function
-    def on_submit(prompt):
-        image_path = gradio_interface(prompt)
-        return image_path, image_path
-    submit_button.click(on_submit, inputs=prompt_input, outputs=[output_image, download_button])
 # Launch the app
-app.launch()

 import gradio as gr
 import os
 import shutil
+from gradio_client import Client, handle_file # handle_file might be used by the agent if it constructs client calls manually
+from smolagents import Tool, CodeAgent, HfApiModel, ToolCollection
+import uuid
+import httpx
+from tenacity import retry, stop_after_attempt, wait_exponential
+from huggingface_hub import list_spaces # For the new search tool
+from PIL import Image # For potential image manipulation by the agent
+# Define initial tools from Spaces (your existing list)
 spaces = [
     {"repo_id": "black-forest-labs/FLUX.1-schnell",
+     "name": "image_generator_flux_schnell", # Renamed for clarity if multiple image generators exist
+     "description": "Generate an image from a prompt using FLUX.1-schnell. Expects a text prompt.",
+     "api_name": "/infer"},
+    {"repo_id": "Remsky/Kokoro-TTS-Zero",
+     "name": "text_to_speech_kokoro",
+     "description": "Generates speech (audio) from input text using Kokoro TTS Zero. Expects text input.",
+     "api_name": "/generate_speech_from_ui"},
     {"repo_id": "jamesliu1217/EasyControl_Ghibli",
+     "name": "ghibli_style_image_control", # Renamed for clarity
+     "description": "Create Ghibli style image from an input image using EasyControl_Ghibli. Expects an image and a prompt/control parameters.",
+     "api_name": "/single_condition_generate_image"},
+    {"repo_id": "opendatalab/MinerU",
+     "name": "pdf_text_extraction_mineru", # Renamed for clarity
+     "description": "Extracts the text of a PDF up to 20 pages long using MinerU. Expects a PDF file.",
+     "api_name": "/to_pdf"},
+    {"repo_id": "InstantX/InstantCharacter",
+     "name": "instant_character_customization", # Renamed for clarity
+     "description": "Personalize Any Characters with a Scalable Diffusion Transformer Framework to any style or pose using InstantCharacter. Expects an input image and potentially pose/style images or prompts.",
+     "api_name": "/predict"}, # Common API name, verify for this space
+    {"repo_id": "fotographerai/Zen-Style-Shape",
+     "name": "img_to_img_style_transfer_zen_shape", # Renamed for clarity
+     "description": "Flux[dev] Redux + Flux[dev] Canny. Implements a custom image-to-image style transfer pipeline blending style from Image A to structure of Image B. Expects two images.",
+     "api_name": "/predict"}, # Common API name, verify for this space
+    {"repo_id": "moonshotai/Kimi-VL-A3B-Thinking",
+     "name": "multimodal_vlm_llm_kimi", # Renamed for clarity
+     "description": "Kimi-VL-A3B-Thinking is a multi-modal LLM that can understand text and images, and generate text with thinking processes. Ask any question about an image. Expects text and optionally an image.",
+     "api_name": "/chat"}, # Verify this api_name for Kimi spaces
 ]
+# Create tools from predefined Spaces with retry logic
 tools = []
+for space_info in spaces: # Renamed 'space' to 'space_info' to avoid conflict
+    repo_id = space_info['repo_id']
+    name = space_info.get('name', repo_id.split('/')[-1].replace('-', '_')) # Default name from repo_id
+    description = space_info.get('description', f'A tool to interact with the Hugging Face Space: {repo_id}')
+    api_name = space_info.get('api_name') # Can be None, Tool.from_space will try to infer
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
+    def create_tool_with_retry(repo_id, name, description, api_name):
+        # If api_name is None, Tool.from_space will try to find a public API endpoint.
+        return Tool.from_space(repo_id, name=name, description=description, api_name=api_name)
+    try:
+        tool = create_tool_with_retry(repo_id, name, description, api_name)
+        tools.append(tool)
+        print(f"Successfully loaded predefined tool: {name} from {repo_id}")
+    except Exception as e:
+        print(f"Failed to load predefined tool from {repo_id}. Error: {str(e)}. Continuing with available tools.")
+# Load tools from a Hugging Face Collection
+collection_slug = "jkorstad/tools-680127d17eed47e759549ff4"
+try:
+    collection = ToolCollection.from_hub(collection_slug=collection_slug)
+    tools.extend(collection.tools)
+    print(f"Successfully loaded tools from collection: {collection_slug}")
+except Exception as e:
+    print(f"Warning: Failed to load collection {collection_slug}. Error: {str(e)}")
+# NEW: Tool for searching Hugging Face Spaces
+def search_hf_spaces(query: str, top_k: int = 3) -> str:
+    """
+    Searches Hugging Face Spaces for a given query and returns the top_k results.
+    Provides repo_id, description, likes, and last modified date for each space found.
+    Use this to discover new tools if the existing ones are not suitable.
+    To use a found space, try: new_tool = Tool.from_space(repo_id='the_space_id', name='a_descriptive_name')
+    Then call it: result = new_tool(param1=value1, ...)
+    """
+    try:
+        print(f"Searching spaces with query: {query}, top_k: {top_k}")
+        spaces_found = list(list_spaces(search=query, full=True, limit=top_k, sort="likes", direction=-1))
+        if not spaces_found:
+            return "No Spaces found for your query."
+        results = "Found the following Spaces (sorted by likes):\n"
+        for i, space_data in enumerate(spaces_found):
+            description = "No description."
+            if space_data.cardData and 'description' in space_data.cardData:
+                description = space_data.cardData['description']
+            elif space_data.title: # Fallback to title if description missing
+                description = space_data.title
+            results += (
+                f"{i+1}. ID: {space_data.id}\n"
+                f"   Description: {description}\n"
+                f"   Likes: {space_data.likes}\n"
+                f"   Last Modified: {space_data.lastModified}\n\n"
+            )
+        results += ("\nTo use one of these, you can try creating a tool in the code like this: "
+                    "my_new_tool = Tool.from_space(repo_id='SPACE_ID_HERE', name='custom_tool_name'). "
+                    "Then you can call it: result = my_new_tool(argument_name=value). "
+                    "The arguments depend on the specific Space. If Tool.from_space fails or the tool doesn't work, "
+                    "the Space might not have a compatible public API or may require a specific api_name.")
+        return results
+    except Exception as e:
+        print(f"Error searching Spaces: {str(e)}")
+        return f"Error searching Spaces: {str(e)}"
+space_search_tool = Tool(
+    name="huggingface_space_searcher",
+    description="Searches for Hugging Face Spaces that can perform a specific task. Input is a search query string (e.g., 'text to image', 'speech recognition'). Returns a list of Space IDs, their descriptions, and instructions on how to try using them.",
+    func=search_hf_spaces,
+    # args_schema can be defined if you want Pydantic validation for args, e.g., using a class Query(BaseModel): query: str; top_k: int = 3
+)
+tools.append(space_search_tool)
 # Initialize the model
+model = HfApiModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct") # Or your preferred model
+# Create the agent with extended imports and a more detailed system prompt
+agent = CodeAgent(
+    tools=tools,
+    model=model,
+    additional_authorized_imports=['PIL', 'Pillow', 'os', 'sys', 'numpy', 'huggingface_hub', 'gradio_client', 'uuid'],
+    add_base_tools=True, # Includes web search, python interpreter
+    system_prompt="""You are a highly capable AI assistant. Your primary goal is to accomplish tasks using a variety of tools, prioritizing Hugging Face Spaces.
+    Follow these steps:
+    1.  **Understand the Request:** Carefully analyze the user's prompt. Identify the core task and any specific requirements or inputs.
+    2.  **Check Predefined Tools:** Review your list of available tools. If a predefined tool can directly address the request, use it.
+    3.  **Search for Spaces (If Needed):** If no predefined tool is suitable, use the `huggingface_space_searcher` tool. Provide a concise search query related to the task (e.g., "image classification", "voice cloning", "document question answering").
+    4.  **Select and Instantiate a Space Tool:** From the search results, choose the most promising Space. Attempt to create a tool from it using `Tool.from_space(repo_id='SELECTED_SPACE_ID', name='a_unique_tool_name')`. You might need to give it a unique name. If `Tool.from_space` fails, the Space might not be compatible, or you could try another one from the search results. Note that some Spaces might not have a public API or may require a specific `api_name` that `Tool.from_space` cannot infer; in such cases, you might not be able to use them.
+    5.  **Execute the Tool:** Call the tool (either predefined or dynamically created) with the necessary arguments.
+        * **File Inputs:** If the user uploads files, their paths will be available as global string variables: `input_image_path`, `input_audio_path`, `input_video_path`, `input_3d_model_path`, `input_file_path`. Before using these variables, check if they exist and are not None (e.g., `if 'input_image_path' in globals() and input_image_path:`). Pass these file paths as arguments to tools that require them. `Tool.from_space` handles file uploads for compatible Spaces when you pass the filepath string.
+        * **Chaining Tools:** If the task requires multiple steps, chain the tools together, passing the output of one tool as the input to the next.
+    6.  **Output Management:**
+        * If a tool generates a file (image, audio, etc.), save it to the current working directory using a unique filename (e.g., `output_filename = os.path.join(os.getcwd(), f"{uuid.uuid4()}.png")`).
+        * **Return the RESULT:** Your final response should be either:
+            * A string containing the direct text answer.
+            * The string path to the generated output file (e.g., `return output_filename`).
+    7.  **Clarity and Error Handling:** If you encounter issues (e.g., a Space tool fails, required inputs are missing), clearly explain the problem in your response. If a Space doesn't work, try to explain why or suggest an alternative if possible.
+    Example of dynamically using a Space after searching:
+    ```python
+    # user_prompt = "Find a space that can make an image of a cat and then use it."
+    # First, I would use huggingface_space_searcher to find relevant spaces.
+    # search_results = huggingface_space_searcher(query="text to image cat")
+    # print(search_results) # This would show me some options. Let's say 'user/cat-generator' is found.
+    # try:
+    #     cat_image_tool = Tool.from_space(repo_id="user/cat-generator", name="cat_generator_tool")
+    #     # The arguments for cat_image_tool depend on the Space. I'll assume it takes a 'prompt' argument.
+    #     image_path = cat_image_tool(prompt="A fluffy siamese cat")
+    #     # image_path should be a path to the generated image file
+    #     return image_path
+    # except Exception as e:
+    #     return f"Failed to use the cat generator Space: {e}"
+    ```
+    Always ensure your generated Python code is complete and directly callable. Use `print()` for debugging if necessary, but the final returned value should be the result or file path.
+    You have access to `os`, `uuid`, `PIL.Image`.
+    """
+)
 # Gradio interface function
+def gradio_interface(prompt, input_image_path, input_audio_path, input_video_path, input_3d_model_path, input_file_path, progress=gr.Progress(track_tqdm=True)):
     try:
+        progress(0, desc="Initializing Agent...")
+        # Prepare a dictionary of potential inputs for the agent's execution scope
+        agent_context_inputs = {"prompt": prompt}
+        # These will be available as global variables in the agent's Python execution environment
+        if input_image_path:
+            agent_context_inputs["input_image_path"] = str(input_image_path) # Ensure it's a string path
+        if input_audio_path:
+            agent_context_inputs["input_audio_path"] = str(input_audio_path)
+        if input_video_path:
+            agent_context_inputs["input_video_path"] = str(input_video_path)
+        if input_3d_model_path:
+            agent_context_inputs["input_3d_model_path"] = str(input_3d_model_path) # Path to .glb or similar
+        if input_file_path:
+            agent_context_inputs["input_file_path"] = str(input_file_path) # Path to PDF, TXT etc.
+        # The agent will use these global variables based on the system prompt's guidance
+        # The `prompt` variable is the main user query.
+        progress(0.2, desc="Agent processing request...")
+        result = agent.run(**agent_context_inputs) # Pass main prompt and other inputs to be set in global scope
+        progress(0.8, desc="Processing result...")
+        # Default all outputs to invisible and None
+        outputs = {
+            "image": gr.update(value=None, visible=False),
+            "file": gr.update(value=None, visible=False),
+            "path": gr.update(value=None, visible=False),
+            "audio": gr.update(value=None, visible=False),
+            "model3d": gr.update(value=None, visible=False),
+            "text": gr.update(value=None, visible=False),
+        }
+        if isinstance(result, str):
+            if os.path.isfile(result):
+                file_path = result
+                outputs["file"] = gr.update(value=file_path, visible=True)
+                outputs["path"] = gr.update(value=file_path, visible=True)
+                ext = file_path.lower().split('.')[-1]
+                if ext in ('png', 'jpg', 'jpeg', 'gif', 'webp'):
+                    outputs["image"] = gr.update(value=file_path, visible=True)
+                elif ext in ('mp3', 'wav', 'ogg', 'flac'):
+                    outputs["audio"] = gr.update(value=file_path, visible=True)
+                elif ext == 'glb': # Common format for Model3D
+                    outputs["model3d"] = gr.update(value=file_path, visible=True)
+                else: # Other file types like PDF, TXT - user can download via file component
+                    outputs["text"] = gr.update(value=f"Output is a file (e.g., PDF, TXT): {os.path.basename(file_path)}. Download it above.", visible=True)
+            else:
+                # Result is a string (e.g., text output from a tool)
+                outputs["text"] = gr.update(value=result, visible=True)
+        elif result is None:
+            outputs["text"] = gr.update(value="Agent returned no result (None). Check logs if available.", visible=True)
+        else: # Other types (e.g. if agent returns a dict or list by mistake)
+            outputs["text"] = gr.update(value=f"Unexpected result type: {type(result)}. Content: {str(result)}", visible=True)
+        progress(1, desc="Done!")
+        return (
+            outputs["image"], outputs["file"], outputs["path"],
+            outputs["audio"], outputs["model3d"], outputs["text"]
+        )
     except Exception as e:
+        error_msg = f"An error occurred in the Gradio interface or agent execution: {str(e)}"
+        print(error_msg) # Also print to console for server-side logs
+        # traceback.print_exc() # For more detailed debugging
+        return (
+            gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),
+            gr.update(visible=False), gr.update(visible=False),
+            gr.update(value=error_msg, visible=True)
+        )
 # Create the Gradio app
+with gr.Blocks(theme=gr.themes.Soft()) as app:
+    gr.Markdown("## 🤖 Smolagent: Multi-Modal Agent with Hugging Face Space Discovery")
+    gr.Markdown("Ask the agent to perform tasks. It will try to use its tools or find Hugging Face Spaces to help. You can provide optional file inputs below if your task requires them (e.g., 'Make this image Ghibli style', 'Summarize this PDF').")
     with gr.Row():
+        prompt_input = gr.Textbox(
+            label="Enter your prompt for the agent",
+            placeholder="e.g., 'Generate an image of a futuristic city', 'Convert this text to speech: Hello world', or 'Search for a space that translates English to French and use it for: Good morning'",
+            lines=3
+        )
+    with gr.Accordion("Optional File Inputs (for tasks requiring them)", open=False):
+        with gr.Row():
+            input_image = gr.Image(label="Image Input", type="filepath", sources=["upload", "clipboard"], elem_id="input_image")
+            input_audio = gr.Audio(label="Audio Input", type="filepath", sources=["upload", "microphone"], elem_id="input_audio")
+        with gr.Row():
+            input_video = gr.Video(label="Video Input", type="filepath", sources=["upload"], elem_id="input_video")
+            input_model3d = gr.Model3D(label="3D Model Input (.glb)", type="filepath", elem_id="input_model3d") # Gradio Model3D component expects .glb usually
+        with gr.Row():
+            input_file = gr.File(label="Generic File Input (PDF, TXT, etc.)", type="filepath", elem_id="input_file")
+    submit_button = gr.Button("🚀 Generate", variant="primary")
+    gr.Markdown("### Outputs:")
+    with gr.Row():
+        image_output = gr.Image(label="Image Output", interactive=False, visible=False, show_download_button=True)
+        audio_output = gr.Audio(label="Audio Output", interactive=False, visible=False, show_download_button=True)
+    with gr.Row():
+        model3d_output = gr.Model3D(label="3D Model Output", interactive=False, visible=False, show_download_button=True)
+        text_output = gr.Textbox(label="Text / Log Output", interactive=False, visible=True, lines=5, max_lines=10) # Start visible for logs/text
+    with gr.Row():
+        file_output = gr.File(label="Download File Output", interactive=False, visible=False)
+        path_output = gr.Textbox(label="Output File Path (Copyable)", interactive=True, visible=False) # Keep for copying if needed
+    # Link button click to the interface function
+    submit_button.click(
+        fn=gradio_interface,
+        inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
+        outputs=[image_output, file_output, path_output, audio_output, model3d_output, text_output]
+    )
+    gr.Examples(
+        examples=[
+            ["Generate an image of a happy robot coding on a laptop, cyberpunk style.", None, None, None, None, None],
+            ["Convert the following text to speech: 'Smolagents are amazing for building AI applications.'", None, None, None, None, None],
+            ["Search for a Hugging Face Space that can perform image captioning. Describe the first result.", None, None, None, None, None],
+            ["I have an image of a cat (you'll need to upload one). Find a space that can make it look like a painting and apply it.", "path/to/your/cat_image.png", None, None, None, None], # User would replace path or upload
+        ],
+        inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
+        label="Example Prompts (Note: For examples with file inputs, you'll need to upload a relevant file)"
+    )
 # Launch the app
+if __name__ == "__main__":
+    app.launch(debug=True) # Enable debug for more detailed logs during development