Easy-Spaces / app.py
jkorstad's picture
Update app.py
2994b0b verified
raw
history blame
20.6 kB
import gradio as gr
import os
import shutil
from gradio_client import Client, handle_file # handle_file might be used by the agent
# Use InferenceClientModel instead of HfApiModel
from smolagents import Tool, CodeAgent, InferenceClientModel, ToolCollection
import uuid
import httpx # Often a dependency for HTTP clients, good to have
from tenacity import retry, stop_after_attempt, wait_exponential
from huggingface_hub import list_spaces
from PIL import Image # For potential image manipulation by the agent
import traceback # For more detailed error logging if needed
# Define initial tools from Spaces
# Commenting out problematic spaces for now.
# You'll need to verify their api_name or compatibility if you re-enable them.
# Ensure the api_name is correct if you uncomment these.
# Visit the HF Space page and look for "API - via gradio_client" for hints.
spaces = [
{"repo_id": "black-forest-labs/FLUX.1-schnell",
"name": "image_generator_flux_schnell",
"description": "Generate an image from a prompt using FLUX.1-schnell. Expects a text prompt.",
"api_name": "/infer"},
{"repo_id": "Remsky/Kokoro-TTS-Zero",
"name": "text_to_speech_kokoro",
"description": "Generates speech (audio) from input text using Kokoro TTS Zero. Expects text input.",
"api_name": "/generate_speech_from_ui"},
{"repo_id": "jamesliu1217/EasyControl_Ghibli",
"name": "ghibli_style_image_control",
"description": "Create Ghibli style image from an input image using EasyControl_Ghibli. Expects an image and a prompt/control parameters.",
"api_name": "/single_condition_generate_image"},
{"repo_id": "opendatalab/MinerU",
"name": "pdf_text_extraction_mineru",
"description": "Extracts the text of a PDF up to 20 pages long using MinerU. Expects a PDF file.",
"api_name": "/to_pdf"},
# {"repo_id": "InstantX/InstantCharacter",
# "name": "instant_character_customization",
# "description": "Personalize Any Characters with a Scalable Diffusion Transformer Framework to any style or pose using InstantCharacter. Expects an input image and potentially pose/style images or prompts.",
# "api_name": "/predict"}, # Example: Verify this api_name if re-enabling
# {"repo_id": "fotographerai/Zen-Style-Shape",
# "name": "img_to_img_style_transfer_zen_shape",
# "description": "Flux[dev] Redux + Flux[dev] Canny. Implements a custom image-to-image style transfer pipeline blending style from Image A to structure of Image B. Expects two images.",
# "api_name": "/predict"}, # Example: Verify this api_name if re-enabling
# {"repo_id": "moonshotai/Kimi-VL-A3B-Thinking",
# "name": "multimodal_vlm_llm_kimi",
# "description": "Kimi-VL-A3B-Thinking is a multi-modal LLM that can understand text and images, and generate text with thinking processes. Ask any question about an image. Expects text and optionally an image.",
# "api_name": "/chat"}, # Example: Verify this api_name if re-enabling
]
# Create tools from predefined Spaces with retry logic
tools = []
for space_info in spaces:
repo_id = space_info['repo_id']
name = space_info.get('name', repo_id.split('/')[-1].replace('-', '_')) # Default name from repo_id
description = space_info.get('description', f'A tool to interact with the Hugging Face Space: {repo_id}')
api_name = space_info.get('api_name') # Can be None, Tool.from_space will try to infer
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def create_tool_with_retry(repo_id, name, description, api_name):
# If api_name is None, Tool.from_space will try to find a public API endpoint.
print(f"Attempting to create tool: '{name}' from space: {repo_id} with api_name: {api_name}")
new_tool = Tool.from_space(repo_id, name=name, description=description, api_name=api_name)
# Explicitly check if name attribute is set after creation by Tool.from_space
if not hasattr(new_tool, 'name') or new_tool.name != name:
print(f"WARNING: Tool '{name}' from space {repo_id} might have a name mismatch or missing name attribute after creation. Actual name: {getattr(new_tool, 'name', 'MISSING')}")
return new_tool
try:
tool = create_tool_with_retry(repo_id, name, description, api_name)
tools.append(tool)
print(f"Successfully loaded predefined tool: {name} from {repo_id}")
except Exception as e:
print(f"Failed to load predefined tool from {repo_id}. Error: {str(e)}. Continuing with available tools.")
# Load tools from a Hugging Face Collection (User has this commented out)
#collection_slug = "jkorstad/tools-680127d17eed47e759549ff4"
#try:
# collection = ToolCollection.from_hub(collection_slug=collection_slug, trust_remote_code=True)
# tools.extend(collection.tools)
# print(f"Successfully loaded tools from collection: {collection_slug}")
#except Exception as e:
# print(f"Warning: Failed to load collection {collection_slug}. Error: {str(e)}")
# Tool for searching Hugging Face Spaces
def search_hf_spaces(query: str, top_k: int = 3) -> str:
"""
Searches Hugging Face Spaces for a given query and returns the top_k results.
Provides repo_id, description, likes, and last modified date for each space found.
Use this to discover new tools if the existing ones are not suitable.
To use a found space, try: new_tool = Tool.from_space(repo_id='the_space_id', name='a_descriptive_name')
Then call it: result = new_tool(param1=value1, ...)
"""
try:
print(f"Searching spaces with query: {query}, top_k: {top_k}")
spaces_found = list(list_spaces(search=query, full=True, limit=top_k, sort="likes", direction=-1))
if not spaces_found:
return "No Spaces found for your query."
results = "Found the following Spaces (sorted by likes):\n"
for i, space_data in enumerate(spaces_found):
description = "No description provided."
if hasattr(space_data, 'cardData') and space_data.cardData and 'description' in space_data.cardData:
description = space_data.cardData['description']
elif hasattr(space_data, 'title') and space_data.title:
description = space_data.title
results += (
f"{i+1}. ID: {space_data.id}\n"
f" Description: {description}\n"
f" Likes: {space_data.likes if hasattr(space_data, 'likes') else 'N/A'}\n"
f" Last Modified: {space_data.lastModified if hasattr(space_data, 'lastModified') else 'N/A'}\n\n"
)
results += ("\nTo use one of these, you can try creating a tool in the code like this: "
"my_new_tool = Tool.from_space(repo_id='SPACE_ID_HERE', name='custom_tool_name'). "
"Then you can call it: result = my_new_tool(argument_name=value). "
"The arguments depend on the specific Space. If Tool.from_space fails or the tool doesn't work, "
"the Space might not have a compatible public API or may require a specific api_name.")
return results
except Exception as e:
print(f"Error searching Spaces: {str(e)}")
return f"Error searching Spaces: {str(e)}"
space_search_tool = Tool(
name="huggingface_space_searcher",
description="Searches for Hugging Face Spaces that can perform a specific task. Input is a search query string (e.g., 'text to image', 'speech recognition'). Returns a list of Space IDs, their descriptions, and instructions on how to try using them.",
func=search_hf_spaces,
)
tools.append(space_search_tool)
# --- Debugging: Inspect tools before CodeAgent initialization ---
print("\n--- Inspecting tools before CodeAgent initialization ---")
for i, t in enumerate(tools):
if t is None:
print(f"Tool at index {i} is None!")
# This would cause an error later, but the current error is 'Tool' object has no attribute 'name'
continue
try:
# Attempt to access the name attribute
tool_name = t.name
print(f"Tool {i}: Name='{tool_name}', Type={type(t)}")
except AttributeError:
print(f"!!! CRITICAL: Tool at index {i} (Type={type(t)}) is missing 'name' attribute.")
except Exception as e:
print(f"!!! ERROR inspecting tool at index {i} (Type={type(t)}): {str(e)}")
print("-------------------------------------------------------\n")
# Initialize the model - Use InferenceClientModel
model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct") # Or your preferred model
# Create the agent - Removed system_prompt from constructor
agent = CodeAgent(
tools=tools,
model=model,
additional_authorized_imports=['PIL', 'Pillow', 'os', 'sys', 'numpy', 'huggingface_hub', 'gradio_client', 'uuid'],
add_base_tools=True, # Includes web search, python interpreter
)
# This is the detailed instruction set that was previously in system_prompt
AGENT_INSTRUCTIONS = """You are a highly capable AI assistant. Your primary goal is to accomplish tasks using a variety of tools, prioritizing Hugging Face Spaces.
Follow these steps:
1. **Understand the Request:** Carefully analyze the user's prompt (which will follow these instructions). Identify the core task and any specific requirements or inputs.
2. **Check Predefined Tools:** Review your list of available tools. If a predefined tool can directly address the request, use it.
3. **Search for Spaces (If Needed):** If no predefined tool is suitable, use the `huggingface_space_searcher` tool. Provide a concise search query related to the task (e.g., "image classification", "voice cloning", "document question answering").
4. **Select and Instantiate a Space Tool:** From the search results, choose the most promising Space. Attempt to create a tool from it using `Tool.from_space(repo_id='SELECTED_SPACE_ID', name='a_unique_tool_name')`. You might need to give it a unique name. If `Tool.from_space` fails, the Space might not be compatible, or you could try another one from the search results. Note that some Spaces might not have a public API or may require a specific `api_name` that `Tool.from_space` cannot infer; in such cases, you might not be able to use them.
5. **Execute the Tool:** Call the tool (either predefined or dynamically created) with the necessary arguments.
* **File Inputs:** If the user uploads files, their paths will be available as global string variables: `input_image_path`, `input_audio_path`, `input_video_path`, `input_3d_model_path`, `input_file_path`. Before using these variables, check if they exist and are not None (e.g., `if 'input_image_path' in globals() and input_image_path:`). Pass these file paths as arguments to tools that require them. `Tool.from_space` handles file uploads for compatible Spaces when you pass the filepath string.
* **Chaining Tools:** If the task requires multiple steps, chain the tools together, passing the output of one tool as the input to the next.
6. **Output Management:**
* If a tool generates a file (image, audio, etc.), save it to the current working directory using a unique filename (e.g., `output_filename = os.path.join(os.getcwd(), f"{uuid.uuid4()}.png")`).
* **Return the RESULT:** Your final response should be either:
* A string containing the direct text answer.
* The string path to the generated output file (e.g., `return output_filename`).
7. **Clarity and Error Handling:** If you encounter issues (e.g., a Space tool fails, required inputs are missing), clearly explain the problem in your response. If a Space doesn't work, try to explain why or suggest an alternative if possible.
Example of dynamically using a Space after searching:
```python
# This is an example of how I, the agent, would think and act.
# User's actual prompt would follow these instructions.
# Example user prompt: "Find a space that can make an image of a cat and then use it."
#
# My thought process:
# 1. The user wants an image of a cat, and wants me to find a Space for it.
# 2. I'll use `huggingface_space_searcher`.
# search_results = huggingface_space_searcher(query="text to image cat")
# print(search_results) # This would show me some options. Let's say 'user/cat-generator' is found.
# try:
# cat_image_tool = Tool.from_space(repo_id="user/cat-generator", name="cat_generator_tool")
# # The arguments for cat_image_tool depend on the Space. I'll assume it takes a 'prompt' argument.
# image_path = cat_image_tool(prompt="A fluffy siamese cat")
# # image_path should be a path to the generated image file
# return image_path
# except Exception as e:
# return f"Failed to use the cat generator Space: {e}"
```
Always ensure your generated Python code is complete and directly callable. Use `print()` for debugging if necessary, but the final returned value should be the result or file path.
You have access to `os`, `uuid`, `PIL.Image`.
"""
# Gradio interface function
def gradio_interface(user_prompt, input_image_path, input_audio_path, input_video_path, input_3d_model_path, input_file_path, progress=gr.Progress(track_tqdm=True)):
try:
progress(0, desc="Initializing Agent...")
# Combine instructions with the user's prompt
full_prompt_with_instructions = f"{AGENT_INSTRUCTIONS}\n\nUSER PROMPT: {user_prompt}"
# Prepare a dictionary of potential inputs for the agent's execution scope
agent_kwargs = {}
if input_image_path:
agent_kwargs["input_image_path"] = str(input_image_path)
if input_audio_path:
agent_kwargs["input_audio_path"] = str(input_audio_path)
if input_video_path:
agent_kwargs["input_video_path"] = str(input_video_path)
if input_3d_model_path:
agent_kwargs["input_3d_model_path"] = str(input_3d_model_path)
if input_file_path:
agent_kwargs["input_file_path"] = str(input_file_path)
progress(0.2, desc="Agent processing request...")
result = agent.run(full_prompt_with_instructions, **agent_kwargs)
progress(0.8, desc="Processing result...")
outputs = {
"image": gr.update(value=None, visible=False),
"file": gr.update(value=None, visible=False),
"path": gr.update(value=None, visible=False),
"audio": gr.update(value=None, visible=False),
"model3d": gr.update(value=None, visible=False),
"text": gr.update(value=None, visible=True),
}
if isinstance(result, str):
if os.path.isfile(result):
file_path = result
outputs["file"] = gr.update(value=file_path, visible=True)
outputs["path"] = gr.update(value=file_path, visible=True)
ext = os.path.splitext(file_path.lower())[1]
if ext in ('.png', '.jpg', '.jpeg', '.gif', '.webp'):
outputs["image"] = gr.update(value=file_path, visible=True)
elif ext in ('.mp3', '.wav', '.ogg', '.flac'):
outputs["audio"] = gr.update(value=file_path, visible=True)
elif ext == '.glb':
outputs["model3d"] = gr.update(value=file_path, visible=True)
else:
outputs["text"] = gr.update(value=f"Output is a file: {os.path.basename(file_path)}. Download it using the 'Download File Output' component.", visible=True)
else:
outputs["text"] = gr.update(value=result, visible=True)
elif result is None:
outputs["text"] = gr.update(value="Agent returned no result (None). This might indicate an issue or that the task didn't produce a specific output string/file.", visible=True)
else:
outputs["text"] = gr.update(value=f"Unexpected result type from agent: {type(result)}. Content: {str(result)}", visible=True)
progress(1, desc="Done!")
return (
outputs["image"], outputs["file"], outputs["path"],
outputs["audio"], outputs["model3d"], outputs["text"]
)
except Exception as e:
error_msg = f"An error occurred in the Gradio interface or agent execution: {str(e)}"
print(error_msg)
traceback.print_exc()
return (
gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False),
gr.update(value=None, visible=False), gr.update(value=None, visible=False),
gr.update(value=error_msg, visible=True)
)
# Create the Gradio app
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("## πŸ€– Smolagent: Multi-Modal Agent with Hugging Face Space Discovery")
gr.Markdown("Ask the agent to perform tasks. It will try to use its tools or find Hugging Face Spaces to help. You can provide optional file inputs below if your task requires them (e.g., 'Make this image Ghibli style', 'Summarize this PDF').")
with gr.Row():
prompt_input = gr.Textbox(
label="Enter your prompt for the agent",
placeholder="e.g., 'Generate an image of a futuristic city', 'Convert this text to speech: Hello world', or 'Search for a space that translates English to French and use it for: Good morning'",
lines=3,
elem_id="user_prompt_textbox"
)
with gr.Accordion("Optional File Inputs (for tasks requiring them)", open=False):
with gr.Row():
input_image = gr.Image(label="Image Input", type="filepath", sources=["upload", "clipboard"], elem_id="input_image_upload")
input_audio = gr.Audio(label="Audio Input", type="filepath", sources=["upload", "microphone"], elem_id="input_audio_upload")
with gr.Row():
input_video = gr.Video(label="Video Input", type="filepath", sources=["upload"], elem_id="input_video_upload")
input_model3d = gr.Model3D(label="3D Model Input (.glb, .obj, etc.)", type="filepath", elem_id="input_model3d_upload")
with gr.Row():
input_file = gr.File(label="Generic File Input (PDF, TXT, etc.)", type="filepath", elem_id="input_file_upload")
submit_button = gr.Button("πŸš€ Generate", variant="primary", elem_id="submit_button_generate")
gr.Markdown("### Outputs:")
with gr.Row():
image_output = gr.Image(label="Image Output", interactive=False, visible=False, show_download_button=True, elem_id="output_image_display")
audio_output = gr.Audio(label="Audio Output", interactive=False, visible=False, show_download_button=True, elem_id="output_audio_display")
with gr.Row():
model3d_output = gr.Model3D(label="3D Model Output", interactive=False, visible=False, show_download_button=True, elem_id="output_model3d_display")
text_output = gr.Textbox(label="Text / Log Output", interactive=False, visible=True, lines=5, max_lines=20, elem_id="output_text_log")
with gr.Row():
file_output = gr.File(label="Download File Output", interactive=False, visible=False, elem_id="output_file_download")
path_output = gr.Textbox(label="Output File Path (Copyable)", interactive=False, visible=False, elem_id="output_file_path_text")
submit_button.click(
fn=gradio_interface,
inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
outputs=[image_output, file_output, path_output, audio_output, model3d_output, text_output]
)
gr.Examples(
examples=[
["Generate an image of a happy robot coding on a laptop, cyberpunk style.", None, None, None, None, None],
["Convert the following text to speech: 'Smolagents are amazing for building AI applications.'", None, None, None, None, None],
["Search for a Hugging Face Space that can perform image captioning. Describe the first result.", None, None, None, None, None],
["I have an image of a cat. Find a space that can make it look like a painting and apply it. You will need to use the 'input_image_path' variable which will contain the path to the uploaded cat image.", "path/to/your/cat_image.png", None, None, None, None],
],
inputs=[prompt_input, input_image, input_audio, input_video, input_model3d, input_file],
label="Example Prompts (Note: For examples with file inputs, you'll need to upload a relevant file first using the 'Optional File Inputs' section)"
)
if __name__ == "__main__":
app.launch(debug=True)