multimodal_agent / main.py
laxminarasimha6's picture
Update main.py
5529d68 verified
import os
import random
import uuid
import time
import asyncio
from threading import Thread
import gradio as gr
import torch
from PIL import Image
from models import (
load_base_model,
load_vision_model,
load_image_model,
load_shape_model,
generate_3d_model
)
from utils import (
clean_history,
save_image,
text_to_speech,
web_search
)
from config import (
MODEL_CONFIG,
COMMANDS,
VOICE_OPTIONS,
MAX_TOKENS,
DEFAULT_TOKENS
)
def setup_interface(description, css):
# Load models
base_model, tokenizer = load_base_model(MODEL_CONFIG["base_model"])
vision_model, processor = load_vision_model(MODEL_CONFIG["vision_model"])
image_model = load_image_model(MODEL_CONFIG["image_model"])
shape_model = load_shape_model(MODEL_CONFIG["shape_model"])
async def generate_voice(text, voice):
try:
audio_file = await text_to_speech(text, voice)
if audio_file and os.path.exists(audio_file):
return {"type": "audio", "value": audio_file}
return "Sorry, voice generation failed. Please try again."
except Exception as e:
return f"Error generating voice: {str(e)}"
def generate(
message: str,
chat_history: list[dict],
max_new_tokens: int = DEFAULT_TOKENS,
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
):
try:
# Handle Image Generation
if message.strip().lower().startswith(COMMANDS["image"]):
prompt = message[len(COMMANDS["image"]):].strip()
yield "🎨 Generating image..."
try:
image = image_model(
prompt=prompt,
num_inference_steps=30,
guidance_scale=7.5
).images[0]
image_path = save_image(image)
yield f"![Generated Image]({image_path})"
except Exception as e:
yield f"Error generating image: {str(e)}"
return
# Handle Voice Synthesis
voice_command = COMMANDS["voice"]
is_voice = message.strip().lower().startswith(voice_command)
voice_index = 1 if f"{voice_command}1" in message.lower() else 2 if f"{voice_command}2" in message.lower() else None
if is_voice and voice_index:
voice = VOICE_OPTIONS[voice_index - 1]
text = message.replace(f"{voice_command}{voice_index}", "").strip()
yield "🎀 Generating voice..."
# Run voice generation in event loop
loop = asyncio.get_event_loop()
if loop.is_running():
loop = asyncio.new_event_loop()
result = loop.run_until_complete(generate_voice(text, voice))
yield result
return
# Handle Web Search
if message.strip().lower().startswith(COMMANDS["search"]):
query = message[len(COMMANDS["search"]):].strip()
yield "πŸ” Searching..."
results = web_search(query)
yield results
return
# Handle 3D Generation
if message.strip().lower().startswith(COMMANDS["shape"]):
prompt = message[len(COMMANDS["shape"]):].strip()
yield "🌟 Creating 3D model..."
model_path = generate_3d_model(shape_model, prompt)
if model_path:
yield f"[Download 3D Model]({model_path})"
else:
yield "Sorry, 3D model generation failed. Please try again."
return
# Default text response
yield "Processing your message..."
# Add your text generation logic here
response = f"I received your message: {message}"
yield response
except Exception as e:
yield f"An error occurred: {str(e)}"
# Create Gradio Interface
demo = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Slider(label="Response Length", minimum=1, maximum=MAX_TOKENS, value=DEFAULT_TOKENS),
gr.Slider(label="Creativity", minimum=0.1, maximum=1.0, value=0.7),
gr.Slider(label="Focus", minimum=0.05, maximum=1.0, value=0.9),
gr.Slider(label="Diversity", minimum=1, maximum=100, value=50),
gr.Slider(label="Consistency", minimum=1.0, maximum=2.0, value=1.2),
],
title="NexusAI",
description=description,
theme=gr.themes.Soft(),
css=css,
examples=[
["@voice2 Tell me about quantum computing"],
["@create A beautiful sunset over mountains"],
["@search Latest developments in AI"],
["Tell me a story about space exploration"],
]
)
return demo