import gradio as gr import requests import base64 import tempfile import os from PIL import Image import numpy as np # ============================== # Configuration # ============================== HF_TOKEN = os.getenv("HF_TOKEN") # optional but recommended # Example Models (can be extended) MODEL_REGISTRY = { "Text - Mistral 7B Instruct": { "type": "text", "endpoint": "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2" }, "Text - Llama 3 8B Instruct": { "type": "text", "endpoint": "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" }, "Vision - LLaVA": { "type": "vision", "endpoint": "https://api-inference.huggingface.co/models/llava-hf/llava-1.5-7b-hf" }, "Audio - Whisper": { "type": "audio", "endpoint": "https://api-inference.huggingface.co/models/openai/whisper-large-v3" } } headers = { "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else None } # Helper Functions def query_text_model(endpoint, prompt): payload = {"inputs": prompt} response = requests.post(endpoint, headers=headers, json=payload) try: return response.json()[0]["generated_text"] except: return str(response.json()) def query_vision_model(endpoint, prompt, image): buffered = tempfile.NamedTemporaryFile(suffix=".png", delete=False) image.save(buffered.name) with open(buffered.name, "rb") as f: img_bytes = f.read() payload = { "inputs": { "image": base64.b64encode(img_bytes).decode("utf-8"), "text": prompt } } response = requests.post(endpoint, headers=headers, json=payload) return response.json() def query_audio_model(endpoint, audio_path): with open(audio_path, "rb") as f: data = f.read() response = requests.post(endpoint, headers=headers, data=data) return response.json() # Main Chat Function def multimodal_chat(prompt, image, audio, selected_models, history): outputs = {} for model_name in selected_models: model = MODEL_REGISTRY[model_name] try: if model["type"] == "text": result = query_text_model(model["endpoint"], prompt) elif model["type"] == "vision" and image is not None: result = query_vision_model(model["endpoint"], prompt, image) elif model["type"] == "audio" and audio is not None: result = query_audio_model(model["endpoint"], audio) else: result = "Unsupported input for this model" except Exception as e: result = f"Error: {str(e)}" outputs[model_name] = result history.append((prompt, outputs)) return history, "", None, None # UI with gr.Blocks(theme=gr.themes.Soft(), title="Multimodal Model Comparison") as demo: gr.Markdown(""" # Multimodal Chat + Model Comparison Compare HuggingFace models across modalities """) with gr.Row(): with gr.Column(scale=3): chatbot = gr.Chatbot(height=500) prompt = gr.Textbox( placeholder="Enter your prompt...", label="Text Input" ) with gr.Row(): image_input = gr.Image(type="pil", label="Image Input") audio_input = gr.Audio(type="filepath", label="Audio Input") submit = gr.Button("Send") with gr.Column(scale=1): gr.Markdown("### Model Selection") model_selector = gr.CheckboxGroup( choices=list(MODEL_REGISTRY.keys()), value=["Text - Mistral 7B Instruct"], label="Select Models" ) clear = gr.Button("Clear") state = gr.State([]) submit.click( multimodal_chat, inputs=[prompt, image_input, audio_input, model_selector, state], outputs=[chatbot, prompt, image_input, audio_input] ) clear.click( lambda: [], None, chatbot ) # Launch if __name__ == "__main__": demo.launch()