Spaces:

CohereLabsCommunity
/

cohere-ui

Sleeping

File size: 4,720 Bytes

from huggingface_hub import InferenceClient
import gradio as gr
import base64
from PIL import Image
import io

def image_to_data_url(image_path):
    if image_path is None:
        return None
    with Image.open(image_path) as img:
        buffered = io.BytesIO()
        img_format = img.format if img.format else "JPEG"
        img.save(buffered, format=img_format)
        img_str = base64.b64encode(buffered.getvalue()).decode()
        return f"data:image/{img_format.lower()};base64,{img_str}"

def process_input(image, image_url, prompt, model, hf_token):
    if not hf_token.startswith("hf_"):
        raise gr.Error("Invalid Hugging Face token. It should start with 'hf_'")
    
    client = InferenceClient(
        api_key=hf_token,
        provider="cohere"
    )

    image_data = None
    if image is not None:
        image_data = image_to_data_url(image)
    elif image_url:
        image_data = image_url
    
    if not image_data:
        raise gr.Error("Please provide either an image upload or image URL")

    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": image_data}}
        ]
    }]
    
    try:
        stream = client.chat.completions.create(
            model=model,
            messages=messages,
            max_tokens=8000,
            stream=True,
        )
        
        full_response = ""
        for chunk in stream:
            if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
                content = chunk.choices[0].delta.content or ""
                full_response += content
                yield full_response
            elif hasattr(chunk, 'content'):
                content = chunk.content or ""
                full_response += content
                yield full_response
    except Exception as e:
        raise gr.Error(f"API Error: {str(e)}")

models = [
    "CohereLabs/aya-vision-32b",
    "CohereLabs/aya-vision-8b",
]

with gr.Blocks() as demo:
    gr.Markdown("""
    # 🔍 Aya-Vision Model Interface
    
    *Explore state-of-the-art vision-language models by Cohere through this interface.  
    Supports image inputs via upload or URL, with streaming responses.*
    Read more about Aya Vision [here](https://cohere.com/research/aya)
    
    **Get your HF token:** [Hugging Face Settings](https://huggingface.co/settings/tokens)
    """)

    with gr.Row():
        with gr.Column():
            hf_token = gr.Textbox(
                label="Hugging Face Token",
                type="password",
                placeholder="hf_XXXXXXXXXXXXXX",
                info="Token is used temporarily for the request"
            )
            
            model_choice = gr.Dropdown(
                label="Model Selection",
                choices=models,
                value=models[0]
            )
            
            with gr.Tab("Upload Image"):
                image_input = gr.Image(
                    label="Upload Image",
                    type="filepath",
                    sources=["upload"]
                )
            with gr.Tab("Image URL"):
                image_url = gr.Textbox(
                    label="Image URL",
                    placeholder="https://example.com/image.jpg",
                )
            
            prompt = gr.Textbox(
                label="Prompt",
                value="Describe this image in one sentence.",
                lines=3
            )
            submit_btn = gr.Button("Generate", variant="primary")
        
        with gr.Column():
            output = gr.Textbox(
                label="Model Response",
                interactive=False,
                lines=10,
                autoscroll=True
            )

    submit_btn.click(
        fn=process_input,
        inputs=[image_input, image_url, prompt, model_choice, hf_token],
        outputs=output,
        concurrency_limit=None
    )

    gr.Examples(
        examples=[
            [
                None,
                "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
                "Describe this image in one sentence.",
                models[0],
                ""
            ],
            [
                None,
                "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png",
                "What is unique about this image format?",
                models[1],
                ""
            ]
        ],
        inputs=[image_input, image_url, prompt, model_choice, hf_token],
        label="Try these examples:"
    )

if __name__ == "__main__":
    demo.launch()