File size: 4,720 Bytes
a05fede
 
 
 
 
 
 
 
 
 
 
b236948
 
a05fede
b236948
a05fede
2763883
 
 
 
b236948
7af49e8
b236948
 
2763883
a05fede
 
 
 
 
 
 
 
2763883
 
 
 
 
 
 
 
a05fede
2763883
 
7af49e8
2763883
7af49e8
2763883
 
 
 
 
b236948
 
 
 
 
 
 
 
2763883
 
a05fede
 
 
 
 
 
 
2763883
 
 
 
 
 
a05fede
2763883
 
 
a05fede
 
2763883
 
 
 
 
 
 
a05fede
2763883
a05fede
2763883
a05fede
 
 
 
 
 
 
 
 
 
 
2763883
a05fede
 
 
 
 
2763883
a05fede
 
 
 
 
 
 
 
 
 
 
 
 
2763883
a05fede
 
 
 
 
 
 
 
 
 
2763883
 
a05fede
 
 
2763883
 
 
 
a05fede
 
2763883
 
a05fede
 
 
b236948
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from huggingface_hub import InferenceClient
import gradio as gr
import base64
from PIL import Image
import io

def image_to_data_url(image_path):
    if image_path is None:
        return None
    with Image.open(image_path) as img:
        buffered = io.BytesIO()
        img_format = img.format if img.format else "JPEG"
        img.save(buffered, format=img_format)
        img_str = base64.b64encode(buffered.getvalue()).decode()
        return f"data:image/{img_format.lower()};base64,{img_str}"

def process_input(image, image_url, prompt, model, hf_token):
    if not hf_token.startswith("hf_"):
        raise gr.Error("Invalid Hugging Face token. It should start with 'hf_'")
    
    client = InferenceClient(
        api_key=hf_token,
        provider="cohere"
    )

    image_data = None
    if image is not None:
        image_data = image_to_data_url(image)
    elif image_url:
        image_data = image_url
    
    if not image_data:
        raise gr.Error("Please provide either an image upload or image URL")

    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": image_data}}
        ]
    }]
    
    try:
        stream = client.chat.completions.create(
            model=model,
            messages=messages,
            max_tokens=8000,
            stream=True,
        )
        
        full_response = ""
        for chunk in stream:
            if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
                content = chunk.choices[0].delta.content or ""
                full_response += content
                yield full_response
            elif hasattr(chunk, 'content'):
                content = chunk.content or ""
                full_response += content
                yield full_response
    except Exception as e:
        raise gr.Error(f"API Error: {str(e)}")

models = [
    "CohereLabs/aya-vision-32b",
    "CohereLabs/aya-vision-8b",
]

with gr.Blocks() as demo:
    gr.Markdown("""
    # πŸ” Aya-Vision Model Interface
    
    *Explore state-of-the-art vision-language models by Cohere through this interface.  
    Supports image inputs via upload or URL, with streaming responses.*
    Read more about Aya Vision [here](https://cohere.com/research/aya)
    
    **Get your HF token:** [Hugging Face Settings](https://huggingface.co/settings/tokens)
    """)

    with gr.Row():
        with gr.Column():
            hf_token = gr.Textbox(
                label="Hugging Face Token",
                type="password",
                placeholder="hf_XXXXXXXXXXXXXX",
                info="Token is used temporarily for the request"
            )
            
            model_choice = gr.Dropdown(
                label="Model Selection",
                choices=models,
                value=models[0]
            )
            
            with gr.Tab("Upload Image"):
                image_input = gr.Image(
                    label="Upload Image",
                    type="filepath",
                    sources=["upload"]
                )
            with gr.Tab("Image URL"):
                image_url = gr.Textbox(
                    label="Image URL",
                    placeholder="https://example.com/image.jpg",
                )
            
            prompt = gr.Textbox(
                label="Prompt",
                value="Describe this image in one sentence.",
                lines=3
            )
            submit_btn = gr.Button("Generate", variant="primary")
        
        with gr.Column():
            output = gr.Textbox(
                label="Model Response",
                interactive=False,
                lines=10,
                autoscroll=True
            )

    submit_btn.click(
        fn=process_input,
        inputs=[image_input, image_url, prompt, model_choice, hf_token],
        outputs=output,
        concurrency_limit=None
    )

    gr.Examples(
        examples=[
            [
                None,
                "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
                "Describe this image in one sentence.",
                models[0],
                ""
            ],
            [
                None,
                "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png",
                "What is unique about this image format?",
                models[1],
                ""
            ]
        ],
        inputs=[image_input, image_url, prompt, model_choice, hf_token],
        label="Try these examples:"
    )

if __name__ == "__main__":
    demo.launch()