|
|
import gradio as gr |
|
|
import base64 |
|
|
import time |
|
|
import html |
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
|
|
|
def progress_bar_html(label: str) -> str: |
|
|
""" |
|
|
Returns an HTML snippet for a thin progress bar with a label. |
|
|
The progress bar is styled as a dark animated bar. |
|
|
""" |
|
|
return f""" |
|
|
<div style="display: flex; align-items: center;"> |
|
|
<span style="margin-right: 10px; font-size: 14px;">{label}</span> |
|
|
<div style="width: 110px; height: 5px; background-color: #9370DB; border-radius: 2px; overflow: hidden;"> |
|
|
<div style="width: 100%; height: 100%; background-color: #4B0082; animation: loading 1.5s linear infinite;"></div> |
|
|
</div> |
|
|
</div> |
|
|
<style> |
|
|
@keyframes loading {{ |
|
|
0% {{ transform: translateX(-100%); }} |
|
|
100% {{ transform: translateX(100%); }} |
|
|
}} |
|
|
</style> |
|
|
""" |
|
|
|
|
|
|
|
|
model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct" |
|
|
|
|
|
|
|
|
def model_inference(input_dict, history, hf_token: gr.OAuthToken): |
|
|
""" |
|
|
Use Hugging Face InferenceClient (streaming) to perform the multimodal chat completion. |
|
|
Signature matches ChatInterface call pattern: (input_dict, history, *additional_inputs) |
|
|
The OAuth token (from gr.LoginButton) is passed as `hf_token`. |
|
|
""" |
|
|
text = input_dict.get("text", "") |
|
|
files = input_dict.get("files", []) or [] |
|
|
|
|
|
if text == "" and not files: |
|
|
gr.Error("Please input a query and optionally image(s).") |
|
|
return |
|
|
if text == "" and files: |
|
|
gr.Error("Please input a text query along with the image(s).") |
|
|
return |
|
|
|
|
|
|
|
|
content_list = [] |
|
|
for f in files: |
|
|
try: |
|
|
|
|
|
if isinstance(f, str) and f.startswith("http"): |
|
|
content_list.append({"type": "image_url", "image_url": {"url": f}}) |
|
|
else: |
|
|
|
|
|
with open(f, "rb") as fh: |
|
|
b = fh.read() |
|
|
b64 = base64.b64encode(b).decode("utf-8") |
|
|
|
|
|
data_url = f"data:image/jpeg;base64,{b64}" |
|
|
content_list.append( |
|
|
{"type": "image_url", "image_url": {"url": data_url}} |
|
|
) |
|
|
except Exception: |
|
|
|
|
|
continue |
|
|
|
|
|
content_list.append({"type": "text", "text": text}) |
|
|
|
|
|
messages = [{"role": "user", "content": content_list}] |
|
|
|
|
|
if hf_token is None or not getattr(hf_token, "token", None): |
|
|
gr.Error( |
|
|
"Please login with a Hugging Face account (use the Login button in the sidebar)." |
|
|
) |
|
|
return |
|
|
|
|
|
client = InferenceClient(token=hf_token.token, model=model_name) |
|
|
|
|
|
response = "" |
|
|
yield progress_bar_html("Processing...") |
|
|
|
|
|
|
|
|
try: |
|
|
stream = client.chat.completions.create(messages=messages, stream=True) |
|
|
except TypeError: |
|
|
|
|
|
stream = client.chat_completion(messages=messages, stream=True) |
|
|
|
|
|
for chunk in stream: |
|
|
|
|
|
token = "" |
|
|
try: |
|
|
|
|
|
if isinstance(chunk, dict): |
|
|
choices = chunk.get("choices") |
|
|
if choices and len(choices) > 0: |
|
|
delta = choices[0].get("delta", {}) |
|
|
token = delta.get("content") or "" |
|
|
else: |
|
|
|
|
|
choices = getattr(chunk, "choices", None) |
|
|
if choices and len(choices) > 0: |
|
|
delta = getattr(choices[0], "delta", None) |
|
|
if isinstance(delta, dict): |
|
|
token = delta.get("content") or "" |
|
|
else: |
|
|
token = getattr(delta, "content", "") |
|
|
except Exception: |
|
|
token = "" |
|
|
|
|
|
if token: |
|
|
|
|
|
response += html.escape(token) |
|
|
time.sleep(0.001) |
|
|
yield response |
|
|
|
|
|
|
|
|
examples = [ |
|
|
[ |
|
|
{ |
|
|
"text": "Write a descriptive caption for this image in a formal tone.", |
|
|
"files": ["example_images/example.png"], |
|
|
} |
|
|
], |
|
|
[ |
|
|
{ |
|
|
"text": "What are the characters wearing?", |
|
|
"files": ["example_images/example.png"], |
|
|
} |
|
|
], |
|
|
] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
with gr.Sidebar(): |
|
|
login_btn = gr.LoginButton(label="Login with Hugging Face") |
|
|
|
|
|
chatbot = gr.ChatInterface( |
|
|
fn=model_inference, |
|
|
description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.", |
|
|
examples=examples, |
|
|
fill_height=True, |
|
|
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]), |
|
|
stop_btn="Stop Generation", |
|
|
multimodal=True, |
|
|
cache_examples=False, |
|
|
additional_inputs=[login_btn], |
|
|
) |
|
|
|
|
|
chatbot.render() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(debug=True) |
|
|
|