monurcan's picture
xx
8178d81
raw
history blame
6.76 kB
import gradio as gr
import base64
import time
import html
from huggingface_hub import InferenceClient
def progress_bar_html(label: str) -> str:
"""
Returns an HTML snippet for a thin progress bar with a label.
The progress bar is styled as a dark animated bar.
"""
return f"""
<div style="display: flex; align-items: center;">
<span style="margin-right: 10px; font-size: 14px;">{label}</span>
<div style="width: 110px; height: 5px; background-color: #9370DB; border-radius: 2px; overflow: hidden;">
<div style="width: 100%; height: 100%; background-color: #4B0082; animation: loading 1.5s linear infinite;"></div>
</div>
</div>
<style>
@keyframes loading {{
0% {{ transform: translateX(-100%); }}
100% {{ transform: translateX(100%); }}
}}
</style>
"""
model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
def model_inference(input_dict, history, *additional_inputs):
"""
Use Hugging Face InferenceClient (streaming) to perform the multimodal chat completion.
Signature matches ChatInterface call pattern: (input_dict, history, *additional_inputs)
The OAuth token (from gr.LoginButton) is passed as `hf_token`.
"""
# Extract hf_token from additional_inputs in a robust way (gradio sometimes passes extra args)
hf_token = None
for ai in additional_inputs:
if ai is None:
continue
# gradio may pass a small object with attribute `token`
if hasattr(ai, "token"):
hf_token = ai
break
# or a dict-like with a token key
if isinstance(ai, dict) and "token" in ai:
class _T:
pass
obj = _T()
obj.token = ai.get("token")
hf_token = obj
break
# or the token itself could be passed as a string
if isinstance(ai, str):
class _T2:
pass
obj = _T2()
obj.token = ai
hf_token = obj
break
text = input_dict.get("text", "")
files = input_dict.get("files", []) or []
if text == "" and not files:
# yield an error text so the streaming generator produces at least one value
yield "Please input a query and optionally image(s)."
return
if text == "" and files:
yield "Please input a text query along with the image(s)."
return
# Build the content list: images (as URLs or data URLs) followed by the text
content_list = []
for f in files:
try:
# If file looks like a URL, send as image_url
if isinstance(f, str) and f.startswith("http"):
content_list.append({"type": "image_url", "image_url": {"url": f}})
else:
# f is a local path-like object; read and convert to base64 data url
with open(f, "rb") as fh:
b = fh.read()
b64 = base64.b64encode(b).decode("utf-8")
# naive mime type: jpeg; this should work for most common images
data_url = f"data:image/jpeg;base64,{b64}"
content_list.append(
{"type": "image_url", "image_url": {"url": data_url}}
)
except Exception:
# if anything goes wrong reading the file, skip embedding that file
continue
content_list.append({"type": "text", "text": text})
messages = [{"role": "user", "content": content_list}]
if hf_token is None or not getattr(hf_token, "token", None):
yield "Please login with a Hugging Face account (use the Login button in the sidebar)."
return
client = InferenceClient(
token=hf_token.token, model=model_name, provider="hf-inference"
)
response = ""
for message in client.chat_completion(
messages,
max_tokens=1024,
stream=True,
):
choices = message.choices
token = ""
if len(choices) and choices[0].delta.content:
token = choices[0].delta.content
response += token
yield response
# for chunk in stream:
# # chunk can be an object with attributes or a dict depending on client version
# token = ""
# try:
# # attempt dict-style
# if isinstance(chunk, dict):
# choices = chunk.get("choices")
# if choices and len(choices) > 0:
# delta = choices[0].get("delta", {})
# token = delta.get("content") or ""
# else:
# # attribute-style
# choices = getattr(chunk, "choices", None)
# if choices and len(choices) > 0:
# delta = getattr(choices[0], "delta", None)
# if isinstance(delta, dict):
# token = delta.get("content") or ""
# else:
# token = getattr(delta, "content", "")
# except Exception:
# token = ""
# if token:
# # escape incremental token to avoid raw HTML breaking the chat box
# response += html.escape(token)
# time.sleep(0.001)
# yield response
# # ensure we yield at least one final message so the async iterator doesn't see StopIteration
# if response:
# yield response
# else:
# yield "(no text was returned by the model)"
examples = [
[
{
"text": "Write a descriptive caption for this image in a formal tone.",
"files": ["example_images/example.png"],
}
],
[
{
"text": "What are the characters wearing?",
"files": ["example_images/example.png"],
}
],
]
with gr.Blocks() as demo:
with gr.Sidebar():
# Gradio LoginButton may not accept a `label` kwarg depending on the installed version
# so create it without that argument for maximum compatibility.
login_btn = gr.LoginButton()
chatbot = gr.ChatInterface(
fn=model_inference,
description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.",
examples=examples,
fill_height=True,
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]),
stop_btn="Stop Generation",
multimodal=True,
cache_examples=False,
additional_inputs=[login_btn],
)
# ChatInterface is already created inside the Blocks context; calling render() can duplicate it
# so we avoid calling chatbot.render() here.
if __name__ == "__main__":
demo.launch(debug=True)