|
|
import gradio as gr |
|
|
import base64 |
|
|
import time |
|
|
import html |
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
|
|
|
def progress_bar_html(label: str) -> str: |
|
|
""" |
|
|
Returns an HTML snippet for a thin progress bar with a label. |
|
|
The progress bar is styled as a dark animated bar. |
|
|
""" |
|
|
return f""" |
|
|
<div style="display: flex; align-items: center;"> |
|
|
<span style="margin-right: 10px; font-size: 14px;">{label}</span> |
|
|
<div style="width: 110px; height: 5px; background-color: #9370DB; border-radius: 2px; overflow: hidden;"> |
|
|
<div style="width: 100%; height: 100%; background-color: #4B0082; animation: loading 1.5s linear infinite;"></div> |
|
|
</div> |
|
|
</div> |
|
|
<style> |
|
|
@keyframes loading {{ |
|
|
0% {{ transform: translateX(-100%); }} |
|
|
100% {{ transform: translateX(100%); }} |
|
|
}} |
|
|
</style> |
|
|
""" |
|
|
|
|
|
|
|
|
model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct" |
|
|
|
|
|
|
|
|
def model_inference(input_dict, history, *additional_inputs): |
|
|
""" |
|
|
Use Hugging Face InferenceClient (streaming) to perform the multimodal chat completion. |
|
|
Signature matches ChatInterface call pattern: (input_dict, history, *additional_inputs) |
|
|
The OAuth token (from gr.LoginButton) is passed as `hf_token`. |
|
|
""" |
|
|
|
|
|
hf_token = None |
|
|
for ai in additional_inputs: |
|
|
if ai is None: |
|
|
continue |
|
|
|
|
|
if hasattr(ai, "token"): |
|
|
hf_token = ai |
|
|
break |
|
|
|
|
|
if isinstance(ai, dict) and "token" in ai: |
|
|
|
|
|
class _T: |
|
|
pass |
|
|
|
|
|
obj = _T() |
|
|
obj.token = ai.get("token") |
|
|
hf_token = obj |
|
|
break |
|
|
|
|
|
if isinstance(ai, str): |
|
|
|
|
|
class _T2: |
|
|
pass |
|
|
|
|
|
obj = _T2() |
|
|
obj.token = ai |
|
|
hf_token = obj |
|
|
break |
|
|
|
|
|
text = input_dict.get("text", "") |
|
|
files = input_dict.get("files", []) or [] |
|
|
|
|
|
if text == "" and not files: |
|
|
|
|
|
yield "Please input a query and optionally image(s)." |
|
|
return |
|
|
if text == "" and files: |
|
|
yield "Please input a text query along with the image(s)." |
|
|
return |
|
|
|
|
|
|
|
|
content_list = [] |
|
|
for f in files: |
|
|
try: |
|
|
|
|
|
if isinstance(f, str) and f.startswith("http"): |
|
|
content_list.append({"type": "image_url", "image_url": {"url": f}}) |
|
|
else: |
|
|
|
|
|
with open(f, "rb") as fh: |
|
|
b = fh.read() |
|
|
b64 = base64.b64encode(b).decode("utf-8") |
|
|
|
|
|
data_url = f"data:image/jpeg;base64,{b64}" |
|
|
content_list.append( |
|
|
{"type": "image_url", "image_url": {"url": data_url}} |
|
|
) |
|
|
except Exception: |
|
|
|
|
|
continue |
|
|
|
|
|
content_list.append({"type": "text", "text": text}) |
|
|
|
|
|
messages = [{"role": "user", "content": content_list}] |
|
|
|
|
|
if hf_token is None or not getattr(hf_token, "token", None): |
|
|
yield "Please login with a Hugging Face account (use the Login button in the sidebar)." |
|
|
return |
|
|
|
|
|
client = InferenceClient( |
|
|
token=hf_token.token, model=model_name, provider="hf-inference" |
|
|
) |
|
|
|
|
|
response = "" |
|
|
for message in client.chat_completion( |
|
|
messages, |
|
|
max_tokens=1024, |
|
|
stream=True, |
|
|
): |
|
|
choices = message.choices |
|
|
token = "" |
|
|
if len(choices) and choices[0].delta.content: |
|
|
token = choices[0].delta.content |
|
|
|
|
|
response += token |
|
|
yield response |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples = [ |
|
|
[ |
|
|
{ |
|
|
"text": "Write a descriptive caption for this image in a formal tone.", |
|
|
"files": ["example_images/example.png"], |
|
|
} |
|
|
], |
|
|
[ |
|
|
{ |
|
|
"text": "What are the characters wearing?", |
|
|
"files": ["example_images/example.png"], |
|
|
} |
|
|
], |
|
|
] |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
with gr.Sidebar(): |
|
|
|
|
|
|
|
|
login_btn = gr.LoginButton() |
|
|
|
|
|
chatbot = gr.ChatInterface( |
|
|
fn=model_inference, |
|
|
description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.", |
|
|
examples=examples, |
|
|
fill_height=True, |
|
|
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]), |
|
|
stop_btn="Stop Generation", |
|
|
multimodal=True, |
|
|
cache_examples=False, |
|
|
additional_inputs=[login_btn], |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(debug=True) |
|
|
|