import gradio as gr import base64 import time import html from huggingface_hub import InferenceClient def progress_bar_html(label: str) -> str: """ Returns an HTML snippet for a thin progress bar with a label. The progress bar is styled as a dark animated bar. """ return f"""

{label}

""" model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct" def model_inference(input_dict, history, *additional_inputs): """ Use Hugging Face InferenceClient (streaming) to perform the multimodal chat completion. Signature matches ChatInterface call pattern: (input_dict, history, *additional_inputs) The OAuth token (from gr.LoginButton) is passed as `hf_token`. """ # Extract hf_token from additional_inputs in a robust way (gradio sometimes passes extra args) hf_token = None for ai in additional_inputs: if ai is None: continue # gradio may pass a small object with attribute `token` if hasattr(ai, "token"): hf_token = ai break # or a dict-like with a token key if isinstance(ai, dict) and "token" in ai: class _T: pass obj = _T() obj.token = ai.get("token") hf_token = obj break # or the token itself could be passed as a string if isinstance(ai, str): class _T2: pass obj = _T2() obj.token = ai hf_token = obj break text = input_dict.get("text", "") files = input_dict.get("files", []) or [] if text == "" and not files: # yield an error text so the streaming generator produces at least one value yield "Please input a query and optionally image(s)." return if text == "" and files: yield "Please input a text query along with the image(s)." return # Build the content list: images (as URLs or data URLs) followed by the text content_list = [] for f in files: try: # If file looks like a URL, send as image_url if isinstance(f, str) and f.startswith("http"): content_list.append({"type": "image_url", "image_url": {"url": f}}) else: # f is a local path-like object; read and convert to base64 data url with open(f, "rb") as fh: b = fh.read() b64 = base64.b64encode(b).decode("utf-8") # naive mime type: jpeg; this should work for most common images data_url = f"data:image/jpeg;base64,{b64}" content_list.append( {"type": "image_url", "image_url": {"url": data_url}} ) except Exception: # if anything goes wrong reading the file, skip embedding that file continue content_list.append({"type": "text", "text": text}) messages = [{"role": "user", "content": content_list}] if hf_token is None or not getattr(hf_token, "token", None): yield "Please login with a Hugging Face account (use the Login button in the sidebar)." return client = InferenceClient( token=hf_token.token, model=model_name, provider="hf-inference" ) response = "" for message in client.chat_completion( messages, max_tokens=1024, stream=True, ): choices = message.choices token = "" if len(choices) and choices[0].delta.content: token = choices[0].delta.content response += token yield response # for chunk in stream: # # chunk can be an object with attributes or a dict depending on client version # token = "" # try: # # attempt dict-style # if isinstance(chunk, dict): # choices = chunk.get("choices") # if choices and len(choices) > 0: # delta = choices[0].get("delta", {}) # token = delta.get("content") or "" # else: # # attribute-style # choices = getattr(chunk, "choices", None) # if choices and len(choices) > 0: # delta = getattr(choices[0], "delta", None) # if isinstance(delta, dict): # token = delta.get("content") or "" # else: # token = getattr(delta, "content", "") # except Exception: # token = "" # if token: # # escape incremental token to avoid raw HTML breaking the chat box # response += html.escape(token) # time.sleep(0.001) # yield response # # ensure we yield at least one final message so the async iterator doesn't see StopIteration # if response: # yield response # else: # yield "(no text was returned by the model)" examples = [ [ { "text": "Write a descriptive caption for this image in a formal tone.", "files": ["example_images/example.png"], } ], [ { "text": "What are the characters wearing?", "files": ["example_images/example.png"], } ], ] with gr.Blocks() as demo: with gr.Sidebar(): # Gradio LoginButton may not accept a `label` kwarg depending on the installed version # so create it without that argument for maximum compatibility. login_btn = gr.LoginButton() chatbot = gr.ChatInterface( fn=model_inference, description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.", examples=examples, fill_height=True, textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]), stop_btn="Stop Generation", multimodal=True, cache_examples=False, additional_inputs=[login_btn], ) # ChatInterface is already created inside the Blocks context; calling render() can duplicate it # so we avoid calling chatbot.render() here. if __name__ == "__main__": demo.launch(debug=True)