import gradio as gr import base64 import time import html from huggingface_hub import InferenceClient def progress_bar_html(label: str) -> str: """ Returns an HTML snippet for a thin progress bar with a label. The progress bar is styled as a dark animated bar. """ return f"""

{label}

""" model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct" def model_inference(input_dict, history, hf_token: gr.OAuthToken): """ Use Hugging Face InferenceClient (streaming) to perform the multimodal chat completion. Signature matches ChatInterface call pattern: (input_dict, history, *additional_inputs) The OAuth token (from gr.LoginButton) is passed as `hf_token`. """ text = input_dict.get("text", "") files = input_dict.get("files", []) or [] if text == "" and not files: gr.Error("Please input a query and optionally image(s).") return if text == "" and files: gr.Error("Please input a text query along with the image(s).") return # Build the content list: images (as URLs or data URLs) followed by the text content_list = [] for f in files: try: # If file looks like a URL, send as image_url if isinstance(f, str) and f.startswith("http"): content_list.append({"type": "image_url", "image_url": {"url": f}}) else: # f is a local path-like object; read and convert to base64 data url with open(f, "rb") as fh: b = fh.read() b64 = base64.b64encode(b).decode("utf-8") # naive mime type: jpeg; this should work for most common images data_url = f"data:image/jpeg;base64,{b64}" content_list.append( {"type": "image_url", "image_url": {"url": data_url}} ) except Exception: # if anything goes wrong reading the file, skip embedding that file continue content_list.append({"type": "text", "text": text}) messages = [{"role": "user", "content": content_list}] if hf_token is None or not getattr(hf_token, "token", None): gr.Error( "Please login with a Hugging Face account (use the Login button in the sidebar)." ) return client = InferenceClient(token=hf_token.token, model=model_name) response = "" yield progress_bar_html("Processing...") # The API may stream tokens. Try to iterate the streaming generator and extract token deltas. try: stream = client.chat.completions.create(messages=messages, stream=True) except TypeError: # older/newer client variants: try the alternative method name stream = client.chat_completion(messages=messages, stream=True) for chunk in stream: # chunk can be an object with attributes or a dict depending on client version token = "" try: # attempt dict-style if isinstance(chunk, dict): choices = chunk.get("choices") if choices and len(choices) > 0: delta = choices[0].get("delta", {}) token = delta.get("content") or "" else: # attribute-style choices = getattr(chunk, "choices", None) if choices and len(choices) > 0: delta = getattr(choices[0], "delta", None) if isinstance(delta, dict): token = delta.get("content") or "" else: token = getattr(delta, "content", "") except Exception: token = "" if token: # escape incremental token to avoid raw HTML breaking the chat box response += html.escape(token) time.sleep(0.001) yield response examples = [ [ { "text": "Write a descriptive caption for this image in a formal tone.", "files": ["example_images/example.png"], } ], [ { "text": "What are the characters wearing?", "files": ["example_images/example.png"], } ], ] with gr.Blocks() as demo: with gr.Sidebar(): # Gradio LoginButton may not accept a `label` kwarg depending on the installed version # so create it without that argument for maximum compatibility. login_btn = gr.LoginButton() chatbot = gr.ChatInterface( fn=model_inference, description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.", examples=examples, fill_height=True, textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]), stop_btn="Stop Generation", multimodal=True, cache_examples=False, additional_inputs=[login_btn], ) chatbot.render() if __name__ == "__main__": demo.launch(debug=True)