Spaces:

monurcan
/

efficient_test_time_scaling

Sleeping

File size: 6,757 Bytes

import gradio as gr
import base64
import time
import html
from huggingface_hub import InferenceClient


def progress_bar_html(label: str) -> str:
    """
    Returns an HTML snippet for a thin progress bar with a label.
    The progress bar is styled as a dark animated bar.
    """
    return f"""
<div style="display: flex; align-items: center;">
    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
    <div style="width: 110px; height: 5px; background-color: #9370DB; border-radius: 2px; overflow: hidden;">
        <div style="width: 100%; height: 100%; background-color: #4B0082; animation: loading 1.5s linear infinite;"></div>
    </div>
</div>
<style>
@keyframes loading {{
    0% {{ transform: translateX(-100%); }}
    100% {{ transform: translateX(100%); }}
}}
</style>
    """


model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"


def model_inference(input_dict, history, *additional_inputs):
    """
    Use Hugging Face InferenceClient (streaming) to perform the multimodal chat completion.
    Signature matches ChatInterface call pattern: (input_dict, history, *additional_inputs)
    The OAuth token (from gr.LoginButton) is passed as `hf_token`.
    """
    # Extract hf_token from additional_inputs in a robust way (gradio sometimes passes extra args)
    hf_token = None
    for ai in additional_inputs:
        if ai is None:
            continue
        # gradio may pass a small object with attribute `token`
        if hasattr(ai, "token"):
            hf_token = ai
            break
        # or a dict-like with a token key
        if isinstance(ai, dict) and "token" in ai:

            class _T:
                pass

            obj = _T()
            obj.token = ai.get("token")
            hf_token = obj
            break
        # or the token itself could be passed as a string
        if isinstance(ai, str):

            class _T2:
                pass

            obj = _T2()
            obj.token = ai
            hf_token = obj
            break

    text = input_dict.get("text", "")
    files = input_dict.get("files", []) or []

    if text == "" and not files:
        # yield an error text so the streaming generator produces at least one value
        yield "Please input a query and optionally image(s)."
        return
    if text == "" and files:
        yield "Please input a text query along with the image(s)."
        return

    # Build the content list: images (as URLs or data URLs) followed by the text
    content_list = []
    for f in files:
        try:
            # If file looks like a URL, send as image_url
            if isinstance(f, str) and f.startswith("http"):
                content_list.append({"type": "image_url", "image_url": {"url": f}})
            else:
                # f is a local path-like object; read and convert to base64 data url
                with open(f, "rb") as fh:
                    b = fh.read()
                b64 = base64.b64encode(b).decode("utf-8")
                # naive mime type: jpeg; this should work for most common images
                data_url = f"data:image/jpeg;base64,{b64}"
                content_list.append(
                    {"type": "image_url", "image_url": {"url": data_url}}
                )
        except Exception:
            # if anything goes wrong reading the file, skip embedding that file
            continue

    content_list.append({"type": "text", "text": text})

    messages = [{"role": "user", "content": content_list}]

    if hf_token is None or not getattr(hf_token, "token", None):
        yield "Please login with a Hugging Face account (use the Login button in the sidebar)."
        return

    client = InferenceClient(
        token=hf_token.token, model=model_name, provider="hf-inference"
    )

    response = ""
    for message in client.chat_completion(
        messages,
        max_tokens=1024,
        stream=True,
    ):
        choices = message.choices
        token = ""
        if len(choices) and choices[0].delta.content:
            token = choices[0].delta.content

        response += token
        yield response

    # for chunk in stream:
    #     # chunk can be an object with attributes or a dict depending on client version
    #     token = ""
    #     try:
    #         # attempt dict-style
    #         if isinstance(chunk, dict):
    #             choices = chunk.get("choices")
    #             if choices and len(choices) > 0:
    #                 delta = choices[0].get("delta", {})
    #                 token = delta.get("content") or ""
    #         else:
    #             # attribute-style
    #             choices = getattr(chunk, "choices", None)
    #             if choices and len(choices) > 0:
    #                 delta = getattr(choices[0], "delta", None)
    #                 if isinstance(delta, dict):
    #                     token = delta.get("content") or ""
    #                 else:
    #                     token = getattr(delta, "content", "")
    #     except Exception:
    #         token = ""

    #     if token:
    #         # escape incremental token to avoid raw HTML breaking the chat box
    #         response += html.escape(token)
    #         time.sleep(0.001)
    #         yield response

    # # ensure we yield at least one final message so the async iterator doesn't see StopIteration
    # if response:
    #     yield response
    # else:
    #     yield "(no text was returned by the model)"


examples = [
    [
        {
            "text": "Write a descriptive caption for this image in a formal tone.",
            "files": ["example_images/example.png"],
        }
    ],
    [
        {
            "text": "What are the characters wearing?",
            "files": ["example_images/example.png"],
        }
    ],
]

with gr.Blocks() as demo:
    with gr.Sidebar():
        # Gradio LoginButton may not accept a `label` kwarg depending on the installed version
        # so create it without that argument for maximum compatibility.
        login_btn = gr.LoginButton()

    chatbot = gr.ChatInterface(
        fn=model_inference,
        description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.",
        examples=examples,
        fill_height=True,
        textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]),
        stop_btn="Stop Generation",
        multimodal=True,
        cache_examples=False,
        additional_inputs=[login_btn],
    )

    # ChatInterface is already created inside the Blocks context; calling render() can duplicate it
    # so we avoid calling chatbot.render() here.


if __name__ == "__main__":
    demo.launch(debug=True)