Spaces:

monurcan
/

efficient_test_time_scaling

Sleeping

App Files Files Community

monurcan commited on Oct 3

Commit

02b26ea

1 Parent(s): 680c2ae

xx

Browse files

Files changed (1) hide show

app.py +101 -69

app.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import gradio as gr
-import torch
-from transformers import (
-    AutoModelForImageTextToText,
-    AutoProcessor,
-    TextIteratorStreamer,
-)
-from peft import PeftModel
-from transformers.image_utils import load_image
-from threading import Thread
 import time
 import html
 def progress_bar_html(label: str) -> str:
@@ -35,63 +28,93 @@ def progress_bar_html(label: str) -> str:
 model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
-model = AutoModelForImageTextToText.from_pretrained(
-    model_name, dtype=torch.bfloat16, device_map="auto"
-).eval()
-processor = AutoProcessor.from_pretrained(model_name)
-print(f"Successfully load the model: {model}")
-def model_inference(input_dict, history):
-    text = input_dict["text"]
-    files = input_dict["files"]
-    if len(files) > 1:
-        images = [load_image(image) for image in files]
-    elif len(files) == 1:
-        images = [load_image(files[0])]
-    else:
-        images = []
-    if text == "" and not images:
         gr.Error("Please input a query and optionally image(s).")
         return
-    if text == "" and images:
         gr.Error("Please input a text query along with the image(s).")
         return
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
-            ],
-        }
-    ]
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt",
-    ).to(model.device, dtype=model.dtype)
-    streamer = TextIteratorStreamer(
-        processor, skip_prompt=True, skip_special_tokens=True
-    )
-    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
     yield progress_bar_html("Processing...")
-    for new_text in streamer:
-        escaped_new_text = html.escape(new_text)
-        buffer += escaped_new_text
-        time.sleep(0.001)
-        yield buffer
 examples = [
@@ -109,15 +132,24 @@ examples = [
     ],
 ]
-demo = gr.ChatInterface(
-    fn=model_inference,
-    description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.",
-    examples=examples,
-    fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]),
-    stop_btn="Stop Generation",
-    multimodal=True,
-    cache_examples=False,
-)
-demo.launch(debug=True)

 import gradio as gr
+import base64
 import time
 import html
+from huggingface_hub import InferenceClient
 def progress_bar_html(label: str) -> str:
 model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+def model_inference(input_dict, history, hf_token: gr.OAuthToken):
+    """
+    Use Hugging Face InferenceClient (streaming) to perform the multimodal chat completion.
+    Signature matches ChatInterface call pattern: (input_dict, history, *additional_inputs)
+    The OAuth token (from gr.LoginButton) is passed as `hf_token`.
+    """
+    text = input_dict.get("text", "")
+    files = input_dict.get("files", []) or []
+    if text == "" and not files:
         gr.Error("Please input a query and optionally image(s).")
         return
+    if text == "" and files:
         gr.Error("Please input a text query along with the image(s).")
         return
+    # Build the content list: images (as URLs or data URLs) followed by the text
+    content_list = []
+    for f in files:
+        try:
+            # If file looks like a URL, send as image_url
+            if isinstance(f, str) and f.startswith("http"):
+                content_list.append({"type": "image_url", "image_url": {"url": f}})
+            else:
+                # f is a local path-like object; read and convert to base64 data url
+                with open(f, "rb") as fh:
+                    b = fh.read()
+                b64 = base64.b64encode(b).decode("utf-8")
+                # naive mime type: jpeg; this should work for most common images
+                data_url = f"data:image/jpeg;base64,{b64}"
+                content_list.append(
+                    {"type": "image_url", "image_url": {"url": data_url}}
+                )
+        except Exception:
+            # if anything goes wrong reading the file, skip embedding that file
+            continue
+    content_list.append({"type": "text", "text": text})
+    messages = [{"role": "user", "content": content_list}]
+    if hf_token is None or not getattr(hf_token, "token", None):
+        gr.Error(
+            "Please login with a Hugging Face account (use the Login button in the sidebar)."
+        )
+        return
+    client = InferenceClient(token=hf_token.token, model=model_name)
+    response = ""
     yield progress_bar_html("Processing...")
+    # The API may stream tokens. Try to iterate the streaming generator and extract token deltas.
+    try:
+        stream = client.chat.completions.create(messages=messages, stream=True)
+    except TypeError:
+        # older/newer client variants: try the alternative method name
+        stream = client.chat_completion(messages=messages, stream=True)
+    for chunk in stream:
+        # chunk can be an object with attributes or a dict depending on client version
+        token = ""
+        try:
+            # attempt dict-style
+            if isinstance(chunk, dict):
+                choices = chunk.get("choices")
+                if choices and len(choices) > 0:
+                    delta = choices[0].get("delta", {})
+                    token = delta.get("content") or ""
+            else:
+                # attribute-style
+                choices = getattr(chunk, "choices", None)
+                if choices and len(choices) > 0:
+                    delta = getattr(choices[0], "delta", None)
+                    if isinstance(delta, dict):
+                        token = delta.get("content") or ""
+                    else:
+                        token = getattr(delta, "content", "")
+        except Exception:
+            token = ""
+        if token:
+            # escape incremental token to avoid raw HTML breaking the chat box
+            response += html.escape(token)
+            time.sleep(0.001)
+            yield response
 examples = [
     ],
 ]
+with gr.Blocks() as demo:
+    with gr.Sidebar():
+        login_btn = gr.LoginButton(label="Login with Hugging Face")
+    chatbot = gr.ChatInterface(
+        fn=model_inference,
+        description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.",
+        examples=examples,
+        fill_height=True,
+        textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]),
+        stop_btn="Stop Generation",
+        multimodal=True,
+        cache_examples=False,
+        additional_inputs=[login_btn],
+    )
+    chatbot.render()
+if __name__ == "__main__":
+    demo.launch(debug=True)