Spaces:

huggingface-projects
/

gemma-4-e4b-it

Running on Zero

App Files Files Community

hysts HF Staff commited on Apr 20

Commit

8aaee8a

1 Parent(s): 2c95709

Enable ChatInterface stop button

Browse files

Files changed (1) hide show

app.py +26 -9

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from threading import Thread
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoModelForMultimodalLM, AutoProcessor, BatchFeature
 from transformers.generation.streamers import TextIteratorStreamer
 MODEL_ID = "google/gemma-4-e4b-it"
@@ -91,6 +91,14 @@ def process_history(history: list[dict]) -> list[dict]:
     return messages
 @spaces.GPU(duration=120)
 @torch.inference_mode()
 def _generate_on_gpu(inputs: BatchFeature, max_new_tokens: int, thinking: bool) -> Iterator[str]:
@@ -102,9 +110,11 @@ def _generate_on_gpu(inputs: BatchFeature, max_new_tokens: int, thinking: bool)
         skip_prompt=True,
         skip_special_tokens=not thinking,
     )
     generate_kwargs = {
         **inputs,
         "streamer": streamer,
         "max_new_tokens": max_new_tokens,
         "disable_compile": True,
     }
@@ -121,13 +131,20 @@ def _generate_on_gpu(inputs: BatchFeature, max_new_tokens: int, thinking: bool)
     thread.start()
     chunks: list[str] = []
-    for text in streamer:
-        chunks.append(text)
-        accumulated = "".join(chunks)
-        if thinking:
-            yield _strip_special_tokens(accumulated)
-        else:
-            yield accumulated
     thread.join()
     if exception_holder:
@@ -292,6 +309,7 @@ demo = gr.ChatInterface(
         file_types=[*IMAGE_FILE_TYPES, *AUDIO_FILE_TYPES, *VIDEO_FILE_TYPES],
         file_count="multiple",
         autofocus=True,
     ),
     multimodal=True,
     additional_inputs=[
@@ -306,7 +324,6 @@ demo = gr.ChatInterface(
         gr.Textbox(label="System Prompt", value=""),
     ],
     additional_inputs_accordion=gr.Accordion("Settings", open=True),
-    stop_btn=False,
     title="Gemma 4 E4B It",
     examples=examples,
     run_examples_on_click=False,

 import gradio as gr
 import spaces
 import torch
+from transformers import AutoModelForMultimodalLM, AutoProcessor, BatchFeature, StoppingCriteria
 from transformers.generation.streamers import TextIteratorStreamer
 MODEL_ID = "google/gemma-4-e4b-it"
     return messages
+class StopOnSignal(StoppingCriteria):
+    def __init__(self) -> None:
+        self.stopped = False
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor, **kwargs: object) -> bool:  # noqa: ARG002
+        return self.stopped
 @spaces.GPU(duration=120)
 @torch.inference_mode()
 def _generate_on_gpu(inputs: BatchFeature, max_new_tokens: int, thinking: bool) -> Iterator[str]:
         skip_prompt=True,
         skip_special_tokens=not thinking,
     )
+    stop_criteria = StopOnSignal()
     generate_kwargs = {
         **inputs,
         "streamer": streamer,
+        "stopping_criteria": [stop_criteria],
         "max_new_tokens": max_new_tokens,
         "disable_compile": True,
     }
     thread.start()
     chunks: list[str] = []
+    try:
+        for text in streamer:
+            chunks.append(text)
+            accumulated = "".join(chunks)
+            if thinking:
+                yield _strip_special_tokens(accumulated)
+            else:
+                yield accumulated
+    except GeneratorExit:
+        stop_criteria.stopped = True
+        for _ in streamer:
+            pass
+        thread.join()
+        raise
     thread.join()
     if exception_holder:
         file_types=[*IMAGE_FILE_TYPES, *AUDIO_FILE_TYPES, *VIDEO_FILE_TYPES],
         file_count="multiple",
         autofocus=True,
+        stop_btn=True,
     ),
     multimodal=True,
     additional_inputs=[
         gr.Textbox(label="System Prompt", value=""),
     ],
     additional_inputs_accordion=gr.Accordion("Settings", open=True),
     title="Gemma 4 E4B It",
     examples=examples,
     run_examples_on_click=False,