Spaces:

Pectics
/

Softie

Runtime error

Pectics commited on Jan 23, 2025

Commit

dc15a3f

verified ·

1 Parent(s): af0c8f0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,26 +1,40 @@
-from gradio import ChatInterface, Textbox, Slider
-from spaces import GPU
 from threading import Thread
-from torch import bfloat16
 from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor, BatchFeature
 from qwen_vl_utils import process_vision_info
 model_path = "Pectics/Softie-VL-7B-250123"
 model = Qwen2VLForConditionalGeneration.from_pretrained(
     model_path,
-    torch_dtype=bfloat16,
-    attn_implementation="flash_attention_2",
     device_map="auto",
 )
 min_pixels = 256 * 28 * 28
 max_pixels = 1280 * 28 * 28
 processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
 @GPU
-def infer(inputs: BatchFeature, streamer, kwargs: dict):
     inputs = inputs.to("cuda")
-    thread = Thread(target=model.generate, kwargs={**inputs, **kwargs})
     thread.start()
     response = ""
     for token in streamer:
@@ -48,14 +62,7 @@ def respond(
         padding = True,
         return_tensors = "pt",
     )
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    kwargs = dict(
-        streamer=streamer,
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-    )
-    for response in infer(inputs, streamer, kwargs):
         yield response
 app = ChatInterface(

 from threading import Thread
 from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor, BatchFeature
+from gradio import ChatInterface, Textbox, Slider
+from spaces import GPU
 from qwen_vl_utils import process_vision_info
 model_path = "Pectics/Softie-VL-7B-250123"
 model = Qwen2VLForConditionalGeneration.from_pretrained(
     model_path,
+    torch_dtype="auto",
     device_map="auto",
+    attn_implementation="flash_attention_2",
 )
 min_pixels = 256 * 28 * 28
 max_pixels = 1280 * 28 * 28
 processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)
 @GPU
+def infer(
+    inputs: BatchFeature,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+):
     inputs = inputs.to("cuda")
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    thread = Thread(target=model.generate, kwargs=kwargs)
     thread.start()
     response = ""
     for token in streamer:
         padding = True,
         return_tensors = "pt",
     )
+    for response in infer(inputs, max_tokens, temperature, top_p):
         yield response
 app = ChatInterface(