Spaces:

manthilaffs
/

Gamunu-Inference

Sleeping

manthilaffs commited on Nov 4, 2025

Commit

aaa6d06

verified ·

1 Parent(s): 1925104

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -15,7 +15,23 @@ alpaca_prompt = """පහත දැක්වෙන්නේ යම් කාර
 ### ප්‍රතිචාරය:
 {}"""
-def infer_stream(message, history, enable_history=False, max_new_tokens=512):
     global model, tokenizer
     if model is None:
@@ -45,18 +61,8 @@ def infer_stream(message, history, enable_history=False, max_new_tokens=512):
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    # Setup streaming
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(
-        **inputs,
-        max_new_tokens=max_new_tokens,
-        streamer=streamer,
-    )
-    # Start generation in a separate thread
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
     # Stream the output
     partial_text = ""
@@ -75,11 +81,6 @@ def infer_stream(message, history, enable_history=False, max_new_tokens=512):
     thread.join()
-@spaces.GPU
-def infer(message, history, enable_history=False, max_new_tokens=512):
-    # Return the generator for streaming
-    return infer_stream(message, history, enable_history, max_new_tokens)
 # Custom CSS for styling
 custom_css = """
 #splash-screen {

 ### ප්‍රතිචාරය:
 {}"""
+@spaces.GPU
+def generate_with_streaming(inputs, max_new_tokens):
+    global model
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        streamer=streamer,
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    return streamer, thread
+def infer(message, history, enable_history=False, max_new_tokens=512):
     global model, tokenizer
     if model is None:
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # Get streamer and thread from GPU function
+    streamer, thread = generate_with_streaming(inputs, max_new_tokens)
     # Stream the output
     partial_text = ""
     thread.join()
 # Custom CSS for styling
 custom_css = """
 #splash-screen {