Spaces:

manthilaffs
/

Gamunu-Inference

Sleeping

App Files Files Community

manthilaffs commited on Nov 4, 2025

Commit

04d4513

verified ·

1 Parent(s): aaa6d06

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -33

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import gradio as gr
 import torch
 import spaces
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from threading import Thread
 model = None
 tokenizer = None
@@ -16,21 +15,6 @@ alpaca_prompt = """පහත දැක්වෙන්නේ යම් කාර
 {}"""
 @spaces.GPU
-def generate_with_streaming(inputs, max_new_tokens):
-    global model
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(
-        **inputs,
-        max_new_tokens=max_new_tokens,
-        streamer=streamer,
-    )
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    return streamer, thread
 def infer(message, history, enable_history=False, max_new_tokens=512):
     global model, tokenizer
@@ -61,25 +45,15 @@ def infer(message, history, enable_history=False, max_new_tokens=512):
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    # Get streamer and thread from GPU function
-    streamer, thread = generate_with_streaming(inputs, max_new_tokens)
-    # Stream the output
-    partial_text = ""
-    response_started = False
-    for new_text in streamer:
-        partial_text += new_text
-        # Check if we've reached the response section
-        if not response_started and "### ප්‍රතිචාරය:" in partial_text:
-            partial_text = partial_text.split("### ප්‍රතිචාරය:")[-1].strip()
-            response_started = True
-        if response_started:
-            yield partial_text
-    thread.join()
 # Custom CSS for styling
 custom_css = """

 import gradio as gr
 import torch
 import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer
 model = None
 tokenizer = None
 {}"""
 @spaces.GPU
 def infer(message, history, enable_history=False, max_new_tokens=512):
     global model, tokenizer
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.inference_mode():
+        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    if "### ප්‍රතිචාරය:" in text:
+        text = text.split("### ප්‍රතිචාරය:")[-1].strip()
+    return text
 # Custom CSS for styling
 custom_css = """