Spaces:

TobDeBer
/

SmolTransform

Sleeping

App Files Files Community

TobDeBer commited on Dec 18, 2025

Commit

c88e367

1 Parent(s): cab2b06

GUI streaming with tps

Browse files

Files changed (1) hide show

app.py +43 -24

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import time
 import random
 # Model configuration - using SmolLM2 for efficient inference
 MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
@@ -24,7 +25,7 @@ def load_model():
             device_map="auto"
         )
-        # Create text generation pipeline
         text_generator = pipeline(
             "text-generation",
             model=model,
@@ -59,45 +60,64 @@ def generate_text(
     repetition_penalty=1.1,
     system_prompt="You are a helpful AI assistant. Provide clear and concise answers."
 ):
-    """Generate text using the loaded model"""
-    global text_generator
-    if text_generator is None:
-        return "⚠️ Please wait for the model to finish loading..."
     if not prompt.strip():
-        return "⚠️ Please enter a prompt."
     try:
         # Format the prompt
         formatted_prompt = format_prompt(prompt, system_prompt)
-        # Generate response
-        start_time = time.time()
-        result = text_generator(
-            formatted_prompt,
             max_new_tokens=max_length,
             temperature=temperature,
             top_p=top_p,
             repetition_penalty=repetition_penalty,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            return_full_text=False
         )
-        generation_time = time.time() - start_time
-        # Extract the generated response directly
-        response = result[0]["generated_text"].strip()
-        # Format output with metadata
-        output = f"**Response:**\n{response}\n\n---\n*Generated in {generation_time:.2f} seconds*"
-        return output
     except Exception as e:
-        return f"❌ Error during generation: {str(e)}"
 def clear_chat():
     """Clear the chat interface"""
@@ -124,7 +144,6 @@ with gr.Blocks() as demo:
         """
         # 🤖 Smol LLM Inference GUI
-        **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)** -
         Efficient text generation using SmolLM2-135M
         This application runs a compact language model locally for text generation.
@@ -255,4 +274,4 @@ demo.launch(
     ],
     share=False,
     show_error=True
-)

 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextIteratorStreamer
 import time
 import random
+from threading import Thread
 # Model configuration - using SmolLM2 for efficient inference
 MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
             device_map="auto"
         )
+        # Create text generation pipeline (still useful for non-streaming checks if needed, but we use model.generate for streaming)
         text_generator = pipeline(
             "text-generation",
             model=model,
     repetition_penalty=1.1,
     system_prompt="You are a helpful AI assistant. Provide clear and concise answers."
 ):
+    """Generate text using the loaded model with streaming"""
+    global model, tokenizer
+    if model is None or tokenizer is None:
+        yield "⚠️ Please wait for the model to finish loading..."
+        return
     if not prompt.strip():
+        yield "⚠️ Please enter a prompt."
+        return
     try:
         # Format the prompt
         formatted_prompt = format_prompt(prompt, system_prompt)
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+        # Setup streamer
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        # Generation arguments
+        generation_kwargs = dict(
+            **inputs,
+            streamer=streamer,
             max_new_tokens=max_length,
             temperature=temperature,
             top_p=top_p,
             repetition_penalty=repetition_penalty,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id
         )
+        # Start generation in a separate thread
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Consume the stream
+        generated_text = ""
+        start_time = time.time()
+        token_count = 0
+        for new_text in streamer:
+            generated_text += new_text
+            token_count += 1
+            current_time = time.time()
+            elapsed_time = current_time - start_time
+            # Avoid division by zero
+            if elapsed_time > 0:
+                tps = token_count / elapsed_time
+                stats = f"\n\n---\n*Generated {token_count} tokens in {elapsed_time:.2f} seconds ({tps:.2f} tokens/s)*"
+            else:
+                stats = "\n\n---\n*Starting generation...*"
+            yield f"**Response:**\n{generated_text}{stats}"
     except Exception as e:
+        yield f"❌ Error during generation: {str(e)}"
 def clear_chat():
     """Clear the chat interface"""
         """
         # 🤖 Smol LLM Inference GUI
         Efficient text generation using SmolLM2-135M
         This application runs a compact language model locally for text generation.
     ],
     share=False,
     show_error=True
+)