TobDeBer commited on
Commit
c88e367
·
1 Parent(s): cab2b06

GUI streaming with tps

Browse files
Files changed (1) hide show
  1. app.py +43 -24
app.py CHANGED
@@ -1,8 +1,9 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
  import time
5
  import random
 
6
 
7
  # Model configuration - using SmolLM2 for efficient inference
8
  MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
@@ -24,7 +25,7 @@ def load_model():
24
  device_map="auto"
25
  )
26
 
27
- # Create text generation pipeline
28
  text_generator = pipeline(
29
  "text-generation",
30
  model=model,
@@ -59,45 +60,64 @@ def generate_text(
59
  repetition_penalty=1.1,
60
  system_prompt="You are a helpful AI assistant. Provide clear and concise answers."
61
  ):
62
- """Generate text using the loaded model"""
63
- global text_generator
64
 
65
- if text_generator is None:
66
- return "⚠️ Please wait for the model to finish loading..."
 
67
 
68
  if not prompt.strip():
69
- return "⚠️ Please enter a prompt."
 
70
 
71
  try:
72
  # Format the prompt
73
  formatted_prompt = format_prompt(prompt, system_prompt)
 
74
 
75
- # Generate response
76
- start_time = time.time()
77
- result = text_generator(
78
- formatted_prompt,
 
 
 
79
  max_new_tokens=max_length,
80
  temperature=temperature,
81
  top_p=top_p,
82
  repetition_penalty=repetition_penalty,
83
  do_sample=True,
84
  pad_token_id=tokenizer.eos_token_id,
85
- eos_token_id=tokenizer.eos_token_id,
86
- return_full_text=False
87
  )
88
 
89
- generation_time = time.time() - start_time
90
-
91
- # Extract the generated response directly
92
- response = result[0]["generated_text"].strip()
93
-
94
- # Format output with metadata
95
- output = f"**Response:**\n{response}\n\n---\n*Generated in {generation_time:.2f} seconds*"
96
 
97
- return output
 
 
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  except Exception as e:
100
- return f"❌ Error during generation: {str(e)}"
101
 
102
  def clear_chat():
103
  """Clear the chat interface"""
@@ -124,7 +144,6 @@ with gr.Blocks() as demo:
124
  """
125
  # 🤖 Smol LLM Inference GUI
126
 
127
- **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)** -
128
  Efficient text generation using SmolLM2-135M
129
 
130
  This application runs a compact language model locally for text generation.
@@ -255,4 +274,4 @@ demo.launch(
255
  ],
256
  share=False,
257
  show_error=True
258
- )
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextIteratorStreamer
4
  import time
5
  import random
6
+ from threading import Thread
7
 
8
  # Model configuration - using SmolLM2 for efficient inference
9
  MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
 
25
  device_map="auto"
26
  )
27
 
28
+ # Create text generation pipeline (still useful for non-streaming checks if needed, but we use model.generate for streaming)
29
  text_generator = pipeline(
30
  "text-generation",
31
  model=model,
 
60
  repetition_penalty=1.1,
61
  system_prompt="You are a helpful AI assistant. Provide clear and concise answers."
62
  ):
63
+ """Generate text using the loaded model with streaming"""
64
+ global model, tokenizer
65
 
66
+ if model is None or tokenizer is None:
67
+ yield "⚠️ Please wait for the model to finish loading..."
68
+ return
69
 
70
  if not prompt.strip():
71
+ yield "⚠️ Please enter a prompt."
72
+ return
73
 
74
  try:
75
  # Format the prompt
76
  formatted_prompt = format_prompt(prompt, system_prompt)
77
+ inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
78
 
79
+ # Setup streamer
80
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
81
+
82
+ # Generation arguments
83
+ generation_kwargs = dict(
84
+ **inputs,
85
+ streamer=streamer,
86
  max_new_tokens=max_length,
87
  temperature=temperature,
88
  top_p=top_p,
89
  repetition_penalty=repetition_penalty,
90
  do_sample=True,
91
  pad_token_id=tokenizer.eos_token_id,
92
+ eos_token_id=tokenizer.eos_token_id
 
93
  )
94
 
95
+ # Start generation in a separate thread
96
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
97
+ thread.start()
 
 
 
 
98
 
99
+ # Consume the stream
100
+ generated_text = ""
101
+ start_time = time.time()
102
+ token_count = 0
103
 
104
+ for new_text in streamer:
105
+ generated_text += new_text
106
+ token_count += 1
107
+ current_time = time.time()
108
+ elapsed_time = current_time - start_time
109
+
110
+ # Avoid division by zero
111
+ if elapsed_time > 0:
112
+ tps = token_count / elapsed_time
113
+ stats = f"\n\n---\n*Generated {token_count} tokens in {elapsed_time:.2f} seconds ({tps:.2f} tokens/s)*"
114
+ else:
115
+ stats = "\n\n---\n*Starting generation...*"
116
+
117
+ yield f"**Response:**\n{generated_text}{stats}"
118
+
119
  except Exception as e:
120
+ yield f"❌ Error during generation: {str(e)}"
121
 
122
  def clear_chat():
123
  """Clear the chat interface"""
 
144
  """
145
  # 🤖 Smol LLM Inference GUI
146
 
 
147
  Efficient text generation using SmolLM2-135M
148
 
149
  This application runs a compact language model locally for text generation.
 
274
  ],
275
  share=False,
276
  show_error=True
277
+ )