vuminhtue commited on
Commit
e871c64
·
verified ·
1 Parent(s): 738bcf1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -286
app.py CHANGED
@@ -1,12 +1,4 @@
1
- """
2
- Qwen3 Text Generation App for Hugging Face Spaces
3
-
4
- This app allows you to generate text using a trained Qwen3 model.
5
- You can control:
6
- - The starting text (prompt)
7
- - How many new words to generate (max_new_tokens)
8
- - How creative the output should be (temperature)
9
- """
10
 
11
  import gradio as gr
12
  import torch
@@ -14,303 +6,112 @@ import tiktoken
14
  from pathlib import Path
15
  from huggingface_hub import hf_hub_download
16
 
17
- # Import our Qwen3 model
 
18
  from Qwen3_model import Qwen3Model, generate_text_simple, text_to_token_ids, token_ids_to_text
19
 
20
 
21
  class TextGenerator:
22
- """
23
- A simple class to load the model and generate text
24
-
25
- This makes it easy to:
26
- 1. Load the trained model once at startup
27
- 2. Generate text multiple times without reloading
28
- """
29
-
30
  def __init__(self, repo_id="vuminhtue/qwen3_sentiment_tinystories"):
31
- """
32
- Initialize the text generator
33
-
34
- Parameters:
35
- -----------
36
- repo_id : str
37
- HuggingFace repository ID to download the model from
38
- Default: "vuminhtue/qwen3_sentiment_tinystories"
39
- """
40
- print("🚀 Loading Qwen3 model from HuggingFace...")
41
- print(f" Repository: {repo_id}")
42
-
43
- # Configuration for Qwen3 0.6B model
44
- # These settings define the architecture of the model
45
  self.config = {
46
- "vocab_size": 151_936, # Number of different tokens the model knows
47
- "context_length": 40_960, # Maximum length of text it can process
48
- "emb_dim": 1024, # Size of the embedding vectors
49
- "n_heads": 16, # Number of attention heads
50
- "n_layers": 28, # Number of transformer layers
51
- "hidden_dim": 3072, # Size of the feed-forward network
52
- "head_dim": 128, # Size of each attention head
53
- "qk_norm": True, # Whether to normalize queries and keys
54
- "n_kv_groups": 8, # Number of key-value groups
55
- "rope_base": 1_000_000.0, # Base for rotary position encoding
56
- "dtype": torch.bfloat16, # Data type for model weights
57
  }
58
-
59
- # Detect if we have a GPU available
60
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
61
- print(f" Using device: {self.device}")
62
-
63
- # Load the tokenizer (converts text to numbers and back)
64
- # We use GPT-2's tokenizer which works well for English text
65
  self.tokenizer = tiktoken.get_encoding("gpt2")
66
- print(" ✓ Tokenizer loaded")
67
-
68
- # Download the model file from HuggingFace
69
- # This will cache the file locally, so it only downloads once
70
- print(" 📥 Downloading model from HuggingFace (this may take a moment)...")
71
- try:
72
- model_path = hf_hub_download(
73
- repo_id=repo_id,
74
- filename="Qwen3_200k_model_params.pt",
75
- repo_type="model"
76
- )
77
- print(f" ✓ Model downloaded to: {model_path}")
78
- except Exception as e:
79
- print(f" ❌ Error downloading model: {e}")
80
- raise
81
-
82
- # Create the model with our configuration
83
  self.model = Qwen3Model(self.config)
84
-
85
- # Load the trained weights from the downloaded file
86
- print(" ⚙️ Loading model weights...")
87
  self.model.load_state_dict(
88
- torch.load(
89
- model_path,
90
- map_location=torch.device(self.device),
91
- weights_only=True
92
- )
93
  )
94
-
95
- # Move model to the appropriate device (CPU or GPU)
96
- self.model = self.model.to(self.device)
97
-
98
- # Set to evaluation mode (disables training-specific features)
99
- self.model.eval()
100
-
101
- print(" ✓ Model loaded successfully!")
102
- print("✅ Ready to generate text!\n")
103
-
104
- def generate(self, prompt, max_new_tokens=50, temperature=1.0):
105
- """
106
- Generate text based on a prompt
107
-
108
- Parameters:
109
- -----------
110
- prompt : str
111
- The starting text (what you want the model to continue)
112
- max_new_tokens : int
113
- How many new tokens (roughly words) to generate
114
- temperature : float
115
- Controls creativity:
116
- - Lower (0.1-0.7): More predictable, focused
117
- - Medium (0.8-1.0): Balanced
118
- - Higher (1.1-2.0): More creative, random
119
-
120
- Returns:
121
- --------
122
- str : The generated text (including the original prompt)
123
- """
124
- try:
125
- # Convert the text prompt to token IDs (numbers)
126
- input_ids = text_to_token_ids(prompt, self.tokenizer)
127
- input_ids = input_ids.to(self.device)
128
-
129
- # Generate new tokens
130
- output_ids = generate_text_simple(
131
- model=self.model,
132
- idx=input_ids,
133
- max_new_tokens=max_new_tokens,
134
- context_size=self.config["context_length"],
135
- temperature=temperature
136
- )
137
-
138
- # Convert the token IDs back to text
139
- generated_text = token_ids_to_text(output_ids, self.tokenizer)
140
-
141
- return generated_text
142
-
143
- except Exception as e:
144
- return f"❌ Error generating text: {str(e)}"
145
 
146
 
147
- # Initialize the generator once when the app starts
148
- print("="*70)
149
  print("INITIALIZING TEXT GENERATION APP")
150
- print("="*70)
151
  generator = TextGenerator()
152
 
153
 
154
- def generate_text_interface(prompt, max_new_tokens, temperature):
155
- """
156
- Interface function for Gradio
157
-
158
- This function:
159
- 1. Takes inputs from the user interface
160
- 2. Calls our generator
161
- 3. Returns the result to display
162
- """
163
- # Check if prompt is empty
164
- if not prompt or len(prompt.strip()) == 0:
165
- return "⚠️ Please enter some text to start with!"
166
-
167
- # Limit max tokens to prevent very long generation times
168
- max_new_tokens = min(max_new_tokens, 200)
169
-
170
- # Generate text
171
- result = generator.generate(prompt, max_new_tokens, temperature)
172
-
173
- return result
174
 
 
 
 
 
 
 
 
175
 
176
- # Create the Gradio interface
177
- # This defines what the web app looks like and how it behaves
178
- with gr.Blocks(title="Qwen3 Text Generator", theme=gr.themes.Soft()) as demo:
179
-
180
- # Header
181
- gr.Markdown(
182
- """
183
- # 🤖 Qwen3 Text Generator
184
-
185
- Generate creative stories and text using a Qwen3 model trained on TinyStories!
186
-
187
- ### How to use:
188
- 1. **Enter your starting text** (e.g., "Once upon a time")
189
- 2. **Adjust the sliders** to control the output
190
- 3. **Click Generate** to create text
191
- """
192
- )
193
-
194
- # Main content area
195
- with gr.Row():
196
- with gr.Column(scale=1):
197
- # Input section
198
- gr.Markdown("### 📝 Input")
199
-
200
- prompt_input = gr.Textbox(
201
- label="Starting Text (Prompt)",
202
- placeholder="Once upon a time...",
203
- lines=3,
204
- info="Enter the text you want the model to continue"
205
- )
206
-
207
- # Control sliders
208
- gr.Markdown("### ⚙️ Generation Settings")
209
-
210
- max_tokens_slider = gr.Slider(
211
- minimum=10,
212
- maximum=200,
213
- value=50,
214
- step=10,
215
- label="Max New Tokens",
216
- info="How many new tokens to generate (roughly = number of words)"
217
- )
218
-
219
- temperature_slider = gr.Slider(
220
- minimum=0.1,
221
- maximum=2.0,
222
- value=1.0,
223
- step=0.1,
224
- label="Temperature",
225
- info="Lower = more predictable, Higher = more creative"
226
- )
227
-
228
- # Generate button
229
- generate_btn = gr.Button(
230
- "✨ Generate Text",
231
- variant="primary",
232
- size="lg"
233
- )
234
-
235
- with gr.Column(scale=1):
236
- # Output section
237
- gr.Markdown("### 📖 Generated Text")
238
-
239
- output_text = gr.Textbox(
240
- label="Result",
241
- lines=15,
242
- interactive=False,
243
- show_copy_button=True
244
- )
245
-
246
- # Example prompts to try
247
- gr.Markdown("### 💡 Try these examples:")
248
- gr.Examples(
249
- examples=[
250
- ["Once upon a time", 50, 0.8],
251
- ["There was a little girl named", 60, 1.0],
252
- ["In a magical forest", 70, 1.2],
253
- ["A brave knight", 50, 0.7],
254
- ["The sun was shining and", 60, 0.9],
255
- ],
256
- inputs=[prompt_input, max_tokens_slider, temperature_slider],
257
- label="Click any example to try it"
258
- )
259
-
260
- # Information section
261
- gr.Markdown(
262
- """
263
- ---
264
- ### 📊 About This Model
265
-
266
- - **Model**: Qwen3 0.6B (596M parameters)
267
- - **Training Data**: TinyStories dataset (children's stories)
268
- - **Architecture**: 28 transformer layers with Grouped Query Attention
269
- - **Model Source**: [vuminhtue/qwen3_sentiment_tinystories](https://huggingface.co/vuminhtue/qwen3_sentiment_tinystories)
270
-
271
- ### 🎯 Understanding the Parameters
272
-
273
- **Max New Tokens:**
274
- - Controls the length of generated text
275
- - One token ≈ one word (roughly)
276
- - More tokens = longer output = slower generation
277
-
278
- **Temperature:**
279
- - `0.1 - 0.7`: Safe, predictable, focused responses
280
- - `0.8 - 1.0`: Balanced creativity and coherence
281
- - `1.1 - 2.0`: Very creative but may be less coherent
282
-
283
- ### ⚠️ Note
284
-
285
- This model was trained on children's stories, so it works best for:
286
- - Simple, clear narratives
287
- - Stories about everyday situations
288
- - Children's vocabulary and themes
289
-
290
- ---
291
- *Built with Qwen3 architecture • Trained on TinyStories • Powered by PyTorch • Model hosted on 🤗 HuggingFace*
292
- """
293
- )
294
-
295
- # Connect the button to the generation function
296
- generate_btn.click(
297
- fn=generate_text_interface,
298
- inputs=[prompt_input, max_tokens_slider, temperature_slider],
299
- outputs=output_text
300
- )
301
-
302
- # Also allow pressing Enter in the text box to generate
303
- prompt_input.submit(
304
- fn=generate_text_interface,
305
- inputs=[prompt_input, max_tokens_slider, temperature_slider],
306
- outputs=output_text
307
  )
308
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
- # Launch the app
311
- if __name__ == "__main__":
312
- print("\n" + "="*70)
313
- print("LAUNCHING GRADIO APP")
314
- print("="*70)
315
- demo.launch()
316
 
 
 
 
 
1
+ # app.py — only the changed/added parts shown
 
 
 
 
 
 
 
 
2
 
3
  import gradio as gr
4
  import torch
 
6
  from pathlib import Path
7
  from huggingface_hub import hf_hub_download
8
 
9
+ import spaces # <-- NEW: required for the ZeroGPU decorator
10
+
11
  from Qwen3_model import Qwen3Model, generate_text_simple, text_to_token_ids, token_ids_to_text
12
 
13
 
14
  class TextGenerator:
 
 
 
 
 
 
 
 
15
  def __init__(self, repo_id="vuminhtue/qwen3_sentiment_tinystories"):
16
+ print(" Loading Qwen3 model from HuggingFace...")
17
+ print(f" Repository: {repo_id}")
18
+
19
+ # Keep config; but DON'T bind dtype to bfloat16 here (T4 usually lacks bf16).
20
+ # We'll control dtype when moving to CUDA later.
 
 
 
 
 
 
 
 
 
21
  self.config = {
22
+ "vocab_size": 151_936,
23
+ "context_length": 40_960,
24
+ "emb_dim": 1024,
25
+ "n_heads": 16,
26
+ "n_layers": 28,
27
+ "hidden_dim": 3072,
28
+ "head_dim": 128,
29
+ "qk_norm": True,
30
+ "n_kv_groups": 8,
31
+ "rope_base": 1_000_000.0,
32
+ "dtype": torch.float32, # <-- SAFE on CPU; we’ll cast on GPU
33
  }
34
+
35
+ # IMPORTANT: stay on CPU in the main process
36
+ self.device = "cpu"
37
+ print(f" Using device: {self.device}")
38
+
39
+ # Tokenizer
 
40
  self.tokenizer = tiktoken.get_encoding("gpt2")
41
+ print(" ✓ Tokenizer loaded")
42
+
43
+ # Download checkpoint (cached by HF)
44
+ print(" Downloading model from HuggingFace (this may take a moment)...")
45
+ model_path = hf_hub_download(
46
+ repo_id=repo_id,
47
+ filename="Qwen3_200k_model_params.pt",
48
+ repo_type="model"
49
+ )
50
+ print(f" ✓ Model downloaded to: {model_path}")
51
+
52
+ # Build model on CPU and load weights onto CPU
 
 
 
 
 
53
  self.model = Qwen3Model(self.config)
54
+ print(" ⚙️ Loading model weights (CPU)...")
 
 
55
  self.model.load_state_dict(
56
+ torch.load(model_path, map_location=torch.device("cpu"), weights_only=True)
 
 
 
 
57
  )
58
+ self.model = self.model.to("cpu").eval()
59
+ print(" Model loaded successfully on CPU")
60
+ print("✅ Ready to generate text on CPU; GPU will be used only inside @spaces.GPU\n")
61
+
62
+ # Keep this as a thin CPU helper; no CUDA here.
63
+ def _prepare_inputs_cpu(self, prompt: str):
64
+ ids = text_to_token_ids(prompt, self.tokenizer) # CPU tensor
65
+ return ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
+ # Initialize the generator once at startup (CPU only)
69
+ print("=" * 70)
70
  print("INITIALIZING TEXT GENERATION APP")
71
+ print("=" * 70)
72
  generator = TextGenerator()
73
 
74
 
75
+ # === NEW: ZeroGPU entrypoint ===
76
+ @spaces.GPU(duration=120) # the actual GPU work happens only here
77
+ def zero_gpu_generate(prompt: str, max_new_tokens: int, temperature: float):
78
+ # ZeroGPU child process context: safe to touch CUDA here
79
+ device = torch.device("cuda")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ # 1) Move/ensure model & dtype on CUDA (T4 lacks bfloat16; use float16)
82
+ # If your block supports fp16, cast for speed. Otherwise keep float32.
83
+ target_dtype = torch.float16
84
+ if next(generator.model.parameters()).dtype != target_dtype:
85
+ generator.model = generator.model.half()
86
+ if next(generator.model.parameters()).device.type != "cuda":
87
+ generator.model = generator.model.to(device).eval()
88
 
89
+ # 2) Prepare inputs and move to CUDA
90
+ input_ids = generator._prepare_inputs_cpu(prompt).to(device)
91
+
92
+ # 3) Generate on CUDA (keep your existing generation function)
93
+ output_ids = generate_text_simple(
94
+ model=generator.model,
95
+ idx=input_ids,
96
+ max_new_tokens=min(max_new_tokens, 200),
97
+ context_size=generator.config["context_length"],
98
+ temperature=temperature,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  )
100
 
101
+ # 4) Back to text on CPU
102
+ # (token_ids_to_text likely uses CPU paths; ensure tensor is on CPU)
103
+ output_ids_cpu = output_ids.detach().to("cpu")
104
+ return token_ids_to_text(output_ids_cpu, generator.tokenizer)
105
+
106
+
107
+ def generate_text_interface(prompt, max_new_tokens, temperature):
108
+ if not prompt or len(prompt.strip()) == 0:
109
+ return "⚠️ Please enter some text to start with!"
110
+
111
+ # IMPORTANT: call the GPU function; DO NOT use CUDA here
112
+ return zero_gpu_generate(prompt, max_new_tokens, temperature)
113
 
 
 
 
 
 
 
114
 
115
+ # ... keep your Gradio UI identical ...
116
+ # demo = gr.Blocks(...); generate_btn.click(fn=generate_text_interface, ...)
117
+ # demo.launch(...)