Spaces:
Sleeping
Sleeping
File size: 8,767 Bytes
e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 226d165 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 738bcf1 e871c64 875e08c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
# app.py β only the changed/added parts shown
import gradio as gr
import torch
import tiktoken
from pathlib import Path
from huggingface_hub import hf_hub_download
import spaces # <-- NEW: required for the ZeroGPU decorator
from Qwen3_model import Qwen3Model, generate_text_simple, text_to_token_ids, token_ids_to_text
class TextGenerator:
def __init__(self, repo_id="vuminhtue/qwen3_sentiment_tinystories"):
print(" Loading Qwen3 model from HuggingFace...")
print(f" Repository: {repo_id}")
# Keep config; but DON'T bind dtype to bfloat16 here (T4 usually lacks bf16).
# We'll control dtype when moving to CUDA later.
self.config = {
"vocab_size": 151_936,
"context_length": 40_960,
"emb_dim": 1024,
"n_heads": 16,
"n_layers": 28,
"hidden_dim": 3072,
"head_dim": 128,
"qk_norm": True,
"n_kv_groups": 8,
"rope_base": 1_000_000.0,
"dtype": torch.float32, # <-- SAFE on CPU; weβll cast on GPU
}
# IMPORTANT: stay on CPU in the main process
self.device = "cpu"
print(f" Using device: {self.device}")
# Tokenizer
self.tokenizer = tiktoken.get_encoding("gpt2")
print(" β Tokenizer loaded")
# Download checkpoint (cached by HF)
print(" Downloading model from HuggingFace (this may take a moment)...")
model_path = hf_hub_download(
repo_id=repo_id,
filename="Qwen3_200k_model_params.pt",
repo_type="model"
)
print(f" β Model downloaded to: {model_path}")
# Build model on CPU and load weights onto CPU
self.model = Qwen3Model(self.config)
print(" βοΈ Loading model weights (CPU)...")
self.model.load_state_dict(
torch.load(model_path, map_location=torch.device("cpu"), weights_only=True)
)
self.model = self.model.to("cpu").eval()
print(" β Model loaded successfully on CPU")
print("β
Ready to generate text on CPU; GPU will be used only inside @spaces.GPU\n")
# Keep this as a thin CPU helper; no CUDA here.
def _prepare_inputs_cpu(self, prompt: str):
ids = text_to_token_ids(prompt, self.tokenizer) # CPU tensor
return ids
# Initialize the generator once at startup (CPU only)
print("=" * 70)
print("INITIALIZING TEXT GENERATION APP")
print("=" * 70)
generator = TextGenerator()
# === NEW: ZeroGPU entrypoint ===
@spaces.GPU(duration=30) # the actual GPU work happens only here
def zero_gpu_generate(prompt: str, max_new_tokens: int, temperature: float):
# ZeroGPU child process context: safe to touch CUDA here
device = torch.device("cuda")
# 1) Move/ensure model & dtype on CUDA (T4 lacks bfloat16; use float16)
# If your block supports fp16, cast for speed. Otherwise keep float32.
target_dtype = torch.float16
if next(generator.model.parameters()).dtype != target_dtype:
generator.model = generator.model.half()
if next(generator.model.parameters()).device.type != "cuda":
generator.model = generator.model.to(device).eval()
# 2) Prepare inputs and move to CUDA
input_ids = generator._prepare_inputs_cpu(prompt).to(device)
# 3) Generate on CUDA (keep your existing generation function)
output_ids = generate_text_simple(
model=generator.model,
idx=input_ids,
max_new_tokens=min(max_new_tokens, 200),
context_size=generator.config["context_length"],
temperature=temperature,
)
# 4) Back to text on CPU
# (token_ids_to_text likely uses CPU paths; ensure tensor is on CPU)
output_ids_cpu = output_ids.detach().to("cpu")
return token_ids_to_text(output_ids_cpu, generator.tokenizer)
def generate_text_interface(prompt, max_new_tokens, temperature):
if not prompt or len(prompt.strip()) == 0:
return "β οΈ Please enter some text to start with!"
# IMPORTANT: call the GPU function; DO NOT use CUDA here
return zero_gpu_generate(prompt, max_new_tokens, temperature)
# ... keep your Gradio UI identical ...
# demo = gr.Blocks(...); generate_btn.click(fn=generate_text_interface, ...)
# demo.launch(...)
with gr.Blocks(title="Qwen3 Text Generator", theme=gr.themes.Soft()) as demo:
# Header
gr.Markdown(
"""
# π€ Qwen3 Text Generator
Generate creative stories and text using a Qwen3 model trained on TinyStories!
### How to use:
1. **Enter your starting text** (e.g., "Once upon a time")
2. **Adjust the sliders** to control the output
3. **Click Generate** to create text
"""
)
# Main content area
with gr.Row():
with gr.Column(scale=1):
# Input section
gr.Markdown("### π Input")
prompt_input = gr.Textbox(
label="Starting Text (Prompt)",
placeholder="Once upon a time...",
lines=3,
info="Enter the text you want the model to continue"
)
# Control sliders
gr.Markdown("### βοΈ Generation Settings")
max_tokens_slider = gr.Slider(
minimum=10,
maximum=200,
value=50,
step=10,
label="Max New Tokens",
info="How many new tokens to generate (roughly = number of words)"
)
temperature_slider = gr.Slider(
minimum=0.1,
maximum=2.0,
value=1.0,
step=0.1,
label="Temperature",
info="Lower = more predictable, Higher = more creative"
)
# Generate button
generate_btn = gr.Button(
"β¨ Generate Text",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
# Output section
gr.Markdown("### π Generated Text")
output_text = gr.Textbox(
label="Result",
lines=15,
interactive=False,
show_copy_button=True
)
# Example prompts to try
gr.Markdown("### π‘ Try these examples:")
gr.Examples(
examples=[
["Once upon a time", 50, 0.8],
["There was a little girl named", 60, 1.0],
["In a magical forest", 70, 1.2],
["A brave knight", 50, 0.7],
["The sun was shining and", 60, 0.9],
],
inputs=[prompt_input, max_tokens_slider, temperature_slider],
label="Click any example to try it"
)
# Information section
gr.Markdown(
"""
---
### π About This Model
- **Model**: Qwen3 0.6B (596M parameters)
- **Training Data**: TinyStories dataset (children's stories)
- **Architecture**: 28 transformer layers with Grouped Query Attention
- **Model Source**: [vuminhtue/qwen3_sentiment_tinystories](https://huggingface.co/vuminhtue/qwen3_sentiment_tinystories)
### π― Understanding the Parameters
**Max New Tokens:**
- Controls the length of generated text
- One token β one word (roughly)
- More tokens = longer output = slower generation
**Temperature:**
- `0.1 - 0.7`: Safe, predictable, focused responses
- `0.8 - 1.0`: Balanced creativity and coherence
- `1.1 - 2.0`: Very creative but may be less coherent
### β οΈ Note
This model was trained on children's stories, so it works best for:
- Simple, clear narratives
- Stories about everyday situations
- Children's vocabulary and themes
---
*Built with Qwen3 architecture β’ Trained on TinyStories β’ Powered by PyTorch β’ Model hosted on π€ HuggingFace*
"""
)
# Connect the button to the generation function
generate_btn.click(
fn=generate_text_interface,
inputs=[prompt_input, max_tokens_slider, temperature_slider],
outputs=output_text
)
# Also allow pressing Enter in the text box to generate
prompt_input.submit(
fn=generate_text_interface,
inputs=[prompt_input, max_tokens_slider, temperature_slider],
outputs=output_text
)
# Launch the app
if __name__ == "__main__":
print("\n" + "="*70)
print("LAUNCHING GRADIO APP")
print("="*70)
demo.launch() |