File size: 8,767 Bytes
e871c64
738bcf1
 
 
 
 
 
 
e871c64
 
738bcf1
 
 
 
 
e871c64
 
 
 
 
738bcf1
e871c64
 
 
 
 
 
 
 
 
 
 
738bcf1
e871c64
 
 
 
 
 
738bcf1
e871c64
 
 
 
 
 
 
 
 
 
 
 
738bcf1
e871c64
738bcf1
e871c64
738bcf1
e871c64
 
 
 
 
 
 
 
738bcf1
 
e871c64
 
738bcf1
e871c64
738bcf1
 
 
e871c64
226d165
e871c64
 
 
738bcf1
e871c64
 
 
 
 
 
 
738bcf1
e871c64
 
 
 
 
 
 
 
 
 
738bcf1
 
e871c64
 
 
 
 
 
 
 
 
 
 
 
738bcf1
 
e871c64
 
 
875e08c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# app.py  β€” only the changed/added parts shown

import gradio as gr
import torch
import tiktoken
from pathlib import Path
from huggingface_hub import hf_hub_download

import spaces  # <-- NEW: required for the ZeroGPU decorator

from Qwen3_model import Qwen3Model, generate_text_simple, text_to_token_ids, token_ids_to_text


class TextGenerator:
    def __init__(self, repo_id="vuminhtue/qwen3_sentiment_tinystories"):
        print(" Loading Qwen3 model from HuggingFace...")
        print(f" Repository: {repo_id}")

        # Keep config; but DON'T bind dtype to bfloat16 here (T4 usually lacks bf16).
        # We'll control dtype when moving to CUDA later.
        self.config = {
            "vocab_size": 151_936,
            "context_length": 40_960,
            "emb_dim": 1024,
            "n_heads": 16,
            "n_layers": 28,
            "hidden_dim": 3072,
            "head_dim": 128,
            "qk_norm": True,
            "n_kv_groups": 8,
            "rope_base": 1_000_000.0,
            "dtype": torch.float32,  # <-- SAFE on CPU; we’ll cast on GPU
        }

        # IMPORTANT: stay on CPU in the main process
        self.device = "cpu"
        print(f" Using device: {self.device}")

        # Tokenizer
        self.tokenizer = tiktoken.get_encoding("gpt2")
        print(" βœ“ Tokenizer loaded")

        # Download checkpoint (cached by HF)
        print(" Downloading model from HuggingFace (this may take a moment)...")
        model_path = hf_hub_download(
            repo_id=repo_id,
            filename="Qwen3_200k_model_params.pt",
            repo_type="model"
        )
        print(f" βœ“ Model downloaded to: {model_path}")

        # Build model on CPU and load weights onto CPU
        self.model = Qwen3Model(self.config)
        print(" βš™οΈ Loading model weights (CPU)...")
        self.model.load_state_dict(
            torch.load(model_path, map_location=torch.device("cpu"), weights_only=True)
        )
        self.model = self.model.to("cpu").eval()
        print(" βœ“ Model loaded successfully on CPU")
        print("βœ… Ready to generate text on CPU; GPU will be used only inside @spaces.GPU\n")

    # Keep this as a thin CPU helper; no CUDA here.
    def _prepare_inputs_cpu(self, prompt: str):
        ids = text_to_token_ids(prompt, self.tokenizer)  # CPU tensor
        return ids


# Initialize the generator once at startup (CPU only)
print("=" * 70)
print("INITIALIZING TEXT GENERATION APP")
print("=" * 70)
generator = TextGenerator()


# === NEW: ZeroGPU entrypoint ===
@spaces.GPU(duration=30)  # the actual GPU work happens only here
def zero_gpu_generate(prompt: str, max_new_tokens: int, temperature: float):
    # ZeroGPU child process context: safe to touch CUDA here
    device = torch.device("cuda")

    # 1) Move/ensure model & dtype on CUDA (T4 lacks bfloat16; use float16)
    #    If your block supports fp16, cast for speed. Otherwise keep float32.
    target_dtype = torch.float16
    if next(generator.model.parameters()).dtype != target_dtype:
        generator.model = generator.model.half()
    if next(generator.model.parameters()).device.type != "cuda":
        generator.model = generator.model.to(device).eval()

    # 2) Prepare inputs and move to CUDA
    input_ids = generator._prepare_inputs_cpu(prompt).to(device)

    # 3) Generate on CUDA (keep your existing generation function)
    output_ids = generate_text_simple(
        model=generator.model,
        idx=input_ids,
        max_new_tokens=min(max_new_tokens, 200),
        context_size=generator.config["context_length"],
        temperature=temperature,
    )

    # 4) Back to text on CPU
    #    (token_ids_to_text likely uses CPU paths; ensure tensor is on CPU)
    output_ids_cpu = output_ids.detach().to("cpu")
    return token_ids_to_text(output_ids_cpu, generator.tokenizer)


def generate_text_interface(prompt, max_new_tokens, temperature):
    if not prompt or len(prompt.strip()) == 0:
        return "⚠️ Please enter some text to start with!"

    # IMPORTANT: call the GPU function; DO NOT use CUDA here
    return zero_gpu_generate(prompt, max_new_tokens, temperature)


# ... keep your Gradio UI identical ...
# demo = gr.Blocks(...); generate_btn.click(fn=generate_text_interface, ...)
# demo.launch(...)

with gr.Blocks(title="Qwen3 Text Generator", theme=gr.themes.Soft()) as demo:
    
    # Header
    gr.Markdown(
        """
        # πŸ€– Qwen3 Text Generator
        
        Generate creative stories and text using a Qwen3 model trained on TinyStories!
        
        ### How to use:
        1. **Enter your starting text** (e.g., "Once upon a time")
        2. **Adjust the sliders** to control the output
        3. **Click Generate** to create text
        """
    )
    
    # Main content area
    with gr.Row():
        with gr.Column(scale=1):
            # Input section
            gr.Markdown("### πŸ“ Input")
            
            prompt_input = gr.Textbox(
                label="Starting Text (Prompt)",
                placeholder="Once upon a time...",
                lines=3,
                info="Enter the text you want the model to continue"
            )
            
            # Control sliders
            gr.Markdown("### βš™οΈ Generation Settings")
            
            max_tokens_slider = gr.Slider(
                minimum=10,
                maximum=200,
                value=50,
                step=10,
                label="Max New Tokens",
                info="How many new tokens to generate (roughly = number of words)"
            )
            
            temperature_slider = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=1.0,
                step=0.1,
                label="Temperature",
                info="Lower = more predictable, Higher = more creative"
            )
            
            # Generate button
            generate_btn = gr.Button(
                "✨ Generate Text", 
                variant="primary", 
                size="lg"
            )
        
        with gr.Column(scale=1):
            # Output section
            gr.Markdown("### πŸ“– Generated Text")
            
            output_text = gr.Textbox(
                label="Result",
                lines=15,
                interactive=False,
                show_copy_button=True
            )
    
    # Example prompts to try
    gr.Markdown("### πŸ’‘ Try these examples:")
    gr.Examples(
        examples=[
            ["Once upon a time", 50, 0.8],
            ["There was a little girl named", 60, 1.0],
            ["In a magical forest", 70, 1.2],
            ["A brave knight", 50, 0.7],
            ["The sun was shining and", 60, 0.9],
        ],
        inputs=[prompt_input, max_tokens_slider, temperature_slider],
        label="Click any example to try it"
    )
    
    # Information section
    gr.Markdown(
        """
        ---
        ### πŸ“Š About This Model
        
        - **Model**: Qwen3 0.6B (596M parameters)
        - **Training Data**: TinyStories dataset (children's stories)
        - **Architecture**: 28 transformer layers with Grouped Query Attention
        - **Model Source**: [vuminhtue/qwen3_sentiment_tinystories](https://huggingface.co/vuminhtue/qwen3_sentiment_tinystories)
        
        ### 🎯 Understanding the Parameters
        
        **Max New Tokens:**
        - Controls the length of generated text
        - One token β‰ˆ one word (roughly)
        - More tokens = longer output = slower generation
        
        **Temperature:**
        - `0.1 - 0.7`: Safe, predictable, focused responses
        - `0.8 - 1.0`: Balanced creativity and coherence
        - `1.1 - 2.0`: Very creative but may be less coherent
        
        ### ⚠️ Note
        
        This model was trained on children's stories, so it works best for:
        - Simple, clear narratives
        - Stories about everyday situations
        - Children's vocabulary and themes
        
        ---
        *Built with Qwen3 architecture β€’ Trained on TinyStories β€’ Powered by PyTorch β€’ Model hosted on πŸ€— HuggingFace*
        """
    )
    
    # Connect the button to the generation function
    generate_btn.click(
        fn=generate_text_interface,
        inputs=[prompt_input, max_tokens_slider, temperature_slider],
        outputs=output_text
    )
    
    # Also allow pressing Enter in the text box to generate
    prompt_input.submit(
        fn=generate_text_interface,
        inputs=[prompt_input, max_tokens_slider, temperature_slider],
        outputs=output_text
    )


# Launch the app
if __name__ == "__main__":
    print("\n" + "="*70)
    print("LAUNCHING GRADIO APP")
    print("="*70)
    demo.launch()