# app.py — only the changed/added parts shown import gradio as gr import torch import tiktoken from pathlib import Path from huggingface_hub import hf_hub_download import spaces # <-- NEW: required for the ZeroGPU decorator from Qwen3_model import Qwen3Model, generate_text_simple, text_to_token_ids, token_ids_to_text class TextGenerator: def __init__(self, repo_id="vuminhtue/qwen3_sentiment_tinystories"): print(" Loading Qwen3 model from HuggingFace...") print(f" Repository: {repo_id}") # Keep config; but DON'T bind dtype to bfloat16 here (T4 usually lacks bf16). # We'll control dtype when moving to CUDA later. self.config = { "vocab_size": 151_936, "context_length": 40_960, "emb_dim": 1024, "n_heads": 16, "n_layers": 28, "hidden_dim": 3072, "head_dim": 128, "qk_norm": True, "n_kv_groups": 8, "rope_base": 1_000_000.0, "dtype": torch.float32, # <-- SAFE on CPU; we’ll cast on GPU } # IMPORTANT: stay on CPU in the main process self.device = "cpu" print(f" Using device: {self.device}") # Tokenizer self.tokenizer = tiktoken.get_encoding("gpt2") print(" ✓ Tokenizer loaded") # Download checkpoint (cached by HF) print(" Downloading model from HuggingFace (this may take a moment)...") model_path = hf_hub_download( repo_id=repo_id, filename="Qwen3_200k_model_params.pt", repo_type="model" ) print(f" ✓ Model downloaded to: {model_path}") # Build model on CPU and load weights onto CPU self.model = Qwen3Model(self.config) print(" ⚙️ Loading model weights (CPU)...") self.model.load_state_dict( torch.load(model_path, map_location=torch.device("cpu"), weights_only=True) ) self.model = self.model.to("cpu").eval() print(" ✓ Model loaded successfully on CPU") print("✅ Ready to generate text on CPU; GPU will be used only inside @spaces.GPU\n") # Keep this as a thin CPU helper; no CUDA here. def _prepare_inputs_cpu(self, prompt: str): ids = text_to_token_ids(prompt, self.tokenizer) # CPU tensor return ids # Initialize the generator once at startup (CPU only) print("=" * 70) print("INITIALIZING TEXT GENERATION APP") print("=" * 70) generator = TextGenerator() # === NEW: ZeroGPU entrypoint === @spaces.GPU(duration=30) # the actual GPU work happens only here def zero_gpu_generate(prompt: str, max_new_tokens: int, temperature: float): # ZeroGPU child process context: safe to touch CUDA here device = torch.device("cuda") # 1) Move/ensure model & dtype on CUDA (T4 lacks bfloat16; use float16) # If your block supports fp16, cast for speed. Otherwise keep float32. target_dtype = torch.float16 if next(generator.model.parameters()).dtype != target_dtype: generator.model = generator.model.half() if next(generator.model.parameters()).device.type != "cuda": generator.model = generator.model.to(device).eval() # 2) Prepare inputs and move to CUDA input_ids = generator._prepare_inputs_cpu(prompt).to(device) # 3) Generate on CUDA (keep your existing generation function) output_ids = generate_text_simple( model=generator.model, idx=input_ids, max_new_tokens=min(max_new_tokens, 200), context_size=generator.config["context_length"], temperature=temperature, ) # 4) Back to text on CPU # (token_ids_to_text likely uses CPU paths; ensure tensor is on CPU) output_ids_cpu = output_ids.detach().to("cpu") return token_ids_to_text(output_ids_cpu, generator.tokenizer) def generate_text_interface(prompt, max_new_tokens, temperature): if not prompt or len(prompt.strip()) == 0: return "⚠️ Please enter some text to start with!" # IMPORTANT: call the GPU function; DO NOT use CUDA here return zero_gpu_generate(prompt, max_new_tokens, temperature) # ... keep your Gradio UI identical ... # demo = gr.Blocks(...); generate_btn.click(fn=generate_text_interface, ...) # demo.launch(...) with gr.Blocks(title="Qwen3 Text Generator", theme=gr.themes.Soft()) as demo: # Header gr.Markdown( """ # 🤖 Qwen3 Text Generator Generate creative stories and text using a Qwen3 model trained on TinyStories! ### How to use: 1. **Enter your starting text** (e.g., "Once upon a time") 2. **Adjust the sliders** to control the output 3. **Click Generate** to create text """ ) # Main content area with gr.Row(): with gr.Column(scale=1): # Input section gr.Markdown("### 📝 Input") prompt_input = gr.Textbox( label="Starting Text (Prompt)", placeholder="Once upon a time...", lines=3, info="Enter the text you want the model to continue" ) # Control sliders gr.Markdown("### ⚙️ Generation Settings") max_tokens_slider = gr.Slider( minimum=10, maximum=200, value=50, step=10, label="Max New Tokens", info="How many new tokens to generate (roughly = number of words)" ) temperature_slider = gr.Slider( minimum=0.1, maximum=2.0, value=1.0, step=0.1, label="Temperature", info="Lower = more predictable, Higher = more creative" ) # Generate button generate_btn = gr.Button( "✨ Generate Text", variant="primary", size="lg" ) with gr.Column(scale=1): # Output section gr.Markdown("### 📖 Generated Text") output_text = gr.Textbox( label="Result", lines=15, interactive=False, show_copy_button=True ) # Example prompts to try gr.Markdown("### 💡 Try these examples:") gr.Examples( examples=[ ["Once upon a time", 50, 0.8], ["There was a little girl named", 60, 1.0], ["In a magical forest", 70, 1.2], ["A brave knight", 50, 0.7], ["The sun was shining and", 60, 0.9], ], inputs=[prompt_input, max_tokens_slider, temperature_slider], label="Click any example to try it" ) # Information section gr.Markdown( """ --- ### 📊 About This Model - **Model**: Qwen3 0.6B (596M parameters) - **Training Data**: TinyStories dataset (children's stories) - **Architecture**: 28 transformer layers with Grouped Query Attention - **Model Source**: [vuminhtue/qwen3_sentiment_tinystories](https://huggingface.co/vuminhtue/qwen3_sentiment_tinystories) ### 🎯 Understanding the Parameters **Max New Tokens:** - Controls the length of generated text - One token ≈ one word (roughly) - More tokens = longer output = slower generation **Temperature:** - `0.1 - 0.7`: Safe, predictable, focused responses - `0.8 - 1.0`: Balanced creativity and coherence - `1.1 - 2.0`: Very creative but may be less coherent ### ⚠️ Note This model was trained on children's stories, so it works best for: - Simple, clear narratives - Stories about everyday situations - Children's vocabulary and themes --- *Built with Qwen3 architecture • Trained on TinyStories • Powered by PyTorch • Model hosted on 🤗 HuggingFace* """ ) # Connect the button to the generation function generate_btn.click( fn=generate_text_interface, inputs=[prompt_input, max_tokens_slider, temperature_slider], outputs=output_text ) # Also allow pressing Enter in the text box to generate prompt_input.submit( fn=generate_text_interface, inputs=[prompt_input, max_tokens_slider, temperature_slider], outputs=output_text ) # Launch the app if __name__ == "__main__": print("\n" + "="*70) print("LAUNCHING GRADIO APP") print("="*70) demo.launch()