Spaces:

vuminhtue
/

Qwen3_Sentence_Completion

Sleeping

File size: 8,767 Bytes

# app.py  — only the changed/added parts shown

import gradio as gr
import torch
import tiktoken
from pathlib import Path
from huggingface_hub import hf_hub_download

import spaces  # <-- NEW: required for the ZeroGPU decorator

from Qwen3_model import Qwen3Model, generate_text_simple, text_to_token_ids, token_ids_to_text


class TextGenerator:
    def __init__(self, repo_id="vuminhtue/qwen3_sentiment_tinystories"):
        print(" Loading Qwen3 model from HuggingFace...")
        print(f" Repository: {repo_id}")

        # Keep config; but DON'T bind dtype to bfloat16 here (T4 usually lacks bf16).
        # We'll control dtype when moving to CUDA later.
        self.config = {
            "vocab_size": 151_936,
            "context_length": 40_960,
            "emb_dim": 1024,
            "n_heads": 16,
            "n_layers": 28,
            "hidden_dim": 3072,
            "head_dim": 128,
            "qk_norm": True,
            "n_kv_groups": 8,
            "rope_base": 1_000_000.0,
            "dtype": torch.float32,  # <-- SAFE on CPU; we’ll cast on GPU
        }

        # IMPORTANT: stay on CPU in the main process
        self.device = "cpu"
        print(f" Using device: {self.device}")

        # Tokenizer
        self.tokenizer = tiktoken.get_encoding("gpt2")
        print(" ✓ Tokenizer loaded")

        # Download checkpoint (cached by HF)
        print(" Downloading model from HuggingFace (this may take a moment)...")
        model_path = hf_hub_download(
            repo_id=repo_id,
            filename="Qwen3_200k_model_params.pt",
            repo_type="model"
        )
        print(f" ✓ Model downloaded to: {model_path}")

        # Build model on CPU and load weights onto CPU
        self.model = Qwen3Model(self.config)
        print(" ⚙️ Loading model weights (CPU)...")
        self.model.load_state_dict(
            torch.load(model_path, map_location=torch.device("cpu"), weights_only=True)
        )
        self.model = self.model.to("cpu").eval()
        print(" ✓ Model loaded successfully on CPU")
        print("✅ Ready to generate text on CPU; GPU will be used only inside @spaces.GPU\n")

    # Keep this as a thin CPU helper; no CUDA here.
    def _prepare_inputs_cpu(self, prompt: str):
        ids = text_to_token_ids(prompt, self.tokenizer)  # CPU tensor
        return ids


# Initialize the generator once at startup (CPU only)
print("=" * 70)
print("INITIALIZING TEXT GENERATION APP")
print("=" * 70)
generator = TextGenerator()


# === NEW: ZeroGPU entrypoint ===
@spaces.GPU(duration=30)  # the actual GPU work happens only here
def zero_gpu_generate(prompt: str, max_new_tokens: int, temperature: float):
    # ZeroGPU child process context: safe to touch CUDA here
    device = torch.device("cuda")

    # 1) Move/ensure model & dtype on CUDA (T4 lacks bfloat16; use float16)
    #    If your block supports fp16, cast for speed. Otherwise keep float32.
    target_dtype = torch.float16
    if next(generator.model.parameters()).dtype != target_dtype:
        generator.model = generator.model.half()
    if next(generator.model.parameters()).device.type != "cuda":
        generator.model = generator.model.to(device).eval()

    # 2) Prepare inputs and move to CUDA
    input_ids = generator._prepare_inputs_cpu(prompt).to(device)

    # 3) Generate on CUDA (keep your existing generation function)
    output_ids = generate_text_simple(
        model=generator.model,
        idx=input_ids,
        max_new_tokens=min(max_new_tokens, 200),
        context_size=generator.config["context_length"],
        temperature=temperature,
    )

    # 4) Back to text on CPU
    #    (token_ids_to_text likely uses CPU paths; ensure tensor is on CPU)
    output_ids_cpu = output_ids.detach().to("cpu")
    return token_ids_to_text(output_ids_cpu, generator.tokenizer)


def generate_text_interface(prompt, max_new_tokens, temperature):
    if not prompt or len(prompt.strip()) == 0:
        return "⚠️ Please enter some text to start with!"

    # IMPORTANT: call the GPU function; DO NOT use CUDA here
    return zero_gpu_generate(prompt, max_new_tokens, temperature)


# ... keep your Gradio UI identical ...
# demo = gr.Blocks(...); generate_btn.click(fn=generate_text_interface, ...)
# demo.launch(...)

with gr.Blocks(title="Qwen3 Text Generator", theme=gr.themes.Soft()) as demo:
    
    # Header
    gr.Markdown(
        """
        # 🤖 Qwen3 Text Generator
        
        Generate creative stories and text using a Qwen3 model trained on TinyStories!
        
        ### How to use:
        1. **Enter your starting text** (e.g., "Once upon a time")
        2. **Adjust the sliders** to control the output
        3. **Click Generate** to create text
        """
    )
    
    # Main content area
    with gr.Row():
        with gr.Column(scale=1):
            # Input section
            gr.Markdown("### 📝 Input")
            
            prompt_input = gr.Textbox(
                label="Starting Text (Prompt)",
                placeholder="Once upon a time...",
                lines=3,
                info="Enter the text you want the model to continue"
            )
            
            # Control sliders
            gr.Markdown("### ⚙️ Generation Settings")
            
            max_tokens_slider = gr.Slider(
                minimum=10,
                maximum=200,
                value=50,
                step=10,
                label="Max New Tokens",
                info="How many new tokens to generate (roughly = number of words)"
            )
            
            temperature_slider = gr.Slider(
                minimum=0.1,
                maximum=2.0,
                value=1.0,
                step=0.1,
                label="Temperature",
                info="Lower = more predictable, Higher = more creative"
            )
            
            # Generate button
            generate_btn = gr.Button(
                "✨ Generate Text", 
                variant="primary", 
                size="lg"
            )
        
        with gr.Column(scale=1):
            # Output section
            gr.Markdown("### 📖 Generated Text")
            
            output_text = gr.Textbox(
                label="Result",
                lines=15,
                interactive=False,
                show_copy_button=True
            )
    
    # Example prompts to try
    gr.Markdown("### 💡 Try these examples:")
    gr.Examples(
        examples=[
            ["Once upon a time", 50, 0.8],
            ["There was a little girl named", 60, 1.0],
            ["In a magical forest", 70, 1.2],
            ["A brave knight", 50, 0.7],
            ["The sun was shining and", 60, 0.9],
        ],
        inputs=[prompt_input, max_tokens_slider, temperature_slider],
        label="Click any example to try it"
    )
    
    # Information section
    gr.Markdown(
        """
        ---
        ### 📊 About This Model
        
        - **Model**: Qwen3 0.6B (596M parameters)
        - **Training Data**: TinyStories dataset (children's stories)
        - **Architecture**: 28 transformer layers with Grouped Query Attention
        - **Model Source**: [vuminhtue/qwen3_sentiment_tinystories](https://huggingface.co/vuminhtue/qwen3_sentiment_tinystories)
        
        ### 🎯 Understanding the Parameters
        
        **Max New Tokens:**
        - Controls the length of generated text
        - One token ≈ one word (roughly)
        - More tokens = longer output = slower generation
        
        **Temperature:**
        - `0.1 - 0.7`: Safe, predictable, focused responses
        - `0.8 - 1.0`: Balanced creativity and coherence
        - `1.1 - 2.0`: Very creative but may be less coherent
        
        ### ⚠️ Note
        
        This model was trained on children's stories, so it works best for:
        - Simple, clear narratives
        - Stories about everyday situations
        - Children's vocabulary and themes
        
        ---
        *Built with Qwen3 architecture • Trained on TinyStories • Powered by PyTorch • Model hosted on 🤗 HuggingFace*
        """
    )
    
    # Connect the button to the generation function
    generate_btn.click(
        fn=generate_text_interface,
        inputs=[prompt_input, max_tokens_slider, temperature_slider],
        outputs=output_text
    )
    
    # Also allow pressing Enter in the text box to generate
    prompt_input.submit(
        fn=generate_text_interface,
        inputs=[prompt_input, max_tokens_slider, temperature_slider],
        outputs=output_text
    )


# Launch the app
if __name__ == "__main__":
    print("\n" + "="*70)
    print("LAUNCHING GRADIO APP")
    print("="*70)
    demo.launch()