Spaces:

vuminhtue
/

Qwen3_Sentence_Completion

Sleeping

App Files Files Community

vuminhtue commited on Oct 15, 2025

Commit

e871c64

verified ·

1 Parent(s): 738bcf1

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -286

app.py CHANGED Viewed

@@ -1,12 +1,4 @@
-"""
-Qwen3 Text Generation App for Hugging Face Spaces
-This app allows you to generate text using a trained Qwen3 model.
-You can control:
-- The starting text (prompt)
-- How many new words to generate (max_new_tokens)
-- How creative the output should be (temperature)
-"""
 import gradio as gr
 import torch
@@ -14,303 +6,112 @@ import tiktoken
 from pathlib import Path
 from huggingface_hub import hf_hub_download
-# Import our Qwen3 model
 from Qwen3_model import Qwen3Model, generate_text_simple, text_to_token_ids, token_ids_to_text
 class TextGenerator:
-    """
-    A simple class to load the model and generate text
-    This makes it easy to:
-    1. Load the trained model once at startup
-    2. Generate text multiple times without reloading
-    """
     def __init__(self, repo_id="vuminhtue/qwen3_sentiment_tinystories"):
-        """
-        Initialize the text generator
-        Parameters:
-        -----------
-        repo_id : str
-            HuggingFace repository ID to download the model from
-            Default: "vuminhtue/qwen3_sentiment_tinystories"
-        """
-        print("🚀 Loading Qwen3 model from HuggingFace...")
-        print(f"   Repository: {repo_id}")
-        # Configuration for Qwen3 0.6B model
-        # These settings define the architecture of the model
         self.config = {
-            "vocab_size": 151_936,      # Number of different tokens the model knows
-            "context_length": 40_960,   # Maximum length of text it can process
-            "emb_dim": 1024,            # Size of the embedding vectors
-            "n_heads": 16,              # Number of attention heads
-            "n_layers": 28,             # Number of transformer layers
-            "hidden_dim": 3072,         # Size of the feed-forward network
-            "head_dim": 128,            # Size of each attention head
-            "qk_norm": True,            # Whether to normalize queries and keys
-            "n_kv_groups": 8,           # Number of key-value groups
-            "rope_base": 1_000_000.0,   # Base for rotary position encoding
-            "dtype": torch.bfloat16,    # Data type for model weights
         }
-        # Detect if we have a GPU available
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"   Using device: {self.device}")
-        # Load the tokenizer (converts text to numbers and back)
-        # We use GPT-2's tokenizer which works well for English text
         self.tokenizer = tiktoken.get_encoding("gpt2")
-        print("   ✓ Tokenizer loaded")
-        # Download the model file from HuggingFace
-        # This will cache the file locally, so it only downloads once
-        print("   📥 Downloading model from HuggingFace (this may take a moment)...")
-        try:
-            model_path = hf_hub_download(
-                repo_id=repo_id,
-                filename="Qwen3_200k_model_params.pt",
-                repo_type="model"
-            )
-            print(f"   ✓ Model downloaded to: {model_path}")
-        except Exception as e:
-            print(f"   ❌ Error downloading model: {e}")
-            raise
-        # Create the model with our configuration
         self.model = Qwen3Model(self.config)
-        # Load the trained weights from the downloaded file
-        print("   ⚙️  Loading model weights...")
         self.model.load_state_dict(
-            torch.load(
-                model_path,
-                map_location=torch.device(self.device),
-                weights_only=True
-            )
         )
-        # Move model to the appropriate device (CPU or GPU)
-        self.model = self.model.to(self.device)
-        # Set to evaluation mode (disables training-specific features)
-        self.model.eval()
-        print("   ✓ Model loaded successfully!")
-        print("✅ Ready to generate text!\n")
-    def generate(self, prompt, max_new_tokens=50, temperature=1.0):
-        """
-        Generate text based on a prompt
-        Parameters:
-        -----------
-        prompt : str
-            The starting text (what you want the model to continue)
-        max_new_tokens : int
-            How many new tokens (roughly words) to generate
-        temperature : float
-            Controls creativity:
-            - Lower (0.1-0.7): More predictable, focused
-            - Medium (0.8-1.0): Balanced
-            - Higher (1.1-2.0): More creative, random
-        Returns:
-        --------
-        str : The generated text (including the original prompt)
-        """
-        try:
-            # Convert the text prompt to token IDs (numbers)
-            input_ids = text_to_token_ids(prompt, self.tokenizer)
-            input_ids = input_ids.to(self.device)
-            # Generate new tokens
-            output_ids = generate_text_simple(
-                model=self.model,
-                idx=input_ids,
-                max_new_tokens=max_new_tokens,
-                context_size=self.config["context_length"],
-                temperature=temperature
-            )
-            # Convert the token IDs back to text
-            generated_text = token_ids_to_text(output_ids, self.tokenizer)
-            return generated_text
-        except Exception as e:
-            return f"❌ Error generating text: {str(e)}"
-# Initialize the generator once when the app starts
-print("="*70)
 print("INITIALIZING TEXT GENERATION APP")
-print("="*70)
 generator = TextGenerator()
-def generate_text_interface(prompt, max_new_tokens, temperature):
-    """
-    Interface function for Gradio
-    This function:
-    1. Takes inputs from the user interface
-    2. Calls our generator
-    3. Returns the result to display
-    """
-    # Check if prompt is empty
-    if not prompt or len(prompt.strip()) == 0:
-        return "⚠️ Please enter some text to start with!"
-    # Limit max tokens to prevent very long generation times
-    max_new_tokens = min(max_new_tokens, 200)
-    # Generate text
-    result = generator.generate(prompt, max_new_tokens, temperature)
-    return result
-# Create the Gradio interface
-# This defines what the web app looks like and how it behaves
-with gr.Blocks(title="Qwen3 Text Generator", theme=gr.themes.Soft()) as demo:
-    # Header
-    gr.Markdown(
-        """
-        # 🤖 Qwen3 Text Generator
-        Generate creative stories and text using a Qwen3 model trained on TinyStories!
-        ### How to use:
-        1. **Enter your starting text** (e.g., "Once upon a time")
-        2. **Adjust the sliders** to control the output
-        3. **Click Generate** to create text
-        """
-    )
-    # Main content area
-    with gr.Row():
-        with gr.Column(scale=1):
-            # Input section
-            gr.Markdown("### 📝 Input")
-            prompt_input = gr.Textbox(
-                label="Starting Text (Prompt)",
-                placeholder="Once upon a time...",
-                lines=3,
-                info="Enter the text you want the model to continue"
-            )
-            # Control sliders
-            gr.Markdown("### ⚙️ Generation Settings")
-            max_tokens_slider = gr.Slider(
-                minimum=10,
-                maximum=200,
-                value=50,
-                step=10,
-                label="Max New Tokens",
-                info="How many new tokens to generate (roughly = number of words)"
-            )
-            temperature_slider = gr.Slider(
-                minimum=0.1,
-                maximum=2.0,
-                value=1.0,
-                step=0.1,
-                label="Temperature",
-                info="Lower = more predictable, Higher = more creative"
-            )
-            # Generate button
-            generate_btn = gr.Button(
-                "✨ Generate Text",
-                variant="primary",
-                size="lg"
-            )
-        with gr.Column(scale=1):
-            # Output section
-            gr.Markdown("### 📖 Generated Text")
-            output_text = gr.Textbox(
-                label="Result",
-                lines=15,
-                interactive=False,
-                show_copy_button=True
-            )
-    # Example prompts to try
-    gr.Markdown("### 💡 Try these examples:")
-    gr.Examples(
-        examples=[
-            ["Once upon a time", 50, 0.8],
-            ["There was a little girl named", 60, 1.0],
-            ["In a magical forest", 70, 1.2],
-            ["A brave knight", 50, 0.7],
-            ["The sun was shining and", 60, 0.9],
-        ],
-        inputs=[prompt_input, max_tokens_slider, temperature_slider],
-        label="Click any example to try it"
-    )
-    # Information section
-    gr.Markdown(
-        """
-        ---
-        ### 📊 About This Model
-        - **Model**: Qwen3 0.6B (596M parameters)
-        - **Training Data**: TinyStories dataset (children's stories)
-        - **Architecture**: 28 transformer layers with Grouped Query Attention
-        - **Model Source**: [vuminhtue/qwen3_sentiment_tinystories](https://huggingface.co/vuminhtue/qwen3_sentiment_tinystories)
-        ### 🎯 Understanding the Parameters
-        **Max New Tokens:**
-        - Controls the length of generated text
-        - One token ≈ one word (roughly)
-        - More tokens = longer output = slower generation
-        **Temperature:**
-        - `0.1 - 0.7`: Safe, predictable, focused responses
-        - `0.8 - 1.0`: Balanced creativity and coherence
-        - `1.1 - 2.0`: Very creative but may be less coherent
-        ### ⚠️ Note
-        This model was trained on children's stories, so it works best for:
-        - Simple, clear narratives
-        - Stories about everyday situations
-        - Children's vocabulary and themes
-        ---
-        *Built with Qwen3 architecture • Trained on TinyStories • Powered by PyTorch • Model hosted on 🤗 HuggingFace*
-        """
-    )
-    # Connect the button to the generation function
-    generate_btn.click(
-        fn=generate_text_interface,
-        inputs=[prompt_input, max_tokens_slider, temperature_slider],
-        outputs=output_text
-    )
-    # Also allow pressing Enter in the text box to generate
-    prompt_input.submit(
-        fn=generate_text_interface,
-        inputs=[prompt_input, max_tokens_slider, temperature_slider],
-        outputs=output_text
     )
-# Launch the app
-if __name__ == "__main__":
-    print("\n" + "="*70)
-    print("LAUNCHING GRADIO APP")
-    print("="*70)
-    demo.launch()

+# app.py  — only the changed/added parts shown
 import gradio as gr
 import torch
 from pathlib import Path
 from huggingface_hub import hf_hub_download
+import spaces  # <-- NEW: required for the ZeroGPU decorator
 from Qwen3_model import Qwen3Model, generate_text_simple, text_to_token_ids, token_ids_to_text
 class TextGenerator:
     def __init__(self, repo_id="vuminhtue/qwen3_sentiment_tinystories"):
+        print(" Loading Qwen3 model from HuggingFace...")
+        print(f" Repository: {repo_id}")
+        # Keep config; but DON'T bind dtype to bfloat16 here (T4 usually lacks bf16).
+        # We'll control dtype when moving to CUDA later.
         self.config = {
+            "vocab_size": 151_936,
+            "context_length": 40_960,
+            "emb_dim": 1024,
+            "n_heads": 16,
+            "n_layers": 28,
+            "hidden_dim": 3072,
+            "head_dim": 128,
+            "qk_norm": True,
+            "n_kv_groups": 8,
+            "rope_base": 1_000_000.0,
+            "dtype": torch.float32,  # <-- SAFE on CPU; we’ll cast on GPU
         }
+        # IMPORTANT: stay on CPU in the main process
+        self.device = "cpu"
+        print(f" Using device: {self.device}")
+        # Tokenizer
         self.tokenizer = tiktoken.get_encoding("gpt2")
+        print(" ✓ Tokenizer loaded")
+        # Download checkpoint (cached by HF)
+        print(" Downloading model from HuggingFace (this may take a moment)...")
+        model_path = hf_hub_download(
+            repo_id=repo_id,
+            filename="Qwen3_200k_model_params.pt",
+            repo_type="model"
+        )
+        print(f" ✓ Model downloaded to: {model_path}")
+        # Build model on CPU and load weights onto CPU
         self.model = Qwen3Model(self.config)
+        print(" ⚙️ Loading model weights (CPU)...")
         self.model.load_state_dict(
+            torch.load(model_path, map_location=torch.device("cpu"), weights_only=True)
         )
+        self.model = self.model.to("cpu").eval()
+        print(" ✓ Model loaded successfully on CPU")
+        print("✅ Ready to generate text on CPU; GPU will be used only inside @spaces.GPU\n")
+    # Keep this as a thin CPU helper; no CUDA here.
+    def _prepare_inputs_cpu(self, prompt: str):
+        ids = text_to_token_ids(prompt, self.tokenizer)  # CPU tensor
+        return ids
+# Initialize the generator once at startup (CPU only)
+print("=" * 70)
 print("INITIALIZING TEXT GENERATION APP")
+print("=" * 70)
 generator = TextGenerator()
+# === NEW: ZeroGPU entrypoint ===
+@spaces.GPU(duration=120)  # the actual GPU work happens only here
+def zero_gpu_generate(prompt: str, max_new_tokens: int, temperature: float):
+    # ZeroGPU child process context: safe to touch CUDA here
+    device = torch.device("cuda")
+    # 1) Move/ensure model & dtype on CUDA (T4 lacks bfloat16; use float16)
+    #    If your block supports fp16, cast for speed. Otherwise keep float32.
+    target_dtype = torch.float16
+    if next(generator.model.parameters()).dtype != target_dtype:
+        generator.model = generator.model.half()
+    if next(generator.model.parameters()).device.type != "cuda":
+        generator.model = generator.model.to(device).eval()
+    # 2) Prepare inputs and move to CUDA
+    input_ids = generator._prepare_inputs_cpu(prompt).to(device)
+    # 3) Generate on CUDA (keep your existing generation function)
+    output_ids = generate_text_simple(
+        model=generator.model,
+        idx=input_ids,
+        max_new_tokens=min(max_new_tokens, 200),
+        context_size=generator.config["context_length"],
+        temperature=temperature,
     )
+    # 4) Back to text on CPU
+    #    (token_ids_to_text likely uses CPU paths; ensure tensor is on CPU)
+    output_ids_cpu = output_ids.detach().to("cpu")
+    return token_ids_to_text(output_ids_cpu, generator.tokenizer)
+def generate_text_interface(prompt, max_new_tokens, temperature):
+    if not prompt or len(prompt.strip()) == 0:
+        return "⚠️ Please enter some text to start with!"
+    # IMPORTANT: call the GPU function; DO NOT use CUDA here
+    return zero_gpu_generate(prompt, max_new_tokens, temperature)
+# ... keep your Gradio UI identical ...
+# demo = gr.Blocks(...); generate_btn.click(fn=generate_text_interface, ...)
+# demo.launch(...)