Spaces:

druvx13
/

0.6wen

Sleeping

File size: 3,611 Bytes

a8c224c
 
 
 
 
 
 
 
663ba54
a8c224c
 
 
 
 
 
663ba54
a8c224c
 
 
663ba54
 
a8c224c
663ba54
a8c224c
663ba54
a8c224c
 
 
 
663ba54
a8c224c
 
 
663ba54
a8c224c
 
 
 
 
 
 
663ba54
 
a8c224c
 
 
663ba54
a8c224c
663ba54
a8c224c
 
663ba54
a8c224c
 
663ba54
a8c224c
 
 
 
 
663ba54
a8c224c
 
663ba54
a8c224c
 
 
 
 
 
663ba54
a8c224c
 
 
 
 
 
 
 
 
 
 
 
663ba54
 
a8c224c
663ba54
a8c224c
 
 
 
 
 
 
 
 
 
 
 
663ba54
 
a8c224c

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Model loading with optimized settings
MODEL_NAME = "Qwen/Qwen3-0.6B"
cache_dir = "./model_cache"

# Load tokenizer with trust_remote_code for model-specific features
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    cache_dir=cache_dir
)

# Load model with GPU acceleration and memory optimization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch.float16,  # FP16 for reduced memory usage
    device_map="auto",          # Let accelerate handle device allocation
    cache_dir=cache_dir
).eval()  # Set to evaluation mode

# Create text generation pipeline (no explicit device needed with device_map)
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id  # Critical fix for generation stability
)

def generate_response(prompt, max_new_tokens=256, temperature=0.7, top_p=0.9):
    """Generate response with safe defaults and error handling"""
    try:
        response = text_generator(
            prompt,
            max_new_tokens=int(max_new_tokens),
            temperature=float(temperature),
            top_p=float(top_p),
            do_sample=True,
            truncation=True,
            max_length=tokenizer.model_max_length  # Prevent overflow
        )
        return response[0]["generated_text"]
    except Exception as e:
        return f"⚠️ Model Error: {str(e)}\n\nTry reducing input length or adjusting generation parameters."

# Gradio interface with enhanced UI
with gr.Blocks(theme="soft", title="Qwen3-0.6B Chat Interface") as demo:
    gr.Markdown("# 🧠 Qwen3-0.6B Text-to-Text Chat")
    gr.Markdown("⚡ Optimized for HuggingFace Spaces with GPU acceleration")
    
    with gr.Row():
        with gr.Column(scale=2):
            prompt = gr.Textbox(
                label="User Input",
                placeholder="Ask me anything...",
                lines=5
            )
            with gr.Accordion("⚙️ Generation Parameters", open=False):
                max_new_tokens = gr.Slider(
                    minimum=32, 
                    maximum=1024,  # Increased max for long-form generation
                    value=256, 
                    step=32,
                    label="Max New Tokens"
                )
                temperature = gr.Slider(
                    minimum=0.1, 
                    maximum=1.5,  # Extended range for creative tasks
                    value=0.7, 
                    step=0.1,
                    label="Temperature"
                )
                top_p = gr.Slider(
                    minimum=0.1, 
                    maximum=1.0, 
                    value=0.9, 
                    step=0.1,
                    label="Top-p Sampling"
                )
            
        with gr.Column(scale=2):
            output = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
    
    submit = gr.Button("💬 Generate Response", variant="primary")
    submit.click(
        fn=generate_response,
        inputs=[prompt, max_new_tokens, temperature, top_p],
        outputs=output
    )
    
    gr.Examples(
        examples=[
            ["Explain quantum computing in simple terms"],
            ["Write a poem about autumn leaves"],
            ["Solve this math problem: 2x + 5 = 17"]
        ],
        inputs=prompt,
        label="🎯 Example Prompts"
    )

if __name__ == "__main__":
    demo.launch()