File size: 3,611 Bytes
a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c 663ba54 a8c224c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
# Model loading with optimized settings
MODEL_NAME = "Qwen/Qwen3-0.6B"
cache_dir = "./model_cache"
# Load tokenizer with trust_remote_code for model-specific features
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
cache_dir=cache_dir
)
# Load model with GPU acceleration and memory optimization
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype=torch.float16, # FP16 for reduced memory usage
device_map="auto", # Let accelerate handle device allocation
cache_dir=cache_dir
).eval() # Set to evaluation mode
# Create text generation pipeline (no explicit device needed with device_map)
text_generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
pad_token_id=tokenizer.eos_token_id # Critical fix for generation stability
)
def generate_response(prompt, max_new_tokens=256, temperature=0.7, top_p=0.9):
"""Generate response with safe defaults and error handling"""
try:
response = text_generator(
prompt,
max_new_tokens=int(max_new_tokens),
temperature=float(temperature),
top_p=float(top_p),
do_sample=True,
truncation=True,
max_length=tokenizer.model_max_length # Prevent overflow
)
return response[0]["generated_text"]
except Exception as e:
return f"⚠️ Model Error: {str(e)}\n\nTry reducing input length or adjusting generation parameters."
# Gradio interface with enhanced UI
with gr.Blocks(theme="soft", title="Qwen3-0.6B Chat Interface") as demo:
gr.Markdown("# 🧠 Qwen3-0.6B Text-to-Text Chat")
gr.Markdown("⚡ Optimized for HuggingFace Spaces with GPU acceleration")
with gr.Row():
with gr.Column(scale=2):
prompt = gr.Textbox(
label="User Input",
placeholder="Ask me anything...",
lines=5
)
with gr.Accordion("⚙️ Generation Parameters", open=False):
max_new_tokens = gr.Slider(
minimum=32,
maximum=1024, # Increased max for long-form generation
value=256,
step=32,
label="Max New Tokens"
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.5, # Extended range for creative tasks
value=0.7,
step=0.1,
label="Temperature"
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.1,
label="Top-p Sampling"
)
with gr.Column(scale=2):
output = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
submit = gr.Button("💬 Generate Response", variant="primary")
submit.click(
fn=generate_response,
inputs=[prompt, max_new_tokens, temperature, top_p],
outputs=output
)
gr.Examples(
examples=[
["Explain quantum computing in simple terms"],
["Write a poem about autumn leaves"],
["Solve this math problem: 2x + 5 = 17"]
],
inputs=prompt,
label="🎯 Example Prompts"
)
if __name__ == "__main__":
demo.launch() |