""" Stack 2.9 - HuggingFace Space Gradio 6.x compatible """ import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch print("Loading model...") MODEL_NAME = "Qwen/Qwen2.5-Coder-1.5B-Instruct" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True, low_cpu_mem_usage=True ) print("Model loaded!") def generate(prompt, max_tokens, temperature): messages = [{"role": "user", "content": prompt}] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer([text], return_tensors="pt") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=int(max_tokens), temperature=float(temperature), do_sample=True, pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True) return response.strip() demo = gr.Interface( fn=generate, inputs=[ gr.Textbox(label="Prompt", placeholder="Write a Python function to calculate fibonacci...", lines=4), gr.Number(label="Max tokens", value=256, minimum=64, maximum=512), gr.Number(label="Temperature", value=0.7, minimum=0.1, maximum=1.0), ], outputs=gr.Textbox(label="Response", lines=10), title="Stack 2.9 Code Assistant", description="Powered by Qwen2.5-Coder-1.5B", ) demo.launch(server_name="0.0.0.0", server_port=7860)