# app.py - small-model friendly Gradio import os import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch MODEL_ID = os.environ.get("MODEL_ID", "Salesforce/codegen-6B-multi") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True) model.to(DEVICE) model.eval() def generate(prompt, max_new_tokens=64, temperature=0.2): inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE) with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature)) text = tokenizer.decode(out[0], skip_special_tokens=True) if text.startswith(prompt): text = text[len(prompt):].lstrip() return text with gr.Blocks() as demo: gr.Markdown("## Code assistant") prompt = gr.Textbox(lines=6, label="Prompt") max_tokens = gr.Slider(16, 256, value=64, step=16, label="Max new tokens") temp = gr.Slider(0.0, 1.0, value=0.2, step=0.01, label="Temperature") out = gr.Textbox(lines=12, label="Output") btn = gr.Button("Generate") btn.click(generate, inputs=[prompt, max_tokens, temp], outputs=[out]) PORT = int(os.environ.get("PORT", 7860)) demo.launch(server_name="0.0.0.0", server_port=PORT)