glm5-server / app.py
Pilipdagh's picture
Create app.py
99441f8 verified
# app.py - small-model friendly Gradio
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
MODEL_ID = os.environ.get("MODEL_ID", "Salesforce/codegen-6B-multi")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model.to(DEVICE)
model.eval()
def generate(prompt, max_new_tokens=64, temperature=0.2):
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
with torch.no_grad():
out = model.generate(**inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature))
text = tokenizer.decode(out[0], skip_special_tokens=True)
if text.startswith(prompt): text = text[len(prompt):].lstrip()
return text
with gr.Blocks() as demo:
gr.Markdown("## Code assistant")
prompt = gr.Textbox(lines=6, label="Prompt")
max_tokens = gr.Slider(16, 256, value=64, step=16, label="Max new tokens")
temp = gr.Slider(0.0, 1.0, value=0.2, step=0.01, label="Temperature")
out = gr.Textbox(lines=12, label="Output")
btn = gr.Button("Generate")
btn.click(generate, inputs=[prompt, max_tokens, temp], outputs=[out])
PORT = int(os.environ.get("PORT", 7860))
demo.launch(server_name="0.0.0.0", server_port=PORT)