import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch model = f"tiiuae/falcon-7b" tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model, torch_dtype=torch.bfloat16, device_map="auto", load_in_8bit=True, trust_remote_code=True ) def greet(prompt): inputs = tokenizer(prompt, return_tensors="pt").to("cuda") v = model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], do_sample=True, temperature=0.6, top_p=0.9, max_new_tokens=50, ) return tokenizer.decode(v[0].to("cpu")) iface = gr.Interface(fn=greet, inputs="text", outputs="text") iface.launch()