import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch # ✅ Use a small model that works on CPU MODEL_NAME = "togethercomputer/RedPajama-INCITE-3B-v1" print("Loading model. This may take a few moments…") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to("cpu") print("Model loaded!") history = [] def chat_with_airi(user_msg): global history # build conversation prompt (last 5 exchanges) prompt = "" for u, a in history[-5:]: prompt += f"User: {u}\nAiri: {a}\n" prompt += f"User: {user_msg}\nAiri:" inputs = tokenizer(prompt, return_tensors="pt").to("cpu") with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=100, # can adjust for longer replies do_sample=True, top_p=0.9, temperature=0.8, pad_token_id=tokenizer.eos_token_id ) reply = tokenizer.decode(output[0], skip_special_tokens=True) reply = reply.split("Airi:", 1)[-1].strip() history.append([user_msg, reply]) return history, "" with gr.Blocks() as demo: gr.HTML("

Airi — Mini Chat AI

") gr.HTML("

Small, Fast & Public Model

") chat = gr.Chatbot() msg = gr.Textbox(label="Talk to Airi…", placeholder="Write here…") msg.submit(chat_with_airi, msg, [chat, msg]) demo.launch()