import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer model_id = "google/gemma-3-1b-it" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, device_map=“cpu” ) model.eval() def predict(message, history): messages = [] for turn in history: messages.append({“role”: “user”, “content”: turn[0]}) messages.append({“role”: “assistant”, “content”: turn[1]}) messages.append({“role”: “user”, “content”: message[-1000:]}) ``` tokenized = tokenizer.apply_chat_template( messages, return_tensors="pt", add_generation_prompt=True ) input_ids = tokenized.to("cpu") with torch.no_grad(): output = model.generate( input_ids=input_ids, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9, use_cache=True ) new_tokens = output[0][input_ids.shape[-1]:] return tokenizer.decode(new_tokens, skip_special_tokens=True).strip() ``` demo = gr.ChatInterface( fn=predict, title=“Gemma 3 1B (CPU)”, description=“google/gemma-3-1b-it — runs on HF free tier CPU (~4GB RAM)” ) if **name** == “**main**”: demo.launch()