import os import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer from threading import Thread # Config MODEL_ID = "google/gemma-3-270m-it" HF_TOKEN = os.getenv('HF_TOKEN') print("--- [1] Loading Assets ---") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN) # Use bfloat16 to keep RAM usage under 1GB model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cpu", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, token=HF_TOKEN ) print("--- [2] Model Ready ---") def chat(message, history): # Prepare input inputs = tokenizer(message, return_tensors="pt").to("cpu") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Generation Settings kwargs = dict( **inputs, streamer=streamer, max_new_tokens=256, do_sample=True, temperature=0.7, ) thread = Thread(target=model.generate, kwargs=kwargs) thread.start() buffer = "" for new_text in streamer: buffer += new_text yield buffer # Build UI demo = gr.ChatInterface(fn=chat, type="messages") if __name__ == "__main__": print("--- [3] Launching on Port 7860 ---") # server_name must be 0.0.0.0 for the platform to see the app demo.launch(server_name="0.0.0.0", server_port=7860)