File size: 1,402 Bytes
170b889 0ca18f0 a6562e8 8732d46 0ca18f0 74059a3 ac69c7e 74059a3 ac69c7e 74059a3 57b7ba8 74059a3 f71bbec 74059a3 f71bbec 74059a3 f71bbec 74059a3 e791f29 74059a3 57b7ba8 a6562e8 74059a3 57b7ba8 74059a3 57b7ba8 74059a3 8732d46 74059a3 e791f29 74059a3 57b7ba8 74059a3 f71bbec 74059a3 7fd018a 170b889 74059a3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | import os
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
# Config
MODEL_ID = "google/gemma-3-270m-it"
HF_TOKEN = os.getenv('HF_TOKEN')
print("--- [1] Loading Assets ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
# Use bfloat16 to keep RAM usage under 1GB
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="cpu",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
token=HF_TOKEN
)
print("--- [2] Model Ready ---")
def chat(message, history):
# Prepare input
inputs = tokenizer(message, return_tensors="pt").to("cpu")
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Generation Settings
kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=256,
do_sample=True,
temperature=0.7,
)
thread = Thread(target=model.generate, kwargs=kwargs)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer
# Build UI
demo = gr.ChatInterface(fn=chat, type="messages")
if __name__ == "__main__":
print("--- [3] Launching on Port 7860 ---")
# server_name must be 0.0.0.0 for the platform to see the app
demo.launch(server_name="0.0.0.0", server_port=7860) |