Spaces:
OrbitMC
/
Configuration error

File size: 1,402 Bytes
170b889
0ca18f0
a6562e8
8732d46
 
0ca18f0
74059a3
ac69c7e
74059a3
ac69c7e
74059a3
57b7ba8
74059a3
 
f71bbec
74059a3
f71bbec
74059a3
 
f71bbec
 
74059a3
e791f29
74059a3
 
 
57b7ba8
a6562e8
74059a3
 
 
57b7ba8
74059a3
57b7ba8
74059a3
8732d46
74059a3
 
e791f29
74059a3
 
57b7ba8
74059a3
 
f71bbec
74059a3
 
7fd018a
170b889
74059a3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread

# Config
MODEL_ID = "google/gemma-3-270m-it"
HF_TOKEN = os.getenv('HF_TOKEN')

print("--- [1] Loading Assets ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)

# Use bfloat16 to keep RAM usage under 1GB
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="cpu",
    torch_dtype=torch.bfloat16, 
    low_cpu_mem_usage=True,
    token=HF_TOKEN
)
print("--- [2] Model Ready ---")

def chat(message, history):
    # Prepare input
    inputs = tokenizer(message, return_tensors="pt").to("cpu")
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    # Generation Settings
    kwargs = dict(
        **inputs,
        streamer=streamer,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
    )
    
    thread = Thread(target=model.generate, kwargs=kwargs)
    thread.start()
    
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer

# Build UI
demo = gr.ChatInterface(fn=chat, type="messages")

if __name__ == "__main__":
    print("--- [3] Launching on Port 7860 ---")
    # server_name must be 0.0.0.0 for the platform to see the app
    demo.launch(server_name="0.0.0.0", server_port=7860)