LLM_hello / app.py
June Hong
enable 4-bit quantization (bitsandbytes) to reduce memory usage
b1d30fe
import os
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from huggingface_hub import login
# Load EXAONE model
MODEL_ID = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
# Authenticate using the HF_TOKEN secret
hf_token = os.getenv("HF_TOKEN")
if hf_token:
hf_token = hf_token.strip()
os.environ["HF_TOKEN"] = hf_token
login(token=hf_token)
print(f"Loading {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=hf_token)
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
quantization_config=quantization_config,
trust_remote_code=True,
device_map="auto",
token=hf_token
)
def chat(message, history):
messages = [{"role": "system", "content": "You are EXAONE, a helpful AI assistant."}]
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to(model.device)
outputs = model.generate(
input_ids,
max_new_tokens=512,
eos_token_id=tokenizer.eos_token_id,
do_sample=True,
temperature=0.7,
top_p=0.9
)
return tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
demo = gr.ChatInterface(fn=chat, title="EXAONE Chat")
demo.launch()