Spaces:

junehong-dominicus
/

LLM_hello

Sleeping

June Hong

enable 4-bit quantization (bitsandbytes) to reduce memory usage

b1d30fe 3 months ago

1.77 kB

	import os
	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from huggingface_hub import login

	# Load EXAONE model
	MODEL_ID = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"

	# Authenticate using the HF_TOKEN secret
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	hf_token = hf_token.strip()
	os.environ["HF_TOKEN"] = hf_token
	login(token=hf_token)

	print(f"Loading {MODEL_ID}...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=hf_token)

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
	)

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	quantization_config=quantization_config,
	trust_remote_code=True,
	device_map="auto",
	token=hf_token
	)

	def chat(message, history):
	messages = [{"role": "system", "content": "You are EXAONE, a helpful AI assistant."}]
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": assistant_msg})
	messages.append({"role": "user", "content": message})

	input_ids = tokenizer.apply_chat_template(
	messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
	).to(model.device)

	outputs = model.generate(
	input_ids,
	max_new_tokens=512,
	eos_token_id=tokenizer.eos_token_id,
	do_sample=True,
	temperature=0.7,
	top_p=0.9
	)
	return tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

	demo = gr.ChatInterface(fn=chat, title="EXAONE Chat")
	demo.launch()