Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from huggingface_hub import login | |
| # Load EXAONE model | |
| MODEL_ID = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" | |
| # Authenticate using the HF_TOKEN secret | |
| hf_token = os.getenv("HF_TOKEN") | |
| if hf_token: | |
| hf_token = hf_token.strip() | |
| os.environ["HF_TOKEN"] = hf_token | |
| login(token=hf_token) | |
| print(f"Loading {MODEL_ID}...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=hf_token) | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| quantization_config=quantization_config, | |
| trust_remote_code=True, | |
| device_map="auto", | |
| token=hf_token | |
| ) | |
| def chat(message, history): | |
| messages = [{"role": "system", "content": "You are EXAONE, a helpful AI assistant."}] | |
| for user_msg, assistant_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| messages.append({"role": "assistant", "content": assistant_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| input_ids = tokenizer.apply_chat_template( | |
| messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" | |
| ).to(model.device) | |
| outputs = model.generate( | |
| input_ids, | |
| max_new_tokens=512, | |
| eos_token_id=tokenizer.eos_token_id, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9 | |
| ) | |
| return tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True) | |
| demo = gr.ChatInterface(fn=chat, title="EXAONE Chat") | |
| demo.launch() | |