llama3-8b / app.py
ShravanHN's picture
modified the runtime to not use quantization if gpu is not present
2f9a9e9
import streamlit as st
import os
from torch import cuda, bfloat16, device as torch_device
import transformers
import requests
# Function to check access to the model
def check_access(model_id, token):
url = f"https://huggingface.co/{model_id}/resolve/main/config.json"
headers = {"Authorization": f"Bearer {token}"}
response = requests.head(url, headers=headers)
if response.status_code == 200:
print(f"Access to {model_id} is confirmed.")
else:
st.error(f"Cannot access {model_id}: {response.status_code} - {response.reason}")
st.stop()
# Retrieve the Hugging Face authentication token from the environment variables
hf_auth = os.getenv('HF_AUTH_TOKEN')
# Model ID
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
# Check access to the model
check_access(model_id, hf_auth)
# Determine device configuration
if cuda.is_available():
device = f'cuda:{cuda.current_device()}'
# Set quantization configuration to load large model with less GPU memory
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=bfloat16
)
quantization_config = bnb_config
device_map = 'auto'
else:
device = 'cpu'
quantization_config = None
device_map = None
# Begin initializing HF items using the access token from environment
model_config = transformers.AutoConfig.from_pretrained(
model_id,
use_auth_token=hf_auth
)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
config=model_config,
quantization_config=quantization_config,
device_map=device_map,
use_auth_token=hf_auth
)
# Enable evaluation mode to allow model inference
model.eval()
# Load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)
st.title("Chatbot using Meta-Llama-3-8B-Instruct")
prompt = st.text_area("Enter your prompt:")
if st.button("Generate"):
if prompt:
with st.spinner("Generating response..."):
inputs = tokenizer(prompt, return_tensors="pt").to(torch_device(device))
outputs = model.generate(inputs["input_ids"], max_length=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
st.write(response)
print(f"Model loaded on {device}")