Spaces:

contenteaseAI
/

llama3-8b

Build error

App Files Files Community

llama3-8b / app.py

ShravanHN

modified the runtime to not use quantization if gpu is not present

2f9a9e9 almost 2 years ago

raw

history blame contribute delete

2.4 kB

	import streamlit as st
	import os
	from torch import cuda, bfloat16, device as torch_device
	import transformers
	import requests

	# Function to check access to the model
	def check_access(model_id, token):
	url = f"https://huggingface.co/{model_id}/resolve/main/config.json"
	headers = {"Authorization": f"Bearer {token}"}
	response = requests.head(url, headers=headers)
	if response.status_code == 200:
	print(f"Access to {model_id} is confirmed.")
	else:
	st.error(f"Cannot access {model_id}: {response.status_code} - {response.reason}")
	st.stop()

	# Retrieve the Hugging Face authentication token from the environment variables
	hf_auth = os.getenv('HF_AUTH_TOKEN')

	# Model ID
	model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'

	# Check access to the model
	check_access(model_id, hf_auth)

	# Determine device configuration
	if cuda.is_available():
	device = f'cuda:{cuda.current_device()}'
	# Set quantization configuration to load large model with less GPU memory
	bnb_config = transformers.BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type='nf4',
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=bfloat16
	)
	quantization_config = bnb_config
	device_map = 'auto'
	else:
	device = 'cpu'
	quantization_config = None
	device_map = None

	# Begin initializing HF items using the access token from environment
	model_config = transformers.AutoConfig.from_pretrained(
	model_id,
	use_auth_token=hf_auth
	)

	model = transformers.AutoModelForCausalLM.from_pretrained(
	model_id,
	trust_remote_code=True,
	config=model_config,
	quantization_config=quantization_config,
	device_map=device_map,
	use_auth_token=hf_auth
	)

	# Enable evaluation mode to allow model inference
	model.eval()

	# Load tokenizer
	tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)

	st.title("Chatbot using Meta-Llama-3-8B-Instruct")

	prompt = st.text_area("Enter your prompt:")

	if st.button("Generate"):
	if prompt:
	with st.spinner("Generating response..."):
	inputs = tokenizer(prompt, return_tensors="pt").to(torch_device(device))
	outputs = model.generate(inputs["input_ids"], max_length=100)
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	st.write(response)

	print(f"Model loaded on {device}")