import streamlit as st import os from torch import cuda, bfloat16, device as torch_device import transformers import requests # Function to check access to the model def check_access(model_id, token): url = f"https://huggingface.co/{model_id}/resolve/main/config.json" headers = {"Authorization": f"Bearer {token}"} response = requests.head(url, headers=headers) if response.status_code == 200: print(f"Access to {model_id} is confirmed.") else: st.error(f"Cannot access {model_id}: {response.status_code} - {response.reason}") st.stop() # Retrieve the Hugging Face authentication token from the environment variables hf_auth = os.getenv('HF_AUTH_TOKEN') # Model ID model_id = 'meta-llama/Meta-Llama-3-8B-Instruct' # Check access to the model check_access(model_id, hf_auth) # Determine device configuration if cuda.is_available(): device = f'cuda:{cuda.current_device()}' # Set quantization configuration to load large model with less GPU memory bnb_config = transformers.BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=bfloat16 ) quantization_config = bnb_config device_map = 'auto' else: device = 'cpu' quantization_config = None device_map = None # Begin initializing HF items using the access token from environment model_config = transformers.AutoConfig.from_pretrained( model_id, use_auth_token=hf_auth ) model = transformers.AutoModelForCausalLM.from_pretrained( model_id, trust_remote_code=True, config=model_config, quantization_config=quantization_config, device_map=device_map, use_auth_token=hf_auth ) # Enable evaluation mode to allow model inference model.eval() # Load tokenizer tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth) st.title("Chatbot using Meta-Llama-3-8B-Instruct") prompt = st.text_area("Enter your prompt:") if st.button("Generate"): if prompt: with st.spinner("Generating response..."): inputs = tokenizer(prompt, return_tensors="pt").to(torch_device(device)) outputs = model.generate(inputs["input_ids"], max_length=100) response = tokenizer.decode(outputs[0], skip_special_tokens=True) st.write(response) print(f"Model loaded on {device}")