Spaces:
Build error
Build error
| import streamlit as st | |
| import os | |
| from torch import cuda, bfloat16, device as torch_device | |
| import transformers | |
| import requests | |
| # Function to check access to the model | |
| def check_access(model_id, token): | |
| url = f"https://huggingface.co/{model_id}/resolve/main/config.json" | |
| headers = {"Authorization": f"Bearer {token}"} | |
| response = requests.head(url, headers=headers) | |
| if response.status_code == 200: | |
| print(f"Access to {model_id} is confirmed.") | |
| else: | |
| st.error(f"Cannot access {model_id}: {response.status_code} - {response.reason}") | |
| st.stop() | |
| # Retrieve the Hugging Face authentication token from the environment variables | |
| hf_auth = os.getenv('HF_AUTH_TOKEN') | |
| # Model ID | |
| model_id = 'meta-llama/Meta-Llama-3-8B-Instruct' | |
| # Check access to the model | |
| check_access(model_id, hf_auth) | |
| # Determine device configuration | |
| if cuda.is_available(): | |
| device = f'cuda:{cuda.current_device()}' | |
| # Set quantization configuration to load large model with less GPU memory | |
| bnb_config = transformers.BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type='nf4', | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=bfloat16 | |
| ) | |
| quantization_config = bnb_config | |
| device_map = 'auto' | |
| else: | |
| device = 'cpu' | |
| quantization_config = None | |
| device_map = None | |
| # Begin initializing HF items using the access token from environment | |
| model_config = transformers.AutoConfig.from_pretrained( | |
| model_id, | |
| use_auth_token=hf_auth | |
| ) | |
| model = transformers.AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| trust_remote_code=True, | |
| config=model_config, | |
| quantization_config=quantization_config, | |
| device_map=device_map, | |
| use_auth_token=hf_auth | |
| ) | |
| # Enable evaluation mode to allow model inference | |
| model.eval() | |
| # Load tokenizer | |
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth) | |
| st.title("Chatbot using Meta-Llama-3-8B-Instruct") | |
| prompt = st.text_area("Enter your prompt:") | |
| if st.button("Generate"): | |
| if prompt: | |
| with st.spinner("Generating response..."): | |
| inputs = tokenizer(prompt, return_tensors="pt").to(torch_device(device)) | |
| outputs = model.generate(inputs["input_ids"], max_length=100) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| st.write(response) | |
| print(f"Model loaded on {device}") | |