File size: 2,404 Bytes
2f25f71
 
2f9a9e9
2f25f71
1877151
 
 
 
 
 
 
 
 
 
 
 
2f25f71
 
 
 
1877151
 
 
 
 
 
2f9a9e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f25f71
 
 
 
 
 
 
 
 
 
 
2f9a9e9
 
2f25f71
 
 
 
 
 
 
 
 
1877151
2f25f71
 
 
 
 
 
2f9a9e9
2f25f71
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import streamlit as st
import os
from torch import cuda, bfloat16, device as torch_device
import transformers
import requests

# Function to check access to the model
def check_access(model_id, token):
    url = f"https://huggingface.co/{model_id}/resolve/main/config.json"
    headers = {"Authorization": f"Bearer {token}"}
    response = requests.head(url, headers=headers)
    if response.status_code == 200:
        print(f"Access to {model_id} is confirmed.")
    else:
        st.error(f"Cannot access {model_id}: {response.status_code} - {response.reason}")
        st.stop()

# Retrieve the Hugging Face authentication token from the environment variables
hf_auth = os.getenv('HF_AUTH_TOKEN')

# Model ID
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'

# Check access to the model
check_access(model_id, hf_auth)

# Determine device configuration
if cuda.is_available():
    device = f'cuda:{cuda.current_device()}'
    # Set quantization configuration to load large model with less GPU memory
    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
    )
    quantization_config = bnb_config
    device_map = 'auto'
else:
    device = 'cpu'
    quantization_config = None
    device_map = None

# Begin initializing HF items using the access token from environment
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=quantization_config,
    device_map=device_map,
    use_auth_token=hf_auth
)

# Enable evaluation mode to allow model inference
model.eval()

# Load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)

st.title("Chatbot using Meta-Llama-3-8B-Instruct")

prompt = st.text_area("Enter your prompt:")

if st.button("Generate"):
    if prompt:
        with st.spinner("Generating response..."):
            inputs = tokenizer(prompt, return_tensors="pt").to(torch_device(device))
            outputs = model.generate(inputs["input_ids"], max_length=100)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            st.write(response)

print(f"Model loaded on {device}")