import streamlit as st
import os
from torch import cuda, bfloat16, device as torch_device
import transformers
import requests

# Function to check access to the model
def check_access(model_id, token):
    url = f"https://huggingface.co/{model_id}/resolve/main/config.json"
    headers = {"Authorization": f"Bearer {token}"}
    response = requests.head(url, headers=headers)
    if response.status_code == 200:
        print(f"Access to {model_id} is confirmed.")
    else:
        st.error(f"Cannot access {model_id}: {response.status_code} - {response.reason}")
        st.stop()

# Retrieve the Hugging Face authentication token from the environment variables
hf_auth = os.getenv('HF_AUTH_TOKEN')

# Model ID
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'

# Check access to the model
check_access(model_id, hf_auth)

# Determine device configuration
if cuda.is_available():
    device = f'cuda:{cuda.current_device()}'
    # Set quantization configuration to load large model with less GPU memory
    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
    )
    quantization_config = bnb_config
    device_map = 'auto'
else:
    device = 'cpu'
    quantization_config = None
    device_map = None

# Begin initializing HF items using the access token from environment
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=quantization_config,
    device_map=device_map,
    use_auth_token=hf_auth
)

# Enable evaluation mode to allow model inference
model.eval()

# Load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_auth)

st.title("Chatbot using Meta-Llama-3-8B-Instruct")

prompt = st.text_area("Enter your prompt:")

if st.button("Generate"):
    if prompt:
        with st.spinner("Generating response..."):
            inputs = tokenizer(prompt, return_tensors="pt").to(torch_device(device))
            outputs = model.generate(inputs["input_ids"], max_length=100)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            st.write(response)

print(f"Model loaded on {device}")