import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import os


# Authenticate with Hugging Face
# Fetch the token from Hugging Face Secrets
hf_token = os.getenv("ACCESS_TOKEN")
if hf_token:
    login(token=hf_token)
else:
    raise ValueError("ACCESS_TOKEN environment variable not set")

# Load the model and tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
print(f"Loading model: {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define the response function
def respond(message, max_tokens=150, temperature=0.7, top_p=0.95):
    inputs = tokenizer(message, return_tensors="pt")
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Create the Gradio interface
demo = gr.Interface(
    fn=respond,
    inputs=["text"],
    outputs="text",
    title="Chat with Meta Llama"
)

# Launch the app
if __name__ == "__main__":
    demo.launch()  # Disable SSR to avoid Node.js dependency