import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM from huggingface_hub import login import os # Authenticate with Hugging Face # Fetch the token from Hugging Face Secrets hf_token = os.getenv("ACCESS_TOKEN") if hf_token: login(token=hf_token) else: raise ValueError("ACCESS_TOKEN environment variable not set") # Load the model and tokenizer model_name = "meta-llama/Llama-3.2-1B-Instruct" print(f"Loading model: {model_name}...") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) device = "cuda" if torch.cuda.is_available() else "cpu" # Define the response function def respond(message, max_tokens=150, temperature=0.7, top_p=0.95): inputs = tokenizer(message, return_tensors="pt") outputs = model.generate( inputs["input_ids"], max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Create the Gradio interface demo = gr.Interface( fn=respond, inputs=["text"], outputs="text", title="Chat with Meta Llama" ) # Launch the app if __name__ == "__main__": demo.launch() # Disable SSR to avoid Node.js dependency