Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from huggingface_hub import login | |
| import os | |
| # Authenticate with Hugging Face | |
| # Fetch the token from Hugging Face Secrets | |
| hf_token = os.getenv("ACCESS_TOKEN") | |
| if hf_token: | |
| login(token=hf_token) | |
| else: | |
| raise ValueError("ACCESS_TOKEN environment variable not set") | |
| # Load the model and tokenizer | |
| model_name = "meta-llama/Llama-3.2-1B-Instruct" | |
| print(f"Loading model: {model_name}...") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Define the response function | |
| def respond(message, max_tokens=150, temperature=0.7, top_p=0.95): | |
| inputs = tokenizer(message, return_tensors="pt") | |
| outputs = model.generate( | |
| inputs["input_ids"], | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Create the Gradio interface | |
| demo = gr.Interface( | |
| fn=respond, | |
| inputs=["text"], | |
| outputs="text", | |
| title="Chat with Meta Llama" | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() # Disable SSR to avoid Node.js dependency |