import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer import spaces import os # Available official Llama models (require access approval from Meta): OFFICIAL_LLAMA_MODELS = { "Llama-3.2-1B": "meta-llama/Llama-3.2-1B-Instruct", "Llama-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct", "Llama-3.1-8B": "meta-llama/Llama-3.1-8B-Instruct", "Llama-3.1-70B": "meta-llama/Llama-3.1-70B-Instruct", "Llama-3.1-405B": "meta-llama/Llama-3.1-405B-Instruct", # Requires massive GPU resources } # Select your model (start with smaller ones for testing) MODEL_ID = OFFICIAL_LLAMA_MODELS["Llama-3.2-8B"] print(f"Loading official Llama model: {MODEL_ID}") print("Note: This requires approval from Meta. Request access at:") print(f"https://huggingface.co/{MODEL_ID}") # Check for Hugging Face token (required for Llama models) HF_TOKEN = os.environ.get("HF_TOKEN") if not HF_TOKEN: print("WARNING: HF_TOKEN not found. You need to:") print("1. Request access to Llama models from Meta") print("2. Create a Hugging Face access token") print("3. Add it as a Space secret named 'HF_TOKEN'") device = "cuda" if torch.cuda.is_available() else "cpu" try: # Load tokenizer with authentication tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, token=HF_TOKEN, trust_remote_code=False # Security: Don't execute remote code ) # Load model with authentication model = AutoModelForCausalLM.from_pretrained( MODEL_ID, token=HF_TOKEN, torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map="auto", trust_remote_code=False, # Security: Don't execute remote code low_cpu_mem_usage=True ) model_loaded = True print(f"✅ Successfully loaded {MODEL_ID}") except Exception as e: model_loaded = False print(f"❌ Failed to load model: {e}") print("\nTo fix this:") print("1. Request access at: https://huggingface.co/meta-llama") print("2. Create token at: https://huggingface.co/settings/tokens") print("3. Add token to Space secrets as 'HF_TOKEN'") @spaces.GPU(duration=60) def generate_response( message, history, max_tokens=512, temperature=0.1, top_p=0.95, ): """Generate response using official Llama model""" if not model_loaded: return "⚠️ Model not loaded. Please set up HF_TOKEN and request Llama access from Meta." # Format conversation for Llama's expected format messages = [] for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) # Apply Llama's chat template prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048) inputs = {k: v.to(device) for k, v in inputs.items()} # Generate with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, ) # Decode response response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) return response # Create Gradio interface with gr.Blocks(title="Official Llama Chat") as demo: gr.Markdown(""" # 🦙 Official Llama Model Chat **IMPORTANT SECURITY NOTICE:** - This uses ONLY official Llama models from Meta - Never download models from unofficial sources - Always verify URLs are from trusted domains **Model**: {model_name} **Setup Required**: 1. Request access: [Meta Llama on Hugging Face](https://huggingface.co/meta-llama) 2. Create token: [Hugging Face Settings](https://huggingface.co/settings/tokens) 3. Add token to Space secrets as 'HF_TOKEN' """.format(model_name=MODEL_ID if model_loaded else "Not loaded - see setup instructions")) if not model_loaded: gr.Markdown(""" ### ⚠️ Model Not Loaded The model could not be loaded. This is usually because: - You haven't added your HF_TOKEN to the Space secrets - You haven't been granted access to Llama models by Meta Please follow the setup instructions above. """) chatbot = gr.Chatbot(height=500) with gr.Row(): msg = gr.Textbox( label="Message", placeholder="Type your message here...", lines=2, scale=4 ) submit_btn = gr.Button("Send", variant="primary", scale=1) with gr.Accordion("Generation Settings", open=False): max_tokens = gr.Slider(minimum=50, maximum=2048, value=512, label="Max Tokens") temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature") top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, label="Top P") clear_btn = gr.Button("Clear Chat") # Example prompts gr.Examples( examples=[ "What are the key principles of secure coding?", "Explain the importance of using official software sources", "How can I verify if a download link is legitimate?", ], inputs=msg, ) # Event handlers def user_submit(message, history): return "", history + [[message, None]] def bot_response(history, max_tokens, temperature, top_p): if not history: return history message = history[-1][0] bot_message = generate_response( message, history[:-1], max_tokens, temperature, top_p ) history[-1][1] = bot_message return history msg.submit(user_submit, [msg, chatbot], [msg, chatbot]).then( bot_response, [chatbot, max_tokens, temperature, top_p], chatbot ) submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot]).then( bot_response, [chatbot, max_tokens, temperature, top_p], chatbot ) clear_btn.click(lambda: None, outputs=chatbot) gr.Markdown(""" --- ### 🔒 Security Best Practices 1. **Only use official model sources** (meta-llama on Hugging Face) 2. **Never run code from untrusted sources** 3. **Verify all URLs before downloading** 4. **Use access tokens securely** (never share them) 5. **Report suspicious links** to the platform ### 📚 Official Resources - [Meta AI](https://ai.meta.com/) - [Official Llama Page](https://llama.meta.com/) - [Hugging Face Meta-Llama](https://huggingface.co/meta-llama) """) if __name__ == "__main__": demo.launch()