stevafernandes's picture
Create appy.py
fc77a06 verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces
import os
# Available official Llama models (require access approval from Meta):
OFFICIAL_LLAMA_MODELS = {
"Llama-3.2-1B": "meta-llama/Llama-3.2-1B-Instruct",
"Llama-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct",
"Llama-3.1-8B": "meta-llama/Llama-3.1-8B-Instruct",
"Llama-3.1-70B": "meta-llama/Llama-3.1-70B-Instruct",
"Llama-3.1-405B": "meta-llama/Llama-3.1-405B-Instruct", # Requires massive GPU resources
}
# Select your model (start with smaller ones for testing)
MODEL_ID = OFFICIAL_LLAMA_MODELS["Llama-3.2-8B"]
print(f"Loading official Llama model: {MODEL_ID}")
print("Note: This requires approval from Meta. Request access at:")
print(f"https://huggingface.co/{MODEL_ID}")
# Check for Hugging Face token (required for Llama models)
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
print("WARNING: HF_TOKEN not found. You need to:")
print("1. Request access to Llama models from Meta")
print("2. Create a Hugging Face access token")
print("3. Add it as a Space secret named 'HF_TOKEN'")
device = "cuda" if torch.cuda.is_available() else "cpu"
try:
# Load tokenizer with authentication
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
token=HF_TOKEN,
trust_remote_code=False # Security: Don't execute remote code
)
# Load model with authentication
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
token=HF_TOKEN,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map="auto",
trust_remote_code=False, # Security: Don't execute remote code
low_cpu_mem_usage=True
)
model_loaded = True
print(f"✅ Successfully loaded {MODEL_ID}")
except Exception as e:
model_loaded = False
print(f"❌ Failed to load model: {e}")
print("\nTo fix this:")
print("1. Request access at: https://huggingface.co/meta-llama")
print("2. Create token at: https://huggingface.co/settings/tokens")
print("3. Add token to Space secrets as 'HF_TOKEN'")
@spaces.GPU(duration=60)
def generate_response(
message,
history,
max_tokens=512,
temperature=0.1,
top_p=0.95,
):
"""Generate response using official Llama model"""
if not model_loaded:
return "⚠️ Model not loaded. Please set up HF_TOKEN and request Llama access from Meta."
# Format conversation for Llama's expected format
messages = []
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
# Apply Llama's chat template
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id,
)
# Decode response
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
return response
# Create Gradio interface
with gr.Blocks(title="Official Llama Chat") as demo:
gr.Markdown("""
# 🦙 Official Llama Model Chat
**IMPORTANT SECURITY NOTICE:**
- This uses ONLY official Llama models from Meta
- Never download models from unofficial sources
- Always verify URLs are from trusted domains
**Model**: {model_name}
**Setup Required**:
1. Request access: [Meta Llama on Hugging Face](https://huggingface.co/meta-llama)
2. Create token: [Hugging Face Settings](https://huggingface.co/settings/tokens)
3. Add token to Space secrets as 'HF_TOKEN'
""".format(model_name=MODEL_ID if model_loaded else "Not loaded - see setup instructions"))
if not model_loaded:
gr.Markdown("""
### ⚠️ Model Not Loaded
The model could not be loaded. This is usually because:
- You haven't added your HF_TOKEN to the Space secrets
- You haven't been granted access to Llama models by Meta
Please follow the setup instructions above.
""")
chatbot = gr.Chatbot(height=500)
with gr.Row():
msg = gr.Textbox(
label="Message",
placeholder="Type your message here...",
lines=2,
scale=4
)
submit_btn = gr.Button("Send", variant="primary", scale=1)
with gr.Accordion("Generation Settings", open=False):
max_tokens = gr.Slider(minimum=50, maximum=2048, value=512, label="Max Tokens")
temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, label="Top P")
clear_btn = gr.Button("Clear Chat")
# Example prompts
gr.Examples(
examples=[
"What are the key principles of secure coding?",
"Explain the importance of using official software sources",
"How can I verify if a download link is legitimate?",
],
inputs=msg,
)
# Event handlers
def user_submit(message, history):
return "", history + [[message, None]]
def bot_response(history, max_tokens, temperature, top_p):
if not history:
return history
message = history[-1][0]
bot_message = generate_response(
message,
history[:-1],
max_tokens,
temperature,
top_p
)
history[-1][1] = bot_message
return history
msg.submit(user_submit, [msg, chatbot], [msg, chatbot]).then(
bot_response, [chatbot, max_tokens, temperature, top_p], chatbot
)
submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot]).then(
bot_response, [chatbot, max_tokens, temperature, top_p], chatbot
)
clear_btn.click(lambda: None, outputs=chatbot)
gr.Markdown("""
---
### 🔒 Security Best Practices
1. **Only use official model sources** (meta-llama on Hugging Face)
2. **Never run code from untrusted sources**
3. **Verify all URLs before downloading**
4. **Use access tokens securely** (never share them)
5. **Report suspicious links** to the platform
### 📚 Official Resources
- [Meta AI](https://ai.meta.com/)
- [Official Llama Page](https://llama.meta.com/)
- [Hugging Face Meta-Llama](https://huggingface.co/meta-llama)
""")
if __name__ == "__main__":
demo.launch()