Spaces:

stevafernandes
/

open-source-llam4

No application file

File size: 7,138 Bytes

fc77a06

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces
import os

# Available official Llama models (require access approval from Meta):
OFFICIAL_LLAMA_MODELS = {
    "Llama-3.2-1B": "meta-llama/Llama-3.2-1B-Instruct",
    "Llama-3.2-3B": "meta-llama/Llama-3.2-3B-Instruct", 
    "Llama-3.1-8B": "meta-llama/Llama-3.1-8B-Instruct",
    "Llama-3.1-70B": "meta-llama/Llama-3.1-70B-Instruct",
    "Llama-3.1-405B": "meta-llama/Llama-3.1-405B-Instruct",  # Requires massive GPU resources
}

# Select your model (start with smaller ones for testing)
MODEL_ID = OFFICIAL_LLAMA_MODELS["Llama-3.2-8B"]


print(f"Loading official Llama model: {MODEL_ID}")
print("Note: This requires approval from Meta. Request access at:")
print(f"https://huggingface.co/{MODEL_ID}")

# Check for Hugging Face token (required for Llama models)
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
    print("WARNING: HF_TOKEN not found. You need to:")
    print("1. Request access to Llama models from Meta")
    print("2. Create a Hugging Face access token")
    print("3. Add it as a Space secret named 'HF_TOKEN'")

device = "cuda" if torch.cuda.is_available() else "cpu"

try:
    # Load tokenizer with authentication
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_ID,
        token=HF_TOKEN,
        trust_remote_code=False  # Security: Don't execute remote code
    )
    
    # Load model with authentication
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        token=HF_TOKEN,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map="auto",
        trust_remote_code=False,  # Security: Don't execute remote code
        low_cpu_mem_usage=True
    )
    
    model_loaded = True
    print(f"✅ Successfully loaded {MODEL_ID}")
    
except Exception as e:
    model_loaded = False
    print(f"❌ Failed to load model: {e}")
    print("\nTo fix this:")
    print("1. Request access at: https://huggingface.co/meta-llama")
    print("2. Create token at: https://huggingface.co/settings/tokens")
    print("3. Add token to Space secrets as 'HF_TOKEN'")

@spaces.GPU(duration=60)
def generate_response(
    message,
    history,
    max_tokens=512,
    temperature=0.1,
    top_p=0.95,
):
    """Generate response using official Llama model"""
    
    if not model_loaded:
        return "⚠️ Model not loaded. Please set up HF_TOKEN and request Llama access from Meta."
    
    # Format conversation for Llama's expected format
    messages = []
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    # Apply Llama's chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    # Decode response
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response

# Create Gradio interface
with gr.Blocks(title="Official Llama Chat") as demo:
    gr.Markdown("""
    # 🦙 Official Llama Model Chat
    
    **IMPORTANT SECURITY NOTICE:**
    - This uses ONLY official Llama models from Meta
    - Never download models from unofficial sources
    - Always verify URLs are from trusted domains
    
    **Model**: {model_name}
    
    **Setup Required**:
    1. Request access: [Meta Llama on Hugging Face](https://huggingface.co/meta-llama)
    2. Create token: [Hugging Face Settings](https://huggingface.co/settings/tokens)
    3. Add token to Space secrets as 'HF_TOKEN'
    """.format(model_name=MODEL_ID if model_loaded else "Not loaded - see setup instructions"))
    
    if not model_loaded:
        gr.Markdown("""
        ### ⚠️ Model Not Loaded
        
        The model could not be loaded. This is usually because:
        - You haven't added your HF_TOKEN to the Space secrets
        - You haven't been granted access to Llama models by Meta
        
        Please follow the setup instructions above.
        """)
    
    chatbot = gr.Chatbot(height=500)
    
    with gr.Row():
        msg = gr.Textbox(
            label="Message",
            placeholder="Type your message here...",
            lines=2,
            scale=4
        )
        submit_btn = gr.Button("Send", variant="primary", scale=1)
    
    with gr.Accordion("Generation Settings", open=False):
        max_tokens = gr.Slider(minimum=50, maximum=2048, value=512, label="Max Tokens")
        temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature")
        top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, label="Top P")
    
    clear_btn = gr.Button("Clear Chat")
    
    # Example prompts
    gr.Examples(
        examples=[
            "What are the key principles of secure coding?",
            "Explain the importance of using official software sources",
            "How can I verify if a download link is legitimate?",
        ],
        inputs=msg,
    )
    
    # Event handlers
    def user_submit(message, history):
        return "", history + [[message, None]]
    
    def bot_response(history, max_tokens, temperature, top_p):
        if not history:
            return history
        
        message = history[-1][0]
        bot_message = generate_response(
            message,
            history[:-1],
            max_tokens,
            temperature,
            top_p
        )
        history[-1][1] = bot_message
        return history
    
    msg.submit(user_submit, [msg, chatbot], [msg, chatbot]).then(
        bot_response, [chatbot, max_tokens, temperature, top_p], chatbot
    )
    
    submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot]).then(
        bot_response, [chatbot, max_tokens, temperature, top_p], chatbot
    )
    
    clear_btn.click(lambda: None, outputs=chatbot)
    
    gr.Markdown("""
    ---
    ### 🔒 Security Best Practices
    
    1. **Only use official model sources** (meta-llama on Hugging Face)
    2. **Never run code from untrusted sources**
    3. **Verify all URLs before downloading**
    4. **Use access tokens securely** (never share them)
    5. **Report suspicious links** to the platform
    
    ### 📚 Official Resources
    - [Meta AI](https://ai.meta.com/)
    - [Official Llama Page](https://llama.meta.com/)
    - [Hugging Face Meta-Llama](https://huggingface.co/meta-llama)
    """)

if __name__ == "__main__":
    demo.launch()