Spaces:

Emil-Matteus
/

Iris

Sleeping

File size: 3,815 Bytes

import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# --- Configuration ---
# Define available models: Label -> (Repo ID, GGUF Filename)
MODELS = {
    "Llama-3.2-1B": {
        "repo_id": "Emil-Matteus/llama-32-1b",
        "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf" 
    },
    "Llama-3.2-3B": {
        "repo_id": "Emil-Matteus/llama-3B_model-GGUF",
        "filename": "llama-3B-Q4_K_M.gguf"
    },
    "Qwen2.5-1.5B": {
        "repo_id": "Emil-Matteus/qwen2.5-1.5B_model-GGUF",
        "filename": "qwen2.5-1.5B-Q4_K_M.gguf"
    }
}

# Global state to hold the currently loaded model
current_model_name = None
llm = None

def load_model(model_name):
    """
    Loads the specified model into memory, unloading the previous one.
    """
    global llm, current_model_name
    
    # If this model is already loaded, do nothing
    if llm is not None and current_model_name == model_name:
        return llm

    print(f"Loading new model: {model_name}...")
    
    if model_name not in MODELS:
        raise ValueError(f"Unknown model: {model_name}")

    repo_id = MODELS[model_name]["repo_id"]
    filename = MODELS[model_name]["filename"]

    try:
        model_path = hf_hub_download(
            repo_id=repo_id,
            filename=filename
        )
        
        # Initialize Llama model (n_gpu_layers=0 for CPU)
        # n_ctx=4096 gives a decent context window
        llm = Llama(
            model_path=model_path,
            n_gpu_layers=0,
            n_ctx=4096,
            verbose=True
        )
        current_model_name = model_name
        print(f"Successfully loaded {model_name}")
        return llm
        
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        raise e

def respond(
    message,
    history: list[dict[str, str]],
    model_selection,  # First additional input (Dropdown)
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    global llm
    
    # Ensure the correct model is loaded
    try:
        load_model(model_selection)
    except Exception as e:
        yield f"Error loading model '{model_selection}': {str(e)}. Please check if the model has been uploaded to Hugging Face."
        return

    messages = [{"role": "system", "content": system_message}]
    messages.extend(history)
    messages.append({"role": "user", "content": message})

    response = ""
    
    # Generate response
    completion = llm.create_chat_completion(
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stream=True
    )

    for chunk in completion:
        if "content" in chunk["choices"][0]["delta"]:
            token = chunk["choices"][0]["delta"]["content"]
            response += token
            yield response

# --- UI Setup ---
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
chatbot = gr.ChatInterface(
    respond,
    type="messages",
    additional_inputs=[
        # Model Selector Dropdown
        gr.Dropdown(
            choices=list(MODELS.keys()), 
            value="Llama-3.2-1B", 
            label="Select Model", 
            info="Switching models will take a few seconds to download/load."
        ),
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    chatbot.launch()