Spaces:
Sleeping
Sleeping
File size: 3,815 Bytes
6158b5a cd1d18d 6158b5a e839314 dc093ff e0f1de1 e839314 cd1d18d e839314 6158b5a e839314 6158b5a e839314 6158b5a cd1d18d e839314 cd1d18d 6158b5a cd1d18d 6158b5a cd1d18d 6158b5a e839314 6158b5a e839314 6158b5a e839314 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# --- Configuration ---
# Define available models: Label -> (Repo ID, GGUF Filename)
MODELS = {
"Llama-3.2-1B": {
"repo_id": "Emil-Matteus/llama-32-1b",
"filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
},
"Llama-3.2-3B": {
"repo_id": "Emil-Matteus/llama-3B_model-GGUF",
"filename": "llama-3B-Q4_K_M.gguf"
},
"Qwen2.5-1.5B": {
"repo_id": "Emil-Matteus/qwen2.5-1.5B_model-GGUF",
"filename": "qwen2.5-1.5B-Q4_K_M.gguf"
}
}
# Global state to hold the currently loaded model
current_model_name = None
llm = None
def load_model(model_name):
"""
Loads the specified model into memory, unloading the previous one.
"""
global llm, current_model_name
# If this model is already loaded, do nothing
if llm is not None and current_model_name == model_name:
return llm
print(f"Loading new model: {model_name}...")
if model_name not in MODELS:
raise ValueError(f"Unknown model: {model_name}")
repo_id = MODELS[model_name]["repo_id"]
filename = MODELS[model_name]["filename"]
try:
model_path = hf_hub_download(
repo_id=repo_id,
filename=filename
)
# Initialize Llama model (n_gpu_layers=0 for CPU)
# n_ctx=4096 gives a decent context window
llm = Llama(
model_path=model_path,
n_gpu_layers=0,
n_ctx=4096,
verbose=True
)
current_model_name = model_name
print(f"Successfully loaded {model_name}")
return llm
except Exception as e:
print(f"Error loading model {model_name}: {e}")
raise e
def respond(
message,
history: list[dict[str, str]],
model_selection, # First additional input (Dropdown)
system_message,
max_tokens,
temperature,
top_p,
):
global llm
# Ensure the correct model is loaded
try:
load_model(model_selection)
except Exception as e:
yield f"Error loading model '{model_selection}': {str(e)}. Please check if the model has been uploaded to Hugging Face."
return
messages = [{"role": "system", "content": system_message}]
messages.extend(history)
messages.append({"role": "user", "content": message})
response = ""
# Generate response
completion = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=True
)
for chunk in completion:
if "content" in chunk["choices"][0]["delta"]:
token = chunk["choices"][0]["delta"]["content"]
response += token
yield response
# --- UI Setup ---
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
chatbot = gr.ChatInterface(
respond,
type="messages",
additional_inputs=[
# Model Selector Dropdown
gr.Dropdown(
choices=list(MODELS.keys()),
value="Llama-3.2-1B",
label="Select Model",
info="Switching models will take a few seconds to download/load."
),
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
chatbot.launch()
|