BlazerApp commited on
Commit
e839314
·
1 Parent(s): 5eee29c

added model selection

Browse files
Files changed (1) hide show
  1. app.py +75 -15
app.py CHANGED
@@ -2,36 +2,88 @@ import gradio as gr
2
  from huggingface_hub import hf_hub_download
3
  from llama_cpp import Llama
4
 
5
- # Download the GGUF model from the new Organization Hub
6
- model_path = hf_hub_download(
7
- repo_id="Emil-Matteus/llama-32-1b",
8
- filename="llama-3.2-1b-instruct.Q4_K_M.gguf"
9
- )
 
 
 
 
 
 
 
10
 
11
- # Initialize the local Llama model
12
- # n_gpu_layers=0 forces CPU usage. n_ctx sets context window.
13
- llm = Llama(
14
- model_path=model_path,
15
- n_gpu_layers=0,
16
- n_ctx=4096,
17
- verbose=False
18
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def respond(
21
  message,
22
  history: list[dict[str, str]],
 
23
  system_message,
24
  max_tokens,
25
  temperature,
26
  top_p,
27
  ):
 
 
 
 
 
 
 
 
 
28
  messages = [{"role": "system", "content": system_message}]
29
  messages.extend(history)
30
  messages.append({"role": "user", "content": message})
31
 
32
  response = ""
33
 
34
- # Generate response using the local model
35
  completion = llm.create_chat_completion(
36
  messages=messages,
37
  max_tokens=max_tokens,
@@ -46,6 +98,7 @@ def respond(
46
  response += token
47
  yield response
48
 
 
49
  """
50
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
51
  """
@@ -53,6 +106,13 @@ chatbot = gr.ChatInterface(
53
  respond,
54
  type="messages",
55
  additional_inputs=[
 
 
 
 
 
 
 
56
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
57
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
58
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
@@ -67,4 +127,4 @@ chatbot = gr.ChatInterface(
67
  )
68
 
69
  if __name__ == "__main__":
70
- chatbot.launch()
 
2
  from huggingface_hub import hf_hub_download
3
  from llama_cpp import Llama
4
 
5
+ # --- Configuration ---
6
+ # Define available models: Label -> (Repo ID, GGUF Filename)
7
+ MODELS = {
8
+ "Llama-3.2-1B": {
9
+ "repo_id": "Emil-Matteus/llama-32-1b",
10
+ "filename": "llama-3.2-1b-instruct.Q4_K_M.gguf"
11
+ },
12
+ "Llama-3.2-3B": {
13
+ "repo_id": "Emil-Matteus/llama-3B_model-GGUF",
14
+ "filename": "llama-3B-Q4_K_M.gguf"
15
+ }
16
+ }
17
 
18
+ # Global state to hold the currently loaded model
19
+ current_model_name = None
20
+ llm = None
21
+
22
+ def load_model(model_name):
23
+ """
24
+ Loads the specified model into memory, unloading the previous one.
25
+ """
26
+ global llm, current_model_name
27
+
28
+ # If this model is already loaded, do nothing
29
+ if llm is not None and current_model_name == model_name:
30
+ return llm
31
+
32
+ print(f"Loading new model: {model_name}...")
33
+
34
+ if model_name not in MODELS:
35
+ raise ValueError(f"Unknown model: {model_name}")
36
+
37
+ repo_id = MODELS[model_name]["repo_id"]
38
+ filename = MODELS[model_name]["filename"]
39
+
40
+ try:
41
+ model_path = hf_hub_download(
42
+ repo_id=repo_id,
43
+ filename=filename
44
+ )
45
+
46
+ # Initialize Llama model (n_gpu_layers=0 for CPU)
47
+ # n_ctx=4096 gives a decent context window
48
+ llm = Llama(
49
+ model_path=model_path,
50
+ n_gpu_layers=0,
51
+ n_ctx=4096,
52
+ verbose=True
53
+ )
54
+ current_model_name = model_name
55
+ print(f"Successfully loaded {model_name}")
56
+ return llm
57
+
58
+ except Exception as e:
59
+ print(f"Error loading model {model_name}: {e}")
60
+ raise e
61
 
62
  def respond(
63
  message,
64
  history: list[dict[str, str]],
65
+ model_selection, # First additional input (Dropdown)
66
  system_message,
67
  max_tokens,
68
  temperature,
69
  top_p,
70
  ):
71
+ global llm
72
+
73
+ # Ensure the correct model is loaded
74
+ try:
75
+ load_model(model_selection)
76
+ except Exception as e:
77
+ yield f"Error loading model '{model_selection}': {str(e)}. Please check if the model has been uploaded to Hugging Face."
78
+ return
79
+
80
  messages = [{"role": "system", "content": system_message}]
81
  messages.extend(history)
82
  messages.append({"role": "user", "content": message})
83
 
84
  response = ""
85
 
86
+ # Generate response
87
  completion = llm.create_chat_completion(
88
  messages=messages,
89
  max_tokens=max_tokens,
 
98
  response += token
99
  yield response
100
 
101
+ # --- UI Setup ---
102
  """
103
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
104
  """
 
106
  respond,
107
  type="messages",
108
  additional_inputs=[
109
+ # Model Selector Dropdown
110
+ gr.Dropdown(
111
+ choices=list(MODELS.keys()),
112
+ value="Llama-3.2-1B",
113
+ label="Select Model",
114
+ info="Switching models will take a few seconds to download/load."
115
+ ),
116
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
117
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
118
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
 
127
  )
128
 
129
  if __name__ == "__main__":
130
+ chatbot.launch()