Beibars003 commited on
Commit
37af6d1
·
verified ·
1 Parent(s): b7e70d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -75
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import os
2
  import json
3
  import subprocess
@@ -11,14 +14,26 @@ from llama_cpp_agent.chat_history.messages import Roles
11
  from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers
12
  from huggingface_hub import hf_hub_download
13
  import gradio as gr
 
 
14
 
15
 
16
  # Load the Environment Variables from .env file
17
  huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
18
 
 
 
 
 
19
  hf_hub_download(
20
- repo_id="SRP-base-model-training/gemma_3_800M_sft_v2_translation-kazparc_latest",
21
- filename="gemma_3_800M_sft_v2_translation-kazparc_latest.gguf",
 
 
 
 
 
 
22
  )
23
 
24
 
@@ -43,10 +58,12 @@ gemma_3_formatter = MessagesFormatter(
43
 
44
 
45
  # Set the title and description
46
- title = "Kazakh Language Model"
47
- description = """Model created in ISSAI by using architecture Gemma 3. Base model was trained on data provided by ISSAI. This interactive chat interface allows you to experiment with the [`gemma-3-1b-it`](https://huggingface.co/google/gemma-3-1b-it) text model using various prompts and generation parameters.
 
48
  Users can select different model variants (GGUF format), system prompts, and observe generated responses in real-time.
49
- Key generation parameters, such as ⁣`temperature`, `max_tokens`, `top_k` and others are exposed below for tuning model behavior."""
 
50
 
51
 
52
  llm = None
@@ -55,7 +72,7 @@ llm_model = None
55
  def respond(
56
  message: str,
57
  history: List[Tuple[str, str]],
58
- model: str = "gemma_3_800M_sft_v2_translation-kazparc_latest.gguf", # Set default model
59
  system_message: str = "You are a helpful assistant.",
60
  max_tokens: int = 1024,
61
  temperature: float = 0.7,
@@ -78,86 +95,102 @@ def respond(
78
  Returns:
79
  str: The response to the message.
80
  """
81
-
82
- # Load the global variables
83
- global llm
84
- global llm_model
85
-
86
- # Ensure model is not None
87
- if model is None:
88
- model = "gemma_3_800M_sft_v2_translation-kazparc_latest.gguf"
89
-
90
- # Load the model
91
- if llm is None or llm_model != model:
92
- # Check if model file exists
93
- model_path = f"{model}"
94
- if not os.path.exists(model_path):
95
- yield f"Error: Model file not found at {model_path}. Please check your model path."
96
- return
97
-
98
- llm = Llama(
99
- model_path=f"{model}",
100
- flash_attn=False,
101
- n_gpu_layers=0,
102
- n_batch=8,
103
- n_ctx=2048,
104
- n_threads=8,
105
- n_threads_batch=8,
 
 
 
 
 
 
 
 
 
 
106
  )
107
- llm_model = model
108
- provider = LlamaCppPythonProvider(llm)
109
-
110
- # Create the agent
111
- agent = LlamaCppAgent(
112
- provider,
113
- system_prompt=f"{system_message}",
114
- custom_messages_formatter=gemma_3_formatter,
115
- debug_output=True,
116
- )
117
 
118
- # Set the settings like temperature, top-k, top-p, max tokens, etc.
119
- settings = provider.get_provider_default_settings()
120
- settings.temperature = temperature
121
- settings.top_k = top_k
122
- settings.top_p = top_p
123
- settings.max_tokens = max_tokens
124
- settings.repeat_penalty = repeat_penalty
125
- settings.stream = True
126
-
127
- messages = BasicChatHistory()
128
-
129
- # Add the chat history
130
- for msn in history:
131
- user = {"role": Roles.user, "content": msn[0]}
132
- assistant = {"role": Roles.assistant, "content": msn[1]}
133
- messages.add_message(user)
134
- messages.add_message(assistant)
135
-
136
- # Get the response stream
137
- stream = agent.get_chat_response(
138
- message,
139
- llm_sampling_settings=settings,
140
- chat_history=messages,
141
- returns_streaming_generator=True,
142
- print_output=False,
143
- )
 
 
 
144
 
 
 
 
 
 
145
 
146
- # Generate the response
147
- outputs = ""
148
- for output in stream:
149
- outputs += output
150
- yield outputs
151
 
152
 
153
  # Create a chat interface
154
  demo = gr.ChatInterface(
155
  respond,
156
- examples=[],
157
  additional_inputs_accordion=gr.Accordion(
158
  label="⚙️ Parameters", open=False, render=False
159
  ),
160
  additional_inputs=[
 
 
 
 
 
 
 
 
 
161
  gr.Textbox(
162
  value="You are a helpful assistant.",
163
  label="System Prompt",
@@ -210,7 +243,9 @@ demo = gr.ChatInterface(
210
  stop_btn="Stop",
211
  title=title,
212
  description=description,
213
- chatbot=gr.Chatbot(scale=1, show_copy_button=True),
 
 
214
  cache_examples=False,
215
  )
216
 
@@ -222,4 +257,4 @@ if __name__ == "__main__":
222
  server_name="0.0.0.0",
223
  server_port=7860,
224
  show_api=False,
225
- )
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore")
3
+
4
  import os
5
  import json
6
  import subprocess
 
14
  from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers
15
  from huggingface_hub import hf_hub_download
16
  import gradio as gr
17
+ from logger import logging
18
+ from exception import CustomExceptionHandling
19
 
20
 
21
  # Load the Environment Variables from .env file
22
  huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
23
 
24
+ # Download gguf model files
25
+ if not os.path.exists("./models"):
26
+ os.makedirs("./models")
27
+
28
  hf_hub_download(
29
+ repo_id="bartowski/google_gemma-3-1b-it-GGUF",
30
+ filename="google_gemma-3-1b-it-Q4_K_M.gguf",
31
+ local_dir="./models",
32
+ )
33
+ hf_hub_download(
34
+ repo_id="bartowski/google_gemma-3-1b-it-GGUF",
35
+ filename="google_gemma-3-1b-it-Q5_K_M.gguf",
36
+ local_dir="./models",
37
  )
38
 
39
 
 
58
 
59
 
60
  # Set the title and description
61
+ title = "Gemma Llama.cpp"
62
+ description = """Google released **[Gemma 3](https://blog.google/technology/developers/gemma-3/)**, a family of multimodal models that offers advanced capabilities like large context and multilingual support.
63
+ This interactive chat interface allows you to experiment with the [`gemma-3-1b-it`](https://huggingface.co/google/gemma-3-1b-it) text model using various prompts and generation parameters.
64
  Users can select different model variants (GGUF format), system prompts, and observe generated responses in real-time.
65
+ Key generation parameters, such as ⁣`temperature`, `max_tokens`, `top_k` and others are exposed below for tuning model behavior.
66
+ For a detailed technical walkthrough, please refer to the accompanying **[blog post](https://sitammeur.medium.com/build-your-own-gemma-3-chatbot-with-gradio-and-llama-cpp-46457b22a28e)**."""
67
 
68
 
69
  llm = None
 
72
  def respond(
73
  message: str,
74
  history: List[Tuple[str, str]],
75
+ model: str = "google_gemma-3-1b-it-Q4_K_M.gguf", # Set default model
76
  system_message: str = "You are a helpful assistant.",
77
  max_tokens: int = 1024,
78
  temperature: float = 0.7,
 
95
  Returns:
96
  str: The response to the message.
97
  """
98
+ try:
99
+ # Load the global variables
100
+ global llm
101
+ global llm_model
102
+
103
+ # Ensure model is not None
104
+ if model is None:
105
+ model = "google_gemma-3-1b-it-Q4_K_M.gguf"
106
+
107
+ # Load the model
108
+ if llm is None or llm_model != model:
109
+ # Check if model file exists
110
+ model_path = f"models/{model}"
111
+ if not os.path.exists(model_path):
112
+ yield f"Error: Model file not found at {model_path}. Please check your model path."
113
+ return
114
+
115
+ llm = Llama(
116
+ model_path=f"models/{model}",
117
+ flash_attn=False,
118
+ n_gpu_layers=0,
119
+ n_batch=8,
120
+ n_ctx=2048,
121
+ n_threads=8,
122
+ n_threads_batch=8,
123
+ )
124
+ llm_model = model
125
+ provider = LlamaCppPythonProvider(llm)
126
+
127
+ # Create the agent
128
+ agent = LlamaCppAgent(
129
+ provider,
130
+ system_prompt=f"{system_message}",
131
+ custom_messages_formatter=gemma_3_formatter,
132
+ debug_output=True,
133
  )
 
 
 
 
 
 
 
 
 
 
134
 
135
+ # Set the settings like temperature, top-k, top-p, max tokens, etc.
136
+ settings = provider.get_provider_default_settings()
137
+ settings.temperature = temperature
138
+ settings.top_k = top_k
139
+ settings.top_p = top_p
140
+ settings.max_tokens = max_tokens
141
+ settings.repeat_penalty = repeat_penalty
142
+ settings.stream = True
143
+
144
+ messages = BasicChatHistory()
145
+
146
+ # Add the chat history
147
+ for msn in history:
148
+ user = {"role": Roles.user, "content": msn[0]}
149
+ assistant = {"role": Roles.assistant, "content": msn[1]}
150
+ messages.add_message(user)
151
+ messages.add_message(assistant)
152
+
153
+ # Get the response stream
154
+ stream = agent.get_chat_response(
155
+ message,
156
+ llm_sampling_settings=settings,
157
+ chat_history=messages,
158
+ returns_streaming_generator=True,
159
+ print_output=False,
160
+ )
161
+
162
+ # Log the success
163
+ logging.info("Response stream generated successfully")
164
 
165
+ # Generate the response
166
+ outputs = ""
167
+ for output in stream:
168
+ outputs += output
169
+ yield outputs
170
 
171
+ # Handle exceptions that may occur during the process
172
+ except Exception as e:
173
+ # Custom exception handling
174
+ raise CustomExceptionHandling(e, sys) from e
 
175
 
176
 
177
  # Create a chat interface
178
  demo = gr.ChatInterface(
179
  respond,
180
+ examples=[["What is the capital of France?"], ["Tell me something about artificial intelligence."], ["What is gravity?"]],
181
  additional_inputs_accordion=gr.Accordion(
182
  label="⚙️ Parameters", open=False, render=False
183
  ),
184
  additional_inputs=[
185
+ gr.Dropdown(
186
+ choices=[
187
+ "google_gemma-3-1b-it-Q4_K_M.gguf",
188
+ "google_gemma-3-1b-it-Q5_K_M.gguf",
189
+ ],
190
+ value="google_gemma-3-1b-it-Q4_K_M.gguf",
191
+ label="Model",
192
+ info="Select the AI model to use for chat",
193
+ ),
194
  gr.Textbox(
195
  value="You are a helpful assistant.",
196
  label="System Prompt",
 
243
  stop_btn="Stop",
244
  title=title,
245
  description=description,
246
+ chatbot=gr.Chatbot(scale=1, show_copy_button=True, resizable=True),
247
+ flagging_mode="never",
248
+ editable=True,
249
  cache_examples=False,
250
  )
251
 
 
257
  server_name="0.0.0.0",
258
  server_port=7860,
259
  show_api=False,
260
+ )