Beibars003 commited on
Commit
eb1838b
·
verified ·
1 Parent(s): 04c13ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -84
app.py CHANGED
@@ -1,6 +1,4 @@
1
  import os
2
- import json
3
- import subprocess
4
  import sys
5
  from typing import List, Tuple
6
  from llama_cpp import Llama
@@ -11,9 +9,6 @@ from llama_cpp_agent.chat_history.messages import Roles
11
  from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers
12
  from huggingface_hub import hf_hub_download
13
  import gradio as gr
14
- # from logger import logging
15
- # from exception import CustomExceptionHandling
16
-
17
 
18
  # Load the Environment Variables from .env file
19
  huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
@@ -28,25 +23,22 @@ hf_hub_download(
28
  local_dir="./models",
29
  )
30
 
31
-
32
  # Define the prompt markers for Gemma 3
33
  gemma_3_prompt_markers = {
34
- Roles.system: PromptMarkers("<start_of_turn>system\n", "<end_of_turn>\n"), # System prompt should be included within user message
35
  Roles.user: PromptMarkers("<start_of_turn>user\n", "<end_of_turn>\n"),
36
  Roles.assistant: PromptMarkers("<start_of_turn>assistant", ""),
37
-
38
- Roles.tool: PromptMarkers("", ""), # If you need tool support
39
  }
40
 
41
- # Create the formatter
42
  gemma_3_formatter = MessagesFormatter(
43
- pre_prompt="", # No pre-prompt
44
  prompt_markers=gemma_3_prompt_markers,
45
- include_sys_prompt_in_first_user_message=True, # Include system prompt in first user message
46
  default_stop_sequences=["<end_of_turn>", "<start_of_turn>"],
47
- strip_prompt=False, # Don't strip whitespace from the prompt
48
- bos_token="<bos>", # Beginning of sequence token for Gemma 3
49
- eos_token="<eos>", # End of sequence token for Gemma 3
50
  )
51
 
52
  # Translation direction to prompts mapping
@@ -69,59 +61,46 @@ direction_to_prompts = {
69
  }
70
  }
71
 
72
- # Set the title and description
73
- title = "Kazakh Language Model"
74
- description = """"""
75
-
76
-
77
  llm = None
78
  llm_model = None
79
 
80
  def respond(
81
  message: str,
82
  history: List[Tuple[str, str]],
83
- model: str,
84
  direction: str,
85
- max_tokens: int = 64,
86
  temperature: float = 0.7,
87
  top_p: float = 0.95,
88
  top_k: int = 40,
89
  repeat_penalty: float = 1.1,
90
  ):
91
  """
92
- Respond to a message using the Gemma3 model via Llama.cpp.
 
93
  Args:
94
- - message (str): The message to respond to.
95
- - history (List[Tuple[str, str]]): The chat history.
96
- - model (str): The model to use.
97
- - system_message (str): The system message to use.
98
- - max_tokens (int): The maximum number of tokens to generate.
99
- - temperature (float): The temperature of the model.
100
- - top_p (float): The top-p of the model.
101
- - top_k (int): The top-k of the model.
102
- - repeat_penalty (float): The repetition penalty of the model.
103
- Returns:
104
- str: The response to the message.
 
105
  """
106
- # try:
107
- # Load the global variables
108
- global llm
109
- global llm_model
110
-
111
- # Ensure model is not None
112
- if model is None:
113
- model = "gemma_3_800M_sft_v2_translation-kazparc_latest.gguf"
114
-
115
- # Load the model
116
  if llm is None or llm_model != model:
117
- # Check if model file exists
118
  model_path = f"models/{model}"
119
  if not os.path.exists(model_path):
120
- yield f"Error: Model file not found at {model_path}. Please check your model path."
121
  return
122
-
123
  llm = Llama(
124
- model_path=f"models/{model}",
125
  flash_attn=False,
126
  n_gpu_layers=0,
127
  n_batch=8,
@@ -132,15 +111,18 @@ def respond(
132
  llm_model = model
133
  provider = LlamaCppPythonProvider(llm)
134
 
135
- # Create the agent
 
 
 
 
136
  agent = LlamaCppAgent(
137
  provider,
138
- #system_prompt=f"{system_message}",
139
  custom_messages_formatter=gemma_3_formatter,
140
  debug_output=True,
141
  )
142
 
143
- # Set the settings like temperature, top-k, top-p, max tokens, etc.
144
  settings = provider.get_provider_default_settings()
145
  settings.temperature = temperature
146
  settings.top_k = top_k
@@ -150,45 +132,31 @@ def respond(
150
  settings.stream = True
151
 
152
  messages = BasicChatHistory()
 
 
 
 
153
 
154
- # Add the chat history
155
- for msn in history:
156
- user = {"role": Roles.user, "content": msn[0]}
157
- assistant = {"role": Roles.assistant, "content": msn[1]}
158
- messages.add_message(user)
159
- messages.add_message(assistant)
160
 
161
- # Get the response stream
162
  stream = agent.get_chat_response(
163
- message,
164
  llm_sampling_settings=settings,
165
  chat_history=messages,
166
  returns_streaming_generator=True,
167
  print_output=False,
168
  )
169
 
170
- # Log the success
171
- # logging.info("Response stream generated successfully")
172
-
173
- # Generate the response
174
  outputs = ""
175
  for output in stream:
176
  outputs += output
177
  yield outputs
178
 
179
- # # Handle exceptions that may occur during the process
180
- # except Exception as e:
181
- # # Custom exception handling
182
- # raise CustomExceptionHandling(e, sys) from e
183
-
184
 
185
- # Create a chat interface
186
  demo = gr.ChatInterface(
187
  respond,
188
- examples=[["Сәлем"], ["Привет"], ["Hello"]],
189
- additional_inputs_accordion=gr.Accordion(
190
- label="⚙️ Parameters", open=False, render=False
191
- ),
192
  additional_inputs=[
193
  gr.Dropdown(
194
  choices=[
@@ -209,7 +177,7 @@ demo = gr.ChatInterface(
209
  value=1024,
210
  step=1,
211
  label="Max Tokens",
212
- info="Maximum length of response (higher = longer replies)",
213
  ),
214
  gr.Slider(
215
  minimum=0.1,
@@ -217,7 +185,7 @@ demo = gr.ChatInterface(
217
  value=0.7,
218
  step=0.1,
219
  label="Temperature",
220
- info="Creativity level (higher = more creative, lower = more focused)",
221
  ),
222
  gr.Slider(
223
  minimum=0.1,
@@ -225,7 +193,7 @@ demo = gr.ChatInterface(
225
  value=0.95,
226
  step=0.05,
227
  label="Top-p",
228
- info="Nucleus sampling threshold",
229
  ),
230
  gr.Slider(
231
  minimum=1,
@@ -233,7 +201,7 @@ demo = gr.ChatInterface(
233
  value=40,
234
  step=1,
235
  label="Top-k",
236
- info="Limit vocabulary choices to top K tokens",
237
  ),
238
  gr.Slider(
239
  minimum=1.0,
@@ -241,24 +209,22 @@ demo = gr.ChatInterface(
241
  value=1.1,
242
  step=0.1,
243
  label="Repetition Penalty",
244
- info="Penalize repeated words (higher = less repetition)",
245
  ),
246
  ],
247
  theme="Ocean",
248
- submit_btn="Send",
249
  stop_btn="Stop",
250
- title=title,
251
- description=description,
252
  chatbot=gr.Chatbot(scale=1, show_copy_button=True),
253
  cache_examples=False,
254
  )
255
 
256
-
257
- # Launch the chat interface
258
  if __name__ == "__main__":
259
  demo.launch(
260
  share=False,
261
  server_name="0.0.0.0",
262
  server_port=7860,
263
  show_api=False,
264
- )
 
1
  import os
 
 
2
  import sys
3
  from typing import List, Tuple
4
  from llama_cpp import Llama
 
9
  from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers
10
  from huggingface_hub import hf_hub_download
11
  import gradio as gr
 
 
 
12
 
13
  # Load the Environment Variables from .env file
14
  huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 
23
  local_dir="./models",
24
  )
25
 
 
26
  # Define the prompt markers for Gemma 3
27
  gemma_3_prompt_markers = {
28
+ Roles.system: PromptMarkers("<start_of_turn>system\n", "<end_of_turn>\n"),
29
  Roles.user: PromptMarkers("<start_of_turn>user\n", "<end_of_turn>\n"),
30
  Roles.assistant: PromptMarkers("<start_of_turn>assistant", ""),
31
+ Roles.tool: PromptMarkers("", ""),
 
32
  }
33
 
 
34
  gemma_3_formatter = MessagesFormatter(
35
+ pre_prompt="",
36
  prompt_markers=gemma_3_prompt_markers,
37
+ include_sys_prompt_in_first_user_message=True,
38
  default_stop_sequences=["<end_of_turn>", "<start_of_turn>"],
39
+ strip_prompt=False,
40
+ bos_token="<bos>",
41
+ eos_token="<eos>",
42
  )
43
 
44
  # Translation direction to prompts mapping
 
61
  }
62
  }
63
 
 
 
 
 
 
64
  llm = None
65
  llm_model = None
66
 
67
  def respond(
68
  message: str,
69
  history: List[Tuple[str, str]],
70
+ model: str = "gemma_3_800M_sft_v2_translation-kazparc_latest.gguf",
71
  direction: str,
72
+ max_tokens: int = 1024,
73
  temperature: float = 0.7,
74
  top_p: float = 0.95,
75
  top_k: int = 40,
76
  repeat_penalty: float = 1.1,
77
  ):
78
  """
79
+ Respond to a message by translating it using the specified direction.
80
+
81
  Args:
82
+ message (str): The text to translate.
83
+ history (List[Tuple[str, str]]): The chat history.
84
+ direction (str): The translation direction (e.g., "English to Kazakh").
85
+ model (str): The model file to use.
86
+ max_tokens (int): Maximum number of tokens to generate.
87
+ temperature (float): Sampling temperature.
88
+ top_p (float): Top-p sampling parameter.
89
+ top_k (int): Top-k sampling parameter.
90
+ repeat_penalty (float): Penalty for repetition.
91
+
92
+ Yields:
93
+ str: The translated text as it is generated.
94
  """
95
+
96
+ global llm, llm_model
 
 
 
 
 
 
 
 
97
  if llm is None or llm_model != model:
 
98
  model_path = f"models/{model}"
99
  if not os.path.exists(model_path):
100
+ yield f"Error: Model file not found at {model_path}."
101
  return
 
102
  llm = Llama(
103
+ model_path=model_path,
104
  flash_attn=False,
105
  n_gpu_layers=0,
106
  n_batch=8,
 
111
  llm_model = model
112
  provider = LlamaCppPythonProvider(llm)
113
 
114
+ # Get system prompt and user prefix based on direction
115
+ prompts = direction_to_prompts[direction]
116
+ system_message = prompts["system"]
117
+ user_prefix = prompts["prefix"]
118
+
119
  agent = LlamaCppAgent(
120
  provider,
121
+ system_prompt=system_message,
122
  custom_messages_formatter=gemma_3_formatter,
123
  debug_output=True,
124
  )
125
 
 
126
  settings = provider.get_provider_default_settings()
127
  settings.temperature = temperature
128
  settings.top_k = top_k
 
132
  settings.stream = True
133
 
134
  messages = BasicChatHistory()
135
+ for user_msg, assistant_msg in history:
136
+ full_user_msg = user_prefix + " " + user_msg
137
+ messages.add_message({"role": Roles.user, "content": full_user_msg})
138
+ messages.add_message({"role": Roles.assistant, "content": assistant_msg})
139
 
140
+ full_message = user_prefix + " " + message
 
 
 
 
 
141
 
 
142
  stream = agent.get_chat_response(
143
+ full_message,
144
  llm_sampling_settings=settings,
145
  chat_history=messages,
146
  returns_streaming_generator=True,
147
  print_output=False,
148
  )
149
 
 
 
 
 
150
  outputs = ""
151
  for output in stream:
152
  outputs += output
153
  yield outputs
154
 
 
 
 
 
 
155
 
 
156
  demo = gr.ChatInterface(
157
  respond,
158
+ examples=[["Hello"], ["Сәлем"], ["Привет"]],
159
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
 
 
160
  additional_inputs=[
161
  gr.Dropdown(
162
  choices=[
 
177
  value=1024,
178
  step=1,
179
  label="Max Tokens",
180
+ info="Maximum length of the translation"
181
  ),
182
  gr.Slider(
183
  minimum=0.1,
 
185
  value=0.7,
186
  step=0.1,
187
  label="Temperature",
188
+ info="Controls randomness (higher = more creative)"
189
  ),
190
  gr.Slider(
191
  minimum=0.1,
 
193
  value=0.95,
194
  step=0.05,
195
  label="Top-p",
196
+ info="Nucleus sampling threshold"
197
  ),
198
  gr.Slider(
199
  minimum=1,
 
201
  value=40,
202
  step=1,
203
  label="Top-k",
204
+ info="Limits vocabulary to top K tokens"
205
  ),
206
  gr.Slider(
207
  minimum=1.0,
 
209
  value=1.1,
210
  step=0.1,
211
  label="Repetition Penalty",
212
+ info="Penalizes repeated words"
213
  ),
214
  ],
215
  theme="Ocean",
216
+ submit_btn="Translate",
217
  stop_btn="Stop",
218
+ title="Kazakh Translation Model",
219
+ description="Translate text between Kazakh, English, and Russian using a specialized language model.",
220
  chatbot=gr.Chatbot(scale=1, show_copy_button=True),
221
  cache_examples=False,
222
  )
223
 
 
 
224
  if __name__ == "__main__":
225
  demo.launch(
226
  share=False,
227
  server_name="0.0.0.0",
228
  server_port=7860,
229
  show_api=False,
230
+ )