liumaolin commited on
Commit
6f77a29
·
1 Parent(s): 6d80696

Update LLM response generator and system prompts

Browse files

- Add "/no_think" directive to English and Chinese system prompts.
- Handle `<think>` and `</think>` tags in response chunk processing.
- Update LLM model file to `Qwen3-8B-Q6_K.gguf`.
- Adjust model parameters: increase context length, modify temperature, top_p, and top_k, and include `presence_penalty`.

assets/models/llm/qwen/{Qwen2.5-14B-Instruct.Q4_0.gguf → Qwen3-8B-Q6_K.gguf} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c9d82509108e9f758ff9ad1034d2c23088c611c76eaec308ad246caf8decac5
3
- size 8517725952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb042ccd76795a8830d6be6bd4165245847cc68e41797b13bd61aed4c2cfbce6
3
+ size 6725899040
src/voice_dialogue/services/text/generator.py CHANGED
@@ -12,15 +12,21 @@ from voice_dialogue.models.voice_task import VoiceTask, QuestionDisplayMessage
12
  from voice_dialogue.services.text.processor import preprocess_sentence_text, \
13
  create_langchain_chat_llamacpp_instance, create_langchain_pipeline, warmup_langchain_pipeline
14
 
15
- CHINESE_SYSTEM_PROMPT = ("你是善于模拟真实的思考过程的AI助手。"
16
- "回答时,必须首先生成一个不超过5个字的简短句子,"
17
- "比如:\"让我想一下\"、\"在我看来\"、\"稍等我理一理\"、\"不错的问题\"、\"稍等片刻\"等,然后再进行正式回答,"
18
- "保持中文口语化表达,禁用emoji和系统相关描述,确保衔接词与内容存在合理逻辑关联。")
19
-
20
- ENGLISH_SYSTEM_PROMPT = ("You are an AI assistant skilled at simulating authentic thinking processes. "
21
- "When responding, you must first generate a brief phrase of no more than 5 words, "
22
- "such as: 'Let me think', 'I see', 'Let me process this', 'Good question', 'One moment', etc., then proceed with your formal response. "
23
- "Maintain natural conversational English expression, avoid emojis and system-related descriptions, and ensure logical coherence between transitional phrases and content.")
 
 
 
 
 
 
24
 
25
 
26
  class LLMResponseGenerator(BaseThread):
@@ -144,6 +150,9 @@ class LLMResponseGenerator(BaseThread):
144
  for chunk in pipeline.stream(input={'input': user_question}, config=config):
145
  if not chunk.content.strip():
146
  continue
 
 
 
147
  chunk_content = f'{chunk.content}'
148
 
149
  sentence_end_mark, remain_content = self._process_chunk_content(chunk_content)
@@ -181,17 +190,18 @@ class LLMResponseGenerator(BaseThread):
181
  self._send_sentence_to_queue(voice_task, sentence, answer_index)
182
 
183
  def run(self):
184
- model_path = paths.LLM_MODELS_PATH / 'qwen' / 'Qwen2.5-14B-Instruct.Q4_0.gguf'
185
  model_params = {
186
  'streaming': True,
187
- 'n_gpu_layers': -1,
188
- 'n_batch': 512,
189
- 'n_ctx': 2048,
190
- 'f16_kv': True,
191
- 'temperature': 0.8,
192
- # 'n_predict': -1,
193
- 'top_k': 50,
194
- 'top_p': 1.0,
 
195
  }
196
  self.model_instance = create_langchain_chat_llamacpp_instance(
197
  local_model_path=model_path, model_params=model_params
 
12
  from voice_dialogue.services.text.processor import preprocess_sentence_text, \
13
  create_langchain_chat_llamacpp_instance, create_langchain_pipeline, warmup_langchain_pipeline
14
 
15
+ CHINESE_SYSTEM_PROMPT = (
16
+ "你是善于模拟真实的思考过程的AI助手。"
17
+ "回答时,必须首先生成一个不超过5个字的简短句子,"
18
+ "比如:\"让我想一下\"、\"在我看来\"、\"稍等我理一理\"、\"不错的问题\"、\"稍等片刻\"等,然后再进行正式回答,"
19
+ "保持中文口语化表达,禁用emoji和系统相关描述,确保衔接词与内容存在合理逻辑关联。"
20
+ "/no_think"
21
+ )
22
+
23
+ ENGLISH_SYSTEM_PROMPT = (
24
+ "You are an AI assistant skilled at simulating authentic thinking processes. "
25
+ "When responding, you must first generate a brief phrase of no more than 5 words, "
26
+ "such as: 'Let me think', 'I see', 'Let me process this', 'Good question', 'One moment', etc., then proceed with your formal response. "
27
+ "Maintain natural conversational English expression, avoid emojis and system-related descriptions, and ensure logical coherence between transitional phrases and content."
28
+ "/no_think"
29
+ )
30
 
31
 
32
  class LLMResponseGenerator(BaseThread):
 
150
  for chunk in pipeline.stream(input={'input': user_question}, config=config):
151
  if not chunk.content.strip():
152
  continue
153
+ elif chunk.content.strip() in {'<think>', '</think>'}:
154
+ continue
155
+
156
  chunk_content = f'{chunk.content}'
157
 
158
  sentence_end_mark, remain_content = self._process_chunk_content(chunk_content)
 
190
  self._send_sentence_to_queue(voice_task, sentence, answer_index)
191
 
192
  def run(self):
193
+ model_path = paths.LLM_MODELS_PATH / 'qwen' / 'Qwen3-8B-Q6_K.gguf'
194
  model_params = {
195
  'streaming': True,
196
+ 'n_ctx': 32768,
197
+ 'temperature': 0.7,
198
+ 'top_p': 0.9,
199
+ 'top_k': 20,
200
+ 'model_kwargs': {
201
+ 'mini_p': 0,
202
+ 'presence_penalty': 1.5
203
+ },
204
+ 'verbose': False,
205
  }
206
  self.model_instance = create_langchain_chat_llamacpp_instance(
207
  local_model_path=model_path, model_params=model_params