liumaolin commited on
Commit
1ae18a4
·
1 Parent(s): 8d91cc1

Refactor sentence processing in `text_generator.py`: centralize sentence end mark sets, streamline `_should_end_sentence` logic, and eliminate redundant parameter passing for improved clarity and maintainability.

Browse files
src/VoiceDialogue/services/text/text_generator.py CHANGED
@@ -35,6 +35,10 @@ class LLMResponseGenerator(BaseThread):
35
  self.user_question_queue = user_question_queue
36
  self.generated_answer_queue = generated_answer_queue
37
 
 
 
 
 
38
  def _get_prompt_by_language(self, language: str) -> str:
39
  """根据语言获取对应的 prompt"""
40
  if language == "zh":
@@ -64,19 +68,24 @@ class LLMResponseGenerator(BaseThread):
64
  messages = memory.load_memory_variables({})[key]
65
  return InMemoryChatMessageHistory(messages=messages)
66
 
67
- def _should_end_sentence(self, sentence: str, sentence_end_mark: str,
68
- sentence_end_marks: set, is_first_sentence: bool) -> bool:
69
  """判断是否应该结束当前句子"""
70
- if not sentence or sentence_end_mark not in sentence_end_marks:
71
  return False
72
 
 
 
 
 
73
  # 第一个句子的特殊处理逻辑
74
  if is_first_sentence:
75
- chinese_sentence_end_marks = {',', '。', '!', '?', ':', ';', '、'}
76
- return (len(sentence) > 2 and sentence_end_mark in chinese_sentence_end_marks)
 
 
77
 
78
  # 普通句子的判断逻辑
79
- if sentence_end_mark in {',', '。', '!', '?', ':', ';', '、'}:
80
  sentence_words = len(sentence)
81
  else:
82
  sentence_words = len(sentence.split())
@@ -105,9 +114,6 @@ class LLMResponseGenerator(BaseThread):
105
 
106
  def _process_voice_task(self, voice_task: VoiceTask) -> None:
107
  """处理单个语音任务"""
108
- english_sentence_end_marks = {'!', '?', '.', ',', ':', ';'}
109
- chinese_sentence_end_marks = {',', '。', '!', '?', ':', ';', '、'}
110
- sentence_end_marks = english_sentence_end_marks | chinese_sentence_end_marks
111
 
112
  chunks = []
113
  answer_index = 0
@@ -124,9 +130,9 @@ class LLMResponseGenerator(BaseThread):
124
 
125
  try:
126
  for chunk in pipeline.stream(input={'input': user_question}, config=config):
127
- chunk_content = f'{chunk.content.strip()}'
128
- if not chunk_content:
129
  continue
 
130
 
131
  sentence_end_mark, remain_content = self._process_chunk_content(chunk_content)
132
  chunks.append(sentence_end_mark)
@@ -136,7 +142,7 @@ class LLMResponseGenerator(BaseThread):
136
  continue
137
 
138
  # 检查是否应该结束当前句子
139
- if self._should_end_sentence(sentence, sentence_end_mark, sentence_end_marks, is_first_sentence):
140
  self._send_sentence_to_queue(voice_task, sentence, answer_index)
141
  chunks = self._reset_chunks(remain_content)
142
  answer_index += 1
@@ -146,19 +152,18 @@ class LLMResponseGenerator(BaseThread):
146
  chunks.append(remain_content)
147
 
148
  # 处理最后剩余的 chunks
149
- self._handle_remaining_chunks(voice_task, chunks, answer_index, sentence_end_marks)
150
 
151
  except Exception as e:
152
  print(f'处理语音任务时发生错误: {e}')
153
 
154
- def _handle_remaining_chunks(self, voice_task: VoiceTask, chunks: list,
155
- answer_index: int, sentence_end_marks: set) -> None:
156
  """处理剩余的 chunks"""
157
  if not chunks:
158
  return
159
 
160
  sentence = preprocess_sentence_text(chunks)
161
- if not sentence or sentence.strip() in sentence_end_marks:
162
  return
163
 
164
  self._send_sentence_to_queue(voice_task, sentence, answer_index)
 
35
  self.user_question_queue = user_question_queue
36
  self.generated_answer_queue = generated_answer_queue
37
 
38
+ self.english_sentence_end_marks = {'!', '?', '.', ',', ':', ';'}
39
+ self.chinese_sentence_end_marks = {',', '。', '!', '?', ':', ';', '、'}
40
+ self.sentence_end_marks = self.english_sentence_end_marks | self.chinese_sentence_end_marks
41
+
42
  def _get_prompt_by_language(self, language: str) -> str:
43
  """根据语言获取对应的 prompt"""
44
  if language == "zh":
 
68
  messages = memory.load_memory_variables({})[key]
69
  return InMemoryChatMessageHistory(messages=messages)
70
 
71
+ def _should_end_sentence(self, sentence: str, sentence_end_mark: str, is_first_sentence: bool) -> bool:
 
72
  """判断是否应该结束当前句子"""
73
+ if not sentence or sentence_end_mark not in self.sentence_end_marks:
74
  return False
75
 
76
+ is_chinese_sentence = False
77
+ if sentence_end_mark in self.chinese_sentence_end_marks:
78
+ is_chinese_sentence = True
79
+
80
  # 第一个句子的特殊处理逻辑
81
  if is_first_sentence:
82
+ if is_chinese_sentence:
83
+ return (len(sentence) > 2 and sentence_end_mark in self.chinese_sentence_end_marks)
84
+ else:
85
+ return (len(sentence.split()) > 1 and sentence_end_mark in self.english_sentence_end_marks)
86
 
87
  # 普通句子的判断逻辑
88
+ if is_chinese_sentence:
89
  sentence_words = len(sentence)
90
  else:
91
  sentence_words = len(sentence.split())
 
114
 
115
  def _process_voice_task(self, voice_task: VoiceTask) -> None:
116
  """处理单个语音任务"""
 
 
 
117
 
118
  chunks = []
119
  answer_index = 0
 
130
 
131
  try:
132
  for chunk in pipeline.stream(input={'input': user_question}, config=config):
133
+ if not chunk.content.strip():
 
134
  continue
135
+ chunk_content = f'{chunk.content}'
136
 
137
  sentence_end_mark, remain_content = self._process_chunk_content(chunk_content)
138
  chunks.append(sentence_end_mark)
 
142
  continue
143
 
144
  # 检查是否应该结束当前句子
145
+ if self._should_end_sentence(sentence, sentence_end_mark, is_first_sentence):
146
  self._send_sentence_to_queue(voice_task, sentence, answer_index)
147
  chunks = self._reset_chunks(remain_content)
148
  answer_index += 1
 
152
  chunks.append(remain_content)
153
 
154
  # 处理最后剩余的 chunks
155
+ self._handle_remaining_chunks(voice_task, chunks, answer_index)
156
 
157
  except Exception as e:
158
  print(f'处理语音任务时发生错误: {e}')
159
 
160
+ def _handle_remaining_chunks(self, voice_task: VoiceTask, chunks: list, answer_index: int) -> None:
 
161
  """处理剩余的 chunks"""
162
  if not chunks:
163
  return
164
 
165
  sentence = preprocess_sentence_text(chunks)
166
+ if not sentence or sentence.strip() in self.sentence_end_marks:
167
  return
168
 
169
  self._send_sentence_to_queue(voice_task, sentence, answer_index)