liumaolin commited on
Commit
e3d17e2
·
1 Parent(s): 08fc60d

重构处理LLM生成chunk内容的方法,以提高文本生成的准确性。

Browse files
src/voice_dialogue/services/text/generator.py CHANGED
@@ -1,5 +1,6 @@
1
  import copy
2
  import time
 
3
  from queue import Queue, Empty
4
 
5
  from langchain.memory import ConversationBufferWindowMemory
@@ -99,12 +100,30 @@ class LLMResponseGenerator(BaseThread):
99
  """重置 chunks 列表"""
100
  return [remain_content] if remain_content else []
101
 
 
 
 
 
 
 
 
102
  def _process_chunk_content(self, chunk_content: str) -> tuple:
103
- """处理 chunk 内容,分离句子结束标记和剩余内容"""
104
- if len(chunk_content) > 1:
105
- return chunk_content[0], chunk_content[1:]
106
- else:
107
- return chunk_content, ''
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  def _process_voice_task(self, voice_task: VoiceTask) -> None:
110
  """处理单个语音任务"""
@@ -140,8 +159,11 @@ class LLMResponseGenerator(BaseThread):
140
 
141
  chunk_content = f'{chunk.content}'
142
 
143
- sentence_end_mark, remain_content = self._process_chunk_content(chunk_content)
144
- chunks.append(sentence_end_mark)
 
 
 
145
 
146
  sentence = preprocess_sentence_text(chunks)
147
  if not sentence:
 
1
  import copy
2
  import time
3
+ import unicodedata
4
  from queue import Queue, Empty
5
 
6
  from langchain.memory import ConversationBufferWindowMemory
 
100
  """重置 chunks 列表"""
101
  return [remain_content] if remain_content else []
102
 
103
+ def _is_punctuation(self, char: str) -> bool:
104
+ """判断一个字符是否是标点符号"""
105
+ if not char or len(char) != 1:
106
+ return False
107
+ # 检查字符的 Unicode 类别是否为标点符号 (Punctuation)
108
+ return unicodedata.category(char).startswith('P')
109
+
110
  def _process_chunk_content(self, chunk_content: str) -> tuple:
111
+ """处理 chunk 内容,从右到左找标点符号并分割成三部分"""
112
+ if not chunk_content:
113
+ return '', '', ''
114
+
115
+ # 从右到左迭代,找到第一个标点符号
116
+ for i in range(len(chunk_content) - 1, -1, -1):
117
+ char = chunk_content[i]
118
+ if self._is_punctuation(char):
119
+ # 找到标点符号,分割成三部分
120
+ before_punct = chunk_content[:i] # 标点之前的部分
121
+ punct = char # 标点符号本身
122
+ after_punct = chunk_content[i + 1:] # 标点之后的部分
123
+ return before_punct, punct, after_punct
124
+
125
+ # 如果没有找到标点符号,返回整个内容作为第一部分
126
+ return chunk_content, '', ''
127
 
128
  def _process_voice_task(self, voice_task: VoiceTask) -> None:
129
  """处理单个语音任务"""
 
159
 
160
  chunk_content = f'{chunk.content}'
161
 
162
+ before_punct, sentence_end_mark, remain_content = self._process_chunk_content(chunk_content)
163
+ if before_punct:
164
+ chunks.append(before_punct)
165
+ if sentence_end_mark:
166
+ chunks.append(sentence_end_mark)
167
 
168
  sentence = preprocess_sentence_text(chunks)
169
  if not sentence: