waxz commited on
Commit
7a85341
·
1 Parent(s): 2d85dce

improve text preprocess

Browse files
Files changed (4) hide show
  1. base_model.py +2 -2
  2. kokoro_model.py +4 -18
  3. supertonic_model.py +7 -3
  4. utils.py +3 -1
base_model.py CHANGED
@@ -44,8 +44,8 @@ class BaseEngine:
44
  except NotImplementedError:
45
  pass
46
 
47
- text = self.preprocess_text(text)
48
- chunks = utils.split_text_into_sentences(text, min_chunk_size=150)
49
 
50
  loop = asyncio.get_event_loop()
51
 
 
44
  except NotImplementedError:
45
  pass
46
 
47
+ chunks = self.preprocess_text(text)
48
+
49
 
50
  loop = asyncio.get_event_loop()
51
 
kokoro_model.py CHANGED
@@ -3,6 +3,7 @@ import re
3
  import asyncio
4
  from kokoro import KPipeline
5
  import base_model
 
6
 
7
  class StreamingEngine(base_model.BaseEngine):
8
  def __init__(self, name):
@@ -43,24 +44,9 @@ class StreamingEngine(base_model.BaseEngine):
43
 
44
  def preprocess_text(self, text):
45
  if not text:
46
- return ""
47
-
48
- is_valid, unsupported = True, []
49
- if not is_valid:
50
- print(f" ⚠️ Contains {len(unsupported)} unsupported character(s): {unsupported[:5]}")
51
- # Escape characters safe for regex usage
52
- pattern = f"[{re.escape(''.join(unsupported))}]"
53
- preprocessed = re.sub(pattern, "", text)
54
-
55
- if preprocessed != text:
56
- print(f" After preprocessing: {preprocessed[:50]}...")
57
- text = preprocessed
58
- else:
59
- # Optional: Comment this out in production to reduce log spam
60
- print(" ✓ All characters supported")
61
-
62
- return text
63
-
64
  def generate(self, chunks: str, voice_name: str, speed: float):
65
  """
66
  Generates audio.
 
3
  import asyncio
4
  from kokoro import KPipeline
5
  import base_model
6
+ import utils
7
 
8
  class StreamingEngine(base_model.BaseEngine):
9
  def __init__(self, name):
 
44
 
45
  def preprocess_text(self, text):
46
  if not text:
47
+ return []
48
+ return [text]
49
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def generate(self, chunks: str, voice_name: str, speed: float):
51
  """
52
  Generates audio.
supertonic_model.py CHANGED
@@ -3,7 +3,7 @@ import re
3
  import asyncio
4
  from supertonic import TTS
5
  import base_model
6
-
7
  class StreamingEngine(base_model.BaseEngine):
8
  def __init__(self, name):
9
  # 1. Initialize configuration variables first
@@ -56,7 +56,10 @@ class StreamingEngine(base_model.BaseEngine):
56
 
57
  def preprocess_text(self, text):
58
  if not text:
59
- return ""
 
 
 
60
 
61
  is_valid, unsupported = self.text_processor.validate_text(text)
62
 
@@ -73,7 +76,8 @@ class StreamingEngine(base_model.BaseEngine):
73
  # Optional: Comment this out in production to reduce log spam
74
  print(" ✓ All characters supported")
75
 
76
- return text
 
77
 
78
  def generate(self, chunks: str, voice_name: str, speed: float):
79
  """
 
3
  import asyncio
4
  from supertonic import TTS
5
  import base_model
6
+ import utils
7
  class StreamingEngine(base_model.BaseEngine):
8
  def __init__(self, name):
9
  # 1. Initialize configuration variables first
 
56
 
57
  def preprocess_text(self, text):
58
  if not text:
59
+ return []
60
+
61
+ split_pattern = r'\n+'
62
+
63
 
64
  is_valid, unsupported = self.text_processor.validate_text(text)
65
 
 
76
  # Optional: Comment this out in production to reduce log spam
77
  print(" ✓ All characters supported")
78
 
79
+ chunks = utils.split_text_into_sentences(text, min_chunk_size=150)
80
+ return chunks
81
 
82
  def generate(self, chunks: str, voice_name: str, speed: float):
83
  """
utils.py CHANGED
@@ -3,7 +3,7 @@ import re
3
  import struct
4
  import lameenc
5
 
6
- def split_text_into_sentences(text: str, min_chunk_size: int = 150):
7
  if not text:
8
  return []
9
 
@@ -17,6 +17,8 @@ def split_text_into_sentences(text: str, min_chunk_size: int = 150):
17
  current_atomic = ""
18
 
19
  for part in raw_parts:
 
 
20
  if re.match(r'^[.?!:;]+$', part):
21
  current_atomic += part
22
  if current_atomic.strip():
 
3
  import struct
4
  import lameenc
5
 
6
+ def split_text_into_sentences(text: str, min_chunk_size: int = 150,split_pattern = r'\n+'):
7
  if not text:
8
  return []
9
 
 
17
  current_atomic = ""
18
 
19
  for part in raw_parts:
20
+ if not part.strip():
21
+ continue
22
  if re.match(r'^[.?!:;]+$', part):
23
  current_atomic += part
24
  if current_atomic.strip():