cs2764 commited on
Commit
d67a3fb
·
verified ·
1 Parent(s): 72ed4f2

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +29 -14
  2. text_cleaning.py +12 -0
app.py CHANGED
@@ -74,17 +74,17 @@ def estimate_text_duration(text):
74
 
75
  return duration
76
 
77
- def split_text_by_paragraphs(text, max_duration_minutes=5):
78
  """Split text into segments that won't exceed limit with safety margin"""
79
  max_duration = max_duration_minutes
80
  estimated_duration = estimate_text_duration(text)
81
 
82
- logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Limit={max_duration}m")
83
 
84
- if estimated_duration <= max_duration:
85
  return [text]
86
 
87
- logger.info(f"Text duration ({estimated_duration:.2f}m) exceeds limit ({max_duration}m). Splitting...")
88
 
89
  # Split by paragraphs first
90
  paragraphs = text.split('\n\n')
@@ -95,20 +95,31 @@ def split_text_by_paragraphs(text, max_duration_minutes=5):
95
  paragraph_duration = estimate_text_duration(paragraph)
96
 
97
  # If single paragraph is too long, split by sentences
98
- if paragraph_duration > max_duration:
99
- sentences = re.split(r'[.!?]+', paragraph)
100
- for sentence in sentences:
 
 
 
 
 
 
 
 
101
  sentence = sentence.strip()
102
  if not sentence:
103
  continue
104
 
105
- if estimate_text_duration(current_segment + sentence) > max_duration and current_segment:
 
 
106
  segments.append(current_segment.strip())
107
- current_segment = sentence + ". "
108
  else:
109
- current_segment += sentence + ". "
110
  else:
111
- if estimate_text_duration(current_segment + paragraph) > max_duration and current_segment:
 
112
  segments.append(current_segment.strip())
113
  current_segment = paragraph + "\n\n"
114
  else:
@@ -128,9 +139,13 @@ async def generate_audio_segment(text_segment, voice_short_name, rate_str, volum
128
  communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
129
 
130
  audio_data = io.BytesIO()
131
- async for chunk in communicate.stream():
132
- if chunk["type"] == "audio":
133
- audio_data.write(chunk["data"])
 
 
 
 
134
 
135
  audio_data.seek(0)
136
 
 
74
 
75
  return duration
76
 
77
+ def split_text_by_paragraphs(text, max_duration_minutes=5, max_chars=1500):
78
  """Split text into segments that won't exceed limit with safety margin"""
79
  max_duration = max_duration_minutes
80
  estimated_duration = estimate_text_duration(text)
81
 
82
+ logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Chars={len(text)}, Limit={max_duration}m/{max_chars}chars")
83
 
84
+ if estimated_duration <= max_duration and len(text) <= max_chars:
85
  return [text]
86
 
87
+ logger.info(f"Text exceeds limits. Splitting...")
88
 
89
  # Split by paragraphs first
90
  paragraphs = text.split('\n\n')
 
95
  paragraph_duration = estimate_text_duration(paragraph)
96
 
97
  # If single paragraph is too long, split by sentences
98
+ # Improved regex to include Chinese punctuation
99
+ if paragraph_duration > max_duration or len(paragraph) > max_chars:
100
+ sentences = re.split(r'([.!?。!?]+)', paragraph)
101
+ # Re-attach delimiters to sentences
102
+ real_sentences = []
103
+ for i in range(0, len(sentences) - 1, 2):
104
+ real_sentences.append(sentences[i] + sentences[i+1])
105
+ if len(sentences) % 2 == 1 and sentences[-1]:
106
+ real_sentences.append(sentences[-1])
107
+
108
+ for sentence in real_sentences:
109
  sentence = sentence.strip()
110
  if not sentence:
111
  continue
112
 
113
+ # Check both duration and char count
114
+ if (estimate_text_duration(current_segment + sentence) > max_duration or
115
+ len(current_segment + sentence) > max_chars) and current_segment:
116
  segments.append(current_segment.strip())
117
+ current_segment = sentence
118
  else:
119
+ current_segment += sentence
120
  else:
121
+ if (estimate_text_duration(current_segment + paragraph) > max_duration or
122
+ len(current_segment + paragraph) > max_chars) and current_segment:
123
  segments.append(current_segment.strip())
124
  current_segment = paragraph + "\n\n"
125
  else:
 
139
  communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
140
 
141
  audio_data = io.BytesIO()
142
+ try:
143
+ async for chunk in communicate.stream():
144
+ if chunk["type"] == "audio":
145
+ audio_data.write(chunk["data"])
146
+ except Exception as e:
147
+ logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
148
+ raise gr.Error(f"Error generating segment {segment_index}: {e}")
149
 
150
  audio_data.seek(0)
151
 
text_cleaning.py CHANGED
@@ -49,8 +49,20 @@ class TextCleaner:
49
  """Normalize whitespace"""
50
  # Replace multiple spaces with single space
51
  text = re.sub(r' +', ' ', text)
 
52
  # Replace multiple newlines with double newline (paragraph break)
53
  text = re.sub(r'\n\s*\n', '\n\n', text)
 
 
 
 
 
 
 
 
 
 
 
54
  return text.strip()
55
 
56
  @staticmethod
 
49
  """Normalize whitespace"""
50
  # Replace multiple spaces with single space
51
  text = re.sub(r' +', ' ', text)
52
+
53
  # Replace multiple newlines with double newline (paragraph break)
54
  text = re.sub(r'\n\s*\n', '\n\n', text)
55
+
56
+ # Merge lines for CJK text (remove single newlines between CJK characters)
57
+ # Lookbehind for CJK/Punctuation, match newline, Lookahead for CJK/Punctuation
58
+ # Ranges:
59
+ # \u4e00-\u9fa5 (Common CJK)
60
+ # \u3000-\u303f (CJK Symbols and Punctuation)
61
+ # \uff00-\uffef (Fullwidth forms)
62
+ cjk_range = r'[\u4e00-\u9fa5\u3000-\u303f\uff00-\uffef]'
63
+ pattern = f'(?<={cjk_range})\\s*\\n\\s*(?={cjk_range})'
64
+ text = re.sub(pattern, '', text)
65
+
66
  return text.strip()
67
 
68
  @staticmethod