Spaces:
Running
Running
Upload 2 files
Browse files- app.py +29 -14
- text_cleaning.py +12 -0
app.py
CHANGED
|
@@ -74,17 +74,17 @@ def estimate_text_duration(text):
|
|
| 74 |
|
| 75 |
return duration
|
| 76 |
|
| 77 |
-
def split_text_by_paragraphs(text, max_duration_minutes=5):
|
| 78 |
"""Split text into segments that won't exceed limit with safety margin"""
|
| 79 |
max_duration = max_duration_minutes
|
| 80 |
estimated_duration = estimate_text_duration(text)
|
| 81 |
|
| 82 |
-
logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Limit={max_duration}m")
|
| 83 |
|
| 84 |
-
if estimated_duration <= max_duration:
|
| 85 |
return [text]
|
| 86 |
|
| 87 |
-
logger.info(f"Text
|
| 88 |
|
| 89 |
# Split by paragraphs first
|
| 90 |
paragraphs = text.split('\n\n')
|
|
@@ -95,20 +95,31 @@ def split_text_by_paragraphs(text, max_duration_minutes=5):
|
|
| 95 |
paragraph_duration = estimate_text_duration(paragraph)
|
| 96 |
|
| 97 |
# If single paragraph is too long, split by sentences
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
sentence = sentence.strip()
|
| 102 |
if not sentence:
|
| 103 |
continue
|
| 104 |
|
| 105 |
-
|
|
|
|
|
|
|
| 106 |
segments.append(current_segment.strip())
|
| 107 |
-
current_segment = sentence
|
| 108 |
else:
|
| 109 |
-
current_segment += sentence
|
| 110 |
else:
|
| 111 |
-
if estimate_text_duration(current_segment + paragraph) > max_duration
|
|
|
|
| 112 |
segments.append(current_segment.strip())
|
| 113 |
current_segment = paragraph + "\n\n"
|
| 114 |
else:
|
|
@@ -128,9 +139,13 @@ async def generate_audio_segment(text_segment, voice_short_name, rate_str, volum
|
|
| 128 |
communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
|
| 129 |
|
| 130 |
audio_data = io.BytesIO()
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
audio_data.seek(0)
|
| 136 |
|
|
|
|
| 74 |
|
| 75 |
return duration
|
| 76 |
|
| 77 |
+
def split_text_by_paragraphs(text, max_duration_minutes=5, max_chars=1500):
|
| 78 |
"""Split text into segments that won't exceed limit with safety margin"""
|
| 79 |
max_duration = max_duration_minutes
|
| 80 |
estimated_duration = estimate_text_duration(text)
|
| 81 |
|
| 82 |
+
logger.info(f"Checking segmentation: Duration={estimated_duration:.2f}m, Chars={len(text)}, Limit={max_duration}m/{max_chars}chars")
|
| 83 |
|
| 84 |
+
if estimated_duration <= max_duration and len(text) <= max_chars:
|
| 85 |
return [text]
|
| 86 |
|
| 87 |
+
logger.info(f"Text exceeds limits. Splitting...")
|
| 88 |
|
| 89 |
# Split by paragraphs first
|
| 90 |
paragraphs = text.split('\n\n')
|
|
|
|
| 95 |
paragraph_duration = estimate_text_duration(paragraph)
|
| 96 |
|
| 97 |
# If single paragraph is too long, split by sentences
|
| 98 |
+
# Improved regex to include Chinese punctuation
|
| 99 |
+
if paragraph_duration > max_duration or len(paragraph) > max_chars:
|
| 100 |
+
sentences = re.split(r'([.!?。!?]+)', paragraph)
|
| 101 |
+
# Re-attach delimiters to sentences
|
| 102 |
+
real_sentences = []
|
| 103 |
+
for i in range(0, len(sentences) - 1, 2):
|
| 104 |
+
real_sentences.append(sentences[i] + sentences[i+1])
|
| 105 |
+
if len(sentences) % 2 == 1 and sentences[-1]:
|
| 106 |
+
real_sentences.append(sentences[-1])
|
| 107 |
+
|
| 108 |
+
for sentence in real_sentences:
|
| 109 |
sentence = sentence.strip()
|
| 110 |
if not sentence:
|
| 111 |
continue
|
| 112 |
|
| 113 |
+
# Check both duration and char count
|
| 114 |
+
if (estimate_text_duration(current_segment + sentence) > max_duration or
|
| 115 |
+
len(current_segment + sentence) > max_chars) and current_segment:
|
| 116 |
segments.append(current_segment.strip())
|
| 117 |
+
current_segment = sentence
|
| 118 |
else:
|
| 119 |
+
current_segment += sentence
|
| 120 |
else:
|
| 121 |
+
if (estimate_text_duration(current_segment + paragraph) > max_duration or
|
| 122 |
+
len(current_segment + paragraph) > max_chars) and current_segment:
|
| 123 |
segments.append(current_segment.strip())
|
| 124 |
current_segment = paragraph + "\n\n"
|
| 125 |
else:
|
|
|
|
| 139 |
communicate = edge_tts.Communicate(text_segment, voice_short_name, rate=rate_str, volume=volume_str, pitch=pitch_str)
|
| 140 |
|
| 141 |
audio_data = io.BytesIO()
|
| 142 |
+
try:
|
| 143 |
+
async for chunk in communicate.stream():
|
| 144 |
+
if chunk["type"] == "audio":
|
| 145 |
+
audio_data.write(chunk["data"])
|
| 146 |
+
except Exception as e:
|
| 147 |
+
logger.error(f"Error generating segment {segment_index} (Length: {len(text_segment)} chars): {e}")
|
| 148 |
+
raise gr.Error(f"Error generating segment {segment_index}: {e}")
|
| 149 |
|
| 150 |
audio_data.seek(0)
|
| 151 |
|
text_cleaning.py
CHANGED
|
@@ -49,8 +49,20 @@ class TextCleaner:
|
|
| 49 |
"""Normalize whitespace"""
|
| 50 |
# Replace multiple spaces with single space
|
| 51 |
text = re.sub(r' +', ' ', text)
|
|
|
|
| 52 |
# Replace multiple newlines with double newline (paragraph break)
|
| 53 |
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
return text.strip()
|
| 55 |
|
| 56 |
@staticmethod
|
|
|
|
| 49 |
"""Normalize whitespace"""
|
| 50 |
# Replace multiple spaces with single space
|
| 51 |
text = re.sub(r' +', ' ', text)
|
| 52 |
+
|
| 53 |
# Replace multiple newlines with double newline (paragraph break)
|
| 54 |
text = re.sub(r'\n\s*\n', '\n\n', text)
|
| 55 |
+
|
| 56 |
+
# Merge lines for CJK text (remove single newlines between CJK characters)
|
| 57 |
+
# Lookbehind for CJK/Punctuation, match newline, Lookahead for CJK/Punctuation
|
| 58 |
+
# Ranges:
|
| 59 |
+
# \u4e00-\u9fa5 (Common CJK)
|
| 60 |
+
# \u3000-\u303f (CJK Symbols and Punctuation)
|
| 61 |
+
# \uff00-\uffef (Fullwidth forms)
|
| 62 |
+
cjk_range = r'[\u4e00-\u9fa5\u3000-\u303f\uff00-\uffef]'
|
| 63 |
+
pattern = f'(?<={cjk_range})\\s*\\n\\s*(?={cjk_range})'
|
| 64 |
+
text = re.sub(pattern, '', text)
|
| 65 |
+
|
| 66 |
return text.strip()
|
| 67 |
|
| 68 |
@staticmethod
|