Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -302,34 +302,107 @@ def get_all_phonetics_schemes(text):
|
|
| 302 |
# 4) Return both audio forms + a status message
|
| 303 |
# return (sr, audio), tmpfile.name, "Tibetan audio generated successfully!"
|
| 304 |
|
|
|
|
|
|
|
| 305 |
def run_task_tts(text: str):
|
| 306 |
-
# Ensure input is a string
|
| 307 |
if not isinstance(text, str):
|
| 308 |
text = str(text)
|
| 309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
# Add extra space to prevent cut endings
|
| 311 |
-
text = text.strip() #+ " །";
|
| 312 |
|
| 313 |
# 1) Generate speech via MMS-TTS
|
| 314 |
-
speech = tts_tibetan(text) # pipeline expects plain string
|
| 315 |
|
| 316 |
# 2) Clip, cast, flatten for Gradio (browser playback expects float32 in [-1, 1])
|
| 317 |
-
audio = speech["audio"]
|
| 318 |
-
sr = int(speech["sampling_rate"])
|
| 319 |
-
audio = np.clip(audio.astype(np.float32), -1.0, 1.0).flatten()
|
| 320 |
|
| 321 |
# 🔥 Add 1 second of silence padding
|
| 322 |
-
silence_duration = 1.0 # seconds
|
| 323 |
-
silence_samples = int(sr * silence_duration)
|
| 324 |
-
silence = np.zeros(silence_samples, dtype=np.float32)
|
| 325 |
-
padded_audio = np.concatenate([audio, silence])
|
| 326 |
|
| 327 |
# 3) Write a WAV file for download/Flutter using PCM_16
|
| 328 |
-
tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
| 329 |
-
sf.write(tmpfile.name, audio, sr, subtype="PCM_16")
|
| 330 |
|
| 331 |
# 4) Return both audio forms + a status message
|
| 332 |
-
return (sr, audio), tmpfile.name, "Tibetan audio generated successfully!"
|
| 333 |
|
| 334 |
|
| 335 |
# Translate/Tokenize function
|
|
|
|
| 302 |
# 4) Return both audio forms + a status message
|
| 303 |
# return (sr, audio), tmpfile.name, "Tibetan audio generated successfully!"
|
| 304 |
|
| 305 |
+
|
| 306 |
+
########
|
| 307 |
def run_task_tts(text: str):
|
| 308 |
+
# Ensure input is a string and strip whitespace
|
| 309 |
if not isinstance(text, str):
|
| 310 |
text = str(text)
|
| 311 |
|
| 312 |
+
text = text.strip()
|
| 313 |
+
|
| 314 |
+
# 1. Segment Text and Filter Empty Chunks
|
| 315 |
+
# Use the primary phrase marker (།) to split the long text into manageable segments.
|
| 316 |
+
# The regex re.split(r'[།\n]', text) is safer for finding both tsheg and newlines
|
| 317 |
+
|
| 318 |
+
# Use the primary phrase marker (།) and newlines (\n) to split the text.
|
| 319 |
+
# The 're' module must be imported at the top of your script (which it is).
|
| 320 |
+
segments = [s.strip() for s in re.split(r'[།\n]', text) if s.strip()]
|
| 321 |
+
|
| 322 |
+
if not segments:
|
| 323 |
+
return (None, ""), "", "⚠️ Error: No valid Tibetan text found after cleaning/segmentation."
|
| 324 |
+
|
| 325 |
+
# List to hold all generated audio segments (numpy arrays)
|
| 326 |
+
audio_segments = []
|
| 327 |
+
|
| 328 |
+
# Get sampling rate once, will be the same for all segments
|
| 329 |
+
sr = 0
|
| 330 |
+
|
| 331 |
+
try:
|
| 332 |
+
# 2. Process each segment
|
| 333 |
+
for segment in segments:
|
| 334 |
+
# Re-add the closing tsheg/shes (།) for better phrasing,
|
| 335 |
+
# and an extra space to prevent cut endings. If the segment already
|
| 336 |
+
# ends in a །, this is harmless as it's trimmed later.
|
| 337 |
+
segment_with_tsheg = segment + " །"
|
| 338 |
+
|
| 339 |
+
# Generate speech for the short segment
|
| 340 |
+
speech = tts_tibetan(segment_with_tsheg)
|
| 341 |
+
|
| 342 |
+
# Clip and flatten the audio for the segment
|
| 343 |
+
audio_data = speech["audio"]
|
| 344 |
+
sr = int(speech["sampling_rate"]) # Capture the sampling rate
|
| 345 |
+
|
| 346 |
+
# Convert to float32 and normalize
|
| 347 |
+
segment_audio = np.clip(audio_data.astype(np.float32), -1.0, 1.0).flatten()
|
| 348 |
+
|
| 349 |
+
audio_segments.append(segment_audio)
|
| 350 |
+
|
| 351 |
+
# Add a small silence gap between segments for clarity (e.g., 0.25s)
|
| 352 |
+
silence_duration = 0.25 # seconds
|
| 353 |
+
silence_samples = int(sr * silence_duration)
|
| 354 |
+
silence = np.zeros(silence_samples, dtype=np.float32)
|
| 355 |
+
audio_segments.append(silence)
|
| 356 |
+
|
| 357 |
+
# 3. Concatenate all audio segments into the final array
|
| 358 |
+
final_audio = np.concatenate(audio_segments)
|
| 359 |
+
|
| 360 |
+
# 4. Write a WAV file for download/Flutter using PCM_16
|
| 361 |
+
tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
| 362 |
+
|
| 363 |
+
# We must have a valid sampling rate 'sr' here
|
| 364 |
+
if sr == 0:
|
| 365 |
+
raise ValueError("Sampling rate was not determined during TTS generation.")
|
| 366 |
+
|
| 367 |
+
sf.write(tmpfile.name, final_audio, sr, subtype="PCM_16")
|
| 368 |
+
|
| 369 |
+
# 5. Return both audio forms + a status message
|
| 370 |
+
return (sr, final_audio), tmpfile.name, "Tibetan audio generated successfully via segmentation!"
|
| 371 |
+
|
| 372 |
+
except Exception as e:
|
| 373 |
+
# Catch any failure during TTS or concatenation
|
| 374 |
+
error_message = f"TTS processing failed for a long text segment: {e}. The segmenting process may have failed or the model encountered an unpronounceable character. Try shorter text."
|
| 375 |
+
print(f"TTS Error during segmentation: {e}")
|
| 376 |
+
return (None, ""), "", error_message # Return empty data on failure
|
| 377 |
+
########
|
| 378 |
+
# def run_task_tts(text: str):
|
| 379 |
+
# Ensure input is a string
|
| 380 |
+
# if not isinstance(text, str):
|
| 381 |
+
# text = str(text)
|
| 382 |
+
|
| 383 |
# Add extra space to prevent cut endings
|
| 384 |
+
# text = text.strip() #+ " །";
|
| 385 |
|
| 386 |
# 1) Generate speech via MMS-TTS
|
| 387 |
+
# speech = tts_tibetan(text) # pipeline expects plain string
|
| 388 |
|
| 389 |
# 2) Clip, cast, flatten for Gradio (browser playback expects float32 in [-1, 1])
|
| 390 |
+
# audio = speech["audio"]
|
| 391 |
+
# sr = int(speech["sampling_rate"])
|
| 392 |
+
# audio = np.clip(audio.astype(np.float32), -1.0, 1.0).flatten()
|
| 393 |
|
| 394 |
# 🔥 Add 1 second of silence padding
|
| 395 |
+
# silence_duration = 1.0 # seconds
|
| 396 |
+
# silence_samples = int(sr * silence_duration)
|
| 397 |
+
# silence = np.zeros(silence_samples, dtype=np.float32)
|
| 398 |
+
# padded_audio = np.concatenate([audio, silence])
|
| 399 |
|
| 400 |
# 3) Write a WAV file for download/Flutter using PCM_16
|
| 401 |
+
# tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
| 402 |
+
# sf.write(tmpfile.name, audio, sr, subtype="PCM_16")
|
| 403 |
|
| 404 |
# 4) Return both audio forms + a status message
|
| 405 |
+
# return (sr, audio), tmpfile.name, "Tibetan audio generated successfully!"
|
| 406 |
|
| 407 |
|
| 408 |
# Translate/Tokenize function
|