Spaces:

tsuching
/

Tibetan-tts

Sleeping

App Files Files Community

tsuching commited on 28 days ago

Commit

29bc350

verified ·

1 Parent(s): dbb9618

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -13

app.py CHANGED Viewed

@@ -302,34 +302,107 @@ def get_all_phonetics_schemes(text):
     # 4) Return both audio forms + a status message
 #    return (sr, audio), tmpfile.name, "Tibetan audio generated successfully!"
 def run_task_tts(text: str):
-    # Ensure input is a string
     if not isinstance(text, str):
         text = str(text)
     # Add extra space to prevent cut endings
-    text = text.strip() #+ " །";
     # 1) Generate speech via MMS-TTS
-    speech = tts_tibetan(text)  # pipeline expects plain string
     # 2) Clip, cast, flatten for Gradio (browser playback expects float32 in [-1, 1])
-    audio = speech["audio"]
-    sr = int(speech["sampling_rate"])
-    audio = np.clip(audio.astype(np.float32), -1.0, 1.0).flatten()
     # 🔥 Add 1 second of silence padding
-    silence_duration = 1.0  # seconds
-    silence_samples = int(sr * silence_duration)
-    silence = np.zeros(silence_samples, dtype=np.float32)
-    padded_audio = np.concatenate([audio, silence])
     # 3) Write a WAV file for download/Flutter using PCM_16
-    tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    sf.write(tmpfile.name, audio, sr, subtype="PCM_16")
     # 4) Return both audio forms + a status message
-    return (sr, audio), tmpfile.name, "Tibetan audio generated successfully!"
 # Translate/Tokenize function

     # 4) Return both audio forms + a status message
 #    return (sr, audio), tmpfile.name, "Tibetan audio generated successfully!"
+########
 def run_task_tts(text: str):
+    # Ensure input is a string and strip whitespace
     if not isinstance(text, str):
         text = str(text)
+    text = text.strip()
+    # 1. Segment Text and Filter Empty Chunks
+    # Use the primary phrase marker (།) to split the long text into manageable segments.
+    # The regex re.split(r'[།\n]', text) is safer for finding both tsheg and newlines
+    # Use the primary phrase marker (།) and newlines (\n) to split the text.
+    # The 're' module must be imported at the top of your script (which it is).
+    segments = [s.strip() for s in re.split(r'[།\n]', text) if s.strip()]
+    if not segments:
+        return (None, ""), "", "⚠️ Error: No valid Tibetan text found after cleaning/segmentation."
+    # List to hold all generated audio segments (numpy arrays)
+    audio_segments = []
+    # Get sampling rate once, will be the same for all segments
+    sr = 0
+    try:
+        # 2. Process each segment
+        for segment in segments:
+            # Re-add the closing tsheg/shes (།) for better phrasing,
+            # and an extra space to prevent cut endings. If the segment already
+            # ends in a །, this is harmless as it's trimmed later.
+            segment_with_tsheg = segment + " །"
+            # Generate speech for the short segment
+            speech = tts_tibetan(segment_with_tsheg)
+            # Clip and flatten the audio for the segment
+            audio_data = speech["audio"]
+            sr = int(speech["sampling_rate"]) # Capture the sampling rate
+            # Convert to float32 and normalize
+            segment_audio = np.clip(audio_data.astype(np.float32), -1.0, 1.0).flatten()
+            audio_segments.append(segment_audio)
+            # Add a small silence gap between segments for clarity (e.g., 0.25s)
+            silence_duration = 0.25 # seconds
+            silence_samples = int(sr * silence_duration)
+            silence = np.zeros(silence_samples, dtype=np.float32)
+            audio_segments.append(silence)
+        # 3. Concatenate all audio segments into the final array
+        final_audio = np.concatenate(audio_segments)
+        # 4. Write a WAV file for download/Flutter using PCM_16
+        tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        # We must have a valid sampling rate 'sr' here
+        if sr == 0:
+             raise ValueError("Sampling rate was not determined during TTS generation.")
+        sf.write(tmpfile.name, final_audio, sr, subtype="PCM_16")
+        # 5. Return both audio forms + a status message
+        return (sr, final_audio), tmpfile.name, "Tibetan audio generated successfully via segmentation!"
+    except Exception as e:
+        # Catch any failure during TTS or concatenation
+        error_message = f"TTS processing failed for a long text segment: {e}. The segmenting process may have failed or the model encountered an unpronounceable character. Try shorter text."
+        print(f"TTS Error during segmentation: {e}")
+        return (None, ""), "", error_message # Return empty data on failure
+########
+#    def run_task_tts(text: str):
+    # Ensure input is a string
+#    if not isinstance(text, str):
+#        text = str(text)
     # Add extra space to prevent cut endings
+#    text = text.strip() #+ " །";
     # 1) Generate speech via MMS-TTS
+#    speech = tts_tibetan(text)  # pipeline expects plain string
     # 2) Clip, cast, flatten for Gradio (browser playback expects float32 in [-1, 1])
+#    audio = speech["audio"]
+#    sr = int(speech["sampling_rate"])
+#    audio = np.clip(audio.astype(np.float32), -1.0, 1.0).flatten()
     # 🔥 Add 1 second of silence padding
+#    silence_duration = 1.0  # seconds
+#    silence_samples = int(sr * silence_duration)
+#    silence = np.zeros(silence_samples, dtype=np.float32)
+#    padded_audio = np.concatenate([audio, silence])
     # 3) Write a WAV file for download/Flutter using PCM_16
+#    tmpfile = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+#    sf.write(tmpfile.name, audio, sr, subtype="PCM_16")
     # 4) Return both audio forms + a status message
+#    return (sr, audio), tmpfile.name, "Tibetan audio generated successfully!"
 # Translate/Tokenize function