Spaces:

abreza
/

mana-tts

Running on Zero

abreza commited on Oct 29, 2025

Commit

da2ee9a

1 Parent(s): 81b0116

feat: improved number handling and audio processing

- Add sentence splitting functionality
- Ensure audio segments are flattened for consistent output
- Improve number handling

Files changed (4) hide show

interface.py +37 -11
sentence_splitter.py +123 -0
synthesis.py +115 -31
text_utils.py +84 -0

interface.py CHANGED Viewed

@@ -32,11 +32,18 @@ def ge2pe_infer(model_name: str, text: str, use_rules: bool, use_dict: bool):
 def create_interface():
     with gr.Blocks(title="Persian Speech Suite", css=custom_css) as demo:
-        gr.Markdown("# Persian Speech Suite: GE2PE & TTS\n" "A unified playground for Persian grapheme‑to‑phoneme conversion (GE2PE) **and** text‑to‑speech synthesis (Mana TTS).")
         with gr.Tabs():
             with gr.TabItem("Grapheme → Phoneme (GE2PE)"):
-                gr.Markdown("Convert Persian text to its phonemic transcription. Choose between **Homo‑GE2PE** and **Homo‑T5**, optionally applying short‑vowel rules and/or a custom dictionary.")
                 with gr.Row():
                     model_selector = gr.Radio(
@@ -75,20 +82,34 @@ def create_interface():
                 )
             with gr.TabItem("Text‑to‑Speech"):
-                gr.Markdown("Generate natural‑sounding Persian speech from your text using Tacotron2 + HiFiGAN.")
-                tts_input = gr.Textbox(
-                    label="Persian Text",
-                    placeholder="مدل تولید گفتار با دادگان نسل مانا",
-                    lines=5,
                 )
-                tts_button = gr.Button("Generate Speech", variant="primary")
-                tts_output = gr.Audio(label="Generated Speech")
                 tts_button.click(
                     fn=generate_speech,
-                    inputs=[tts_input],
                     outputs=[tts_output],
                 )
@@ -98,6 +119,11 @@ def create_interface():
                         ["ایران سرزمین زیبایی‌ها و افتخارات است."],
                         ["فناوری هوش مصنوعی به سرعت در حال پیشرفت است."],
                         ["مدل تولید گفتار با دادگان نسل مانا"],
                     ],
                     inputs=[tts_input],
                 )

 def create_interface():
     with gr.Blocks(title="Persian Speech Suite", css=custom_css) as demo:
+        gr.Markdown(
+            "# Persian Speech Suite: GE2PE & TTS\n"
+            "A unified playground for Persian grapheme‑to‑phoneme conversion (GE2PE) **and** text‑to‑speech synthesis (Mana TTS).\n\n"
+            "✨ **Now supports long texts!** The TTS system automatically splits long texts into natural segments."
+        )
         with gr.Tabs():
             with gr.TabItem("Grapheme → Phoneme (GE2PE)"):
+                gr.Markdown(
+                    "Convert Persian text to its phonemic transcription. "
+                    "Choose between **Homo‑GE2PE** and **Homo‑T5**, optionally applying short‑vowel rules and/or a custom dictionary."
+                )
                 with gr.Row():
                     model_selector = gr.Radio(
                 )
             with gr.TabItem("Text‑to‑Speech"):
+                gr.Markdown(
+                    "Generate natural‑sounding Persian speech from your text using Tacotron2 + HiFiGAN.\n\n"
+                    "✨ **New:** Supports long texts! The system automatically splits text into natural segments "
+                    "and adds pauses between them for better readability."
                 )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        tts_input = gr.Textbox(
+                            label="Persian Text",
+                            placeholder="متن فارسی خود را اینجا بنویسید...",
+                            lines=8,
+                        )
+                        with gr.Row():
+                            tts_add_pauses = gr.Checkbox(
+                                value=True,
+                                label="Add pauses between segments",
+                                info="Adds 300ms pause between text segments for natural flow"
+                            )
+                        tts_button = gr.Button("Generate Speech", variant="primary", size="lg")
+                tts_output = gr.Audio(label="Generated Speech", type="filepath")
                 tts_button.click(
                     fn=generate_speech,
+                    inputs=[tts_input, gr.State(None), tts_add_pauses],
                     outputs=[tts_output],
                 )
                         ["ایران سرزمین زیبایی‌ها و افتخارات است."],
                         ["فناوری هوش مصنوعی به سرعت در حال پیشرفت است."],
                         ["مدل تولید گفتار با دادگان نسل مانا"],
+                        [
+                            "هوش مصنوعی یکی از شگفت‌انگیزترین دستاوردهای بشر در قرن بیست و یکم است. "
+                            "این فناوری توانایی یادگیری، استدلال و حل مسئله را به ماشین‌ها می‌دهد. "
+                            "از پردازش زبان طبیعی گرفته تا بینایی کامپیوتری، هوش مصنوعی در حال تغییر دنیای ماست."
+                        ],
                     ],
                     inputs=[tts_input],
                 )

sentence_splitter.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import re
+from typing import List
+class PersianSentenceSplitter:
+    def __init__(self, max_chars: int = 200, min_chars: int = 50):
+        self.max_chars = max_chars
+        self.min_chars = min_chars
+        self.sentence_endings = r'[.!?؟۔]'
+        self.weak_boundaries = r'[،,;؛]'
+    def clean_text(self, text: str) -> str:
+        text = re.sub(r'\s+', ' ', text)
+        text = text.replace('_', '\u200c')
+        text = text.replace('ك', 'ک').replace('ي', 'ی')
+        persian_digits = '۰۱۲۳۴۵۶۷۸۹'
+        english_digits = '0123456789'
+        digit_map = str.maketrans(persian_digits, english_digits)
+        text = text.translate(digit_map)
+        arabic_digits = '٠١٢٣٤٥٦٧٨٩'
+        arabic_map = str.maketrans(arabic_digits, english_digits)
+        text = text.translate(arabic_map)
+        return text.strip()
+    def split_by_punctuation(self, text: str) -> List[str]:
+        segments = re.split(f'({self.sentence_endings})', text)
+        sentences = []
+        for i in range(0, len(segments) - 1, 2):
+            if i + 1 < len(segments):
+                sentence = segments[i] + segments[i + 1]
+            else:
+                sentence = segments[i]
+            sentence = sentence.strip()
+            if sentence:
+                sentences.append(sentence)
+        if len(segments) % 2 == 1 and segments[-1].strip():
+            sentences.append(segments[-1].strip())
+        return sentences
+    def split_long_sentence(self, sentence: str) -> List[str]:
+        if len(sentence) <= self.max_chars:
+            return [sentence]
+        chunks = []
+        current_chunk = ""
+        parts = re.split(f'({self.weak_boundaries})', sentence)
+        for i in range(0, len(parts)):
+            part = parts[i]
+            if len(current_chunk + part) > self.max_chars and current_chunk:
+                chunks.append(current_chunk.strip())
+                current_chunk = part
+            else:
+                current_chunk += part
+        if current_chunk.strip():
+            chunks.append(current_chunk.strip())
+        final_chunks = []
+        for chunk in chunks:
+            if len(chunk) > self.max_chars:
+                final_chunks.extend(self.force_split_by_words(chunk))
+            else:
+                final_chunks.append(chunk)
+        return final_chunks
+    def force_split_by_words(self, text: str) -> List[str]:
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for word in words:
+            word_length = len(word) + 1  # +1 for space
+            if current_length + word_length > self.max_chars and current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [word]
+                current_length = word_length
+            else:
+                current_chunk.append(word)
+                current_length += word_length
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
+    def split(self, text: str) -> List[str]:
+        text = self.clean_text(text)
+        if not text:
+            return []
+        if len(text) <= self.max_chars:
+            return [text]
+        sentences = self.split_by_punctuation(text)
+        final_segments = []
+        for sentence in sentences:
+            if len(sentence) > self.max_chars:
+                final_segments.extend(self.split_long_sentence(sentence))
+            else:
+                final_segments.append(sentence)
+        final_segments = [seg.strip() for seg in final_segments if seg.strip()]
+        return final_segments

synthesis.py CHANGED Viewed

@@ -1,83 +1,167 @@
 import os
 import sys
 import numpy as np
 import torch
 import soundfile as sf
 import spaces
 from config import models_path, results_path, sample_path, BASE_DIR
 encoder = None
 synthesizer = None
 vocoder = None
 def load_models():
-    global encoder, synthesizer, vocoder
     try:
         sys.path.append(os.path.join(BASE_DIR, 'pmt2'))
         from encoder import inference as encoder_module
         from synthesizer.inference import Synthesizer
         from parallel_wavegan.utils import load_model as vocoder_hifigan
         global encoder
         encoder = encoder_module
         print("Loading encoder model...")
         encoder.load_model(os.path.join(models_path, 'encoder.pt'))
         print("Loading synthesizer model...")
         synthesizer = Synthesizer(os.path.join(models_path, 'synthesizer.pt'))
         print("Loading HiFiGAN vocoder...")
         vocoder = vocoder_hifigan(os.path.join(models_path, 'vocoder_HiFiGAN.pkl'))
         vocoder.remove_weight_norm()
         vocoder = vocoder.eval().to('cuda' if torch.cuda.is_available() else 'cpu')
         return True
     except Exception as e:
         import traceback
         print(f"Error loading models: {traceback.format_exc()}")
         return False
 @spaces.GPU(duration=120)
-def generate_speech(text, reference_audio=None):
     if not text or text.strip() == "":
         return None
     try:
         if reference_audio is None:
             ref_wav_path = sample_path
         else:
             ref_wav_path = os.path.join(results_path, "reference_audio.wav")
             sf.write(ref_wav_path, reference_audio[1], reference_audio[0])
         print(f"Using reference audio: {ref_wav_path}")
         wav = synthesizer.load_preprocess_wav(ref_wav_path)
         encoder_wav = encoder.preprocess_wav(wav)
         embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
-        texts = [text]
-        embeds = [embed] * len(texts)
-        specs = synthesizer.synthesize_spectrograms(texts, embeds)
-        spec = np.concatenate(specs, axis=1)
-        x = torch.from_numpy(spec.T).to('cuda' if torch.cuda.is_available() else 'cpu')
-        with torch.no_grad():
-            wav = vocoder.inference(x)
-        wav = wav.cpu().numpy()
-        wav = wav / np.abs(wav).max() * 0.97
-        output_filename = f"generated_{hash(text) % 10000}.wav"
         output_path = os.path.join(results_path, output_filename)
-        sf.write(output_path, wav, synthesizer.sample_rate)
         return output_path
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()

 import os
 import sys
+import re
 import numpy as np
 import torch
 import soundfile as sf
 import spaces
 from config import models_path, results_path, sample_path, BASE_DIR
+from sentence_splitter import PersianSentenceSplitter
+from text_utils import convert_number_to_text
 encoder = None
 synthesizer = None
 vocoder = None
+sentence_splitter = None
 def load_models():
+    global encoder, synthesizer, vocoder, sentence_splitter
     try:
         sys.path.append(os.path.join(BASE_DIR, 'pmt2'))
         from encoder import inference as encoder_module
         from synthesizer.inference import Synthesizer
         from parallel_wavegan.utils import load_model as vocoder_hifigan
         global encoder
         encoder = encoder_module
         print("Loading encoder model...")
         encoder.load_model(os.path.join(models_path, 'encoder.pt'))
         print("Loading synthesizer model...")
         synthesizer = Synthesizer(os.path.join(models_path, 'synthesizer.pt'))
         print("Loading HiFiGAN vocoder...")
         vocoder = vocoder_hifigan(os.path.join(models_path, 'vocoder_HiFiGAN.pkl'))
         vocoder.remove_weight_norm()
         vocoder = vocoder.eval().to('cuda' if torch.cuda.is_available() else 'cpu')
+        sentence_splitter = PersianSentenceSplitter(max_chars=150, min_chars=30)
+        print("Models loaded successfully!")
         return True
     except Exception as e:
         import traceback
         print(f"Error loading models: {traceback.format_exc()}")
         return False
+def normalize_text_for_synthesis(text: str) -> str:
+    text = text.replace('ك', 'ک').replace('ي', 'ی')
+    text = text.replace('_', '\u200c')
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    number_pattern = r'[۰-۹0-9٠-٩]+(?:[,،٬][۰-۹0-9٠-٩]+)*'
+    def replace_number(match):
+        num_str = match.group(0)
+        try:
+            return convert_number_to_text(num_str)
+        except:
+            return num_str
+    text = re.sub(number_pattern, replace_number, text)
+    return text
+def synthesize_segment(text_segment: str, embed: np.ndarray) -> np.ndarray:
+    try:
+        text_segment = normalize_text_for_synthesis(text_segment)
+        specs = synthesizer.synthesize_spectrograms([text_segment], [embed])
+        spec = specs[0]
+        x = torch.from_numpy(spec.T).to('cuda' if torch.cuda.is_available() else 'cpu')
+        with torch.no_grad():
+            wav = vocoder.inference(x)
+        wav = wav.cpu().numpy()
+        if wav.ndim > 1:
+            wav = wav.squeeze()
+        return wav
+    except Exception as e:
+        import traceback
+        print(f"Error synthesizing segment '{text_segment[:50]}...': {traceback.format_exc()}")
+        return None
+def add_silence(duration_ms: int = 300) -> np.ndarray:
+    sample_rate = synthesizer.sample_rate
+    num_samples = int(sample_rate * duration_ms / 1000)
+    return np.zeros(num_samples, dtype=np.float32)
 @spaces.GPU(duration=120)
+def generate_speech(text, reference_audio=None, add_pauses: bool = True):
     if not text or text.strip() == "":
         return None
     try:
         if reference_audio is None:
             ref_wav_path = sample_path
         else:
             ref_wav_path = os.path.join(results_path, "reference_audio.wav")
             sf.write(ref_wav_path, reference_audio[1], reference_audio[0])
         print(f"Using reference audio: {ref_wav_path}")
         wav = synthesizer.load_preprocess_wav(ref_wav_path)
         encoder_wav = encoder.preprocess_wav(wav)
         embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
+        text_segments = sentence_splitter.split(text)
+        print(f"Split text into {len(text_segments)} segments:")
+        for i, segment in enumerate(text_segments, 1):
+            print(f"  Segment {i}: {segment[:60]}{'...' if len(segment) > 60 else ''}")
+        audio_segments = []
+        silence = add_silence(300) if add_pauses else None  # 300ms pause
+        for i, segment in enumerate(text_segments):
+            print(f"Processing segment {i+1}/{len(text_segments)}...")
+            segment_wav = synthesize_segment(segment, embed)
+            if segment_wav is not None:
+                segment_wav = segment_wav.flatten() if segment_wav.ndim > 1 else segment_wav
+                audio_segments.append(segment_wav)
+                if add_pauses and i < len(text_segments) - 1:
+                    audio_segments.append(silence)
+            else:
+                print(f"Warning: Failed to synthesize segment {i+1}")
+        if not audio_segments:
+            print("Error: No audio segments were generated successfully")
+            return None
+        audio_segments = [seg.flatten() if seg.ndim > 1 else seg for seg in audio_segments]
+        final_wav = np.concatenate(audio_segments)
+        final_wav = final_wav / np.abs(final_wav).max() * 0.97
+        output_filename = f"generated_{abs(hash(text)) % 100000}.wav"
         output_path = os.path.join(results_path, output_filename)
+        sf.write(output_path, final_wav, synthesizer.sample_rate)
+        print(f"✓ Successfully generated speech: {output_path}")
+        print(f"  Total duration: {len(final_wav) / synthesizer.sample_rate:.2f} seconds")
         return output_path
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()

text_utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+PERSIAN_DIGITS = {
+    '۰': 'صفر', '۱': 'یک', '۲': 'دو', '۳': 'سه', '۴': 'چهار',
+    '۵': 'پنج', '۶': 'شش', '۷': 'هفت', '۸': 'هشت', '۹': 'نه',
+    '0': 'صفر', '1': 'یک', '2': 'دو', '3': 'سه', '4': 'چهار',
+    '5': 'پنج', '6': 'شش', '7': 'هفت', '8': 'هشت', '9': 'نه'
+}
+PERSIAN_NUMBERS = {
+    10: 'ده', 11: 'یازده', 12: 'دوازده', 13: 'سیزده', 14: 'چهارده',
+    15: 'پانزده', 16: 'شانزده', 17: 'هفده', 18: 'هجده', 19: 'نوزده',
+    20: 'بیست', 30: 'سی', 40: 'چهل', 50: 'پنجاه',
+    60: 'شصت', 70: 'هفتاد', 80: 'هشتاد', 90: 'نود',
+    100: 'صد', 200: 'دویست', 300: 'سیصد', 400: 'چهارصد', 500: 'پانصد',
+    600: 'ششصد', 700: 'هفتصد', 800: 'هشتصد', 900: 'نهصد'
+}
+def convert_three_digit(num: int) -> str:
+    if num == 0:
+        return ''
+    if num < 10:
+        return PERSIAN_DIGITS[str(num)]
+    elif num < 20:
+        return PERSIAN_NUMBERS[num]
+    elif num < 100:
+        tens = (num // 10) * 10
+        ones = num % 10
+        if ones == 0:
+            return PERSIAN_NUMBERS[tens]
+        return PERSIAN_NUMBERS[tens] + ' و ' + PERSIAN_DIGITS[str(ones)]
+    else:
+        hundreds = (num // 100) * 100
+        remainder = num % 100
+        if remainder == 0:
+            return PERSIAN_NUMBERS[hundreds]
+        return PERSIAN_NUMBERS[hundreds] + ' و ' + convert_three_digit(remainder)
+def convert_number_to_text(num_str: str, phone_mode: bool = False) -> str:
+    try:
+        num_str = num_str.replace(',', '').replace('٬', '').replace(' ', '')
+        persian_to_english = str.maketrans('۰۱۲۳۴۵۶۷۸۹', '0123456789')
+        num_str = num_str.translate(persian_to_english)
+        if phone_mode:
+            return ' '.join(PERSIAN_DIGITS[d] for d in num_str if d.isdigit())
+        num = int(num_str)
+        if num == 0:
+            return 'صفر'
+        if num < 0:
+            return 'منفی ' + convert_number_to_text(str(abs(num)))
+        if num < 1000:
+            return convert_three_digit(num)
+        parts = []
+        if num >= 1_000_000_000:
+            billions = num // 1_000_000_000
+            parts.append(convert_three_digit(billions) + ' میلیارد')
+            num %= 1_000_000_000
+        if num >= 1_000_000:
+            millions = num // 1_000_000
+            parts.append(convert_three_digit(millions) + ' میلیون')
+            num %= 1_000_000
+        if num >= 1000:
+            thousands = num // 1000
+            parts.append(convert_three_digit(thousands) + ' هزار')
+            num %= 1000
+        if num > 0:
+            parts.append(convert_three_digit(num))
+        return ' و '.join(parts)
+    except:
+        return num_str