import os import re import sys import uuid import zipfile import shutil import random import nltk import gradio as gr from g2p_en import G2p from pydub import AudioSegment # Pre-download required NLTK data nltk.download('averaged_perceptron_tagger') nltk.download('averaged_perceptron_tagger_eng', quiet=True) BASE_CHARACTERS_DIR = os.path.join("assets", "characters") os.makedirs(BASE_CHARACTERS_DIR, exist_ok=True) class TextToSpeech: PHONEME_MAPPING = { 'AW': ['AE', 'OW'], 'DH': ['D'], 'EY': ['EH', 'IY'], 'JH': ['CH'], 'SH': ['CH'], 'TH': ['D'], 'ZH': ['CH'], 'AE': ['AA'], 'AO': ['AA', 'OW'], 'ER': ['AA'], 'IH': ['IY'], 'OY': ['OW', 'Y', 'IY'], 'UH': ['UW'], 'AH': ['AA'] } def __init__(self, character_folder): self.character_folder = character_folder self.g2p = G2p() self.word_pause_ms = 1 # 0.001 seconds -> 1 ms self.fade_duration_ms = 10 # 0.010 seconds -> 10 ms # Target specs self.target_channels = 1 self.target_rate = 44100 def _pick_random_variant(self, base_path): directory = os.path.dirname(base_path) base_name = os.path.splitext(os.path.basename(base_path))[0] if not os.path.isdir(directory): return None pattern = re.compile(rf"^{re.escape(base_name)}(_\d+)?\.wav$", re.IGNORECASE) candidates = [os.path.join(directory, f) for f in os.listdir(directory) if pattern.match(f)] return random.choice(candidates) if candidates else None def _normalize_audio(self, filepath): try: audio = AudioSegment.from_wav(filepath) if audio.channels > 1: audio = audio.set_channels(self.target_channels) if audio.frame_rate != self.target_rate: audio = audio.set_frame_rate(self.target_rate) return audio except Exception: return None def _get_phoneme_data(self, phoneme): if phoneme == "AH0": chosen_fallback = random.choice(["AA", "AH"]) return self._get_phoneme_data(chosen_fallback) base = os.path.join(self.character_folder, f"{phoneme}.wav") path = self._pick_random_variant(base) if path: return self._normalize_audio(path) if phoneme in self.PHONEME_MAPPING: combined_audio = None for sub_p in self.PHONEME_MAPPING[phoneme]: sub_audio = self._get_phoneme_data(sub_p) if sub_audio: if combined_audio: combined_audio = combined_audio.append(sub_audio, crossfade=min(self.fade_duration_ms, len(combined_audio), len(sub_audio))) else: combined_audio = sub_audio return combined_audio return None def generate_audio_data(self, str_input): tokens = re.findall(r"[\w']+|[.,!?;]", str_input) raw_segments = [] for token in tokens: if token in [".", "!", "?", ",", ";"]: dur_ms = 400 if token in [".", "!", "?"] else 220 raw_segments.append({"audio": AudioSegment.silent(duration=dur_ms), "is_pause": True}) continue word_wav = self._pick_random_variant(os.path.join(self.character_folder, "words", f"{token.upper()}.wav")) if word_wav: norm_word = self._normalize_audio(word_wav) if norm_word: raw_segments.append({"audio": norm_word, "is_pause": False}) else: phonemes = self.g2p(token) valid_ps = [re.sub(r'\d+', '', p) if p != "AH0" else p for p in phonemes] valid_ps = [p for p in valid_ps if re.match(r'[A-Z]+[0-9]*', p)] if valid_ps and valid_ps[-1] in ["AH", "AE", "AH0"]: valid_ps[-1] = random.choice(["AA", "AH"]) for p_clean in valid_ps: seg_audio = self._get_phoneme_data(p_clean) if seg_audio: raw_segments.append({"audio": seg_audio, "is_pause": False}) raw_segments.append({"audio": AudioSegment.silent(duration=self.word_pause_ms), "is_pause": True}) if not raw_segments: return AudioSegment.silent(duration=100) final_audio = None for i in range(len(raw_segments)): curr_audio = raw_segments[i]["audio"] if final_audio is None: final_audio = curr_audio continue # Apply crossfade if neither side is a pause segment if not raw_segments[i-1]["is_pause"] and not raw_segments[i]["is_pause"]: fade_size = min(self.fade_duration_ms, len(final_audio), len(curr_audio)) if fade_size > 0: final_audio = final_audio.append(curr_audio, crossfade=fade_size) else: final_audio += curr_audio else: final_audio += curr_audio return final_audio def render_to_file(self, str_input, output_path): audio_segment = self.generate_audio_data(str_input) audio_segment.export(output_path, format="wav") # --- Helper functions for Managing Categories & ZIP uploads --- def get_hierarchy(): """Scans the assets directory and returns structural mapping.""" categories = {} if not os.path.isdir(BASE_CHARACTERS_DIR): return categories for cat in sorted(os.listdir(BASE_CHARACTERS_DIR)): cat_p = os.path.join(BASE_CHARACTERS_DIR, cat) if os.path.isdir(cat_p): chars = [c for c in os.listdir(cat_p) if os.path.isdir(os.path.join(cat_p, c))] if chars: categories[cat] = sorted(chars) return categories def handle_zip_upload(file_obj): """Unpacks zipped voice lines into the expected directory schema.""" if file_obj is None: return gr.update(), gr.update(), "No file uploaded." try: temp_extract = os.path.join("assets", f"temp_{uuid.uuid4().hex[:6]}") with zipfile.ZipFile(file_obj.name, 'r') as zip_ref: zip_ref.extractall(temp_extract) # Figure out internal structure and migrate valid directories for root, dirs, files in os.walk(temp_extract): # If directory contains wav files directly, treat it as a character folder if any(f.lower().endswith('.wav') for f in files): char_name = os.path.basename(root) parent_name = os.path.basename(os.path.dirname(root)) # If parent folder is just the root temp extraction layout, assign a generic Category category_name = parent_name if parent_name != os.path.basename(temp_extract) else "Uploaded" dest_dir = os.path.join(BASE_CHARACTERS_DIR, category_name, char_name) os.makedirs(os.path.dirname(dest_dir), exist_ok=True) if os.path.exists(dest_dir): shutil.rmtree(dest_dir) shutil.copytree(root, dest_dir) shutil.rmtree(temp_extract) # Refresh configuration selections hierarchy = get_hierarchy() cats = list(hierarchy.keys()) default_cat = cats[0] if cats else None default_chars = hierarchy[default_cat] if default_cat else [] return ( gr.update(choices=cats, value=default_cat), gr.update(choices=default_chars, value=default_chars[0] if default_chars else None), "Voice pack uploaded and cataloged successfully!" ) except Exception as e: return gr.update(), gr.update(), f"Error processing file: {str(e)}" def update_characters(category): hierarchy = get_hierarchy() chars = hierarchy.get(category, []) return gr.update(choices=chars, value=chars[0] if chars else None) def update_profile_preview(category, character): if not category or not character: return None profile_path = os.path.join(BASE_CHARACTERS_DIR, category, character, "profile.png") if os.path.exists(profile_path): return profile_path return None def synthesize(category, character, text): if not category or not character: raise gr.Error("Please ensure a valid Category and Character are active.") if not text.strip(): raise gr.Error("Text field cannot be left blank.") char_path = os.path.join(BASE_CHARACTERS_DIR, category, character) tts = TextToSpeech(char_path) out_filename = f"output_{uuid.uuid4().hex[:8]}.wav" tts.render_to_file(text, out_filename) return out_filename # --- Gradio UI Block Setup --- initial_hierarchy = get_hierarchy() initial_cats = list(initial_hierarchy.keys()) initial_chars = initial_hierarchy[initial_cats[0]] if initial_cats else [] with gr.Blocks(theme=gr.themes.Soft(primary_hue="amber", neutral_hue="slate")) as demo: gr.Markdown("# 🎙️ Sentence Mixing TTS Generator") gr.Markdown("An elegant web interface for sentence-mixing speech generation. Upload voice line assets or choose a character configuration to begin.") with gr.Row(): with gr.Column(scale=1): profile_preview = gr.Image( value=update_profile_preview(initial_cats[0], initial_chars[0]) if initial_chars else None, label="Character Profile", height=220, width=220, interactive=False, circle=True ) category_drop = gr.Dropdown(choices=initial_cats, value=initial_cats[0] if initial_cats else None, label="Voice Category") character_drop = gr.Dropdown(choices=initial_chars, value=initial_chars[0] if initial_chars else None, label="Character") category_drop.change(update_characters, inputs=category_drop, outputs=character_drop) character_drop.change(update_profile_preview, inputs=[category_drop, character_drop], outputs=profile_preview) with gr.Column(scale=2): input_text = gr.Textbox(label="Text to Synthesize", lines=6, placeholder="Type your text sentence here...") submit_btn = gr.Button("📢 Speak / Generate", variant="primary") audio_output = gr.Audio(label="Synthesized Audio Output", type="filepath") submit_btn.click(synthesize, inputs=[category_drop, character_drop, input_text], outputs=audio_output) with gr.Accordion("⚙️ Upload New Voice Assets (.zip)", open=False): gr.Markdown(""" ### Expected `.zip` Internal Structure You can pack folders into your zip file. For example: * `MyCharacter/AA.wav`, `MyCharacter/B.wav`, etc. * `MyCharacter/words/HELLO.wav` (Optional) * `MyCharacter/profile.png` (Optional round-cropped display icon) """) zip_uploader = gr.File(label="Choose Voice Zip File", file_types=[".zip"]) upload_status = gr.Markdown(value="Waiting for file upload...") upload_btn = gr.Button("📦 Unpack & Register Voice Pack") upload_btn.click( handle_zip_upload, inputs=zip_uploader, outputs=[category_drop, character_drop, upload_status] ) if __name__ == "__main__": demo.launch()