aeuhhh's picture
Upload 2 files
71fd59e verified
Raw
History Blame Contribute Delete
11.5 kB
import os
import re
import sys
import uuid
import zipfile
import shutil
import random
import nltk
import gradio as gr
from g2p_en import G2p
from pydub import AudioSegment
# Pre-download required NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
BASE_CHARACTERS_DIR = os.path.join("assets", "characters")
os.makedirs(BASE_CHARACTERS_DIR, exist_ok=True)
class TextToSpeech:
PHONEME_MAPPING = {
'AW': ['AE', 'OW'], 'DH': ['D'], 'EY': ['EH', 'IY'], 'JH': ['CH'],
'SH': ['CH'], 'TH': ['D'], 'ZH': ['CH'], 'AE': ['AA'],
'AO': ['AA', 'OW'], 'ER': ['AA'], 'IH': ['IY'],
'OY': ['OW', 'Y', 'IY'], 'UH': ['UW'], 'AH': ['AA']
}
def __init__(self, character_folder):
self.character_folder = character_folder
self.g2p = G2p()
self.word_pause_ms = 1 # 0.001 seconds -> 1 ms
self.fade_duration_ms = 10 # 0.010 seconds -> 10 ms
# Target specs
self.target_channels = 1
self.target_rate = 44100
def _pick_random_variant(self, base_path):
directory = os.path.dirname(base_path)
base_name = os.path.splitext(os.path.basename(base_path))[0]
if not os.path.isdir(directory):
return None
pattern = re.compile(rf"^{re.escape(base_name)}(_\d+)?\.wav$", re.IGNORECASE)
candidates = [os.path.join(directory, f) for f in os.listdir(directory) if pattern.match(f)]
return random.choice(candidates) if candidates else None
def _normalize_audio(self, filepath):
try:
audio = AudioSegment.from_wav(filepath)
if audio.channels > 1:
audio = audio.set_channels(self.target_channels)
if audio.frame_rate != self.target_rate:
audio = audio.set_frame_rate(self.target_rate)
return audio
except Exception:
return None
def _get_phoneme_data(self, phoneme):
if phoneme == "AH0":
chosen_fallback = random.choice(["AA", "AH"])
return self._get_phoneme_data(chosen_fallback)
base = os.path.join(self.character_folder, f"{phoneme}.wav")
path = self._pick_random_variant(base)
if path:
return self._normalize_audio(path)
if phoneme in self.PHONEME_MAPPING:
combined_audio = None
for sub_p in self.PHONEME_MAPPING[phoneme]:
sub_audio = self._get_phoneme_data(sub_p)
if sub_audio:
if combined_audio:
combined_audio = combined_audio.append(sub_audio, crossfade=min(self.fade_duration_ms, len(combined_audio), len(sub_audio)))
else:
combined_audio = sub_audio
return combined_audio
return None
def generate_audio_data(self, str_input):
tokens = re.findall(r"[\w']+|[.,!?;]", str_input)
raw_segments = []
for token in tokens:
if token in [".", "!", "?", ",", ";"]:
dur_ms = 400 if token in [".", "!", "?"] else 220
raw_segments.append({"audio": AudioSegment.silent(duration=dur_ms), "is_pause": True})
continue
word_wav = self._pick_random_variant(os.path.join(self.character_folder, "words", f"{token.upper()}.wav"))
if word_wav:
norm_word = self._normalize_audio(word_wav)
if norm_word:
raw_segments.append({"audio": norm_word, "is_pause": False})
else:
phonemes = self.g2p(token)
valid_ps = [re.sub(r'\d+', '', p) if p != "AH0" else p for p in phonemes]
valid_ps = [p for p in valid_ps if re.match(r'[A-Z]+[0-9]*', p)]
if valid_ps and valid_ps[-1] in ["AH", "AE", "AH0"]:
valid_ps[-1] = random.choice(["AA", "AH"])
for p_clean in valid_ps:
seg_audio = self._get_phoneme_data(p_clean)
if seg_audio:
raw_segments.append({"audio": seg_audio, "is_pause": False})
raw_segments.append({"audio": AudioSegment.silent(duration=self.word_pause_ms), "is_pause": True})
if not raw_segments:
return AudioSegment.silent(duration=100)
final_audio = None
for i in range(len(raw_segments)):
curr_audio = raw_segments[i]["audio"]
if final_audio is None:
final_audio = curr_audio
continue
# Apply crossfade if neither side is a pause segment
if not raw_segments[i-1]["is_pause"] and not raw_segments[i]["is_pause"]:
fade_size = min(self.fade_duration_ms, len(final_audio), len(curr_audio))
if fade_size > 0:
final_audio = final_audio.append(curr_audio, crossfade=fade_size)
else:
final_audio += curr_audio
else:
final_audio += curr_audio
return final_audio
def render_to_file(self, str_input, output_path):
audio_segment = self.generate_audio_data(str_input)
audio_segment.export(output_path, format="wav")
# --- Helper functions for Managing Categories & ZIP uploads ---
def get_hierarchy():
"""Scans the assets directory and returns structural mapping."""
categories = {}
if not os.path.isdir(BASE_CHARACTERS_DIR):
return categories
for cat in sorted(os.listdir(BASE_CHARACTERS_DIR)):
cat_p = os.path.join(BASE_CHARACTERS_DIR, cat)
if os.path.isdir(cat_p):
chars = [c for c in os.listdir(cat_p) if os.path.isdir(os.path.join(cat_p, c))]
if chars:
categories[cat] = sorted(chars)
return categories
def handle_zip_upload(file_obj):
"""Unpacks zipped voice lines into the expected directory schema."""
if file_obj is None:
return gr.update(), gr.update(), "No file uploaded."
try:
temp_extract = os.path.join("assets", f"temp_{uuid.uuid4().hex[:6]}")
with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
zip_ref.extractall(temp_extract)
# Figure out internal structure and migrate valid directories
for root, dirs, files in os.walk(temp_extract):
# If directory contains wav files directly, treat it as a character folder
if any(f.lower().endswith('.wav') for f in files):
char_name = os.path.basename(root)
parent_name = os.path.basename(os.path.dirname(root))
# If parent folder is just the root temp extraction layout, assign a generic Category
category_name = parent_name if parent_name != os.path.basename(temp_extract) else "Uploaded"
dest_dir = os.path.join(BASE_CHARACTERS_DIR, category_name, char_name)
os.makedirs(os.path.dirname(dest_dir), exist_ok=True)
if os.path.exists(dest_dir):
shutil.rmtree(dest_dir)
shutil.copytree(root, dest_dir)
shutil.rmtree(temp_extract)
# Refresh configuration selections
hierarchy = get_hierarchy()
cats = list(hierarchy.keys())
default_cat = cats[0] if cats else None
default_chars = hierarchy[default_cat] if default_cat else []
return (
gr.update(choices=cats, value=default_cat),
gr.update(choices=default_chars, value=default_chars[0] if default_chars else None),
"Voice pack uploaded and cataloged successfully!"
)
except Exception as e:
return gr.update(), gr.update(), f"Error processing file: {str(e)}"
def update_characters(category):
hierarchy = get_hierarchy()
chars = hierarchy.get(category, [])
return gr.update(choices=chars, value=chars[0] if chars else None)
def update_profile_preview(category, character):
if not category or not character:
return None
profile_path = os.path.join(BASE_CHARACTERS_DIR, category, character, "profile.png")
if os.path.exists(profile_path):
return profile_path
return None
def synthesize(category, character, text):
if not category or not character:
raise gr.Error("Please ensure a valid Category and Character are active.")
if not text.strip():
raise gr.Error("Text field cannot be left blank.")
char_path = os.path.join(BASE_CHARACTERS_DIR, category, character)
tts = TextToSpeech(char_path)
out_filename = f"output_{uuid.uuid4().hex[:8]}.wav"
tts.render_to_file(text, out_filename)
return out_filename
# --- Gradio UI Block Setup ---
initial_hierarchy = get_hierarchy()
initial_cats = list(initial_hierarchy.keys())
initial_chars = initial_hierarchy[initial_cats[0]] if initial_cats else []
with gr.Blocks(theme=gr.themes.Soft(primary_hue="amber", neutral_hue="slate")) as demo:
gr.Markdown("# 🎙️ Sentence Mixing TTS Generator")
gr.Markdown("An elegant web interface for sentence-mixing speech generation. Upload voice line assets or choose a character configuration to begin.")
with gr.Row():
with gr.Column(scale=1):
profile_preview = gr.Image(
value=update_profile_preview(initial_cats[0], initial_chars[0]) if initial_chars else None,
label="Character Profile",
height=220,
width=220,
interactive=False,
circle=True
)
category_drop = gr.Dropdown(choices=initial_cats, value=initial_cats[0] if initial_cats else None, label="Voice Category")
character_drop = gr.Dropdown(choices=initial_chars, value=initial_chars[0] if initial_chars else None, label="Character")
category_drop.change(update_characters, inputs=category_drop, outputs=character_drop)
character_drop.change(update_profile_preview, inputs=[category_drop, character_drop], outputs=profile_preview)
with gr.Column(scale=2):
input_text = gr.Textbox(label="Text to Synthesize", lines=6, placeholder="Type your text sentence here...")
submit_btn = gr.Button("📢 Speak / Generate", variant="primary")
audio_output = gr.Audio(label="Synthesized Audio Output", type="filepath")
submit_btn.click(synthesize, inputs=[category_drop, character_drop, input_text], outputs=audio_output)
with gr.Accordion("⚙️ Upload New Voice Assets (.zip)", open=False):
gr.Markdown("""
### Expected `.zip` Internal Structure
You can pack folders into your zip file. For example:
* `MyCharacter/AA.wav`, `MyCharacter/B.wav`, etc.
* `MyCharacter/words/HELLO.wav` (Optional)
* `MyCharacter/profile.png` (Optional round-cropped display icon)
""")
zip_uploader = gr.File(label="Choose Voice Zip File", file_types=[".zip"])
upload_status = gr.Markdown(value="Waiting for file upload...")
upload_btn = gr.Button("📦 Unpack & Register Voice Pack")
upload_btn.click(
handle_zip_upload,
inputs=zip_uploader,
outputs=[category_drop, character_drop, upload_status]
)
if __name__ == "__main__":
demo.launch()