Spaces:

aeuhhh
/

SentenceMixsingGenerator

Runtime error

App Files Files Community

SentenceMixsingGenerator / app.py

aeuhhh

Upload 2 files

71fd59e verified 14 days ago

Raw

History Blame Contribute Delete

11.5 kB

	import os
	import re
	import sys
	import uuid
	import zipfile
	import shutil
	import random
	import nltk
	import gradio as gr
	from g2p_en import G2p
	from pydub import AudioSegment

	# Pre-download required NLTK data
	nltk.download('averaged_perceptron_tagger')
	nltk.download('averaged_perceptron_tagger_eng', quiet=True)

	BASE_CHARACTERS_DIR = os.path.join("assets", "characters")
	os.makedirs(BASE_CHARACTERS_DIR, exist_ok=True)

	class TextToSpeech:
	PHONEME_MAPPING = {
	'AW': ['AE', 'OW'], 'DH': ['D'], 'EY': ['EH', 'IY'], 'JH': ['CH'],
	'SH': ['CH'], 'TH': ['D'], 'ZH': ['CH'], 'AE': ['AA'],
	'AO': ['AA', 'OW'], 'ER': ['AA'], 'IH': ['IY'],
	'OY': ['OW', 'Y', 'IY'], 'UH': ['UW'], 'AH': ['AA']
	}

	def __init__(self, character_folder):
	self.character_folder = character_folder
	self.g2p = G2p()
	self.word_pause_ms = 1 # 0.001 seconds -> 1 ms
	self.fade_duration_ms = 10 # 0.010 seconds -> 10 ms

	# Target specs
	self.target_channels = 1
	self.target_rate = 44100

	def _pick_random_variant(self, base_path):
	directory = os.path.dirname(base_path)
	base_name = os.path.splitext(os.path.basename(base_path))[0]
	if not os.path.isdir(directory):
	return None
	pattern = re.compile(rf"^{re.escape(base_name)}(_\d+)?\.wav$", re.IGNORECASE)
	candidates = [os.path.join(directory, f) for f in os.listdir(directory) if pattern.match(f)]
	return random.choice(candidates) if candidates else None

	def _normalize_audio(self, filepath):
	try:
	audio = AudioSegment.from_wav(filepath)
	if audio.channels > 1:
	audio = audio.set_channels(self.target_channels)
	if audio.frame_rate != self.target_rate:
	audio = audio.set_frame_rate(self.target_rate)
	return audio
	except Exception:
	return None

	def _get_phoneme_data(self, phoneme):
	if phoneme == "AH0":
	chosen_fallback = random.choice(["AA", "AH"])
	return self._get_phoneme_data(chosen_fallback)

	base = os.path.join(self.character_folder, f"{phoneme}.wav")
	path = self._pick_random_variant(base)
	if path:
	return self._normalize_audio(path)

	if phoneme in self.PHONEME_MAPPING:
	combined_audio = None
	for sub_p in self.PHONEME_MAPPING[phoneme]:
	sub_audio = self._get_phoneme_data(sub_p)
	if sub_audio:
	if combined_audio:
	combined_audio = combined_audio.append(sub_audio, crossfade=min(self.fade_duration_ms, len(combined_audio), len(sub_audio)))
	else:
	combined_audio = sub_audio
	return combined_audio
	return None

	def generate_audio_data(self, str_input):
	tokens = re.findall(r"[\w']+\|[.,!?;]", str_input)
	raw_segments = []

	for token in tokens:
	if token in [".", "!", "?", ",", ";"]:
	dur_ms = 400 if token in [".", "!", "?"] else 220
	raw_segments.append({"audio": AudioSegment.silent(duration=dur_ms), "is_pause": True})
	continue

	word_wav = self._pick_random_variant(os.path.join(self.character_folder, "words", f"{token.upper()}.wav"))
	if word_wav:
	norm_word = self._normalize_audio(word_wav)
	if norm_word:
	raw_segments.append({"audio": norm_word, "is_pause": False})
	else:
	phonemes = self.g2p(token)
	valid_ps = [re.sub(r'\d+', '', p) if p != "AH0" else p for p in phonemes]
	valid_ps = [p for p in valid_ps if re.match(r'[A-Z]+[0-9]*', p)]

	if valid_ps and valid_ps[-1] in ["AH", "AE", "AH0"]:
	valid_ps[-1] = random.choice(["AA", "AH"])

	for p_clean in valid_ps:
	seg_audio = self._get_phoneme_data(p_clean)
	if seg_audio:
	raw_segments.append({"audio": seg_audio, "is_pause": False})

	raw_segments.append({"audio": AudioSegment.silent(duration=self.word_pause_ms), "is_pause": True})

	if not raw_segments:
	return AudioSegment.silent(duration=100)

	final_audio = None
	for i in range(len(raw_segments)):
	curr_audio = raw_segments[i]["audio"]
	if final_audio is None:
	final_audio = curr_audio
	continue

	# Apply crossfade if neither side is a pause segment
	if not raw_segments[i-1]["is_pause"] and not raw_segments[i]["is_pause"]:
	fade_size = min(self.fade_duration_ms, len(final_audio), len(curr_audio))
	if fade_size > 0:
	final_audio = final_audio.append(curr_audio, crossfade=fade_size)
	else:
	final_audio += curr_audio
	else:
	final_audio += curr_audio

	return final_audio

	def render_to_file(self, str_input, output_path):
	audio_segment = self.generate_audio_data(str_input)
	audio_segment.export(output_path, format="wav")


	# --- Helper functions for Managing Categories & ZIP uploads ---

	def get_hierarchy():
	"""Scans the assets directory and returns structural mapping."""
	categories = {}
	if not os.path.isdir(BASE_CHARACTERS_DIR):
	return categories
	for cat in sorted(os.listdir(BASE_CHARACTERS_DIR)):
	cat_p = os.path.join(BASE_CHARACTERS_DIR, cat)
	if os.path.isdir(cat_p):
	chars = [c for c in os.listdir(cat_p) if os.path.isdir(os.path.join(cat_p, c))]
	if chars:
	categories[cat] = sorted(chars)
	return categories

	def handle_zip_upload(file_obj):
	"""Unpacks zipped voice lines into the expected directory schema."""
	if file_obj is None:
	return gr.update(), gr.update(), "No file uploaded."

	try:
	temp_extract = os.path.join("assets", f"temp_{uuid.uuid4().hex[:6]}")
	with zipfile.ZipFile(file_obj.name, 'r') as zip_ref:
	zip_ref.extractall(temp_extract)

	# Figure out internal structure and migrate valid directories
	for root, dirs, files in os.walk(temp_extract):
	# If directory contains wav files directly, treat it as a character folder
	if any(f.lower().endswith('.wav') for f in files):
	char_name = os.path.basename(root)
	parent_name = os.path.basename(os.path.dirname(root))

	# If parent folder is just the root temp extraction layout, assign a generic Category
	category_name = parent_name if parent_name != os.path.basename(temp_extract) else "Uploaded"

	dest_dir = os.path.join(BASE_CHARACTERS_DIR, category_name, char_name)
	os.makedirs(os.path.dirname(dest_dir), exist_ok=True)
	if os.path.exists(dest_dir):
	shutil.rmtree(dest_dir)
	shutil.copytree(root, dest_dir)

	shutil.rmtree(temp_extract)

	# Refresh configuration selections
	hierarchy = get_hierarchy()
	cats = list(hierarchy.keys())
	default_cat = cats[0] if cats else None
	default_chars = hierarchy[default_cat] if default_cat else []

	return (
	gr.update(choices=cats, value=default_cat),
	gr.update(choices=default_chars, value=default_chars[0] if default_chars else None),
	"Voice pack uploaded and cataloged successfully!"
	)
	except Exception as e:
	return gr.update(), gr.update(), f"Error processing file: {str(e)}"

	def update_characters(category):
	hierarchy = get_hierarchy()
	chars = hierarchy.get(category, [])
	return gr.update(choices=chars, value=chars[0] if chars else None)

	def update_profile_preview(category, character):
	if not category or not character:
	return None
	profile_path = os.path.join(BASE_CHARACTERS_DIR, category, character, "profile.png")
	if os.path.exists(profile_path):
	return profile_path
	return None

	def synthesize(category, character, text):
	if not category or not character:
	raise gr.Error("Please ensure a valid Category and Character are active.")
	if not text.strip():
	raise gr.Error("Text field cannot be left blank.")

	char_path = os.path.join(BASE_CHARACTERS_DIR, category, character)
	tts = TextToSpeech(char_path)

	out_filename = f"output_{uuid.uuid4().hex[:8]}.wav"
	tts.render_to_file(text, out_filename)
	return out_filename


	# --- Gradio UI Block Setup ---

	initial_hierarchy = get_hierarchy()
	initial_cats = list(initial_hierarchy.keys())
	initial_chars = initial_hierarchy[initial_cats[0]] if initial_cats else []

	with gr.Blocks(theme=gr.themes.Soft(primary_hue="amber", neutral_hue="slate")) as demo:
	gr.Markdown("# 🎙️ Sentence Mixing TTS Generator")
	gr.Markdown("An elegant web interface for sentence-mixing speech generation. Upload voice line assets or choose a character configuration to begin.")

	with gr.Row():
	with gr.Column(scale=1):
	profile_preview = gr.Image(
	value=update_profile_preview(initial_cats[0], initial_chars[0]) if initial_chars else None,
	label="Character Profile",
	height=220,
	width=220,
	interactive=False,
	circle=True
	)

	category_drop = gr.Dropdown(choices=initial_cats, value=initial_cats[0] if initial_cats else None, label="Voice Category")
	character_drop = gr.Dropdown(choices=initial_chars, value=initial_chars[0] if initial_chars else None, label="Character")

	category_drop.change(update_characters, inputs=category_drop, outputs=character_drop)
	character_drop.change(update_profile_preview, inputs=[category_drop, character_drop], outputs=profile_preview)

	with gr.Column(scale=2):
	input_text = gr.Textbox(label="Text to Synthesize", lines=6, placeholder="Type your text sentence here...")
	submit_btn = gr.Button("📢 Speak / Generate", variant="primary")
	audio_output = gr.Audio(label="Synthesized Audio Output", type="filepath")

	submit_btn.click(synthesize, inputs=[category_drop, character_drop, input_text], outputs=audio_output)

	with gr.Accordion("⚙️ Upload New Voice Assets (.zip)", open=False):
	gr.Markdown("""
	### Expected `.zip` Internal Structure
	You can pack folders into your zip file. For example:
	* `MyCharacter/AA.wav`, `MyCharacter/B.wav`, etc.
	* `MyCharacter/words/HELLO.wav` (Optional)
	* `MyCharacter/profile.png` (Optional round-cropped display icon)
	""")
	zip_uploader = gr.File(label="Choose Voice Zip File", file_types=[".zip"])
	upload_status = gr.Markdown(value="Waiting for file upload...")
	upload_btn = gr.Button("📦 Unpack & Register Voice Pack")

	upload_btn.click(
	handle_zip_upload,
	inputs=zip_uploader,
	outputs=[category_drop, character_drop, upload_status]
	)

	if __name__ == "__main__":
	demo.launch()