tts_PTBR

Running

App Files Files Community

tts_PTBR / app.py

elielsilva

Update app.py

c1758a5 verified 3 months ago

raw

history blame contribute delete

12.5 kB

	import os
	import re
	import uuid
	import subprocess
	import numpy as np
	import wave
	import gradio as gr
	from huggingface_hub import list_repo_files
	from kokoro import KPipeline
	from deep_translator import GoogleTranslator
	from pydub import AudioSegment
	from pydub.silence import split_on_silence

	# --- Configurações Iniciais ---

	language_map_local = {
	"Brazilian Portuguese": "pt",
	"American English": "en",
	"British English": "en",
	"Hindi": "hi",
	"Spanish": "es",
	"French": "fr",
	"Italian": "it",
	"Japanese": "ja",
	"Mandarin Chinese": "zh-CN"
	}
	VOICE_PRESETS = {
	"Manual": {
	"pitch": None,
	"speed": None
	},
	"Natural (Padrão)": {
	"pitch": 1.00,
	"speed": 1.00
	},
	"Levemente Jovem": {
	"pitch": 1.06,
	"speed": 1.02
	},
	"Jovem / ElevenLabs-like": {
	"pitch": 1.09,
	"speed": 1.05
	},
	"Grave / Narrador": {
	"pitch": 0.92,
	"speed": 0.95
	},
	"Muito Grave": {
	"pitch": 0.88,
	"speed": 0.92
	},
	"Rápido Comercial": {
	"pitch": 1.03,
	"speed": 1.12
	}
	}


	# Mapeamento do Idioma para o Prefixo da Voz (ex: Brazilian Portuguese -> 'p')
	language_map = {
	"Brazilian Portuguese": "p",
	"American English": "a",
	"British English": "b",
	"Hindi": "h",
	"Spanish": "e",
	"French": "f",
	"Italian": "i",
	"Japanese": "j",
	"Mandarin Chinese": "z"
	}

	last_used_language = "p"
	pipeline = None

	# Lista global para armazenar todas as vozes carregadas
	ALL_VOICES = []


	# --- Funções Auxiliares de Tradução e Texto ---

	def bulk_translate(text, target_language, chunk_size=500, MAX_ALLOWED_CHARACTERS=10000):
	if len(text) >= MAX_ALLOWED_CHARACTERS:
	gr.Warning("[WARNING] Text too long — skipping translation.")
	return text

	lang_code = language_map_local.get(target_language)
	if not lang_code:
	return text

	sentences = re.split(r'(?<=[.!?])\s+', text)
	chunks = []
	current_chunk = ""

	for sentence in sentences:
	if len(current_chunk) + len(sentence) <= chunk_size:
	current_chunk += " " + sentence
	else:
	chunks.append(current_chunk.strip())
	current_chunk = sentence

	if current_chunk:
	chunks.append(current_chunk.strip())

	try:
	translated_chunks = [GoogleTranslator(target=lang_code).translate(chunk) for chunk in chunks]
	result = " ".join(translated_chunks)
	return result.strip()
	except Exception as e:
	gr.Warning(f"Translation failed: {e}")
	return text

	def clean_text(text):
	replacements = {
	"–": " ", "-": " ", "*": " ", "": " ", "#": " "
	}
	for old, new in replacements.items():
	text = text.replace(old, new)

	emoji_pattern = re.compile(r'[^\w\s,.:;?!@\'"()-]', flags=re.UNICODE)
	text = emoji_pattern.sub(r'', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	# --- Gerenciamento de Arquivos e Pipeline ---

	def create_audio_dir():
	root_dir = os.getcwd()
	audio_dir = os.path.join(root_dir, "kokoro_audio")
	os.makedirs(audio_dir, exist_ok=True)
	return audio_dir

	temp_folder = create_audio_dir()

	def update_pipeline(Language):
	global pipeline, last_used_language
	new_lang = language_map.get(Language, "p")

	if new_lang != last_used_language or pipeline is None:
	try:
	pipeline = KPipeline(lang_code=new_lang)
	last_used_language = new_lang
	except Exception as e:
	gr.Warning(f"Error loading {Language}. Fallback to English.")
	pipeline = KPipeline(lang_code="a")
	last_used_language = "a"

	def get_voice_names(repo_id):
	"""Obtém todas as vozes disponíveis."""
	try:
	return [os.path.splitext(file.replace("voices/", ""))[0] for file in list_repo_files(repo_id) if file.startswith("voices/")]
	except:
	# Fallback manual com algumas vozes conhecidas
	return ["pf_dora", "pm_alex","pm_santa", "af_bella", "af_sarah", "bf_isabella", "ff_siwis", "ef_dora", "jf_nezumi", "zf_xiaoni"]

	def filter_voices_by_language(language):
	"""Filtra a lista global ALL_VOICES baseada no prefixo do idioma selecionado."""
	prefix = language_map.get(language, "a") # padrão 'a' se falhar

	# Filtra vozes que começam com o prefixo (ex: 'p' para 'pf_dora')
	filtered = [v for v in ALL_VOICES if v.startswith(prefix)]

	if not filtered:
	return gr.Dropdown(choices=ALL_VOICES, value=ALL_VOICES[0])

	return gr.Dropdown(choices=filtered, value=filtered[0])

	def tts_file_name(text, language):
	global temp_folder
	clean_t = re.sub(r'[^a-zA-Z\s]', '', text).lower().strip().replace(" ", "_")
	lang_clean = language.replace(" ", "_").strip()
	truncated_text = clean_t[:20] if len(clean_t) > 0 else lang_clean
	random_string = uuid.uuid4().hex[:8].upper()
	return f"{temp_folder}/{truncated_text}_{random_string}.wav"

	# --- Processamento de Áudio ---

	def remove_silence_function(file_path,minimum_silence=50):
	# Extract file name and format from the provided path
	output_path = file_path.replace(".wav", "_no_silence.wav")
	audio_format = "wav"
	# Reading and splitting the audio file into chunks
	sound = AudioSegment.from_file(file_path, format=audio_format)
	audio_chunks = split_on_silence(sound,
	min_silence_len=100,
	silence_thresh=-45,
	keep_silence=minimum_silence)
	# Putting the file back together
	combined = AudioSegment.empty()
	for chunk in audio_chunks:
	combined += chunk
	combined.export(output_path, format=audio_format)
	return output_path

	def apply_ffmpeg_audio_fx(input_wav, pitch=1.09, loudnorm=True):
	output_wav = input_wav.replace(".wav", "_fx.wav")

	filters = []

	# Pitch + Formant
	filters.append(f"rubberband=pitch={pitch}:formant=preserved")

	# Loudnorm estilo ElevenLabs
	if loudnorm:
	filters.append("loudnorm=I=-16:TP=-1.5:LRA=11")

	af_filter = ",".join(filters)

	cmd = [
	"ffmpeg", "-y",
	"-i", input_wav,
	"-af", af_filter,
	output_wav
	]

	try:
	subprocess.run(
	cmd,
	check=True,
	stdout=subprocess.DEVNULL,
	stderr=subprocess.DEVNULL
	)
	return output_wav
	except Exception as e:
	gr.Warning(f"FFmpeg FX falhou: {e}")
	return input_wav



	def generate_and_save_audio(text, Language, voice, speed, remove_silence, keep_silence_up_to,use_ffmpeg,pitch, use_loudnorm):
	text = clean_text(text)
	update_pipeline(Language)

	# Gerar áudio
	generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
	save_path = tts_file_name(text, Language)

	with wave.open(save_path, 'wb') as wav_file:
	wav_file.setnchannels(1)
	wav_file.setsampwidth(2)
	wav_file.setframerate(24000)

	for i, result in enumerate(generator):
	audio = result.audio
	audio_np = audio.numpy()
	audio_int16 = (audio_np * 32767).astype(np.int16)
	wav_file.writeframes(audio_int16.tobytes())

	final = save_path
	if remove_silence:
	keep_silence = int(keep_silence_up_to * 1000)
	final = remove_silence_function(final, minimum_silence=keep_silence)

	if use_ffmpeg:
	final = apply_ffmpeg_audio_fx(
	final,
	pitch=pitch,
	loudnorm=use_loudnorm
	)

	return final

	# --- API Principal para a UI ---

	def KOKORO_TTS_API(text, Language, voice, speed, translate_text, remove_silence,use_ffmpeg, pitch, preset, use_loudnorm):
	if not Language: Language = "Brazilian Portuguese"
	if not voice: voice = "pf_dora"

	if translate_text:
	text = bulk_translate(text, Language, chunk_size=500)

	if preset in VOICE_PRESETS and preset != "Manual":
	preset_cfg = VOICE_PRESETS[preset]

	if preset_cfg["pitch"] is not None:
	pitch = preset_cfg["pitch"]

	if preset_cfg["speed"] is not None:
	speed = preset_cfg["speed"]


	save_path = generate_and_save_audio(
	text=text, Language=Language, voice=voice, speed=speed,
	remove_silence=remove_silence, keep_silence_up_to=0.05,
	use_ffmpeg=use_ffmpeg, pitch=pitch, use_loudnorm = use_loudnorm)

	return save_path, save_path

	# --- Interface Gradio ---

	def toggle_autoplay(autoplay):
	return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)

	def ui():
	global ALL_VOICES
	lang_list = list(language_map.keys())

	# Carrega todas as vozes uma única vez
	ALL_VOICES = get_voice_names("hexgrad/Kokoro-82M")

	# Define valores iniciais para PT-BR
	initial_lang = "Brazilian Portuguese"
	initial_voices = [v for v in ALL_VOICES if v.startswith(language_map[initial_lang])]
	initial_voice_value = "pf_dora" if "pf_dora" in initial_voices else (initial_voices[0] if initial_voices else ALL_VOICES[0])

	dummy_examples = [
	["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pf_dora"],
	["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pm_alex"],
	["Olá! Hoje é um ótimo dia para estudar e aprender coisas novas.", "Brazilian Portuguese", "pm_santa"],
	]

	with gr.Blocks(title="Kokoro TTS") as demo:
	gr.Markdown("## Kokoro TTS (Audio Only)")

	with gr.Row():
	with gr.Column():
	text = gr.Textbox(label='📝 Texto de Entrada', lines=3, placeholder="Digite seu texto aqui...")

	with gr.Row():
	language_name = gr.Dropdown(lang_list, label="🌍 Selecionar Idioma", value=initial_lang)

	with gr.Row():
	# Começa preenchido apenas com vozes em Português
	voice_name = gr.Dropdown(initial_voices, label="🎙️ Escolher Voz", value=initial_voice_value)

	with gr.Row():
	generate_btn = gr.Button('🚀 Gerar Áudio', variant='primary')

	with gr.Accordion('🎛️ Configurações de Áudio', open=False):
	speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='⚡️ Velocidade')
	translate_text = gr.Checkbox(value=False, label='🌐 Traduzir texto para o idioma selecionado')
	remove_silence = gr.Checkbox(value=False, label='✂️ Remover Silêncio')
	use_ffmpeg = gr.Checkbox(value=False,label="🎵 Aplicar FFmpeg (Rubberband Pitch + Formant)")
	preset = gr.Dropdown(
	choices=list(VOICE_PRESETS.keys()),
	value="Natural (Padrão)",
	label="🎙️ Preset de Voz (Estilo ElevenLabs)"
	)
	pitch = gr.Slider(minimum=0.85,maximum=1.20,value=1.09, step=0.01,label="🎵 Pitch (Rubberband – Formant Preserved)")
	use_loudnorm = gr.Checkbox(
	value=True,
	label="🔊 Loudnorm (Volume profissional / ElevenLabs)"
	)

	with gr.Column():
	audio = gr.Audio(interactive=False, label='🔊 Áudio Gerado', autoplay=True)
	audio_file = gr.File(label='📥 Baixar Áudio')

	with gr.Row():
	autoplay = gr.Checkbox(value=True, label='▶️ Autoplay')
	autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])

	# --- EVENTOS ---
	# Quando mudar o idioma, atualiza a lista de vozes
	language_name.change(filter_voices_by_language, inputs=[language_name], outputs=[voice_name])

	inputs = [text, language_name, voice_name, speed, translate_text, remove_silence,use_ffmpeg,pitch,preset,use_loudnorm]
	outputs = [audio, audio_file]

	text.submit(KOKORO_TTS_API, inputs=inputs, outputs=outputs)
	generate_btn.click(KOKORO_TTS_API, inputs=inputs, outputs=outputs)

	gr.Examples(examples=dummy_examples, inputs=[text, language_name, voice_name])

	return demo

	if __name__ == "__main__":
	print("Inicializando pipeline em Português...")
	update_pipeline("Brazilian Portuguese")

	demo = ui()
	demo.queue().launch(show_api=False)