Spaces:

Leteint
/

TTS_Francais

Runtime error

App Files Files Community

TTS_Francais / app.py

Leteint

Upload app.py

208bc60 verified about 1 month ago

raw

history blame contribute delete

6.78 kB

	import spaces
	import gradio as gr
	import torch
	import numpy as np
	from parler_tts import ParlerTTSForConditionalGeneration
	from transformers import AutoTokenizer

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	repo_id = "PHBJT/french_parler_tts_mini_v0.1"

	model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
	tokenizer = AutoTokenizer.from_pretrained(repo_id)

	SAMPLE_RATE = model.config.sampling_rate
	SEED = 42
	MAX_CHARS_PER_SEGMENT = 200

	default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
	default_description = (
	"A male voice speaks very fast with very clear audio, neutral tone, no background noise."
	)

	examples = [
	[
	"La voix humaine est un instrument de musique au-dessus de tous les autres.",
	default_description,
	],
	[
	"Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
	"A male voice delivers a slightly expressive and animated speech with a moderate speed. "
	"The recording features a low-pitch voice, creating a close-sounding audio experience.",
	],
	[
	"La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
	"A male voice provides a monotone yet slightly fast delivery, with a very close recording "
	"that almost has no background noise.",
	],
	[
	"Le progrès fait naître plus de besoins qu'il n'en satisfait.",
	"A female voice, in a very poor recording quality, delivers slightly expressive and animated "
	"words with a fast pace. There's a high level of background noise and a very distant-sounding "
	"reverberation. The voice is slightly higher pitched than average.",
	],
	]


	def preprocess_text(text):
	"""Prétraite le texte pour le rendre compatible avec le modèle TTS."""
	# Remplacer les apostrophes (curly et droites) par des espaces
	text = text.replace("\u2019", " ").replace("\u2018", " ").replace("'", " ")
	# Remplacer les caractères accentués par leur équivalent non accentué
	accent_map = {
	"À": "A", "Â": "A", "Ä": "A", "à": "a", "â": "a", "ä": "a",
	"É": "é", "È": "è", "Ê": "E", "Ë": "E", "ê": "e", "ë": "e",
	"Î": "I", "Ï": "I", "î": "i", "ï": "i",
	"Ô": "O", "Ö": "O", "ô": "o", "ö": "o",
	"Ù": "U", "Û": "U", "Ü": "U", "ù": "u", "û": "u", "ü": "u",
	"Ÿ": "Y", "ÿ": "y",
	"Ç": "C", "ç": "c",
	"Œ": "OE", "œ": "oe",
	"Æ": "AE", "æ": "ae",
	}
	for accented, plain in accent_map.items():
	text = text.replace(accented, plain)
	return text


	def split_text(text, max_chars=MAX_CHARS_PER_SEGMENT):
	"""Découpe le texte en segments de taille maximale, en coupant sur la ponctuation."""
	if len(text) <= max_chars:
	return [text]

	segments = []
	remaining = text

	while remaining:
	if len(remaining) <= max_chars:
	segments.append(remaining.strip())
	break

	# Chercher un point de coupure dans la limite
	chunk = remaining[:max_chars]

	# Priorité : fin de phrase (. ! ?)
	cut_pos = -1
	for punct in ".!?":
	pos = chunk.rfind(punct)
	if pos > cut_pos:
	cut_pos = pos

	# Si pas trouvé, chercher ponctuation secondaire (; : ,)
	if cut_pos == -1:
	for punct in ";:,":
	pos = chunk.rfind(punct)
	if pos > cut_pos:
	cut_pos = pos

	# Si toujours pas trouvé, couper sur un espace
	if cut_pos == -1:
	cut_pos = chunk.rfind(" ")

	# En dernier recours, couper à max_chars
	if cut_pos == -1:
	cut_pos = max_chars - 1

	segment = remaining[: cut_pos + 1].strip()
	if segment:
	segments.append(segment)
	remaining = remaining[cut_pos + 1 :].strip()

	return segments


	def trim_silence(audio_arr, threshold=0.005):
	"""Supprime le silence au début et à la fin de l'audio."""
	mask = np.abs(audio_arr) > threshold
	if mask.any():
	start = np.argmax(mask)
	end = len(mask) - np.argmax(mask[::-1])
	return audio_arr[start:end]
	return audio_arr


	@spaces.GPU # <- important pour ZeroGPU
	def generate_segment(text_segment, description):
	"""Génère l'audio pour un segment de texte."""
	torch.manual_seed(SEED)
	desc_inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
	text_inputs = tokenizer(text_segment.strip(), return_tensors="pt").to(device)

	generation = model.generate(
	input_ids=desc_inputs.input_ids,
	prompt_input_ids=text_inputs.input_ids,
	attention_mask=desc_inputs.attention_mask,
	prompt_attention_mask=text_inputs.attention_mask,
	do_sample=True,
	temperature=1.0,
	max_new_tokens=1024,
	)

	audio_arr = generation.cpu().numpy().squeeze()
	return trim_silence(audio_arr)


	def gen_tts(text, description):
	if not text.strip():
	return None

	text = preprocess_text(text)
	segments = split_text(text)

	audio_segments = []
	for segment in segments:
	audio_arr = generate_segment(segment, description)
	audio_segments.append(audio_arr)

	# Concaténer tous les segments audio
	full_audio = np.concatenate(audio_segments)

	return SAMPLE_RATE, full_audio


	with gr.Blocks() as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
	French Parler-TTS 🗣️
	</h1>
	<p>Génération de voix française avec Parler-TTS Mini (ZeroGPU).</p>
	</div>
	"""
	)

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Texte d'entrée",
	lines=3,
	value=default_text,
	)
	description = gr.Textbox(
	label="Description de la voix",
	lines=3,
	value=default_description,
	)
	run_button = gr.Button("Générer l'audio", variant="primary")

	with gr.Column():
	audio_out = gr.Audio(
	label="Parler-TTS generation",
	type="numpy",
	)

	run_button.click(
	fn=gen_tts,
	inputs=[input_text, description],
	outputs=[audio_out],
	queue=True,
	)

	gr.Examples(
	examples=examples,
	inputs=[input_text, description],
	outputs=[audio_out],
	fn=gen_tts,
	cache_examples=False,
	)

	demo.queue()
	demo.launch()