Spaces:
Runtime error
Runtime error
| import spaces | |
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| from parler_tts import ParlerTTSForConditionalGeneration | |
| from transformers import AutoTokenizer | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| repo_id = "PHBJT/french_parler_tts_mini_v0.1" | |
| model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) | |
| tokenizer = AutoTokenizer.from_pretrained(repo_id) | |
| SAMPLE_RATE = model.config.sampling_rate | |
| SEED = 42 | |
| MAX_CHARS_PER_SEGMENT = 200 | |
| default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres." | |
| default_description = ( | |
| "A male voice speaks very fast with very clear audio, neutral tone, no background noise." | |
| ) | |
| examples = [ | |
| [ | |
| "La voix humaine est un instrument de musique au-dessus de tous les autres.", | |
| default_description, | |
| ], | |
| [ | |
| "Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.", | |
| "A male voice delivers a slightly expressive and animated speech with a moderate speed. " | |
| "The recording features a low-pitch voice, creating a close-sounding audio experience.", | |
| ], | |
| [ | |
| "La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.", | |
| "A male voice provides a monotone yet slightly fast delivery, with a very close recording " | |
| "that almost has no background noise.", | |
| ], | |
| [ | |
| "Le progrès fait naître plus de besoins qu'il n'en satisfait.", | |
| "A female voice, in a very poor recording quality, delivers slightly expressive and animated " | |
| "words with a fast pace. There's a high level of background noise and a very distant-sounding " | |
| "reverberation. The voice is slightly higher pitched than average.", | |
| ], | |
| ] | |
| def preprocess_text(text): | |
| """Prétraite le texte pour le rendre compatible avec le modèle TTS.""" | |
| # Remplacer les apostrophes (curly et droites) par des espaces | |
| text = text.replace("\u2019", " ").replace("\u2018", " ").replace("'", " ") | |
| # Remplacer les caractères accentués par leur équivalent non accentué | |
| accent_map = { | |
| "À": "A", "Â": "A", "Ä": "A", "à": "a", "â": "a", "ä": "a", | |
| "É": "é", "È": "è", "Ê": "E", "Ë": "E", "ê": "e", "ë": "e", | |
| "Î": "I", "Ï": "I", "î": "i", "ï": "i", | |
| "Ô": "O", "Ö": "O", "ô": "o", "ö": "o", | |
| "Ù": "U", "Û": "U", "Ü": "U", "ù": "u", "û": "u", "ü": "u", | |
| "Ÿ": "Y", "ÿ": "y", | |
| "Ç": "C", "ç": "c", | |
| "Œ": "OE", "œ": "oe", | |
| "Æ": "AE", "æ": "ae", | |
| } | |
| for accented, plain in accent_map.items(): | |
| text = text.replace(accented, plain) | |
| return text | |
| def split_text(text, max_chars=MAX_CHARS_PER_SEGMENT): | |
| """Découpe le texte en segments de taille maximale, en coupant sur la ponctuation.""" | |
| if len(text) <= max_chars: | |
| return [text] | |
| segments = [] | |
| remaining = text | |
| while remaining: | |
| if len(remaining) <= max_chars: | |
| segments.append(remaining.strip()) | |
| break | |
| # Chercher un point de coupure dans la limite | |
| chunk = remaining[:max_chars] | |
| # Priorité : fin de phrase (. ! ?) | |
| cut_pos = -1 | |
| for punct in ".!?": | |
| pos = chunk.rfind(punct) | |
| if pos > cut_pos: | |
| cut_pos = pos | |
| # Si pas trouvé, chercher ponctuation secondaire (; : ,) | |
| if cut_pos == -1: | |
| for punct in ";:,": | |
| pos = chunk.rfind(punct) | |
| if pos > cut_pos: | |
| cut_pos = pos | |
| # Si toujours pas trouvé, couper sur un espace | |
| if cut_pos == -1: | |
| cut_pos = chunk.rfind(" ") | |
| # En dernier recours, couper à max_chars | |
| if cut_pos == -1: | |
| cut_pos = max_chars - 1 | |
| segment = remaining[: cut_pos + 1].strip() | |
| if segment: | |
| segments.append(segment) | |
| remaining = remaining[cut_pos + 1 :].strip() | |
| return segments | |
| def trim_silence(audio_arr, threshold=0.005): | |
| """Supprime le silence au début et à la fin de l'audio.""" | |
| mask = np.abs(audio_arr) > threshold | |
| if mask.any(): | |
| start = np.argmax(mask) | |
| end = len(mask) - np.argmax(mask[::-1]) | |
| return audio_arr[start:end] | |
| return audio_arr | |
| # <- important pour ZeroGPU | |
| def generate_segment(text_segment, description): | |
| """Génère l'audio pour un segment de texte.""" | |
| torch.manual_seed(SEED) | |
| desc_inputs = tokenizer(description.strip(), return_tensors="pt").to(device) | |
| text_inputs = tokenizer(text_segment.strip(), return_tensors="pt").to(device) | |
| generation = model.generate( | |
| input_ids=desc_inputs.input_ids, | |
| prompt_input_ids=text_inputs.input_ids, | |
| attention_mask=desc_inputs.attention_mask, | |
| prompt_attention_mask=text_inputs.attention_mask, | |
| do_sample=True, | |
| temperature=1.0, | |
| max_new_tokens=1024, | |
| ) | |
| audio_arr = generation.cpu().numpy().squeeze() | |
| return trim_silence(audio_arr) | |
| def gen_tts(text, description): | |
| if not text.strip(): | |
| return None | |
| text = preprocess_text(text) | |
| segments = split_text(text) | |
| audio_segments = [] | |
| for segment in segments: | |
| audio_arr = generate_segment(segment, description) | |
| audio_segments.append(audio_arr) | |
| # Concaténer tous les segments audio | |
| full_audio = np.concatenate(audio_segments) | |
| return SAMPLE_RATE, full_audio | |
| with gr.Blocks() as demo: | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; max-width: 700px; margin: 0 auto;"> | |
| <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;"> | |
| French Parler-TTS 🗣️ | |
| </h1> | |
| <p>Génération de voix française avec Parler-TTS Mini (ZeroGPU).</p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox( | |
| label="Texte d'entrée", | |
| lines=3, | |
| value=default_text, | |
| ) | |
| description = gr.Textbox( | |
| label="Description de la voix", | |
| lines=3, | |
| value=default_description, | |
| ) | |
| run_button = gr.Button("Générer l'audio", variant="primary") | |
| with gr.Column(): | |
| audio_out = gr.Audio( | |
| label="Parler-TTS generation", | |
| type="numpy", | |
| ) | |
| run_button.click( | |
| fn=gen_tts, | |
| inputs=[input_text, description], | |
| outputs=[audio_out], | |
| queue=True, | |
| ) | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[input_text, description], | |
| outputs=[audio_out], | |
| fn=gen_tts, | |
| cache_examples=False, | |
| ) | |
| demo.queue() | |
| demo.launch() |