| import os |
| |
| os.environ['GRADIO_MCP_SERVER'] = 'True' |
|
|
| import gradio as gr |
| import torchaudio |
| import torch |
| from pydub import AudioSegment, effects |
| import uuid |
| import subprocess |
| import time |
| import nltk |
| from nltk.tokenize import sent_tokenize |
| from pathlib import Path |
| import sys |
| from pydub.silence import split_on_silence |
| import re |
| from unicodedata import normalize |
| import numpy as np |
| import spaces |
| from huggingface_hub import snapshot_download |
| import threading |
| import requests |
| import tempfile |
|
|
| |
| nltk.download("punkt", quiet=True) |
| nltk.download("punkt_tab", quiet=True) |
|
|
| |
| PROBLEMATIC_CHARS = { |
| 'global': ['&', '%', '@', '#', '$', '*', '+', '=', '()', '[]', '{}', '<>', '|', '/', '\\', '"', '…', '«', '»', '"', '"', ''', '''], |
| 'fr': ['&', '%', '@', '#', '$', '*', '+', '=', 'etc.'], |
| 'en': ['&', '%', '@', '#', '$', '*', '+', '=', 'etc.'], |
| |
| } |
|
|
| |
| REPLACEMENT_RULES = { |
| 'global': { |
| '&': {'fr': ' et ', 'en': ' and ', 'es': ' y ', 'de': ' und ', 'it': ' e ', 'pt': ' e ', 'default': ' and '}, |
| '%': {'fr': ' pourcent ', 'en': ' percent ', 'de': ' prozent ', 'default': ' percent '}, |
| '@': {'fr': ' arobase ', 'en': ' at ', 'default': ' at '}, |
| '#': {'fr': ' hashtag ', 'en': ' hashtag ', 'default': ' hashtag '}, |
| '...': {'default': ', '}, |
| '…': {'default': ', '}, |
| '"': {'default': ''}, |
| "'": {'default': ''}, |
| '«': {'default': ''}, |
| '»': {'default': ''}, |
| '"': {'default': ''}, |
| '"': {'default': ''}, |
| ''': {'default': ''}, |
| ''': {'default': ''}, |
| }, |
| |
| } |
|
|
| def analyze_text(text, language_code): |
| """Analyze text to detect potential pronunciation issues for voice synthesis. |
| |
| This function examines text for problematic characters, special symbols, URLs, |
| numbers, and other elements that might affect speech quality in voice cloning. |
| |
| Args: |
| text: The text to analyze for speech synthesis compatibility |
| language_code: Language code (en, fr, es, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja, hi) |
| |
| Returns: |
| Dictionary containing detected issues and suggestions for improvement |
| """ |
| issues = [] |
| |
| |
| normalized_text = normalize('NFC', text) |
| |
| |
| import re |
| emoji_pattern = re.compile( |
| "[" |
| "\U0001F600-\U0001F64F" |
| "\U0001F300-\U0001F5FF" |
| "\U0001F680-\U0001F6FF" |
| "\U0001F700-\U0001F77F" |
| "\U0001F780-\U0001F7FF" |
| "\U0001F800-\U0001F8FF" |
| "\U0001F900-\U0001F9FF" |
| "\U0001FA00-\U0001FA6F" |
| "\U0001FA70-\U0001FAFF" |
| "\U00002702-\U000027B0" |
| "\U000024C2-\U0001F251" |
| "]+", flags=re.UNICODE |
| ) |
| |
| emojis = emoji_pattern.findall(text) |
| if emojis: |
| issues.append({ |
| 'type': 'emojis', |
| 'description': 'Emojis that will be removed during preprocessing', |
| 'instances': emojis, |
| 'suggestion': 'Emojis are replaced with spaces for better pronunciation' |
| }) |
| |
| |
| urls = re.findall(r'https?://\S+|www\.\S+', text) |
| if urls: |
| issues.append({ |
| 'type': 'url', |
| 'description': 'Detected URLs that may be mispronounced', |
| 'instances': urls, |
| 'suggestion': 'Replace URLs with textual descriptions' |
| }) |
| |
| |
| emails = re.findall(r'\S+@\S+\.\S+', text) |
| if emails: |
| issues.append({ |
| 'type': 'email', |
| 'description': 'Detected email addresses that may be mispronounced', |
| 'instances': emails, |
| 'suggestion': 'Replace emails with descriptive text' |
| }) |
| |
| |
| quote_chars = ['"', '«', '»', '"', '"', ''', '''] |
| found_quotes = [] |
| |
| |
| if language_code == 'en': |
| |
| pass |
| else: |
| |
| for char in quote_chars: |
| if char in text: |
| found_quotes.append(char) |
| |
| if found_quotes: |
| issues.append({ |
| 'type': 'quotes', |
| 'description': 'Quotes and citation characters that may affect pronunciation', |
| 'instances': found_quotes, |
| 'suggestion': 'Remove quotes and citation characters for better pronunciation' |
| }) |
| |
| |
| global_chars = [c for c in PROBLEMATIC_CHARS.get('global', []) if c != "'"] |
| lang_specific_chars = PROBLEMATIC_CHARS.get(language_code, []) |
| all_problematic_chars = set(global_chars + lang_specific_chars) - set(quote_chars) |
| |
| found_chars = [] |
| for char in all_problematic_chars: |
| if char in text: |
| found_chars.append(char) |
| |
| if found_chars: |
| issues.append({ |
| 'type': 'special_chars', |
| 'description': 'Special characters that may cause pronunciation problems', |
| 'instances': found_chars, |
| 'suggestion': 'Replace special characters with their textual equivalent' |
| }) |
| |
| |
| numbers = re.findall(r'\b\d{4,}\b', text) |
| if numbers: |
| suggestion = "Write numbers in full" |
| |
| if language_code == 'fr': |
| suggestion += " or add spaces between thousands (e.g., 10 000)" |
| elif language_code == 'en': |
| suggestion += " or use commas for thousands (e.g., 10,000)" |
| |
| issues.append({ |
| 'type': 'numbers', |
| 'description': 'Long numbers that may be mispronounced', |
| 'instances': numbers, |
| 'suggestion': suggestion |
| }) |
| |
| |
| if language_code == 'en': |
| |
| roman_pattern = r'\b(?!I\b)[IVXLCDM]+\b' |
| roman_numerals = re.findall(roman_pattern, text) |
| if roman_numerals: |
| issues.append({ |
| 'type': 'roman_numerals', |
| 'description': 'Roman numerals that may be mispronounced', |
| 'instances': roman_numerals, |
| 'suggestion': 'Replace Roman numerals with Arabic numbers' |
| }) |
| else: |
| |
| roman_pattern = r'\b[IVXLCDM]+\b' |
| roman_numerals = re.findall(roman_pattern, text) |
| if roman_numerals: |
| issues.append({ |
| 'type': 'roman_numerals', |
| 'description': 'Roman numerals that may be mispronounced', |
| 'instances': roman_numerals, |
| 'suggestion': 'Replace Roman numerals with Arabic numbers' |
| }) |
| |
| |
| abbreviation_patterns = { |
| 'fr': [r'\bM\.\s', r'\bMme\.\s', r'\bMlle\.\s', r'\bDr\.\s', r'\bProf\.\s', r'\betc\.\s', r'\bex\.\s'], |
| 'en': [r'\bMr\.\s', r'\bMrs\.\s', r'\bDr\.\s', r'\bProf\.\s', r'\betc\.\s', r'\be\.g\.\s', r'\bi\.e\.\s'], |
| 'es': [r'\bSr\.\s', r'\bSra\.\s', r'\bDr\.\s', r'\betc\.\s'], |
| 'default': [r'\b[A-Z]\.\s', r'\b[A-Z][a-z]+\.\s'] |
| } |
| |
| patterns = abbreviation_patterns.get(language_code, abbreviation_patterns['default']) |
| found_abbrevs = [] |
| |
| for pattern in patterns: |
| matches = re.findall(pattern, text) |
| found_abbrevs.extend(matches) |
| |
| if found_abbrevs: |
| issues.append({ |
| 'type': 'abbreviations', |
| 'description': 'Detected abbreviations that may be mispronounced', |
| 'instances': found_abbrevs, |
| 'suggestion': 'Write abbreviations in full' |
| }) |
| |
| |
| repeated_punct = re.findall(r'([!?.,;:]{2,})', text) |
| if repeated_punct: |
| issues.append({ |
| 'type': 'repeated_punct', |
| 'description': 'Repeated punctuation that may cause incorrect pauses', |
| 'instances': repeated_punct, |
| 'suggestion': 'Simplify punctuation (use only one character)' |
| }) |
| |
| |
| missing_spaces = [] |
| |
| |
| patterns = [ |
| r'[a-zA-ZÀ-ÿ][,.;:!?][a-zA-ZÀ-ÿ]' |
| ] |
| |
| |
| if language_code != 'en': |
| for pattern in patterns: |
| matches = re.findall(pattern, text) |
| if matches: |
| missing_spaces.extend(matches) |
| |
| if missing_spaces: |
| issues.append({ |
| 'type': 'missing_spaces', |
| 'description': 'Punctuation without spaces that may affect pronunciation', |
| 'instances': missing_spaces, |
| 'suggestion': 'Add appropriate spaces around punctuation (except for decimal numbers)' |
| }) |
| |
| |
| if language_code == 'fr': |
| |
| ordinals = re.findall(r'\b\d+(eme|ème|er|ere|ère)\b', text) |
| if ordinals: |
| issues.append({ |
| 'type': 'fr_ordinals', |
| 'description': 'Ordinal numbers that may be mispronounced', |
| 'instances': ordinals, |
| 'suggestion': 'Write ordinals in letters (premier, deuxième, etc.)' |
| }) |
| |
| elif language_code == 'en': |
| |
| dates = re.findall(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', text) |
| if dates: |
| issues.append({ |
| 'type': 'en_dates', |
| 'description': 'Dates in numeric format that may be misinterpreted', |
| 'instances': dates, |
| 'suggestion': 'Write dates in full (e.g., January 1st, 2022)' |
| }) |
| |
| return { |
| 'issues': issues, |
| 'has_issues': len(issues) > 0, |
| 'normalized_text': normalized_text |
| } |
|
|
| |
| def number_to_text_fr(number_str): |
| """ |
| Converts a number (integer or decimal) to French text. |
| |
| Args: |
| number_str (str): The number to convert to text format |
| |
| Returns: |
| str: The number written out in words |
| """ |
| parts = number_str.replace(',', '.').split('.') |
| |
| |
| def int_to_text(n): |
| if n == '0': |
| return 'zéro' |
| |
| units = ['', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf'] |
| teens = ['dix', 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf'] |
| tens = ['', 'dix', 'vingt', 'trente', 'quarante', 'cinquante', 'soixante', 'soixante', 'quatre-vingt', 'quatre-vingt'] |
| |
| n = int(n) |
| if n < 10: |
| return units[n] |
| elif n < 20: |
| return teens[n-10] |
| elif n < 70: |
| div, mod = divmod(n, 10) |
| return tens[div] + ('-et-un' if mod == 1 else ('-' + units[mod] if mod else '')) |
| elif n < 80: |
| div, mod = divmod(n, 10) |
| return tens[div] + ('-' + teens[mod-10] if mod else '') |
| elif n < 90: |
| div, mod = divmod(n, 10) |
| return tens[div] + (('-' + units[mod]) if mod else 's') |
| elif n < 100: |
| div, mod = divmod(n, 10) |
| return tens[div] + ('-' + teens[mod-10] if mod else 's') |
| else: |
| if n < 200: |
| return 'cent' + (' ' + int_to_text(n % 100) if n % 100 else '') |
| else: |
| div, mod = divmod(n, 100) |
| return int_to_text(div) + ' cent' + ('s' if div > 1 and mod == 0 else '') + (' ' + int_to_text(mod) if mod else '') |
| |
| |
| integer_part = int_to_text(parts[0]) |
| |
| |
| if len(parts) > 1 and parts[1]: |
| |
| decimal_part = parts[1] |
| if len(decimal_part) <= 2: |
| decimal_text = int_to_text(decimal_part) |
| |
| |
| if len(decimal_part) == 2 and decimal_part[0] == '0': |
| decimal_text = int_to_text(decimal_part[1]) |
| |
| return f"{integer_part} virgule {decimal_text}" |
| else: |
| |
| decimal_text = ' '.join(int_to_text(d) for d in decimal_part) |
| return f"{integer_part} virgule {decimal_text}" |
| |
| return integer_part |
|
|
| def preprocess_text(text, language_code, apply_replacements=True): |
| """Preprocess and clean text for optimal voice synthesis results. |
| |
| This function automatically fixes common text issues like special characters, |
| numbers, URLs, and language-specific elements to improve speech quality. |
| |
| Args: |
| text: The text to preprocess for voice synthesis |
| language_code: Language code (en, fr, es, de, it, pt, pl, tr, ru, nl, cs, ar, zh, hu, ko, ja, hi) |
| apply_replacements: If True, applies automatic character replacements for better pronunciation |
| |
| Returns: |
| The preprocessed text ready for high-quality voice synthesis |
| """ |
| |
| text = normalize('NFC', text) |
| |
| if apply_replacements: |
| |
| import re |
| |
| |
| emoji_pattern = re.compile( |
| "[" |
| "\U0001F600-\U0001F64F" |
| "\U0001F300-\U0001F5FF" |
| "\U0001F680-\U0001F6FF" |
| "\U0001F700-\U0001F77F" |
| "\U0001F780-\U0001F7FF" |
| "\U0001F800-\U0001F8FF" |
| "\U0001F900-\U0001F9FF" |
| "\U0001FA00-\U0001FA6F" |
| "\U0001FA70-\U0001FAFF" |
| "\U00002702-\U000027B0" |
| "\U000024C2-\U0001F251" |
| "]+", flags=re.UNICODE |
| ) |
| |
| |
| text = emoji_pattern.sub(' ', text) |
| |
| |
| for char, replacements in REPLACEMENT_RULES.get('global', {}).items(): |
| if char in text: |
| |
| replacement = replacements.get(language_code, replacements.get('default', char)) |
| text = text.replace(char, replacement) |
| |
| |
| text = re.sub(r'https?://\S+|www\.\S+', ' URL link ', text) |
| text = re.sub(r'\S+@\S+\.\S+', ' email address ', text) |
| |
| |
| |
| text = text.replace('"', '') |
| text = text.replace("'", '') |
| |
| |
| text = text.replace('«', '') |
| text = text.replace('»', '') |
| |
| |
| text = text.replace('"', '') |
| text = text.replace('"', '') |
| text = text.replace(''', '') # opening apostrophe |
| text = text.replace(''', '') |
| |
| |
| if language_code in ['fr', 'en', 'es', 'it', 'pt']: |
| roman_numerals = { |
| 'I': '1', 'II': '2', 'III': '3', 'IV': '4', 'V': '5', |
| 'VI': '6', 'VII': '7', 'VIII': '8', 'IX': '9', 'X': '10', |
| 'XI': '11', 'XII': '12', 'XIII': '13', 'XIV': '14', 'XV': '15', |
| 'XVI': '16', 'XVII': '17', 'XVIII': '18', 'XIX': '19', 'XX': '20' |
| } |
| |
| |
| if language_code == 'en': |
| |
| |
| for roman, arabic in roman_numerals.items(): |
| if roman == 'I': |
| |
| |
| |
| text = re.sub(r'\b(I)\b(?!\'m|\'ve|\'ll|\'d|\.)', roman, text) |
| text = re.sub(r'\b(I)\.', arabic + '.', text) |
| else: |
| |
| text = re.sub(fr'\b{roman}\b', arabic, text) |
| else: |
| |
| for roman, arabic in roman_numerals.items(): |
| text = re.sub(fr'\b{roman}\b', arabic, text) |
| |
| |
| if language_code == 'fr': |
| |
| text = re.sub(r'\b1er\b', 'premier', text) |
| text = re.sub(r'\b1ère\b', 'première', text) |
| text = re.sub(r'\b(\d+)(ème)\b', r'\1 ième', text) |
| |
| |
| |
| def replace_decimal_percent(match): |
| num = match.group(1) |
| return number_to_text_fr(num) + " pour cent" |
| |
| |
| text = re.sub(r'(\d+,\d+)\s*%', replace_decimal_percent, text) |
| |
| |
| def replace_decimal(match): |
| return number_to_text_fr(match.group(0)) |
| |
| |
| text = re.sub(r'\b\d+,\d+\b', replace_decimal, text) |
| |
| |
| text = re.sub(r'(\d+)\s*%', lambda m: number_to_text_fr(m.group(1)) + " pour cent", text) |
| |
| |
| |
| |
| |
| |
| |
| text = re.sub(r'\s*([.,;:!?\[\]\(\)\{\}])\s*', r'\1', text) |
| |
| |
| |
| text = re.sub(r'([.,)])', r'\1 ', text) |
| |
| |
| text = re.sub(r'([;:!?])', r' \1 ', text) |
| |
| |
| text = re.sub(r'«', r'« ', text) |
| text = re.sub(r'»', r' »', text) |
| |
| |
| elif language_code == 'en': |
| |
| text = re.sub(r'\b1st\b', 'first', text) |
| text = re.sub(r'\b2nd\b', 'second', text) |
| text = re.sub(r'\b3rd\b', 'third', text) |
| text = re.sub(r'\b(\d+)th\b', r'\1th', text) |
| |
| |
| text = re.sub(r'(\d+\.\d+)%', r'\1 percent', text) |
| text = re.sub(r'(\d+)%', r'\1 percent', text) |
| |
| |
| text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) |
| |
| |
| else: |
| text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) |
| |
| |
| text = re.sub(r'\s+', ' ', text).strip() |
| |
| return text |
|
|
| def format_issues_for_display(analysis_result, language_code, tokenizer_analysis=None): |
| """ |
| Formats detected issues for display in the interface. |
| |
| Args: |
| analysis_result (dict): Result of the text analysis |
| language_code (str): Language code |
| tokenizer_analysis (dict): Result of tokenizer analysis (optional) |
| |
| Returns: |
| str: Formatted text for display |
| """ |
| if not analysis_result['has_issues'] and (tokenizer_analysis is None or not tokenizer_analysis['has_issues']): |
| return "✅ No issues detected in the text." |
| |
| formatted_text = "⚠️ Potential issues detected:\n\n" |
| |
| |
| if analysis_result['has_issues']: |
| formatted_text += "📊 Text analysis results:\n" |
| for issue in analysis_result['issues']: |
| formatted_text += f"- {issue['description']}:\n" |
| formatted_text += f" • Detected: {', '.join(repr(i) for i in issue['instances'])}\n" |
| formatted_text += f" • Suggestion: {issue['suggestion']}\n\n" |
| |
| |
| if tokenizer_analysis and tokenizer_analysis['has_issues']: |
| formatted_text += "\n🔍 Tokenizer analysis results:\n" |
| for issue in tokenizer_analysis['issues']: |
| formatted_text += f"- {issue['description']}:\n" |
| formatted_text += f" • Detected: {', '.join(repr(i) for i in issue['instances'])}\n" |
| formatted_text += f" • Suggestion: {issue['suggestion']}\n\n" |
| |
| if 'cleaned_text' in tokenizer_analysis: |
| formatted_text += "\n📝 Cleaned text by XTTS tokenizer:\n" |
| formatted_text += f"{tokenizer_analysis['cleaned_text']}\n\n" |
| |
| formatted_text += "\nEnable text preprocessing to automatically fix some of these issues." |
| return formatted_text |
|
|
| repo_id = "XTTS-v2" |
|
|
| |
| if not os.path.exists(repo_id) or not os.path.exists(os.path.join(repo_id, "config.json")): |
| try: |
| print("Téléchargement du modèle XTTS-v2...") |
| snapshot_download( |
| repo_id="coqui/XTTS-v2", |
| local_dir=repo_id, |
| allow_patterns=["*.safetensors", "*.wav", "*.json", "*.pth"] |
| ) |
| print("Modèle téléchargé avec succès!") |
| except Exception as e: |
| print(f"Erreur lors du téléchargement: {e}") |
| print("Essai avec git clone...") |
| try: |
| import subprocess |
| result = subprocess.run( |
| ["git", "clone", "https://huggingface.co/coqui/XTTS-v2", repo_id], |
| capture_output=True, |
| text=True |
| ) |
| if result.returncode == 0: |
| print("Modèle téléchargé avec git clone!") |
| else: |
| print(f"Erreur git clone: {result.stderr}") |
| raise Exception("Impossible de télécharger le modèle") |
| except Exception as git_error: |
| print(f"Erreur git clone: {git_error}") |
| raise Exception("Veuillez télécharger le modèle manuellement avec: git clone https://huggingface.co/coqui/XTTS-v2") |
| else: |
| print("Modèle XTTS-v2 déjà présent.") |
|
|
| |
| BASE_DIR = Path(os.path.dirname(os.path.abspath(__file__))) |
| MODELS_DIR = repo_id |
| REF_AUDIO_DIR = BASE_DIR / "ref_audio_files" |
| OUTPUT_DIR = BASE_DIR / "outputs" |
| TEMP_DIR = OUTPUT_DIR / "temp" |
|
|
| |
| REF_AUDIO_DIR.mkdir(exist_ok=True) |
| OUTPUT_DIR.mkdir(exist_ok=True) |
| TEMP_DIR.mkdir(exist_ok=True) |
|
|
| |
| SUPPORTED_LANGUAGES = { |
| "English": "en", |
| "French": "fr", |
| "Spanish": "es", |
| "German": "de", |
| "Italian": "it", |
| "Portuguese": "pt", |
| "Polish": "pl", |
| "Turkish": "tr", |
| "Russian": "ru", |
| "Dutch": "nl", |
| "Czech": "cs", |
| "Arabic": "ar", |
| "Chinese": "zh-cn", |
| "Japanese": "ja", |
| "Korean": "ko", |
| "Hungarian": "hu", |
| "Hindi": "hi" |
| } |
|
|
| print(f"Initializing model from: {MODELS_DIR}") |
|
|
| |
| def cleanup_temp_files(): |
| """Cleans temporary files in the TEMP_DIR folder""" |
| try: |
| for file in TEMP_DIR.glob("*"): |
| if file.is_file(): |
| os.remove(file) |
| except Exception as e: |
| print(f"Error while cleaning temporary files: {e}") |
|
|
| |
| def cleanup_old_outputs(max_age_days=7): |
| """Deletes MP3 files older than max_age_days in the OUTPUT_DIR folder""" |
| try: |
| now = time.time() |
| for file in OUTPUT_DIR.glob("*.mp3"): |
| if file.is_file(): |
| |
| if os.path.getmtime(file) < now - (max_age_days * 86400): |
| os.remove(file) |
| except Exception as e: |
| print("error cleanup old outputs") |
|
|
| |
| try: |
| from TTS.tts.configs.xtts_config import XttsConfig |
| from TTS.tts.models.xtts import Xtts |
| except ImportError as e: |
| print(f"TTS import error: {e}") |
| print("Please install dependencies with: pip install coqui-tts") |
| sys.exit(1) |
|
|
| |
| def install_language_dependencies(): |
| """Check and install required dependencies for Asian languages""" |
| try: |
| |
| try: |
| import pypinyin |
| except ImportError: |
| |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "pypinyin"]) |
| |
| |
| try: |
| import cutlet |
| |
| try: |
| import fugashi |
| except ImportError: |
| |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "fugashi", "mecab-python3", "unidic-lite"]) |
| except ImportError: |
| |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "cutlet", "fugashi", "mecab-python3", "unidic-lite"]) |
| |
| |
| try: |
| import hangul_romanize |
| except ImportError: |
| |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "hangul-romanize"]) |
| |
| return True |
| except Exception as e: |
| |
| return False |
|
|
| |
| try: |
| |
| install_language_dependencies() |
| |
| config = XttsConfig() |
| config.load_json(str("XTTS-v2/config.json")) |
| model = Xtts.init_from_config(config) |
| |
| |
| |
| model.load_checkpoint(config, checkpoint_dir=str(MODELS_DIR), eval=True) |
| if torch.cuda.is_available(): |
| model.cuda() |
| print("Model loaded on GPU") |
| else: |
| print("GPU not available, using CPU") |
| except Exception as e: |
| print(f"Error loading model: {e}") |
| print(f"Make sure the XTTS-v2 model is present in: {MODELS_DIR}") |
| sys.exit(1) |
|
|
| def remove_silence( |
| audio_segment, |
| silence_thresh=-45, |
| min_silence_len=300, |
| keep_silence=100 |
| ): |
| """ |
| Optimisé: Coupe audio_segment autour des silences puis reconstruit l'audio |
| en supprimant les silences. Ajuste silence_thresh et min_silence_len |
| en fonction du niveau sonore de votre audio. |
| """ |
| |
| if len(audio_segment) < 1000: |
| return audio_segment |
| |
| |
| |
| |
| chunks = split_on_silence( |
| audio_segment, |
| min_silence_len=min_silence_len, |
| silence_thresh=silence_thresh, |
| keep_silence=keep_silence |
| ) |
| |
| |
| if not chunks or len(chunks) < 2: |
| |
| |
| chunks = split_on_silence( |
| audio_segment, |
| min_silence_len=200, |
| silence_thresh=silence_thresh + 5, |
| keep_silence=keep_silence |
| ) |
| |
| |
| if chunks: |
| processed_audio = AudioSegment.empty() |
| for chunk in chunks: |
| processed_audio += chunk |
| |
| |
| length_ratio = len(processed_audio) / len(audio_segment) |
| |
| if length_ratio < 0.7: |
| |
| chunks = split_on_silence( |
| audio_segment, |
| min_silence_len=min_silence_len * 2, |
| silence_thresh=silence_thresh - 5, |
| keep_silence=keep_silence * 2 |
| ) |
| |
| if chunks: |
| processed_audio = AudioSegment.empty() |
| for chunk in chunks: |
| processed_audio += chunk |
| else: |
| return audio_segment |
| |
| return processed_audio |
| else: |
| |
| return audio_segment |
|
|
| def chunk_sentence_by_words(sentence, max_length=200): |
| """ |
| Divise une phrase en sous-chunks (max. max_length caractères) |
| sans couper au milieu d'un mot. |
| Optimisé pour la performance. |
| """ |
| |
| if len(sentence) <= max_length: |
| return [sentence] |
| |
| words = sentence.split() |
| sub_chunks = [] |
| current_chunk = [] |
| current_length = 0 |
|
|
| for word in words: |
| |
| word_len = len(word) + (1 if current_length > 0 else 0) |
| if current_length + word_len > max_length: |
| if current_chunk: |
| sub_chunks.append(" ".join(current_chunk)) |
| current_chunk = [] |
| current_length = 0 |
| |
| |
| if len(word) > max_length: |
| sub_chunks.append(word) |
| continue |
| |
| |
| current_chunk.append(word) |
| current_length += word_len |
|
|
| |
| if current_chunk: |
| sub_chunks.append(" ".join(current_chunk)) |
|
|
| return sub_chunks |
|
|
| def split_text(text, max_length=150): |
| """ |
| - Divise 'text' en phrases (via sent_tokenize). |
| - Si une phrase dépasse max_length, la divise mot par mot |
| en utilisant chunk_sentence_by_words. |
| - Retourne une liste de chunks, chacun ≤ max_length caractères. |
| Optimisé pour la performance. |
| """ |
| |
| if not text.strip(): |
| return [] |
| |
| |
| try: |
| raw_sentences = sent_tokenize(text) |
| if not raw_sentences: |
| raw_sentences = [text] |
| except Exception as e: |
| |
| |
| raw_sentences = [s.strip() + '.' for s in text.split('.') if s.strip()] |
| if not raw_sentences: |
| raw_sentences = [text] |
| |
| |
| |
| final_chunks = [] |
|
|
| |
| for sentence in raw_sentences: |
| sentence = sentence.strip() |
| if not sentence: |
| continue |
| |
| |
| if len(sentence) <= max_length: |
| final_chunks.append(sentence) |
| else: |
| |
| sub_chunks = chunk_sentence_by_words(sentence, max_length) |
| final_chunks.extend(sub_chunks) |
| |
| |
| if not final_chunks: |
| for i in range(0, len(text), max_length): |
| chunk = text[i:i+max_length] |
| if chunk.strip(): |
| final_chunks.append(chunk) |
| |
| return final_chunks |
|
|
| def check_language_dependencies(language): |
| """ |
| Vérifie les dépendances nécessaires pour une langue donnée. |
| Cette fonction s'exécute sur CPU. |
| |
| Args: |
| language (str): Code de langue à vérifier |
| |
| Returns: |
| tuple: (None, None) si tout est ok, ou (None, message_erreur) si problème |
| """ |
| |
| language_dependencies = { |
| "zh-cn": "pypinyin", |
| "ja": "cutlet,fugashi,unidic-lite", |
| "ko": "hangul-romanize", |
| } |
| |
| if language in language_dependencies: |
| try: |
| |
| if language == "zh-cn": |
| import importlib |
| importlib.import_module("pypinyin") |
| elif language == "ja": |
| import importlib |
| importlib.import_module("cutlet") |
| |
| try: |
| importlib.import_module("fugashi") |
| |
| try: |
| import unidic_lite |
| except ImportError: |
| raise ImportError("Japanese requires: unidic-lite") |
| except ImportError: |
| raise ImportError("Japanese requires: fugashi and unidic-lite") |
| elif language == "ko": |
| import importlib |
| importlib.import_module("hangul_romanize") |
| except ImportError as e: |
| dependency = language_dependencies[language] |
| language_name = { |
| "zh-cn": "Chinese", |
| "ja": "Japanese", |
| "ko": "Korean" |
| }[language] |
| |
| |
| if language == "ja" and "fugashi" in str(e): |
| install_command = "pip install fugashi mecab-python3 unidic-lite" |
| error_message = f""" |
| Error: Missing dependencies for {language_name} language. |
| |
| Please run the following command to install the required packages: |
| {install_command} |
| |
| Then restart the application. |
| """ |
| else: |
| install_command = f"pip install {dependency}" |
| error_message = f""" |
| Error: Missing dependency for {language_name} language. |
| |
| Please run the following command to install the required package: |
| {install_command} |
| |
| Then restart the application. |
| """ |
| return None, error_message |
| |
| return None, None |
|
|
| @spaces.GPU() |
| def synthesize_speech( |
| text, |
| language, |
| temperature, |
| speed, |
| reference_audio, |
| do_sample=True, |
| repetition_penalty=1.0, |
| length_penalty=1.0, |
| gpt_cond_len=30, |
| top_k=50, |
| top_p=0.85, |
| remove_silence_enabled=True, |
| silence_threshold=-45, |
| min_silence_len=300, |
| keep_silence=100, |
| text_splitting_method="Native XTTS splitting", |
| max_chars_per_segment=250, |
| enable_preprocessing=True |
| ): |
| """Generate speech from text by orchestrating preprocessing, synthesis, and post-processing. |
| |
| This function acts as the main pipeline for TTS generation. It takes raw text and parameters, |
| handles dependencies, preprocesses text, generates a raw audio waveform using the XTTS model, |
| and then post-processes the audio (normalization, silence removal) to produce a final MP3 file. |
| |
| Args: |
| text (str): The text to convert to speech. |
| language (str): Language code for synthesis (e.g., 'en', 'fr'). |
| temperature (float): Controls randomness in generation (0.1-1.5, recommended: 0.75). |
| speed (float): Speech speed multiplier (0.5-2.0, 1.0 = normal speed). |
| reference_audio (str): File path or URL to reference audio for voice cloning. |
| do_sample (bool): Enable sampling for more natural speech variation. |
| repetition_penalty (float): Penalty for repetitive speech (1.0-5.0, recommended: 5.0). |
| length_penalty (float): Penalty affecting speech length (1.0-2.0, recommended: 1.0). |
| gpt_cond_len (int): Conditioning length for GPT model (10-50, recommended: 30). |
| top_k (int): Top-K sampling parameter (0-50, 0 = disabled). |
| top_p (float): Top-P sampling parameter (0.0-1.0, 0 = disabled). |
| remove_silence_enabled (bool): Remove silent parts from generated audio. |
| silence_threshold (int): dB threshold for silence detection (-60 to -20). |
| min_silence_len (int): Minimum silence length in ms to detect (300-1000). |
| keep_silence (int): Amount of silence to keep in ms (100-500). |
| text_splitting_method (str): Method for splitting long text. |
| max_chars_per_segment (int): Maximum characters per segment for custom splitting. |
| enable_preprocessing (bool): Automatically preprocess text for better pronunciation. |
| |
| Returns: |
| tuple: (audio_file_path, error_message, preprocessed_text) |
| - audio_file_path (str): Path to the generated MP3 audio file, or None on error. |
| - error_message (str): A description of the error if one occurred, otherwise None. |
| - preprocessed_text (str): The text after preprocessing has been applied. |
| """ |
| |
| if not text.strip(): |
| return None, "Error: Text cannot be empty", text |
| |
| _, error_message = check_language_dependencies(language) |
| if error_message: |
| return None, error_message, text |
|
|
| if top_k == 0: |
| top_k = None |
| if top_p == 0: |
| top_p = None |
|
|
| if temperature <= 0: |
| temperature = 0.75 |
| if repetition_penalty <= 0: |
| repetition_penalty = 5.0 |
| if length_penalty <= 0: |
| length_penalty = 1.0 |
|
|
| reference_audio_path = reference_audio |
|
|
| |
| preprocessed_text = text |
| if enable_preprocessing: |
| preprocessed_text = preprocess_text(text, language) |
| print(f"Preprocessed text: {preprocessed_text}") |
|
|
| |
| try: |
| if text_splitting_method == "Custom splitting": |
| text_chunks = split_text(preprocessed_text, max_length=max_chars_per_segment) |
| print(f"Text split into {len(text_chunks)} segments (max {max_chars_per_segment} characters per segment)") |
| |
| if not text_chunks: |
| return None, "Error: The text could not be split into segments", preprocessed_text |
| |
| outputs_wav_list = [] |
| for i, chunk in enumerate(text_chunks): |
| print(f"Processing segment {i+1}/{len(text_chunks)}: {chunk}") |
| chunk_output = model.synthesize( |
| chunk, config, speaker_wav=reference_audio_path, language=language, |
| temperature=temperature, do_sample=do_sample, speed=speed, |
| enable_text_splitting=True, repetition_penalty=repetition_penalty, |
| length_penalty=length_penalty, gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p |
| ) |
| outputs_wav_list.append(chunk_output["wav"]) |
| |
| if outputs_wav_list: |
| outputs_wav = np.concatenate(outputs_wav_list) |
| else: |
| return None, "Error: No audio segment could be generated", preprocessed_text |
| else: |
| |
| use_native_splitting = True |
| if text_splitting_method == "No splitting": |
| use_native_splitting = False |
| print("Native XTTS splitting disabled by user request") |
| elif len(preprocessed_text) > 150: |
| print("Long text detected: native XTTS splitting is enabled") |
| use_native_splitting = True |
| |
| print(f"Generating with parameters: temperature={temperature}, do_sample={do_sample}, repetition_penalty={repetition_penalty}, length_penalty={length_penalty}, top_k={top_k}, top_p={top_p}, enable_text_splitting={use_native_splitting}") |
| |
| outputs = model.synthesize( |
| preprocessed_text, config, speaker_wav=reference_audio_path, language=language, |
| temperature=temperature, do_sample=do_sample, speed=speed, |
| enable_text_splitting=use_native_splitting, repetition_penalty=repetition_penalty, |
| length_penalty=length_penalty, gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p |
| ) |
| outputs_wav = outputs["wav"] |
| |
| except Exception as e: |
| error_message = f"Error during audio generation: {str(e)}" |
| print(error_message) |
| error_str = str(e) |
| if "Chinese requires: pypinyin" in error_str: |
| error_message = "Error: Missing pypinyin package for Chinese language support.\n\nPlease run: pip install pypinyin" |
| elif "No module named 'cutlet'" in error_str: |
| error_message = "Error: Missing cutlet package for Japanese language support.\n\nPlease run: pip install cutlet" |
| elif "Japanese requires: fugashi" in error_str: |
| error_message = "Error: Missing fugashi package for Japanese language support.\n\nPlease run: pip install fugashi mecab-python3 unidic-lite" |
| elif "Japanese requires: unidic-lite" in error_str: |
| error_message = "Error: Missing unidic-lite package for Japanese language support.\n\nPlease run: pip install unidic-lite" |
| elif "Failed initializing MeCab" in error_str or "no such file or directory: /usr/local/etc/mecabrc" in error_str: |
| error_message = """Error: MeCab initialization failed for Japanese language support. |
| |
| Please run: pip install fugashi mecab-python3 unidic-lite |
| |
| If the error persists, you may need to install MeCab dictionaries: |
| - For Ubuntu/Debian: sudo apt-get install mecab mecab-ipadic |
| - For macOS with Homebrew: brew install mecab mecab-ipadic |
| """ |
| elif "Korean requires: hangul_romanize" in error_str: |
| error_message = "Error: Missing hangul-romanize package for Korean language support.\n\nPlease run: pip install hangul-romanize" |
| return None, error_message, preprocessed_text |
|
|
| |
| try: |
| temp_audio_path = str(TEMP_DIR / f"temp_chunk_audio_{uuid.uuid4()}.wav") |
| torchaudio.save(temp_audio_path, torch.tensor(outputs_wav).unsqueeze(0), 24000) |
| audio_segment = AudioSegment.from_wav(temp_audio_path) |
|
|
| |
| target_dbfs = -18.0 |
| current_dbfs = audio_segment.dBFS |
| if current_dbfs < -50: |
| delta_db = -18.0 - current_dbfs |
| delta_db = min(delta_db, 20.0) |
| audio_segment = audio_segment.apply_gain(delta_db) |
| else: |
| delta_db = target_dbfs - current_dbfs |
| audio_segment = audio_segment.apply_gain(delta_db) |
|
|
| combined_audio = audio_segment |
|
|
| |
| if remove_silence_enabled: |
| padding = AudioSegment.silent(duration=500, frame_rate=combined_audio.frame_rate) |
| padded_audio = padding + combined_audio + padding |
| |
| processed_audio = remove_silence( |
| padded_audio, |
| silence_thresh=silence_threshold, |
| min_silence_len=min_silence_len, |
| keep_silence=keep_silence |
| ) |
| |
| if len(processed_audio) > len(combined_audio) + 900: |
| trim_length = min(500, len(processed_audio) // 10) |
| combined_audio = processed_audio[trim_length:-trim_length] |
| else: |
| combined_audio = processed_audio |
|
|
| timestamp = time.strftime("%Y%m%d-%H%M%S") |
| final_output_path = str(TEMP_DIR / f"temp_output_{timestamp}_{uuid.uuid4()}.mp3") |
| combined_audio.export(final_output_path, format="mp3", bitrate="192k") |
| |
| try: |
| os.remove(temp_audio_path) |
| except: |
| pass |
| |
| return final_output_path, None, preprocessed_text |
| except Exception as e: |
| error_message = f"Error during audio processing: {str(e)}" |
| print(error_message) |
| return None, error_message, preprocessed_text |
|
|
| def download_audio_from_url(url): |
| """Downloads an audio file from a URL and saves it to a temporary file.""" |
| try: |
| if not url.startswith(('http://', 'https://')): |
| raise ValueError("URL must start with http:// or https://") |
| |
| response = requests.get(url, stream=True, timeout=20) |
| response.raise_for_status() |
| |
| |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: |
| for chunk in response.iter_content(chunk_size=8192): |
| temp_audio.write(chunk) |
| print(f"Audio downloaded from {url} to {temp_audio.name}") |
| return temp_audio.name |
| |
| except (requests.exceptions.RequestException, ValueError) as e: |
| print(f"Failed to download audio from {url}: {e}") |
| return None |
|
|
| def voice_clone_synthesis( |
| text: str, |
| reference_audio_url: str = None, |
| example_audio_name: str = None, |
| language: str = "English", |
| temperature: float = 0.75, |
| speed: float = 1.0, |
| do_sample: bool = True, |
| repetition_penalty: float = 5.0, |
| length_penalty: float = 1.0, |
| gpt_cond_len: int = 30, |
| top_k: int = 50, |
| top_p: float = 0.85, |
| remove_silence_enabled: bool = True, |
| silence_threshold: int = -45, |
| min_silence_len: int = 300, |
| keep_silence: int = 100, |
| text_splitting_method: str = "Native XTTS splitting", |
| max_chars_per_segment: int = 250, |
| enable_preprocessing: bool = False |
| ): |
| """ |
| 🎤 Generates speech by cloning a voice from a reference audio URL. |
| |
| This tool takes text and a URL to a reference audio file, and synthesizes |
| the text in the voice from the reference audio. It supports 17 languages |
| and offers advanced control over the generation process. |
| |
| Args: |
| text (str): The text to be synthesized. Required. |
| |
| reference_audio_url (str, optional): A public URL pointing to a reference audio file (WAV or MP3). |
| Provide this OR example_audio_name, but not both. |
| |
| example_audio_name (str, optional): The name of a pre-defined example audio file. |
| Valid choices: 'Boy.mp3', 'Buddha.mp3', 'Buddha2.mp3', 'Budhiya.mp3', |
| 'Energeticboy.mp3', 'Female_1.wav', 'Girl.mp3', 'Littlekid.mp3', |
| 'Male(deep).mp3', 'Male.mp3'. |
| Provide this OR reference_audio_url, but not both. |
| |
| language (str): The language of the text. Defaults to "English". |
| Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, |
| Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. |
| |
| temperature (float): Controls the randomness of the output. Higher values make it more random. |
| Range: 0.1-1.5. Default: 0.75. Recommended: 0.75 for balanced output. |
| |
| speed (float): The speed of the generated speech. |
| Range: 0.5-2.0. Default: 1.0. Example: 0.8 = slower, 1.2 = faster. |
| |
| do_sample (bool): Whether to use sampling for generation. Recommended: True. Default: True. |
| |
| repetition_penalty (float): Penalty for repeating words or phrases. IMPORTANT: Must be > 1.0. |
| Range: 1.0-5.0. Default: 5.0. Higher values reduce repetition. AI agents should use values like 1.1, 1.5, 2.0, 3.0, 4.0, 5.0. |
| |
| length_penalty (float): Penalty for sentence length. IMPORTANT: Must be > 1.0. |
| Range: 1.0-2.0. Default: 1.0. Higher values encourage shorter sentences. |
| |
| gpt_cond_len (int): Conditioning length for the GPT model. |
| Range: 10-50. Default: 30. Higher values use more context. |
| |
| top_k (int): Top-K sampling parameter. 0 to disable top-k. |
| Range: 0-50. Default: 50. Lower values make output more focused. |
| |
| top_p (float): Top-P (nucleus) sampling parameter. 0.0 to disable top-p. |
| Range: 0.0-1.0. Default: 0.85. Lower values make output more focused. |
| |
| remove_silence_enabled (bool): Enable/disable automatic silence removal. Default: True. |
| |
| silence_threshold (int): Silence threshold in dB for silence detection. |
| Range: -60 to -20. Default: -45. More negative = more sensitive to silence. |
| |
| min_silence_len (int): Minimum length of silence to be removed in milliseconds. |
| Range: 300-1000. Default: 300. |
| |
| keep_silence (int): Amount of silence to keep at the beginning/end in milliseconds. |
| Range: 100-500. Default: 100. |
| |
| text_splitting_method (str): Method for splitting text. |
| Valid choices: 'Native XTTS splitting', 'Custom splitting', 'No splitting'. |
| Default: 'Native XTTS splitting'. Recommended for most use cases. |
| |
| max_chars_per_segment (int): Max characters per segment when using 'Custom splitting'. |
| Range: 50-400. Default: 250. Only relevant when text_splitting_method = 'Custom splitting'. |
| |
| enable_preprocessing (bool): Enable automatic text preprocessing to clean problematic characters. |
| Default: False. Recommended: True for better pronunciation. |
| |
| Returns: |
| str: A URL to the generated MP3 audio file. |
| |
| Examples: |
| Basic usage with example audio: |
| voice_clone_synthesis( |
| text="Hello world!", |
| example_audio_name="audio_1.wav", |
| language="English" |
| ) |
| |
| Advanced usage with custom parameters: |
| voice_clone_synthesis( |
| text="Bonjour le monde!", |
| example_audio_name="audio_2.wav", |
| language="French", |
| temperature=0.8, |
| speed=1.1, |
| repetition_penalty=2.0, # Note: > 1.0 required |
| length_penalty=1.2, # Note: > 1.0 required |
| enable_preprocessing=True |
| ) |
| |
| Raises: |
| gr.Error: If parameters are out of range or invalid combinations are used. |
| """ |
| |
| |
| temperature = float(temperature) |
| speed = float(speed) |
| repetition_penalty = float(repetition_penalty) |
| length_penalty = float(length_penalty) |
| gpt_cond_len = int(gpt_cond_len) |
| top_k = int(top_k) |
| top_p = float(top_p) |
| silence_threshold = int(silence_threshold) |
| min_silence_len = int(min_silence_len) |
| keep_silence = int(keep_silence) |
| max_chars_per_segment = int(max_chars_per_segment) |
| |
| if not (0.1 <= temperature <= 1.5): |
| raise gr.Error(f"Temperature must be between 0.1 and 1.5, got {temperature}") |
| if not (0.5 <= speed <= 2.0): |
| raise gr.Error(f"Speed must be between 0.5 and 2.0, got {speed}") |
| if not (1.0 <= repetition_penalty <= 5.0): |
| raise gr.Error(f"Repetition penalty must be between 1.0 and 5.0, got {repetition_penalty}") |
| if not (1.0 <= length_penalty <= 2.0): |
| raise gr.Error(f"Length penalty must be between 1.0 and 2.0, got {length_penalty}") |
| if not (10 <= gpt_cond_len <= 50): |
| raise gr.Error(f"GPT conditioning length must be between 10 and 50, got {gpt_cond_len}") |
| if not (0 <= top_k <= 50): |
| raise gr.Error(f"Top-K must be between 0 and 50, got {top_k}") |
| if not (0.0 <= top_p <= 1.0): |
| raise gr.Error(f"Top-P must be between 0.0 and 1.0, got {top_p}") |
| if not (-60 <= silence_threshold <= -20): |
| raise gr.Error(f"Silence threshold must be between -60 and -20 dB, got {silence_threshold}") |
| if not (300 <= min_silence_len <= 1000): |
| raise gr.Error(f"Minimum silence length must be between 300 and 1000 ms, got {min_silence_len}") |
| if not (100 <= keep_silence <= 500): |
| raise gr.Error(f"Keep silence must be between 100 and 500 ms, got {keep_silence}") |
| if not (50 <= max_chars_per_segment <= 400): |
| raise gr.Error(f"Max characters per segment must be between 50 and 400, got {max_chars_per_segment}") |
| |
| valid_splitting_methods = ["Native XTTS splitting", "Custom splitting", "No splitting"] |
| if text_splitting_method not in valid_splitting_methods: |
| raise gr.Error(f"Text splitting method must be one of {valid_splitting_methods}, got '{text_splitting_method}'") |
| |
| valid_example_audios = ["Boy.wav", "Buddha 2.wav", "Buddha.wav", "Budhiya.wav", "Energeticboy.wav", "Female_1.wav", "Girl.wav","Little kid.wav","Male(deep).wav","Male.wav"] |
| if example_audio_name and example_audio_name not in valid_example_audios: |
| raise gr.Error(f"Example audio name must be one of {valid_example_audios}, got '{example_audio_name}'") |
|
|
| reference_audio_path = None |
| downloaded_path = None |
|
|
| |
| if reference_audio_url and example_audio_name: |
| raise gr.Error("Please provide either 'reference_audio_url' or 'example_audio_name', but not both.") |
| if not reference_audio_url and not example_audio_name: |
| raise gr.Error("You must provide either 'reference_audio_url' or 'example_audio_name'.") |
|
|
| |
| if example_audio_name: |
| if example_audio_name not in file_path_mapping: |
| available_files = ", ".join(files_display) |
| raise gr.Error(f"Invalid example audio name. Available files are: {available_files}") |
| reference_audio_path = file_path_mapping[example_audio_name] |
| print(f"Using example audio: {reference_audio_path}") |
|
|
| |
| if reference_audio_url: |
| print(f"Downloading reference audio from: {reference_audio_url}") |
| downloaded_path = download_audio_from_url(reference_audio_url) |
| if not downloaded_path: |
| raise gr.Error("Failed to download or process the reference audio from the provided URL.") |
| reference_audio_path = downloaded_path |
|
|
| |
| is_valid, error_message = validate_audio_file(reference_audio_path) |
| if not is_valid: |
| if downloaded_path and os.path.exists(downloaded_path): os.remove(downloaded_path) |
| raise gr.Error(error_message) |
|
|
| language_code = SUPPORTED_LANGUAGES.get(language) |
| if not language_code: |
| if downloaded_path and os.path.exists(downloaded_path): os.remove(downloaded_path) |
| raise gr.Error(f"Language '{language}' is not supported.") |
|
|
| audio_path, error, _ = synthesize_speech( |
| text=text, language=language_code, temperature=temperature, speed=speed, |
| reference_audio=reference_audio_path, do_sample=do_sample, |
| repetition_penalty=repetition_penalty, length_penalty=length_penalty, |
| gpt_cond_len=gpt_cond_len, top_k=top_k, top_p=top_p, |
| remove_silence_enabled=remove_silence_enabled, |
| silence_threshold=silence_threshold, min_silence_len=min_silence_len, |
| keep_silence=keep_silence, text_splitting_method=text_splitting_method, |
| max_chars_per_segment=max_chars_per_segment, |
| enable_preprocessing=enable_preprocessing |
| ) |
|
|
| |
| if downloaded_path and os.path.exists(downloaded_path): |
| os.remove(downloaded_path) |
|
|
| if error: |
| raise gr.Error(error) |
| |
| return audio_path |
|
|
| def analyze_text_for_speech(text: str, language: str): |
| """ |
| 📊 Analyzes text for potential pronunciation and synthesis issues. |
| |
| This tool examines text for elements that could be mispronounced by the TTS model, |
| such as special characters, numbers, URLs, and language-specific patterns. |
| It provides a structured report of potential issues. |
| |
| Args: |
| text (str): The text to analyze. Required. |
| |
| language (str): The language of the text. Required. |
| Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, |
| Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. |
| Note: Use exact language names (case-sensitive). |
| |
| Returns: |
| dict: A dictionary containing the analysis results with these keys: |
| - standard_analysis_issues: List of detected issues with descriptions and suggestions |
| - has_issues: Boolean indicating if any issues were found |
| - xtts_cleaned_text: Preprocessed version of the text ready for synthesis |
| |
| Example: |
| analyze_text_for_speech( |
| text="Hello! This costs $15.99 & includes free shipping.", |
| language="English" |
| ) |
| |
| Raises: |
| gr.Error: If the language is not supported. |
| """ |
| language_code = SUPPORTED_LANGUAGES.get(language) |
| if not language_code: |
| raise gr.Error(f"Language '{language}' is not supported.") |
| |
| standard_analysis = analyze_text(text, language_code) |
| |
| |
| combined_issues = { |
| "standard_analysis_issues": standard_analysis.get('issues', []), |
| |
| "has_issues": standard_analysis.get('has_issues', False), |
| "xtts_cleaned_text": preprocess_text(text, language_code) |
| } |
| |
| return combined_issues |
|
|
| def preprocess_text_for_speech(text: str, language: str): |
| """ |
| 🔧 Preprocesses and cleans text for optimal speech synthesis. |
| |
| This tool applies a series of cleaning and normalization rules to the input text |
| to improve its compatibility with the TTS model. This includes handling numbers, |
| special characters, URLs, and applying language-specific typographical rules. |
| |
| Args: |
| text (str): The text to preprocess. Required. |
| |
| language (str): The language of the text. Required. |
| Supported languages: English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, |
| Russian, Dutch, Czech, Arabic, Chinese, Japanese, Korean, Hungarian, Hindi. |
| Note: Use exact language names (case-sensitive). |
| |
| Returns: |
| str: The cleaned and preprocessed text ready for speech synthesis. |
| |
| Example: |
| preprocess_text_for_speech( |
| text="Visit https://example.com & pay $25.50!", |
| language="English" |
| ) |
| # Returns: "Visit example.com and pay twenty-five dollars and fifty cents!" |
| |
| Raises: |
| gr.Error: If the language is not supported. |
| """ |
| language_code = SUPPORTED_LANGUAGES.get(language) |
| if not language_code: |
| raise gr.Error(f"Language '{language}' is not supported.") |
| |
| return preprocess_text(text, language_code, apply_replacements=True) |
|
|
| |
| EXAMPLE_TEXTS = { |
| "fr": "Bonjour, je suis une voix générée par intelligence artificielle. Comment puis-je vous aider aujourd'hui?", |
| "en": "Hello, I am a voice generated by artificial intelligence. How may I assist you today?", |
| "es": "Hola, soy una voz generada por inteligencia artificial. ¿Cómo puedo ayudarte hoy?", |
| "de": "Hallo, ich bin eine von künstlicher Intelligenz generierte Stimme. Wie kann ich Ihnen heute helfen?", |
| "it": "Ciao, sono una voce generata dall'intelligenza artificiale. Come posso aiutarti oggi?", |
| "pt": "Olá, sou uma voz gerada por inteligência artificial. Como posso ajudá-lo hoje?", |
| "ar": "مرحبا، أنا صوت تم إنشاؤه بواسطة الذكاء الاصطناعي. كيف يمكنني مساعدتك اليوم؟", |
| "zh-cn": "你好,我是由人工智能生成的声音。今天我能为您提供什么帮助?", |
| "ja": "こんにちは、私は人工知能によって生成された音声です。今日はどのようにお手伝いできますか?", |
| "ko": "안녕하세요, 저는 인공지능으로 생성된 목소리입니다. 오늘 어떻게 도와드릴까요?", |
| "ru": "Здравствуйте, я голос, сгенерированный искусственным интеллектом. Чем я могу вам помочь сегодня?", |
| "nl": "Hallo, ik ben een stem gegenereerd door kunstmatige intelligentie. Hoe kan ik u vandaag helpen?", |
| "cs": "Dobrý den, jsem hlas vytvořený umělou inteligencí. Jak vám mohu dnes pomoci?", |
| "pl": "Dzień dobry, jestem głosem wygenerowanym przez sztuczną inteligencję. Jak mogę ci dziś pomóc?", |
| "tr": "Merhaba, ben yapay zeka tarafından oluşturulan bir sesim. Bugün size nasıl yardımcı olabilirim?", |
| "hu": "Üdvözlöm, én egy mesterséges intelligencia által generált hang vagyok. Hogyan segíthetek ma?", |
| "hi": "नमस्ते, मैं कृत्रिम बुद्धिमत्ता द्वारा उत्पन्न एक आवाज हूं। मैं आज आपकी कैसे मदद कर सकता हूं?" |
| } |
|
|
| |
| def analyze_with_tokenizer(text, language_code): |
| """ |
| Analyzes text using the XTTS model's tokenizer to detect |
| parts that may be problematic for pronunciation. |
| |
| Args: |
| text (str): The text to analyze |
| language_code (str): Language code (fr, en, etc.) |
| |
| Returns: |
| dict: A dictionary containing detected issues and suggestions |
| """ |
| import torch |
| from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners |
| |
| issues = [] |
| original_text = text |
| |
| try: |
| |
| cleaned_text = text |
| print(f"Using XTTS cleaners for language: {language_code}") |
|
|
| |
| if language_code in multilingual_cleaners: |
| cleaner_fn = multilingual_cleaners[language_code] |
| cleaned_text = cleaner_fn(text) |
| else: |
| |
| |
| print(f"No specific cleaner for language {language_code}, using original text for analysis.") |
| cleaned_text = text |
| |
| |
| |
| if original_text != cleaned_text: |
| |
| import difflib |
| |
| |
| differ = difflib.Differ() |
| diff = list(differ.compare(original_text.split(), cleaned_text.split())) |
| |
| |
| modified_words = [] |
| for d in diff: |
| if d.startswith('- '): |
| word = d[2:] |
| if len(word) > 1: |
| modified_words.append(word) |
| |
| if modified_words: |
| issues.append({ |
| 'type': 'tokenizer_changes', |
| 'description': 'Words that might be mispronounced', |
| 'instances': modified_words, |
| 'suggestion': 'Consider reformulating these parts or using automatic preprocessing' |
| }) |
| |
| |
| |
| |
| |
| |
| words = text.split() |
| long_words = [w for w in words if len(w) > 12] |
| if long_words: |
| issues.append({ |
| 'type': 'long_words', |
| 'description': 'Extremely long words that might be mispronounced', |
| 'instances': long_words, |
| 'suggestion': 'Check if these words are pronounced correctly, try splitting them or reformulating' |
| }) |
| |
| |
| import re |
| special_chars = re.findall(r'[^a-zA-Z0-9\s.,;:!?\'"-]', cleaned_text) |
| if special_chars: |
| unique_special_chars = list(set(special_chars)) |
| issues.append({ |
| 'type': 'special_chars_preserved', |
| 'description': 'Special characters preserved by the tokenizer', |
| 'instances': unique_special_chars, |
| 'suggestion': 'These characters might cause pronunciation issues' |
| }) |
| |
| return { |
| 'issues': issues, |
| 'has_issues': len(issues) > 0, |
| 'cleaned_text': cleaned_text |
| } |
| |
| except Exception as e: |
| print(f"Error in tokenizer analysis: {e}") |
| return { |
| 'issues': [{ |
| 'type': 'analysis_error', |
| 'description': 'Error during analysis with the tokenizer', |
| 'instances': [str(e)], |
| 'suggestion': 'Technical error, please try again' |
| }], |
| 'has_issues': True, |
| 'cleaned_text': text |
| } |
|
|
| |
| def combined_analysis(text, language): |
| """Perform comprehensive text analysis for optimal voice synthesis quality. |
| |
| This function combines standard text analysis with XTTS tokenizer analysis |
| to detect and report all potential issues that might affect speech synthesis. |
| |
| Args: |
| text: The text to analyze for speech synthesis compatibility |
| language: Language name (English, French, Spanish, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Hungarian, Korean, Japanese, Hindi) |
| |
| Returns: |
| A tuple containing detailed analysis report and cleaned text ready for synthesis |
| """ |
| language_code = SUPPORTED_LANGUAGES[language] |
| |
| |
| standard_analysis = analyze_text(text, language_code) |
| |
| |
| tokenizer_analysis = analyze_with_tokenizer(text, language_code) |
| |
| |
| display_text = format_issues_for_display(standard_analysis, language_code, tokenizer_analysis) |
| |
| |
| cleaned_text = tokenizer_analysis.get('cleaned_text', "") |
| if not cleaned_text or cleaned_text == text: |
| cleaned_text = preprocess_text(text, language_code) if text else "" |
| |
| return display_text, cleaned_text |
|
|
| def cleanup_old_files(max_age_minutes=60): |
| """ |
| Optimized: deletes temporary files older than max_age_minutes. |
| This function can be called regularly to prevent accumulation of files. |
| """ |
| try: |
| now = time.time() |
| count_removed = 0 |
| |
| |
| for file in TEMP_DIR.glob("*"): |
| if file.is_file(): |
| file_age_minutes = (now - os.path.getmtime(file)) / 60 |
| if file_age_minutes > max_age_minutes: |
| os.remove(file) |
| count_removed += 1 |
| |
| |
| for file in OUTPUT_DIR.glob("*.mp3"): |
| if file.is_file(): |
| file_age_days = (now - os.path.getmtime(file)) / (24 * 60 * 60) |
| if file_age_days > 7: |
| os.remove(file) |
| count_removed += 1 |
| |
| |
| return count_removed |
| except Exception as e: |
| return 0 |
|
|
| |
| with gr.Blocks(theme=gr.themes.Ocean(), css=""" |
| .gradio-container { |
| max-width: 1280px !important; |
| margin: auto !important; |
| } |
| #header { |
| display: flex; |
| justify-content: center; |
| align-items: center; |
| padding: 10px 0; |
| } |
| """) as interface: |
| with gr.Row(elem_id="header"): |
| gr.Markdown( |
| """ |
| <div style="text-align: center;"> |
| <h1 style="margin: 0; font-size: 1.8rem;">🎙️ Voice Cloning Studio</h1> |
| <p style="margin: 0; font-size: 1rem;">Bring any voice to life from a 3-second audio sample.</p> |
| </div> |
| """ |
| ) |
| |
| |
| try: |
| files_paths = [str(f) for f in REF_AUDIO_DIR.iterdir() if f.is_file() and f.suffix.lower() in ['.wav', '.mp3']] |
| files_display = [os.path.basename(f) for f in files_paths] |
| file_path_mapping = dict(zip(files_display, files_paths)) |
| except Exception as e: |
| files_paths = [] |
| files_display = [] |
| file_path_mapping = {} |
|
|
| with gr.Row(equal_height=False): |
| |
| with gr.Column(scale=2): |
| with gr.Tabs(): |
| with gr.TabItem("1. Voice"): |
| gr.Markdown("### Select a Reference Voice") |
| gr.Markdown("Choose a pre-defined example or upload your own 3-10 second audio clip. For best results, use a clear, high-quality recording with no background noise.") |
| |
| example_audio_dropdown = gr.Dropdown( |
| choices=files_display, |
| label="Reference Audio (from examples)", |
| value=files_display[0] if files_display else None, |
| interactive=True |
| ) |
| |
| reference_audio_input = gr.Audio( |
| label="Reference Audio (upload your own)", |
| type="filepath" |
| ) |
|
|
| with gr.TabItem("2. Text & Language"): |
| gr.Markdown("### Enter Text and Select Language") |
| lang_dropdown = gr.Dropdown( |
| choices=list(SUPPORTED_LANGUAGES.keys()), |
| value="English", |
| label="Language" |
| ) |
| |
| text_input = gr.Textbox( |
| label="Text to Synthesize", |
| placeholder="Enter text here...", |
| lines=5, |
| value="Hello, I am a voice generated by artificial intelligence. How may I assist you today?" |
| ) |
| |
| with gr.Row(): |
| example_buttons = [] |
| example_langs_to_show = ["en", "fr", "es", "de", "zh-cn"] |
| for lang in example_langs_to_show: |
| if lang in EXAMPLE_TEXTS: |
| example_buttons.append(gr.Button(f"Example ({lang.upper()})")) |
|
|
| with gr.Accordion("Text Analysis & Preprocessing", open=True): |
| with gr.Row(): |
| analyze_button = gr.Button("Analyze Text") |
| enable_preprocessing = gr.Checkbox( |
| value=False, |
| label="Preprocess text automatically" |
| ) |
| text_analysis_output = gr.Textbox( |
| label="Text Analysis", |
| value="Click 'Analyze Text' to see results here.", |
| lines=6 |
| ) |
| preprocessed_text_output = gr.Textbox( |
| label="Preprocessed Text", |
| value="The processed text will appear here after analysis or generation.", |
| lines=3, |
| visible=True |
| ) |
| |
| with gr.TabItem("3. Settings"): |
| gr.Markdown("### Fine-Tune Your Audio") |
| gr.Markdown("Adjust these settings to control the style and quality of the generated speech.") |
| |
| with gr.Accordion("Generation Settings", open=True): |
| with gr.Row(): |
| with gr.Column(): |
| temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, step=0.05, value=0.75, label="Temperature") |
| speed_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.05, value=1.0, label="Speed") |
| do_sample = gr.Checkbox(value=True, label="Enable Sampling (do_sample)") |
| with gr.Column(): |
| repetition_penalty = gr.Slider(minimum=1.0, maximum=5.0, step=0.1, value=5.0, label="Repetition Penalty") |
| length_penalty = gr.Slider(minimum=1.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty") |
| gpt_cond_len = gr.Slider(minimum=10, maximum=50, step=1, value=30, label="GPT Conditioning Length") |
| top_k = gr.Slider(minimum=0, maximum=50, step=1, value=50, label="Top-K") |
| top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.85, label="Top-P") |
| |
| with gr.Accordion("Text Splitting", open=False): |
| text_splitting_method = gr.Radio( |
| choices=["Native XTTS splitting", "Custom splitting", "No splitting"], |
| value="Native XTTS splitting", |
| label="Text Splitting Method" |
| ) |
| enable_text_splitting = gr.Checkbox( |
| value=True, |
| label="enable_text_splitting (XTTS parameter)", |
| visible=False |
| ) |
| max_chars_per_segment = gr.Slider( |
| minimum=50, |
| maximum=400, |
| step=10, |
| value=250, |
| label="Max characters per segment", |
| visible=False |
| ) |
|
|
| with gr.Accordion("Silence Removal", open=False): |
| remove_silence_enabled = gr.Checkbox(value=True, label="Remove silences from audio") |
| silence_threshold = gr.Slider(minimum=-60, maximum=-20, step=5, value=-45, label="Silence threshold (dB)") |
| min_silence_len = gr.Slider(minimum=300, maximum=1000, step=50, value=300, label="Minimum silence length (ms)") |
| keep_silence = gr.Slider(minimum=100, maximum=500, step=10, value=100, label="Silence to keep (ms)") |
|
|
| |
| with gr.Column(scale=1): |
| gr.Markdown("### 4. Generate & Listen") |
| gr.Markdown("Click the button to generate your audio. Results will appear below.") |
| generate_button = gr.Button("Generate Audio", variant="primary", scale=1) |
| output_audio = gr.Audio(label="Generated Audio") |
| output_message = gr.Textbox(label="Status & Tips", visible=True, lines=8) |
| |
| with gr.Accordion("User Guide, Disclaimer & API Info", open=False): |
| with gr.Tabs(): |
| with gr.TabItem("🎯 Quick Start Guide"): |
| gr.Markdown(""" |
| ## 🎯 Quick User Guide |
| 1. **Choose a reference voice**: In the **Voice** tab, select an example from the dropdown or upload your own clear audio file (3-10 seconds). |
| 2. **Enter your text**: In the **Text & Language** tab, type or paste the text you want to synthesize and select the correct language. |
| 3. **Generate**: Click the "Generate Audio" button. |
| 4. **Iterate**: If you're not happy with the result, try regenerating. Small changes to the settings in the **Settings** tab can produce different results. |
| |
| ### 🔍 Essential Tips |
| - **Reference Audio Quality**: The quality of the generated audio heavily depends on the reference. Use clean recordings with no background noise. |
| - **Text Preprocessing**: Keep "Preprocess text automatically" enabled. It improves pronunciation of numbers, symbols, and URLs. Use the "Analyze Text" button to see potential issues. |
| - **Optimizing Results**: For long texts, "Native XTTS splitting" is recommended. To change the speech style, try regenerating, adjusting the `Temperature`, or changing the `Speed`. |
| - **Languages**: Ensure the selected language matches the text. |
| """) |
| with gr.TabItem("⚠️ Disclaimer"): |
| gr.Markdown(""" |
| ## ⚠️ Disclaimer and Legal Notice |
| **By using this voice cloning application, you acknowledge and agree to the following:** |
| 1. This application is provided "as is" without any warranties of any kind, either express or implied. |
| 2. The creator(s) of this application accept no responsibility or liability for any misuse of the technology. |
| 3. You are solely responsible for obtaining proper consent when cloning someone else's voice. |
| 4. You agree not to use this technology for deceptive, harmful, or illegal purposes. |
| 5. Voice cloning results may vary in quality and accuracy; no specific results are guaranteed. |
| 6. You understand that voice cloning technology has ethical implications and agree to use it responsibly. |
| The technology is intended for legitimate creative, educational, and accessibility purposes only. |
| |
| --- |
| |
| ### License & Model Information |
| By accessing or using any feature within this space, you acknowledge and accept the terms of the following license: [https://coqui.ai/cpml](https://coqui.ai/cpml). |
| |
| **Model source:** [coqui/XTTS-v2](https://huggingface.co/coqui/XTTS-v2) |
| """) |
| with gr.TabItem("🔧 API Tools"): |
| gr.Markdown(f""" |
| ## 🛠️ Model Context Protocol (MCP) Tools |
| This application exposes MCP tools that you can use with LLMs. |
| |
| **MCP Endpoint:** `https://hasanbasbunar-voice-cloning-xtts-v2.hf.space/gradio_api/mcp/sse` |
| |
| --- |
| |
| ### 🎤 `voice_clone_synthesis` |
| Generates an audio file by cloning a voice from a reference audio file (provided via URL or a local example). |
| |
| **Parameters:** |
| - `text` (string, required): The text to synthesize. |
| - `reference_audio_url` (string, optional): A public URL for a reference audio file (WAV, MP3). **Provide this OR `example_audio_name`.** |
| - `example_audio_name` (string, optional): The name of a predefined example audio file. **Provide this OR `reference_audio_url`.** Available files are: {', '.join(files_display)}. |
| - `language` (string, optional): The language of the text. Default: "English". |
| - ... (and other advanced parameters, see the function's docstring for a full list). |
| |
| **Returns:** |
| - `string`: A URL to the generated MP3 audio file. |
| |
| --- |
| |
| ### 📊 `analyze_text_for_speech` |
| Analyzes text for potential pronunciation issues. |
| |
| **Parameters:** |
| - `text` (string, required): The text to analyze. |
| - `language` (string, required): The language of the text. |
| |
| **Returns:** |
| - `object`: A JSON object with the detected issues. |
| |
| --- |
| |
| ### 🔧 `preprocess_text_for_speech` |
| Cleans and preprocesses text for optimal speech synthesis. |
| |
| **Parameters:** |
| - `text` (string, required): The text to preprocess. |
| - `language` (string, required): The language of the text. |
| |
| **Returns:** |
| - `string`: The cleaned text. |
| """) |
|
|
| |
| for i, lang_code in enumerate(example_langs_to_show): |
| if lang_code in EXAMPLE_TEXTS: |
| lang_name = next((k for k, v in SUPPORTED_LANGUAGES.items() if v == lang_code), None) |
| if lang_name: |
| example_buttons[i].click( |
| lambda t, l: (t, l), |
| inputs=[gr.Textbox(value=EXAMPLE_TEXTS[lang_code], visible=False), gr.Textbox(value=lang_name, visible=False)], |
| outputs=[text_input, lang_dropdown], |
| api_name=False |
| ) |
|
|
| |
| def analyze_input_text(text, language): |
| language_code = SUPPORTED_LANGUAGES[language] |
| analysis = analyze_text(text, language_code) |
| display_text = format_issues_for_display(analysis, language_code) |
| |
| |
| preprocessed = preprocess_text(text, language_code) if text else "" |
| |
| return display_text, preprocessed |
|
|
| |
| text_input.change( |
| analyze_input_text, |
| inputs=[text_input, lang_dropdown], |
| outputs=[text_analysis_output, preprocessed_text_output], |
| api_name=False |
| ) |
| |
| lang_dropdown.change( |
| analyze_input_text, |
| inputs=[text_input, lang_dropdown], |
| outputs=[text_analysis_output, preprocessed_text_output], |
| api_name=False |
| ) |
| |
| analyze_button.click( |
| combined_analysis, |
| inputs=[text_input, lang_dropdown], |
| outputs=[text_analysis_output, preprocessed_text_output], |
| api_name=False |
| ) |
| |
| |
| def validate_audio_file(file_path, max_size_mb=20, min_duration_sec=1, max_duration_sec=60): |
| """ |
| Validates audio files to ensure they are valid, have appropriate size and duration. |
| |
| Args: |
| file_path (str): Path to the audio file |
| max_size_mb (int): Maximum file size in MB |
| min_duration_sec (float): Minimum duration in seconds |
| max_duration_sec (float): Maximum duration in seconds |
| |
| Returns: |
| tuple: (is_valid, error_message) |
| """ |
| |
| if not os.path.exists(file_path): |
| return False, "Error: File does not exist" |
| |
| |
| file_ext = os.path.splitext(file_path)[1].lower() |
| if file_ext not in ['.mp3', '.wav']: |
| return False, f"Error: Invalid file format {file_ext}. Only MP3 and WAV files are supported." |
| |
| |
| file_size_mb = os.path.getsize(file_path) / (1024 * 1024) |
| if file_size_mb > max_size_mb: |
| return False, f"Error: File size ({file_size_mb:.1f} MB) exceeds the maximum allowed size ({max_size_mb} MB)" |
| |
| try: |
| |
| if file_ext == '.mp3': |
| audio = AudioSegment.from_mp3(file_path) |
| else: |
| audio = AudioSegment.from_wav(file_path) |
| |
| duration_sec = len(audio) / 1000 |
| |
| if duration_sec < min_duration_sec: |
| return False, f"Error: Audio duration ({duration_sec:.1f} sec) is too short (min: {min_duration_sec} sec)" |
| |
| if duration_sec > max_duration_sec: |
| return False, f"Error: Audio duration ({duration_sec:.1f} sec) is too long (max: {max_duration_sec} sec)" |
| |
| |
| if audio.dBFS < -50: |
| return True, "Warning: Audio is very quiet, which may result in poor voice cloning quality" |
| |
| return True, None |
| |
| except Exception as e: |
| return False, f"Error: Failed to process audio file - {str(e)}" |
|
|
| def handle_synthesis_request( |
| text, language, temperature, speed, reference_audio, example_audio_name, |
| do_sample, enable_text_splitting, repetition_penalty, length_penalty, |
| gpt_cond_len, top_k, top_p, remove_silence_enabled, silence_threshold, |
| min_silence_len, keep_silence, text_splitting_method, max_chars_per_segment, |
| enable_preprocessing |
| ): |
| """ |
| Gradio callback to handle the "Generate Audio" button click. |
| |
| This function orchestrates the synthesis process by: |
| 1. Selecting and validating the reference audio. |
| 2. Calling the main `synthesize_speech` function. |
| 3. Formatting the output (audio and messages) for the Gradio interface. |
| """ |
| language_code = SUPPORTED_LANGUAGES[language] |
| |
| |
| repetition_penalty = float(repetition_penalty) |
| length_penalty = float(length_penalty) |
| |
| |
| final_reference_audio = reference_audio |
| if not final_reference_audio and example_audio_name: |
| final_reference_audio = file_path_mapping.get(example_audio_name) |
| |
| |
| if final_reference_audio: |
| is_valid, error_message = validate_audio_file(final_reference_audio) |
| if not is_valid: |
| return None, error_message, "" |
| |
| |
| audio_path, error_message, preprocessed_text = synthesize_speech( |
| text=text, |
| language=language_code, |
| temperature=temperature, |
| speed=speed, |
| reference_audio=final_reference_audio, |
| do_sample=do_sample, |
| repetition_penalty=repetition_penalty, |
| length_penalty=length_penalty, |
| gpt_cond_len=gpt_cond_len, |
| top_k=top_k, |
| top_p=top_p, |
| remove_silence_enabled=remove_silence_enabled, |
| silence_threshold=silence_threshold, |
| min_silence_len=min_silence_len, |
| keep_silence=keep_silence, |
| text_splitting_method=text_splitting_method, |
| max_chars_per_segment=max_chars_per_segment, |
| enable_preprocessing=enable_preprocessing |
| ) |
| |
| if error_message: |
| return None, error_message, preprocessed_text |
| |
| success_message = f""" |
| ✅ Audio generation successful! |
| |
| 💾 Use the download button to save the audio. |
| |
| 🔄 If you're not satisfied with the result (e.g., pronunciation, intonation, or pace), feel free to click "Generate Audio" again. |
| |
| ℹ️ The generation process includes randomness controlled by the temperature parameter ({temperature:.2f}), so each output is unique. |
| |
| 🎤 For different results, try another voice from the "Reference Audio (examples)" dropdown or upload your own. |
| |
| ⚙️ If the result is still not satisfactory after several attempts, consider adjusting parameters in the "Advanced Settings" accordion. |
| """ |
| |
| return audio_path, success_message, preprocessed_text |
|
|
| generate_button.click( |
| handle_synthesis_request, |
| inputs=[ |
| text_input, lang_dropdown, temperature_slider, speed_slider, |
| reference_audio_input, example_audio_dropdown, do_sample, |
| enable_text_splitting, repetition_penalty, length_penalty, |
| gpt_cond_len, top_k, top_p, remove_silence_enabled, |
| silence_threshold, min_silence_len, keep_silence, |
| text_splitting_method, max_chars_per_segment, enable_preprocessing |
| ], |
| outputs=[output_audio, output_message, preprocessed_text_output], |
| api_name=False |
| ) |
|
|
| |
| def update_text_splitting_options(method): |
| |
| is_native = method == "Native XTTS splitting" |
| is_custom = method == "Custom splitting" |
| |
| |
| enable_splitting = is_native |
| |
| |
| show_max_chars = is_custom |
| |
| return gr.update(value=enable_splitting), gr.update(visible=show_max_chars) |
| |
| |
| text_splitting_method.change( |
| update_text_splitting_options, |
| inputs=[text_splitting_method], |
| outputs=[enable_text_splitting, max_chars_per_segment], |
| api_name=False |
| ) |
|
|
| |
| with gr.Tab("API Endpoints", visible=False): |
| |
| with gr.Row(): |
| api_synth_text = gr.Textbox(label="Text") |
| api_synth_ref_url = gr.Textbox(label="Reference Audio URL") |
| api_synth_example_name = gr.Dropdown(files_display, label="Example Audio Name") |
| api_synth_lang = gr.Dropdown(list(SUPPORTED_LANGUAGES.keys()), label="Language", value="English") |
| api_synth_temp = gr.Slider(minimum=0.1, maximum=1.5, value=0.75, label="Temperature") |
| api_synth_speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, label="Speed") |
| api_synth_do_sample = gr.Checkbox(value=True, label="Do Sample") |
| api_synth_rep_penalty = gr.Slider(minimum=1.0, maximum=5.0, value=5.0, label="Repetition Penalty") |
| api_synth_len_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.0, label="Length Penalty") |
| api_synth_gpt_cond_len = gr.Slider(minimum=10, maximum=50, value=30, label="GPT Cond Length") |
| api_synth_top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K") |
| api_synth_top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.85, label="Top P") |
| api_synth_remove_silence = gr.Checkbox(value=True, label="Remove Silence") |
| api_synth_silence_thresh = gr.Slider(minimum=-60, maximum=-20, value=-45, label="Silence Threshold") |
| api_synth_min_silence_len = gr.Slider(minimum=300, maximum=1000, value=300, label="Min Silence Length") |
| api_synth_keep_silence = gr.Slider(minimum=100, maximum=500, value=100, label="Keep Silence") |
| api_synth_split_method = gr.Radio(choices=["Native XTTS splitting", "Custom splitting", "No splitting"], value="Native XTTS splitting", label="Splitting Method") |
| api_synth_max_chars = gr.Slider(minimum=50, maximum=400, value=250, label="Max Chars") |
| api_synth_preprocess = gr.Checkbox(value=False, label="Enable Preprocessing") |
| |
| api_synth_output_audio = gr.Audio(label="Generated Audio") |
| api_synth_trigger = gr.Button("Synthesize_API") |
|
|
| |
| with gr.Row(): |
| api_analyze_text = gr.Textbox(label="Text") |
| api_analyze_lang = gr.Dropdown(list(SUPPORTED_LANGUAGES.keys()), label="Language", value="English") |
| api_analyze_output = gr.JSON(label="Analysis Result") |
| api_analyze_trigger = gr.Button("Analyze_API") |
|
|
| |
| with gr.Row(): |
| api_preprocess_text = gr.Textbox(label="Text") |
| api_preprocess_lang = gr.Dropdown(list(SUPPORTED_LANGUAGES.keys()), label="Language", value="English") |
| api_preprocess_output = gr.Textbox(label="Preprocessed Text") |
| api_preprocess_trigger = gr.Button("Preprocess_API") |
|
|
| |
| api_synth_trigger.click( |
| fn=voice_clone_synthesis, |
| inputs=[ |
| api_synth_text, api_synth_ref_url, api_synth_example_name, api_synth_lang, api_synth_temp, |
| api_synth_speed, api_synth_do_sample, api_synth_rep_penalty, |
| api_synth_len_penalty, api_synth_gpt_cond_len, api_synth_top_k, |
| api_synth_top_p, api_synth_remove_silence, api_synth_silence_thresh, |
| api_synth_min_silence_len, api_synth_keep_silence, api_synth_split_method, |
| api_synth_max_chars, api_synth_preprocess |
| ], |
| outputs=[api_synth_output_audio], |
| api_name="voice_clone_synthesis" |
| ) |
| api_analyze_trigger.click( |
| fn=analyze_text_for_speech, |
| inputs=[api_analyze_text, api_analyze_lang], |
| outputs=[api_analyze_output], |
| api_name="analyze_text_for_speech" |
| ) |
| api_preprocess_trigger.click( |
| fn=preprocess_text_for_speech, |
| inputs=[api_preprocess_text, api_preprocess_lang], |
| outputs=[api_preprocess_output], |
| api_name="preprocess_text_for_speech" |
| ) |
|
|
| if __name__ == "__main__": |
| |
| |
| def periodic_cleanup(): |
| """Run cleanup task periodically in background""" |
| while True: |
| try: |
| |
| time.sleep(60 * 60) |
| |
| files_removed = cleanup_old_files(max_age_minutes=60) |
| except Exception as e: |
| print(f"Error in background cleanup task: {e}") |
| |
| |
| cleanup_thread = threading.Thread(target=periodic_cleanup, daemon=True) |
| cleanup_thread.start() |
| |
| |
| interface.queue() |
| interface.launch(share=False, allowed_paths=[str(REF_AUDIO_DIR)]) |