Spaces:
Build error
Build error
| import os | |
| import tempfile | |
| import gradio as gr | |
| from TTS.api import TTS | |
| from TTS.utils.synthesizer import Synthesizer | |
| from huggingface_hub import hf_hub_download | |
| import json | |
| from zipfile import ZipFile | |
| # Define constants | |
| #MODEL_INFO = [ | |
| #["vits checkpoint 57000", "checkpoint_57000.pth", "config.json", "mhrahmani/persian-tts-vits-0"], | |
| # ["VITS Grapheme Multispeaker CV15(reduct)(best at 17864)", "best_model_17864.pth", "config.json", | |
| # "saillab/persian-tts-cv15-reduct-grapheme-multispeaker"], | |
| #["Single speaker (best)VITS Grapheme Azure (61000)", "checkpoint_61000.pth", "config.json", "saillab/persian-tts-azure-grapheme-60K"], | |
| #["VITS Grapheme ARM24 Fine-Tuned on 1 (66651)", "best_model_66651.pth", "config.json","saillab/persian-tts-grapheme-arm24-finetuned-on1"], | |
| #["VITS Female single speaker","best_model_15397.pth","config.json","saillab/ZabanZad_VITS_Female"], | |
| #["Multi Speaker Vits Grapheme CV+Azure in one set ","best_model_358320.pth","config.json","saillab/Multi_Speaker_Cv_plus_Azure_female_in_one_set","speakers.pth"], | |
| #["Multispeaker VITS Grapheme CV15(reduct)(22000)", "checkpoint_22000.pth", "config.json", "saillab/persian-tts-cv15-reduct-grapheme-multispeaker", "speakers.pth"], | |
| #["Multispeaker VITS Grapheme CV15(reduct)(26000)", "checkpoint_25000.pth", "config.json", "saillab/persian-tts-cv15-reduct-grapheme-multispeaker", "speakers.pth"], | |
| #["Multispeaker VITS Grapheme CV15(90K)", "best_model_56960.pth", "config.json", "saillab/multi_speaker", "speakers.pth"], | |
| #["Single speaker female best VITS Grapheme CV-Azure_male-Azure_female","best_model_15397.pth","config.json","saillab/female_cv_azure_male_azure_female","speakers.pth"], | |
| # ["VITS Grapheme Azure (best at 15934)", "best_model_15934.pth", "config.json", | |
| # "saillab/persian-tts-azure-grapheme-60K"], | |
| #["Single speaker VITS Grapheme ARM24 Fine-Tuned on 1 (66651)", "best_model_66651.pth", "config.json","saillab/persian-tts-grapheme-arm24-finetuned-on1"], | |
| #["Single speaker VITS Grapheme ARM24 Fine-Tuned on 1 (120000)", "checkpoint_120000.pth", "config.json","saillab/persian-tts-grapheme-arm24-finetuned-on1"], | |
| # ... Add other models similarly | |
| #] | |
| MODEL_INFO = MODEL_INFO = [ | |
| ["VITS Male Single Speaker", "checkpoint_61000.pth", "config.json", "saillab/ZabanZad_VITS_MAle"], | |
| ["VITS Female Single speaker","best_model_15397.pth","config.json","saillab/ZabanZad_VITS_Female","speakers1.pth"], | |
| ] | |
| # Extract model names from MODEL_INFO | |
| MODEL_NAMES = [info[0] for info in MODEL_INFO] | |
| MAX_TXT_LEN = 400 | |
| TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN') | |
| model_files = {} | |
| config_files = {} | |
| speaker_files = {} | |
| # Create a dictionary to store synthesizer objects for each model | |
| synthesizers = {} | |
| def update_config_speakers_file_recursive(config_dict, speakers_path): | |
| """Recursively update speakers_file keys in a dictionary.""" | |
| if "speakers_file" in config_dict: | |
| config_dict["speakers_file"] = speakers_path | |
| for key, value in config_dict.items(): | |
| if isinstance(value, dict): | |
| update_config_speakers_file_recursive(value, speakers_path) | |
| def update_config_speakers_file(config_path, speakers_path): | |
| """Update the config.json file to point to the correct speakers.pth file.""" | |
| # Load the existing config | |
| with open(config_path, 'r') as f: | |
| config = json.load(f) | |
| # Modify the speakers_file entry | |
| update_config_speakers_file_recursive(config, speakers_path) | |
| # Save the modified config | |
| with open(config_path, 'w') as f: | |
| json.dump(config, f, indent=4) | |
| # Download models and initialize synthesizers | |
| for info in MODEL_INFO: | |
| model_name, model_file, config_file, repo_name = info[:4] | |
| speaker_file = info[4] if len(info) == 5 else None # Check if speakers.pth is defined for the model | |
| print(f"|> Downloading: {model_name}") | |
| # Download model and config files | |
| model_files[model_name] = hf_hub_download(repo_id=repo_name, filename=model_file, use_auth_token=TOKEN) | |
| config_files[model_name] = hf_hub_download(repo_id=repo_name, filename=config_file, use_auth_token=TOKEN) | |
| # Download speakers.pth if it exists | |
| if speaker_file: | |
| speaker_files[model_name] = hf_hub_download(repo_id=repo_name, filename=speaker_file, use_auth_token=TOKEN) | |
| update_config_speakers_file(config_files[model_name], speaker_files[model_name]) # Update the config file | |
| print(speaker_files[model_name]) | |
| # Initialize synthesizer for the model | |
| synthesizer = Synthesizer( | |
| tts_checkpoint=model_files[model_name], | |
| tts_config_path=config_files[model_name], | |
| tts_speakers_file=speaker_files[model_name], # Pass the speakers.pth file if it exists | |
| use_cuda=False # Assuming you don't want to use GPU, adjust if needed | |
| ) | |
| elif speaker_file is None: | |
| # Initialize synthesizer for the model | |
| synthesizer = Synthesizer( | |
| tts_checkpoint=model_files[model_name], | |
| tts_config_path=config_files[model_name], | |
| # tts_speakers_file=speaker_files.get(model_name, None), # Pass the speakers.pth file if it exists | |
| use_cuda=False # Assuming you don't want to use GPU, adjust if needed | |
| ) | |
| synthesizers[model_name] = synthesizer | |
| #def synthesize(text: str, model_name: str, speaker_name="speaker-0") -> str: | |
| def synthesize(text: str, model_name: str, speaker_name=None) -> str: | |
| """Synthesize speech using the selected model.""" | |
| if len(text) > MAX_TXT_LEN: | |
| text = text[:MAX_TXT_LEN] | |
| print(f"Input text was cut off as it exceeded the {MAX_TXT_LEN} character limit.") | |
| # Use the synthesizer object for the selected model | |
| synthesizer = synthesizers[model_name] | |
| if synthesizer is None: | |
| raise NameError("Model not found") | |
| if synthesizer.tts_speakers_file is "": | |
| wavs = synthesizer.tts(text) | |
| elif synthesizer.tts_speakers_file is not "": | |
| if speaker_name == "": | |
| #wavs = synthesizer.tts(text, speaker_name="speaker-0") ## should change, better if gradio conditions are figure out. | |
| wavs = synthesizer.tts(text, speaker_name=None) | |
| else: | |
| wavs = synthesizer.tts(text, speaker_name=speaker_name) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
| synthesizer.save_wav(wavs, fp) | |
| return fp.name | |
| # Callback function to update UI based on the selected model | |
| def update_options(model_name): | |
| synthesizer = synthesizers[model_name] | |
| # if synthesizer.tts.is_multi_speaker: | |
| if model_name is MODEL_NAMES[1]: | |
| speakers = synthesizer.tts_model.speaker_manager.speaker_names | |
| # return options for the dropdown | |
| return speakers | |
| else: | |
| # return empty options if not multi-speaker | |
| return [] | |
| links = """ | |
| """ | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=synthesize, | |
| inputs=[ | |
| gr.Textbox(label="Enter Text to Synthesize:", value="زین همرهان سست عناصر، دلم گرفت."), | |
| gr.Radio(label="Pick a Model", choices=MODEL_NAMES, value=MODEL_NAMES[0], type="value"), | |
| #gr.Dropdown(label="Select Speaker", choices=update_options(MODEL_NAMES[1]), type="value", default="speaker-0") | |
| gr.Dropdown(label="Select Speaker", choices=update_options(MODEL_NAMES[1]), type="value", default=None) | |
| ], | |
| outputs=gr.Audio(label="Output", type='filepath'), | |
| examples=[["زبان فارسی یکی از زبان های زنده و ارزشمند دنیاست که حدود ده درصد محتوای اینترنتی کل جهان به زبان فارسی است.", MODEL_NAMES[0], "speaker-0"], | |
| ["رسول پرویزی خودش را نقال میداند. با نقل قصههایی به شیرینی قند و شکر و البته طنز تلخی که در گوشهگوشه کتاب جای دارد. او از مردم شیراز حرف میزند. طبقهای ساده و بیآلایش. آنها در دورانی زندگی میکردند که کمتر کسی باسواد بود.", MODEL_NAMES[0], "speaker-0"]], # Example should include a speaker name for multispeaker models | |
| title="VITS ZabanZad 😎", | |
| description=""" | |
| This demo is currently running **VITS** support Persian Language. | |
| **VITS** is a text-to-speech model that translates written text into natural-sounding, human-like speech. It uses a combination of variational autoencoders and generative adversarial networks to create voice outputs that are realistic and can be customized for various applications, ranging from virtual assistants to audiobook narration. | |
| This is the same model that powers our creator application [SAIL LAB UNH](https://github.com/UNHSAILLab/Persian-TTS). | |
| Leave a star 🌟 on Github if you use and like our model! | |
| Stand with us as we strive not only to shape the future of Persian Text-to-Speech technologies but also to create a more inclusive and accessible space for Persian speakers worldwide [Gofundme](https://www.gofundme.com/f/zabanzad-Persian-TTS). | |
| """, | |
| article=links, | |
| live=False | |
| ) | |
| iface.launch() | |