import os os.environ["NUMBA_DISABLE_CACHE"] = "1" import gradio as gr from docx import Document from TTS.api import TTS import tempfile # Available TTS models with descriptions VOICE_MODELS = { "Jenny (Expressive Female)": { "model_name": "tts_models/en/jenny/jenny", "multi_speaker": False }, "LJSpeech (Standard Female)": { "model_name": "tts_models/en/ljspeech/vits", "multi_speaker": False }, "VCTK (Multiple Speakers)": { "model_name": "tts_models/en/vctk/vits", "multi_speaker": True } } # Cache to avoid reloading models MODEL_CACHE = {} def load_tts_model(model_key): if model_key in MODEL_CACHE: return MODEL_CACHE[model_key] info = VOICE_MODELS[model_key] tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False) MODEL_CACHE[model_key] = tts return tts def extract_speakers(model_key): info = VOICE_MODELS[model_key] if info["multi_speaker"]: tts = load_tts_model(model_key) return list(tts.speakers) return [] def docx_to_wav(doc_file, selected_voice, selected_speaker=None): info = VOICE_MODELS[selected_voice] tts = load_tts_model(selected_voice) # Extract text from docx document = Document(doc_file.name) full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()]) # Save to WAV with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav: wav_path = tmp_wav.name kwargs = {} if info["multi_speaker"]: kwargs["speaker"] = selected_speaker tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs) return wav_path def update_speaker_dropdown(voice_selection): speakers = extract_speakers(voice_selection) return gr.Dropdown.update(choices=speakers, visible=bool(speakers), value=speakers[0] if speakers else None) with gr.Blocks() as interface: gr.Markdown("# Realistic Voiceover from DOCX\nUpload a .docx and choose a voice to generate a WAV audio.") with gr.Row(): docx_input = gr.File(label="Upload .docx File", type="file") voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice") speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False) generate_button = gr.Button("Generate Speech") audio_output = gr.Audio(label="Download WAV", type="filepath") voice_dropdown.change(fn=update_speaker_dropdown, inputs=voice_dropdown, outputs=speaker_dropdown) generate_button.click( fn=docx_to_wav, inputs=[docx_input, voice_dropdown, speaker_dropdown], outputs=audio_output ) if __name__ == "__main__": interface.launch()