import os os.environ["NUMBA_DISABLE_CACHE"] = "1" import gradio as gr from docx import Document from TTS.api import TTS import tempfile VOICE_MODELS = { "Jenny (Expressive Female)": { "model_name": "tts_models/en/jenny/jenny", "multi_speaker": False }, "LJSpeech (Standard Female)": { "model_name": "tts_models/en/ljspeech/vits", "multi_speaker": False }, "VCTK (Multiple Speakers)": { "model_name": "tts_models/en/vctk/vits", "multi_speaker": True } } MODEL_CACHE = {} def load_tts_model(model_key): if model_key in MODEL_CACHE: return MODEL_CACHE[model_key] info = VOICE_MODELS[model_key] tts = TTS(model_name=info["model_name"], progress_bar=False, gpu=False) MODEL_CACHE[model_key] = tts return tts def extract_speakers(model_key): info = VOICE_MODELS[model_key] if info["multi_speaker"]: if info["model_name"] == "tts_models/en/vctk/vits": return ["p225", "p226", "p227", "p228", "p229", "p230", "p231", "p232", "p233", "p234"] else: tts = load_tts_model(model_key) return getattr(tts, "speakers", []) return [] def docx_to_wav(doc_file, selected_voice, selected_speaker=None): info = VOICE_MODELS[selected_voice] tts = load_tts_model(selected_voice) document = Document(doc_file.name) full_text = "\n".join([para.text for para in document.paragraphs if para.text.strip()]) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav: wav_path = tmp_wav.name kwargs = {} if info["multi_speaker"]: kwargs["speaker"] = selected_speaker tts.tts_to_file(text=full_text, file_path=wav_path, **kwargs) return wav_path def show_load_button(voice_selection): info = VOICE_MODELS[voice_selection] return gr.update(visible=info["multi_speaker"]), gr.update(visible=False), gr.update(visible=False, interactive=False) def load_and_show_speakers(voice_selection): speakers = extract_speakers(voice_selection) return gr.update(choices=speakers, visible=True, value=speakers[0]), gr.update(interactive=True) with gr.Blocks() as interface: gr.Markdown("# 🎤 Realistic Voiceover from DOCX\nUpload a `.docx` file, select a voice, and generate lifelike speech!") with gr.Row(): docx_input = gr.File(label="Upload .docx File", type="filepath") voice_dropdown = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Jenny (Expressive Female)", label="Voice") load_speakers_btn = gr.Button("🔍 Load Speakers", visible=False) speaker_dropdown = gr.Dropdown(choices=[], label="Speaker", visible=False) generate_button = gr.Button("🎧 Generate Speech", interactive=True) audio_output = gr.Audio(label="🔊 Download WAV", type="filepath") # Interactions voice_dropdown.change(fn=show_load_button, inputs=voice_dropdown, outputs=[load_speakers_btn, speaker_dropdown, generate_button]) load_speakers_btn.click(fn=load_and_show_speakers, inputs=voice_dropdown, outputs=[speaker_dropdown, generate_button]) generate_button.click(fn=docx_to_wav, inputs=[docx_input, voice_dropdown, speaker_dropdown], outputs=audio_output) if __name__ == "__main__": interface.launch()