Spaces:
Paused
Paused
| import os | |
| import gradio as gr | |
| from openai import OpenAI | |
| from playdiffusion import PlayDiffusion, InpaintInput, TTSInput, RVCInput | |
| inpainter = PlayDiffusion() | |
| _whisper_client = None | |
| def get_whisper_client(): | |
| global _whisper_client | |
| if _whisper_client is None: | |
| _whisper_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| return _whisper_client | |
| def run_asr(audio): | |
| whisper_client = get_whisper_client() | |
| audio_file = open(audio, "rb") | |
| transcript = whisper_client.audio.transcriptions.create( | |
| file=audio_file, | |
| model="whisper-1", | |
| response_format="verbose_json", | |
| timestamp_granularities=["word"] | |
| ) | |
| word_times = [{ | |
| "word": word.word, | |
| "start": word.start, | |
| "end": word.end | |
| } for word in transcript.words] | |
| return transcript.text, transcript.text, word_times | |
| def run_inpainter(input_text, output_text, word_times, audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio): | |
| if not use_manual_ratio: | |
| audio_token_syllable_ratio = None | |
| return inpainter.inpaint(InpaintInput(input_text=input_text, output_text=output_text, input_word_times=word_times, audio=audio, num_steps=num_steps, | |
| init_temp=init_temp, init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk, | |
| audio_token_syllable_ratio=audio_token_syllable_ratio)) | |
| def run_inpainter_tts(input_text, voice_audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio): | |
| if not use_manual_ratio: | |
| audio_token_syllable_ratio = None | |
| return inpainter.tts(TTSInput(output_text=input_text, voice=voice_audio, num_steps=num_steps, init_temp=init_temp, | |
| init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk, | |
| audio_token_syllable_ratio=audio_token_syllable_ratio)) | |
| def toggle_ratio_input(use_manual): | |
| return gr.update(visible=use_manual, interactive=use_manual) | |
| def create_advanced_options_accordion(): | |
| with gr.Accordion("Advanced options", open=False): | |
| num_steps_slider = gr.Slider(1, 100, 30, step=1, label="number of sampling steps codebook") | |
| init_temp_slider = gr.Slider(0.5, 10, 1, step=0.1, label="Initial temperature") | |
| init_diversity_slider = gr.Slider(0, 10, 1, step=0.1, label="Initial diversity") | |
| guidance_slider = gr.Slider(0, 10, 0.5, step=0.1, label="guidance") | |
| rescale_slider = gr.Slider(0, 1, 0.7, step=0.1, label="guidance rescale factor") | |
| topk_slider = gr.Slider(1, 10000, 25, step=1, label="sampling from top-k logits") | |
| gr.Markdown("#### Audio Token Syllable Ratio") | |
| gr.Markdown("*Automatic calculation (recommended) provides the best results in most cases.*") | |
| use_manual_ratio = gr.Checkbox(label="Use manual audio token syllable ratio", value=False) | |
| audio_token_syllable_ratio = gr.Number( | |
| label="Audio token syllable ratio (manual)", | |
| value=12.5, precision=2, minimum=5.0, maximum=25.0, | |
| visible=False, interactive=False | |
| ) | |
| use_manual_ratio.change( | |
| toggle_ratio_input, | |
| inputs=[use_manual_ratio], | |
| outputs=[audio_token_syllable_ratio] | |
| ) | |
| return (num_steps_slider, init_temp_slider, init_diversity_slider, | |
| guidance_slider, rescale_slider, topk_slider, | |
| use_manual_ratio, audio_token_syllable_ratio) | |
| def speech_rvc(rvc_source_speech, rvc_target_voice): | |
| return inpainter.rvc(RVCInput(source_speech=rvc_source_speech, target_voice=rvc_target_voice)) | |
| if __name__ == '__main__': | |
| with gr.Blocks(analytics_enabled=False, title="PlayDiffusion") as demo: | |
| gr.Markdown("## PlayDiffusion") | |
| with gr.Tab("Inpaint"): | |
| gr.Markdown("### Upload an audio file and run ASR to get the text.") | |
| gr.Markdown("### Then, specify the desired output text.") | |
| gr.Markdown("### Run the inpainter to generate the modified audio.") | |
| gr.Markdown("### Note: The model and demo are currently targeted for English.") | |
| inpaint_advanced_options = create_advanced_options_accordion() | |
| with gr.Row(): | |
| audio_input = gr.Audio(label="Upload audio to be modified", sources=["upload", "microphone"], type="filepath") | |
| with gr.Row(): | |
| asr_submit = gr.Button("Run ASR") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox(label="Input text from ASR", interactive=False) | |
| text_output = gr.Textbox(label="Desired output text") | |
| with gr.Column(): | |
| word_times = gr.JSON(label="Word times from ASR") | |
| with gr.Row(): | |
| inpainter_submit = gr.Button("Run Inpainter") | |
| with gr.Row(): | |
| audio_output = gr.Audio(label="Output audio") | |
| asr_submit.click(run_asr, inputs=[audio_input], outputs=[text_input, text_output, word_times]) | |
| inpainter_submit.click( | |
| run_inpainter, | |
| inputs=[text_input, text_output, word_times, audio_input] + list(inpaint_advanced_options), | |
| outputs=[audio_output]) | |
| with gr.Tab("Text to Speech"): | |
| gr.Markdown("### Text to Speech") | |
| tts_advanced_options = create_advanced_options_accordion() | |
| tts_text = gr.Textbox(label="TTS Input", placeholder="Enter text to convert to speech", lines=2) | |
| tts_voice = gr.Audio(label="Voice to use for TTS", | |
| sources=["upload", "microphone"], type="filepath", | |
| ) | |
| tts_submit = gr.Button("Convert to Speech") | |
| tts_output = gr.Audio(label="Generated Speech") | |
| tts_submit.click( | |
| run_inpainter_tts, | |
| inputs=[tts_text, tts_voice] + list(tts_advanced_options), | |
| outputs=[tts_output] | |
| ) | |
| with gr.Tab("Voice Conversion"): | |
| gr.Markdown("### Real Time Voice Conversion (works best for english)") | |
| rvc_source_speech = gr.Audio(label="Source Conversion Speech", | |
| sources=["upload", "microphone"], type="filepath", | |
| ) | |
| rvc_target_voice = gr.Audio(label="Target Voice", | |
| sources=["upload", "microphone"], type="filepath", | |
| ) | |
| rvc_submit = gr.Button("Real time Voice Conversion") | |
| rvc_output = gr.Audio(label="Converted Speech") | |
| rvc_submit.click( | |
| speech_rvc, | |
| inputs=[rvc_source_speech, rvc_target_voice], | |
| outputs=[rvc_output] | |
| ) | |
| demo.launch(share=True) | |