Spaces:
Build error
Build error
| # imports | |
| import os | |
| import sys | |
| os.system("pip install git+https://github.com/openai/whisper.git") | |
| import gradio as gr | |
| import whisper | |
| # the model we are using for ASR, options are small, medium, large and largev2 (large and largev2 don't fit on huggingface cpu) | |
| model = whisper.load_model("medium") | |
| import torch | |
| # A table to look up all the languages | |
| language_id_lookup = { | |
| "Arabic" : "ar", | |
| "English" : "en", | |
| "Chinese" : "zh", | |
| "Spanish" : "es", | |
| "Russian" : "ru", | |
| "French" : "fr", | |
| } | |
| # load mRASP2 | |
| os.system("git clone https://github.com/PANXiao1994/mRASP2.git") | |
| os.system('mv -n mRASP2/* ./') | |
| os.system("rm -rf mRASP2") | |
| os.system("pip install -r requirements.txt") | |
| os.system("git clone https://github.com/pytorch/fairseq") | |
| os.system("cd fairseq; pip install ./; cd ..") | |
| model_name = "12e12d_last.pt" | |
| # os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl2021/mrasp2/" + model_name) | |
| os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl2021/mrasp2/bpe_vocab") | |
| os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000") | |
| # load tts | |
| # os.system("git clone https://github.com/Kyubyong/g2pC.git") | |
| # os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \ | |
| # sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \ | |
| # sed -i 's/package_data={/# package_data={/g' setup.py; \ | |
| # pip install ./; cd ..") | |
| # os.system("git clone https://github.com/DigitalPhonetics/IMS-Toucan.git") | |
| # sys.path.append('./IMS-Toucan') | |
| # os.system("cd IMS-Toucan; pip install --no-cache-dir -r requirements.txt") | |
| # os.system("python run_model_downloader.py; cd ..") | |
| # from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface | |
| # cwd = os.getcwd() | |
| # os.chdir('./IMS-Toucan') | |
| # tts = PortaSpeechInterface(device='cpu', tts_model_path='Meta') | |
| # os.chdir(cwd) | |
| # azure tts | |
| # os.system("2d1847f4151f4f94ae06d0b620533936") | |
| # os.system("eastus") | |
| lang2voice = { | |
| "zh": "zh-CN-XiaoxiaoNeural", | |
| "ar": "ar-EG-SalmaNeural", | |
| "en": "en-US-JennyNeural", | |
| "es": "es-ES-AbrilNeural", | |
| "ru": "ru-RU-DariyaNeural", | |
| "fr": "fr-FR-AlainNeural", | |
| } | |
| # The predict function. audio, language and mic_audio are all parameters directly passed by gradio | |
| # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The | |
| # gr.outputs[] block will specify the output type. | |
| def predict(audio, src_language, tgt_language, mic_audio=None): | |
| # checks if mic_audio is used, otherwise feeds model uploaded audio | |
| if mic_audio is not None: | |
| input_audio = mic_audio | |
| elif audio is not None: | |
| input_audio = audio | |
| else: | |
| return "(please provide audio)" | |
| # Uses the model's preprocessing methods to preprocess audio | |
| audio = whisper.load_audio(input_audio) | |
| audio = whisper.pad_or_trim(audio) | |
| # Calculates the mel frequency spectogram | |
| mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
| # if model is supposed to detect language, set outLanguage to None | |
| # otherwise set to specified language | |
| if(src_language == "Detect Language"): | |
| src_language = None | |
| else: | |
| src_language = language_id_lookup[src_language.split()[0]] | |
| tgt_language = language_id_lookup[tgt_language.split()[0]] | |
| # Runs the audio through the whisper model and gets the DecodingResult object, which has the features: | |
| # audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio | |
| # asr | |
| options = whisper.DecodingOptions(fp16 = True, language = src_language) | |
| result = whisper.decode(model, mel, options) | |
| if src_language is None: | |
| src_language = result.language | |
| transcript = result.text | |
| # mt | |
| with open("input." + src_language, 'w') as w: | |
| w.write(result.text) | |
| with open("input." + tgt_language, 'w') as w: | |
| w.write('LANG_TOK_' + src_language.upper()) | |
| os.system("python fairseq/fairseq_cli/preprocess.py --dataset-impl raw \ | |
| --srcdict bpe_vocab --tgtdict bpe_vocab --testpref input -s {} -t {}".format( \ | |
| src_language, tgt_language)) | |
| os.system("python fairseq/fairseq_cli/interactive.py ./data-bin \ | |
| --user-dir mcolt \ | |
| -s zh \ | |
| -t en \ | |
| --skip-invalid-size-inputs-valid-test \ | |
| --path {} \ | |
| --max-tokens 1024 \ | |
| --task translation_w_langtok \ | |
| --lang-prefix-tok \"LANG_TOK_{}\" \ | |
| --max-source-positions 1024 \ | |
| --max-target-positions 1024 \ | |
| --nbest 1 \ | |
| --bpe subword_nmt \ | |
| --bpe-codes codes.bpe.32000 \ | |
| --post-process --tokenizer moses \ | |
| --input input.{} | grep -E '[D]-[0-9]+' > output".format( | |
| model_name, tgt_language.upper(), src_language)) | |
| with open("output", 'r') as r: | |
| translation = (' '.join(r.readline().split(' ')[3:])).strip() | |
| # tts | |
| # tts.set_language(tgt_language) | |
| # tts.read_to_file(text_list=[translation], file_location='output.wav') | |
| # azure tts | |
| import azure.cognitiveservices.speech as speechsdk | |
| speech_key = "2d1847f4151f4f94ae06d0b620533936" | |
| service_region = "eastus" | |
| speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) | |
| # Note: the voice setting will not overwrite the voice element in input SSML. | |
| speech_config.speech_synthesis_voice_name = lang2voice[tgt_language] | |
| audio_config = speechsdk.audio.AudioOutputConfig(filename="output.wav") | |
| speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config) | |
| speech_synthesizer.speak_text(translation) | |
| # Returns the text | |
| return transcript, translation, "output.wav" | |
| title = "Demo for Whisper (ASR) -> Something -> IMS Toucan (TTS)" | |
| description = """ | |
| <b>How to use:</b> Upload an audio file or record using the microphone. The audio is into the whisper model developed by openai. | |
| The output is the text transcription of the audio in the language you inputted. If you asked the model to detect a language, it will | |
| tell you what language it detected. | |
| """ | |
| # The gradio interface | |
| gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Audio(label="Upload Speech", source="upload", type="filepath"), | |
| gr.inputs.Dropdown(['Arabic', | |
| 'Chinese', | |
| 'English', | |
| 'Spanish', | |
| 'Russian', | |
| 'French', | |
| 'Detect Language'], type="value", default='English', label="Select the language of input"), | |
| gr.inputs.Dropdown(['Arabic', | |
| 'Chinese', | |
| 'English', | |
| 'Spanish', | |
| 'Russian', | |
| 'French', | |
| 'Detect Language'], type="value", default='English', label="Select the language of output"), | |
| gr.Audio(label="Record Speech", source="microphone", type="filepath"), | |
| ], | |
| # To change to output audio, replace the outputs line with | |
| # outputs=gr.outputs.Audio(type="numpy", label=None) | |
| outputs=[ | |
| gr.Text(label="Transcript"), | |
| gr.Text(label="Translation"), | |
| gr.Audio(label="Translation Speech") | |
| ], | |
| title=title, | |
| description=description, | |
| ).launch() |