| import os |
| import gradio as gr |
| import whisper |
| import youtube_dl |
|
|
| |
| os.system("pip install git+https://github.com/openai/whisper.git") |
|
|
| |
| model = whisper.load_model("base") |
|
|
| |
| def inference_audio(audio): |
| audio = whisper.load_audio(audio) |
| audio = whisper.pad_or_trim(audio) |
| mel = whisper.log_mel_spectrogram(audio).to(model.device) |
| _, probs = model.detect_language(mel) |
| options = whisper.DecodingOptions(fp16=False) |
| result = whisper.decode(model, mel, options) |
| return result.text |
|
|
| def inference_text(text): |
| |
| |
| return "文本推斷結果:" + text |
|
|
| |
| def extract_text_from_youtube(url): |
| ydl_opts = { |
| 'format': 'bestaudio/best', |
| 'postprocessors': [{ |
| 'key': 'FFmpegExtractAudio', |
| 'preferredcodec': 'wav', |
| 'preferredquality': '192', |
| }], |
| } |
| with youtube_dl.YoutubeDL(ydl_opts) as ydl: |
| info_dict = ydl.extract_info(url, download=False) |
| video_url = info_dict['formats'][0]['url'] |
| os.system(f"youtube-dl -x --audio-format wav -o temp.wav {video_url}") |
| |
| |
| |
|
|
| |
| extracted_text = extract_text_from_audio("temp.wav") |
|
|
| |
| os.remove("temp.wav") |
|
|
| return extracted_text |
|
|
| |
| tab_group = gr.TabGroup([ |
| gr.Tab("音訊推斷", gr.Interface(inference_audio, inputs=gr.Audio(type="filepath"), outputs="text")), |
| gr.Tab("文字推斷", gr.Interface(inference_text, inputs="text", outputs="text")), |
| gr.Tab("YouTube影片", gr.Interface(extract_text_from_youtube, inputs=gr.Textbox(placeholder="輸入 YouTube 影片的網址"), outputs="text")), |
| ]) |
|
|
| |
| tab_group.launch() |
|
|
|
|