ahishamm commited on
Commit
e71d9bf
·
1 Parent(s): 264a881

Upload 2 files

Browse files
Files changed (2) hide show
  1. readme.md +30 -0
  2. whisperAPITest.py +63 -0
readme.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Whisper API Test
2
+ The python script simply uses the Whisper API as the backend and Gradio web app framework as the frontend
3
+
4
+ ### Prerequisite Packages
5
+ ```bash
6
+ pip install gradio
7
+ pip install git+https://github.com/openai/whisper.git
8
+ pip install -U pip setuptools wheel
9
+ pip install -U spacy
10
+ python -m spacy download en_core_web_sm
11
+ python -m spacy download xx_ent_wiki_sm
12
+ pip install spacy-fastlang
13
+ ```
14
+ ffmpeg should also be installed
15
+ ```bash
16
+ # on Ubuntu or Debian
17
+ sudo apt update && sudo apt install ffmpeg
18
+
19
+ # on Arch Linux
20
+ sudo pacman -S ffmpeg
21
+
22
+ # on MacOS using Homebrew (https://brew.sh/)
23
+ brew install ffmpeg
24
+
25
+ # on Windows using Chocolatey (https://chocolatey.org/)
26
+ choco install ffmpeg
27
+
28
+ # on Windows using Scoop (https://scoop.sh/)
29
+ scoop install ffmpeg
30
+ ```
whisperAPITest.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import gradio as gr
3
+ import spacy_fastlang
4
+ import spacy
5
+ import seaborn as sns
6
+ import matplotlib.pyplot as plt
7
+ import pandas as pd
8
+ #import pymongo_get_database
9
+ nlp = spacy.load('en_core_web_sm')
10
+ nlp.add_pipe('language_detector')
11
+ def find_frequency_and_percentage(items):
12
+ frequency = {}
13
+ total_items = len(items)
14
+ for item in items:
15
+ if item in frequency:
16
+ frequency[item] += 1
17
+ else:
18
+ frequency[item] = 1
19
+ for item, count in frequency.items():
20
+ percentage = (count / total_items) * 100
21
+ print(f"{item}: {percentage:.2f}%")
22
+ return frequency,total_items
23
+ def whisperbackend(audiopath, audiopath2):
24
+ if(audiopath == None):
25
+ audiopath = audiopath2
26
+ model = whisper.load_model("large")
27
+ audio = whisper.load_audio(audiopath)
28
+ audio = whisper.pad_or_trim(audio)
29
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
30
+ _, probs = model.detect_language(mel)
31
+ lang = max(probs, key=probs.get)
32
+ result = model.transcribe(audiopath)['text']
33
+ options = dict(language='ar',beam_size=5,best_of=5)
34
+ translate_options = dict(task="translate",**options)
35
+ translation = model.transcribe(audiopath,**translate_options)['text']
36
+ result_list = result.split()
37
+ result_lang = []
38
+ for i in result_list:
39
+ doc = nlp(i)
40
+ result_lang.append(doc._.language)
41
+ freq_list, total_items = find_frequency_and_percentage(result_lang)
42
+ freq_keys = freq_list.keys()
43
+ freq_values = freq_list.values()
44
+ freq_df = pd.DataFrame({'Languages':freq_keys,'Percentages':freq_values})
45
+ freq_df['Percentages'] = freq_df['Percentages'].apply(lambda x: (x / total_items) *100)
46
+ if lang == 'en':
47
+ lang = 'English'
48
+ elif lang == 'ar':
49
+ lang = 'Arabic'
50
+ #pymongo_get_database.create_document(translation)
51
+ fig = (sns.barplot(freq_df,x='Languages',y='Percentages')).get_figure()
52
+ return fig, "The detected language is: \n"+lang+"\n Audio transcription: \n"+result+'\n'+translation
53
+ with gr.Blocks() as demo:
54
+ with gr.Tab("Input & Translation"):
55
+ audio_input1 = gr.Audio(source="microphone",type="filepath")
56
+ audio_input2 = gr.Audio(type="filepath")
57
+ text_output1 = gr.Textbox()
58
+ button1 = gr.Button("Process Audio")
59
+ with gr.Tab("Visualization"):
60
+ image_output1 = gr.Plot(width=350,height=300)
61
+ button1.click(whisperbackend,inputs=[audio_input1,audio_input2],
62
+ outputs=[image_output1,text_output1])
63
+ demo.launch(share=True)