imansarraf commited on
Commit
c0b9248
·
verified ·
1 Parent(s): f4c2fda

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +70 -0
  2. requirements.txt +35 -0
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from iman.sad_tfpy10 import *
3
+ from autosub import SpeechRecognizer
4
+ from autosub import GOOGLE_SPEECH_API_KEY
5
+ import soundfile as sf
6
+ import io
7
+
8
+ css = """
9
+ textarea { direction: rtl; text-align: right; font-family: Calibri, sans-serif; font-size: 16px;}
10
+ """
11
+
12
+
13
+ recognizer = SpeechRecognizer(language="az", rate=16000,api_key=GOOGLE_SPEECH_API_KEY, proxies=None)
14
+
15
+
16
+ seg = Segmenter(ffmpeg_path="ffmpeg",model_path="keras_speech_music_noise_cnn.hdf5" , device="cpu",vad_type="vad")
17
+
18
+
19
+
20
+ def process_segment(args):
21
+ segment, wav = args
22
+ start, stop = segment
23
+ # pp = converter((start, stop))
24
+ pp = pcm_to_flac(wav[int(start*16000) : int(stop*16000)])
25
+ tr_beamsearch_lm = recognizer(pp)
26
+ return start, stop, tr_beamsearch_lm
27
+
28
+ def pcm_to_flac(pcm_data, sample_rate=16000):
29
+ buffer = io.BytesIO()
30
+ sf.write(buffer, pcm_data, sample_rate, format='FLAC')
31
+ flac_data = buffer.getvalue()
32
+ return flac_data
33
+
34
+
35
+ def transcribe_audio(audio_file):
36
+
37
+ text=""
38
+ isig,wav = seg(audio_file)
39
+ isig = filter_output(isig , max_silence=0.5 ,ignore_small_speech_segments=0.1 , max_speech_len=15 ,split_speech_bigger_than=20)
40
+ isig = [(a,b) for x,a,b,_,_ in isig]
41
+ print(isig)
42
+ results=[]
43
+ for segment in isig:
44
+ results.append (process_segment((segment, wav)))
45
+ for start, stop, tr_beamsearch_lm in results:
46
+
47
+ try:
48
+
49
+ text += ' ' + tr_beamsearch_lm + '\r\n'
50
+ print(start)
51
+ print(stop)
52
+ print(text)
53
+ except:
54
+ pass
55
+
56
+ return text
57
+
58
+ # Define the Gradio interface
59
+ interface = gr.Interface(
60
+ fn=transcribe_audio,
61
+ inputs=gr.Audio(type="filepath"),
62
+
63
+ outputs=gr.Textbox(label="Transcription", elem_id="output-text",interactive=True),
64
+ title="Azari Audio Transcription",
65
+ description="Upload an audio file or record audio to get the transcription.",
66
+ css=css
67
+ )
68
+
69
+ # Launch the Gradio app
70
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tensorflow
2
+ iman==1.0.24
3
+ scikit-image
4
+ imageio
5
+ pytextgrid
6
+ soundfile
7
+ numpy==1.24.0
8
+ cachetools==4.2.4
9
+ certifi==2021.10.8
10
+ chardet==4.0.0
11
+ charset-normalizer==2.0.6
12
+ google-api-core==2.1.0
13
+ google-api-python-client==2.24.0
14
+ google-auth==2.3.0
15
+ google-auth-httplib2==0.1.0
16
+ google-auth-oauthlib==0.4.6
17
+ googleapis-common-protos==1.53.0
18
+ httplib2==0.20.1
19
+ idna==3.2
20
+ oauthlib==3.1.1
21
+ progressbar==2.5
22
+ protobuf==3.18.1
23
+ pyasn1==0.4.8
24
+ pyasn1-modules==0.2.8
25
+ pyparsing==2.4.7
26
+ pysrt==1.1.2
27
+ requests==2.26.0
28
+ requests-oauthlib==1.3.0
29
+ rsa==4.7.2
30
+ six==1.16.0
31
+ uritemplate==3.0.1
32
+ urllib3==1.26.7
33
+ Nuitka
34
+ orderedset
35
+ zstandard