kamranferoz commited on
Commit
5bd4c84
·
1 Parent(s): 532c1b7

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. 00_a2t.py +122 -0
  2. README.md +3 -9
  3. requirements.txt +112 -0
00_a2t.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import gradio as gr
3
+ import time
4
+ from pyChatGPT import ChatGPT
5
+ import warnings
6
+ from gtts import gTTS
7
+ import tempfile
8
+ from pydub import AudioSegment
9
+
10
+
11
+ warnings.filterwarnings("ignore")
12
+ model = whisper.load_model("base")
13
+
14
+
15
+ def transcribe_long_audio(model, audio_path, segment_length = 30*1000):
16
+ audio = AudioSegment.from_wav(audio_path)
17
+
18
+ transcription = ''
19
+ for i in range(0, len(audio), segment_length):
20
+ audio_segment = audio[i:i + segment_length]
21
+ transcription += transcribe(model, audio_segment)
22
+
23
+ return transcription
24
+
25
+ # def transcribe(model, audio_segment):
26
+
27
+ # load audio and pad/trim it to fit 30 seconds
28
+ loaded_audio = whisper.load_audio(audio_segment) # Change here, avoid using same name for parameter and variable
29
+ print("Audio Loaded") # Debugging print statement
30
+
31
+ padded_trimmed_audio = whisper.pad_or_trim(loaded_audio) # Avoid reusing variable names
32
+
33
+ # make log-Mel spectrogram and move to the same device as the model
34
+ mel = whisper.log_mel_spectrogram(padded_trimmed_audio).to(model.device)
35
+
36
+ # decode the audio
37
+ options = whisper.DecodingOptions(fp16=False)
38
+ result = whisper.decode(model, mel, options)
39
+ print("Decoded Audio") # Debugging print statement
40
+
41
+ result_text = result.text
42
+
43
+ print("Transcription: ", result_text) # Print the complete transcription for debugging
44
+
45
+ return result_text
46
+
47
+
48
+ # def transcribe(model, audio_segment):
49
+
50
+ # Save audio segment to temporary file
51
+ temp_file = tempfile.NamedTemporaryFile(delete=True)
52
+ audio_segment.export(temp_file.name, format="wav")
53
+
54
+ # Load audio from temporary file
55
+ loaded_audio = whisper.load_audio(temp_file.name)
56
+ print("Audio Loaded") # Debugging print statement
57
+
58
+ padded_trimmed_audio = whisper.pad_or_trim(loaded_audio) # Avoid reusing variable names
59
+
60
+ # make log-Mel spectrogram and move to the same device as the model
61
+ mel = whisper.log_mel_spectrogram(padded_trimmed_audio).to(model.device)
62
+
63
+ # decode the audio
64
+ options = whisper.DecodingOptions(fp16=False)
65
+ result = whisper.decode(model, mel, options)
66
+ print("Decoded Audio") # Debugging print statement
67
+
68
+ result_text = result.text
69
+
70
+ print("Transcription: ", result_text) # Print the complete transcription for debugging
71
+
72
+ return result_text
73
+
74
+ # def transcribe(model, audio_segment):
75
+
76
+ def transcribe(model, audio_segment):
77
+
78
+ if audio_segment is None or len(audio_segment) == 0:
79
+ print("No audio data received. Cannot proceed with transcription.")
80
+ return ""
81
+
82
+ # Save audio segment to temporary file
83
+ temp_file = tempfile.NamedTemporaryFile(delete=True)
84
+ audio_segment.export(temp_file.name, format="wav")
85
+
86
+ # Load audio from temporary file
87
+ loaded_audio = whisper.load_audio(temp_file.name)
88
+ print("Audio Loaded") # Debugging print statement
89
+
90
+ padded_trimmed_audio = whisper.pad_or_trim(loaded_audio)
91
+
92
+ # make log-Mel spectrogram and move to the same device as the model
93
+ mel = whisper.log_mel_spectrogram(padded_trimmed_audio).to(model.device)
94
+
95
+ # decode the audio
96
+ options = whisper.DecodingOptions(fp16=False)
97
+ result = whisper.decode(model, mel, options)
98
+ print("Decoded Audio")
99
+
100
+ result_text = result.text
101
+
102
+ print("Transcription: ", result_text)
103
+
104
+ return result_text
105
+
106
+
107
+ output_1 = gr.Textbox(label="Speech to Text")
108
+
109
+
110
+ def transcribe_wrapper(audio_path):
111
+ return transcribe_long_audio(model, audio_path)
112
+
113
+ gr.Interface(
114
+ title = 'Voice to Text (KF)',
115
+ fn=transcribe_wrapper,
116
+ inputs=[
117
+ gr.inputs.Audio(source="upload", type="filepath") # change made here
118
+ ],
119
+ outputs=[
120
+ output_1
121
+ ],
122
+ live=True, allow_flagging=False).launch(share=True)
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: A2t
3
- emoji: 🐨
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 3.44.3
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: a2t
3
+ app_file: 00_a2t.py
 
 
4
  sdk: gradio
5
+ sdk_version: 3.42.0
 
 
6
  ---
 
 
requirements.txt ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.8.5
3
+ aiosignal==1.3.1
4
+ altair==5.1.1
5
+ anyio==3.7.1
6
+ async-timeout==4.0.3
7
+ attrs==23.1.0
8
+ beautifulsoup4==4.12.2
9
+ certifi==2023.7.22
10
+ charset-normalizer==3.2.0
11
+ click==8.1.7
12
+ contourpy==1.1.0
13
+ cycler==0.11.0
14
+ dacite==1.8.1
15
+ decorator==4.4.2
16
+ exceptiongroup==1.1.3
17
+ fastapi==0.103.1
18
+ ffmpy==0.3.1
19
+ filelock==3.12.3
20
+ fonttools==4.42.1
21
+ frozenlist==1.4.0
22
+ fsspec==2023.9.0
23
+ gradio==3.42.0
24
+ gradio_client==0.5.0
25
+ gTTS==2.3.2
26
+ h11==0.14.0
27
+ htmlmin==0.1.12
28
+ httpcore==0.17.3
29
+ httpx==0.24.1
30
+ huggingface-hub==0.16.4
31
+ idna==3.4
32
+ ImageHash==4.3.1
33
+ imageio==2.31.3
34
+ imageio-ffmpeg==0.4.9
35
+ importlib-resources==6.0.1
36
+ Jinja2==3.1.2
37
+ joblib==1.3.2
38
+ jsonschema==4.19.0
39
+ jsonschema-specifications==2023.7.1
40
+ kiwisolver==1.4.5
41
+ llvmlite==0.40.1
42
+ markdownify==0.11.6
43
+ MarkupSafe==2.1.3
44
+ matplotlib==3.7.2
45
+ more-itertools==10.1.0
46
+ moviepy==1.0.3
47
+ mpmath==1.3.0
48
+ multidict==6.0.4
49
+ multimethod==1.9.1
50
+ networkx==3.1
51
+ numba==0.57.1
52
+ numpy==1.23.5
53
+ openai==0.28.0
54
+ openai-whisper @ git+https://github.com/openai/whisper.git@e8622f9afc4eba139bf796c210f5c01081000472
55
+ orjson==3.9.5
56
+ outcome==1.2.0
57
+ packaging==23.1
58
+ pandas==2.0.3
59
+ pandas-profiling==3.6.6
60
+ patsy==0.5.3
61
+ phik==0.12.3
62
+ Pillow==10.0.0
63
+ proglog==0.1.10
64
+ pyChatGPT==0.4.3.3
65
+ pydantic==1.10.12
66
+ pydub==0.25.1
67
+ pyparsing==3.0.9
68
+ PySocks==1.7.1
69
+ python-dateutil==2.8.2
70
+ python-dotenv==1.0.0
71
+ python-multipart==0.0.6
72
+ pytube3==9.6.4
73
+ pytz==2023.3
74
+ PyWavelets==1.4.1
75
+ PyYAML==6.0.1
76
+ referencing==0.30.2
77
+ regex==2023.8.8
78
+ requests==2.31.0
79
+ rpds-py==0.10.2
80
+ scipy==1.11.2
81
+ seaborn==0.12.2
82
+ selenium==4.12.0
83
+ semantic-version==2.10.0
84
+ six==1.16.0
85
+ sniffio==1.3.0
86
+ sortedcontainers==2.4.0
87
+ soupsieve==2.5
88
+ SpeechRecognition==3.10.0
89
+ starlette==0.27.0
90
+ statsmodels==0.14.0
91
+ sympy==1.12
92
+ tangled-up-in-unicode==0.2.0
93
+ tiktoken==0.3.3
94
+ toolz==0.12.0
95
+ torch==2.0.1
96
+ tqdm==4.66.1
97
+ trio==0.22.2
98
+ trio-websocket==0.10.4
99
+ typeguard==2.13.3
100
+ typing_extensions==4.7.1
101
+ tzdata==2023.3
102
+ undetected-chromedriver==3.5.3
103
+ urllib3==2.0.4
104
+ uvicorn==0.23.2
105
+ visions==0.7.5
106
+ websockets==11.0.3
107
+ whisper==1.1.10
108
+ wordcloud==1.9.2
109
+ wsproto==1.2.0
110
+ yarl==1.9.2
111
+ ydata-profiling==4.5.1
112
+ zipp==3.16.2