Spaces:
Runtime error
Runtime error
Commit ·
5bd4c84
1
Parent(s): 532c1b7
Upload folder using huggingface_hub
Browse files- 00_a2t.py +122 -0
- README.md +3 -9
- requirements.txt +112 -0
00_a2t.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import whisper
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import time
|
| 4 |
+
from pyChatGPT import ChatGPT
|
| 5 |
+
import warnings
|
| 6 |
+
from gtts import gTTS
|
| 7 |
+
import tempfile
|
| 8 |
+
from pydub import AudioSegment
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
warnings.filterwarnings("ignore")
|
| 12 |
+
model = whisper.load_model("base")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def transcribe_long_audio(model, audio_path, segment_length = 30*1000):
|
| 16 |
+
audio = AudioSegment.from_wav(audio_path)
|
| 17 |
+
|
| 18 |
+
transcription = ''
|
| 19 |
+
for i in range(0, len(audio), segment_length):
|
| 20 |
+
audio_segment = audio[i:i + segment_length]
|
| 21 |
+
transcription += transcribe(model, audio_segment)
|
| 22 |
+
|
| 23 |
+
return transcription
|
| 24 |
+
|
| 25 |
+
# def transcribe(model, audio_segment):
|
| 26 |
+
|
| 27 |
+
# load audio and pad/trim it to fit 30 seconds
|
| 28 |
+
loaded_audio = whisper.load_audio(audio_segment) # Change here, avoid using same name for parameter and variable
|
| 29 |
+
print("Audio Loaded") # Debugging print statement
|
| 30 |
+
|
| 31 |
+
padded_trimmed_audio = whisper.pad_or_trim(loaded_audio) # Avoid reusing variable names
|
| 32 |
+
|
| 33 |
+
# make log-Mel spectrogram and move to the same device as the model
|
| 34 |
+
mel = whisper.log_mel_spectrogram(padded_trimmed_audio).to(model.device)
|
| 35 |
+
|
| 36 |
+
# decode the audio
|
| 37 |
+
options = whisper.DecodingOptions(fp16=False)
|
| 38 |
+
result = whisper.decode(model, mel, options)
|
| 39 |
+
print("Decoded Audio") # Debugging print statement
|
| 40 |
+
|
| 41 |
+
result_text = result.text
|
| 42 |
+
|
| 43 |
+
print("Transcription: ", result_text) # Print the complete transcription for debugging
|
| 44 |
+
|
| 45 |
+
return result_text
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# def transcribe(model, audio_segment):
|
| 49 |
+
|
| 50 |
+
# Save audio segment to temporary file
|
| 51 |
+
temp_file = tempfile.NamedTemporaryFile(delete=True)
|
| 52 |
+
audio_segment.export(temp_file.name, format="wav")
|
| 53 |
+
|
| 54 |
+
# Load audio from temporary file
|
| 55 |
+
loaded_audio = whisper.load_audio(temp_file.name)
|
| 56 |
+
print("Audio Loaded") # Debugging print statement
|
| 57 |
+
|
| 58 |
+
padded_trimmed_audio = whisper.pad_or_trim(loaded_audio) # Avoid reusing variable names
|
| 59 |
+
|
| 60 |
+
# make log-Mel spectrogram and move to the same device as the model
|
| 61 |
+
mel = whisper.log_mel_spectrogram(padded_trimmed_audio).to(model.device)
|
| 62 |
+
|
| 63 |
+
# decode the audio
|
| 64 |
+
options = whisper.DecodingOptions(fp16=False)
|
| 65 |
+
result = whisper.decode(model, mel, options)
|
| 66 |
+
print("Decoded Audio") # Debugging print statement
|
| 67 |
+
|
| 68 |
+
result_text = result.text
|
| 69 |
+
|
| 70 |
+
print("Transcription: ", result_text) # Print the complete transcription for debugging
|
| 71 |
+
|
| 72 |
+
return result_text
|
| 73 |
+
|
| 74 |
+
# def transcribe(model, audio_segment):
|
| 75 |
+
|
| 76 |
+
def transcribe(model, audio_segment):
|
| 77 |
+
|
| 78 |
+
if audio_segment is None or len(audio_segment) == 0:
|
| 79 |
+
print("No audio data received. Cannot proceed with transcription.")
|
| 80 |
+
return ""
|
| 81 |
+
|
| 82 |
+
# Save audio segment to temporary file
|
| 83 |
+
temp_file = tempfile.NamedTemporaryFile(delete=True)
|
| 84 |
+
audio_segment.export(temp_file.name, format="wav")
|
| 85 |
+
|
| 86 |
+
# Load audio from temporary file
|
| 87 |
+
loaded_audio = whisper.load_audio(temp_file.name)
|
| 88 |
+
print("Audio Loaded") # Debugging print statement
|
| 89 |
+
|
| 90 |
+
padded_trimmed_audio = whisper.pad_or_trim(loaded_audio)
|
| 91 |
+
|
| 92 |
+
# make log-Mel spectrogram and move to the same device as the model
|
| 93 |
+
mel = whisper.log_mel_spectrogram(padded_trimmed_audio).to(model.device)
|
| 94 |
+
|
| 95 |
+
# decode the audio
|
| 96 |
+
options = whisper.DecodingOptions(fp16=False)
|
| 97 |
+
result = whisper.decode(model, mel, options)
|
| 98 |
+
print("Decoded Audio")
|
| 99 |
+
|
| 100 |
+
result_text = result.text
|
| 101 |
+
|
| 102 |
+
print("Transcription: ", result_text)
|
| 103 |
+
|
| 104 |
+
return result_text
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
output_1 = gr.Textbox(label="Speech to Text")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def transcribe_wrapper(audio_path):
|
| 111 |
+
return transcribe_long_audio(model, audio_path)
|
| 112 |
+
|
| 113 |
+
gr.Interface(
|
| 114 |
+
title = 'Voice to Text (KF)',
|
| 115 |
+
fn=transcribe_wrapper,
|
| 116 |
+
inputs=[
|
| 117 |
+
gr.inputs.Audio(source="upload", type="filepath") # change made here
|
| 118 |
+
],
|
| 119 |
+
outputs=[
|
| 120 |
+
output_1
|
| 121 |
+
],
|
| 122 |
+
live=True, allow_flagging=False).launch(share=True)
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 3.
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: a2t
|
| 3 |
+
app_file: 00_a2t.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
+
sdk_version: 3.42.0
|
|
|
|
|
|
|
| 6 |
---
|
|
|
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiofiles==23.2.1
|
| 2 |
+
aiohttp==3.8.5
|
| 3 |
+
aiosignal==1.3.1
|
| 4 |
+
altair==5.1.1
|
| 5 |
+
anyio==3.7.1
|
| 6 |
+
async-timeout==4.0.3
|
| 7 |
+
attrs==23.1.0
|
| 8 |
+
beautifulsoup4==4.12.2
|
| 9 |
+
certifi==2023.7.22
|
| 10 |
+
charset-normalizer==3.2.0
|
| 11 |
+
click==8.1.7
|
| 12 |
+
contourpy==1.1.0
|
| 13 |
+
cycler==0.11.0
|
| 14 |
+
dacite==1.8.1
|
| 15 |
+
decorator==4.4.2
|
| 16 |
+
exceptiongroup==1.1.3
|
| 17 |
+
fastapi==0.103.1
|
| 18 |
+
ffmpy==0.3.1
|
| 19 |
+
filelock==3.12.3
|
| 20 |
+
fonttools==4.42.1
|
| 21 |
+
frozenlist==1.4.0
|
| 22 |
+
fsspec==2023.9.0
|
| 23 |
+
gradio==3.42.0
|
| 24 |
+
gradio_client==0.5.0
|
| 25 |
+
gTTS==2.3.2
|
| 26 |
+
h11==0.14.0
|
| 27 |
+
htmlmin==0.1.12
|
| 28 |
+
httpcore==0.17.3
|
| 29 |
+
httpx==0.24.1
|
| 30 |
+
huggingface-hub==0.16.4
|
| 31 |
+
idna==3.4
|
| 32 |
+
ImageHash==4.3.1
|
| 33 |
+
imageio==2.31.3
|
| 34 |
+
imageio-ffmpeg==0.4.9
|
| 35 |
+
importlib-resources==6.0.1
|
| 36 |
+
Jinja2==3.1.2
|
| 37 |
+
joblib==1.3.2
|
| 38 |
+
jsonschema==4.19.0
|
| 39 |
+
jsonschema-specifications==2023.7.1
|
| 40 |
+
kiwisolver==1.4.5
|
| 41 |
+
llvmlite==0.40.1
|
| 42 |
+
markdownify==0.11.6
|
| 43 |
+
MarkupSafe==2.1.3
|
| 44 |
+
matplotlib==3.7.2
|
| 45 |
+
more-itertools==10.1.0
|
| 46 |
+
moviepy==1.0.3
|
| 47 |
+
mpmath==1.3.0
|
| 48 |
+
multidict==6.0.4
|
| 49 |
+
multimethod==1.9.1
|
| 50 |
+
networkx==3.1
|
| 51 |
+
numba==0.57.1
|
| 52 |
+
numpy==1.23.5
|
| 53 |
+
openai==0.28.0
|
| 54 |
+
openai-whisper @ git+https://github.com/openai/whisper.git@e8622f9afc4eba139bf796c210f5c01081000472
|
| 55 |
+
orjson==3.9.5
|
| 56 |
+
outcome==1.2.0
|
| 57 |
+
packaging==23.1
|
| 58 |
+
pandas==2.0.3
|
| 59 |
+
pandas-profiling==3.6.6
|
| 60 |
+
patsy==0.5.3
|
| 61 |
+
phik==0.12.3
|
| 62 |
+
Pillow==10.0.0
|
| 63 |
+
proglog==0.1.10
|
| 64 |
+
pyChatGPT==0.4.3.3
|
| 65 |
+
pydantic==1.10.12
|
| 66 |
+
pydub==0.25.1
|
| 67 |
+
pyparsing==3.0.9
|
| 68 |
+
PySocks==1.7.1
|
| 69 |
+
python-dateutil==2.8.2
|
| 70 |
+
python-dotenv==1.0.0
|
| 71 |
+
python-multipart==0.0.6
|
| 72 |
+
pytube3==9.6.4
|
| 73 |
+
pytz==2023.3
|
| 74 |
+
PyWavelets==1.4.1
|
| 75 |
+
PyYAML==6.0.1
|
| 76 |
+
referencing==0.30.2
|
| 77 |
+
regex==2023.8.8
|
| 78 |
+
requests==2.31.0
|
| 79 |
+
rpds-py==0.10.2
|
| 80 |
+
scipy==1.11.2
|
| 81 |
+
seaborn==0.12.2
|
| 82 |
+
selenium==4.12.0
|
| 83 |
+
semantic-version==2.10.0
|
| 84 |
+
six==1.16.0
|
| 85 |
+
sniffio==1.3.0
|
| 86 |
+
sortedcontainers==2.4.0
|
| 87 |
+
soupsieve==2.5
|
| 88 |
+
SpeechRecognition==3.10.0
|
| 89 |
+
starlette==0.27.0
|
| 90 |
+
statsmodels==0.14.0
|
| 91 |
+
sympy==1.12
|
| 92 |
+
tangled-up-in-unicode==0.2.0
|
| 93 |
+
tiktoken==0.3.3
|
| 94 |
+
toolz==0.12.0
|
| 95 |
+
torch==2.0.1
|
| 96 |
+
tqdm==4.66.1
|
| 97 |
+
trio==0.22.2
|
| 98 |
+
trio-websocket==0.10.4
|
| 99 |
+
typeguard==2.13.3
|
| 100 |
+
typing_extensions==4.7.1
|
| 101 |
+
tzdata==2023.3
|
| 102 |
+
undetected-chromedriver==3.5.3
|
| 103 |
+
urllib3==2.0.4
|
| 104 |
+
uvicorn==0.23.2
|
| 105 |
+
visions==0.7.5
|
| 106 |
+
websockets==11.0.3
|
| 107 |
+
whisper==1.1.10
|
| 108 |
+
wordcloud==1.9.2
|
| 109 |
+
wsproto==1.2.0
|
| 110 |
+
yarl==1.9.2
|
| 111 |
+
ydata-profiling==4.5.1
|
| 112 |
+
zipp==3.16.2
|