|
|
import gradio as gr |
|
|
import speech_recognition as sr |
|
|
from pydub import AudioSegment |
|
|
import os |
|
|
|
|
|
def audio_preproccessing(): |
|
|
os.system("deepFilter 'Audio.wav'") |
|
|
os.rename("Audio_DeepFilterNet3.wav", "Audio.wav") |
|
|
print("Audio Preprocessing Done!") |
|
|
|
|
|
def transcribe_audio(audio_file_path, input_language,denoising,music): |
|
|
|
|
|
file_ext = os.path.splitext(audio_file_path)[1][1:] |
|
|
|
|
|
|
|
|
audio = AudioSegment.from_file(audio_file_path, format=file_ext) |
|
|
two_sec_silence = AudioSegment.silent(duration=2500) |
|
|
audio = two_sec_silence + audio + two_sec_silence |
|
|
audio.export("Audio.wav", format="wav") |
|
|
file_path2 = "Audio.wav" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if denoising=='Yes': |
|
|
audio_preproccessing() |
|
|
|
|
|
recognizer = sr.Recognizer() |
|
|
try: |
|
|
with sr.AudioFile(file_path2) as source: |
|
|
recognizer.adjust_for_ambient_noise(source) |
|
|
audio_data = recognizer.record(source) |
|
|
|
|
|
|
|
|
text = recognizer.recognize_google(audio_data, language=input_language) |
|
|
return text |
|
|
|
|
|
except sr.UnknownValueError: |
|
|
return "Could not understand the audio" |
|
|
except sr.RequestError as e: |
|
|
return f"Could not request results; {e}" |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=transcribe_audio, |
|
|
inputs=[ |
|
|
gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio Input"), |
|
|
|
|
|
gr.Dropdown(choices=["fa-IR", "en-US", "ar-SA"], label="Choose the right language:"), |
|
|
|
|
|
gr.Dropdown(choices=["No","Yes"], label="Need Denoising?"), |
|
|
|
|
|
], |
|
|
|
|
|
outputs=gr.Textbox(label="Transcription results", lines=10), |
|
|
title="Speech-to-Text Service", |
|
|
description="Upload or record audio and get transcription using our STT service." |
|
|
) |
|
|
|
|
|
iface.launch() |