import gradio as gr import speech_recognition as sr from pydub import AudioSegment import os def audio_preproccessing(): os.system("deepFilter 'Audio.wav'") os.rename("Audio_DeepFilterNet3.wav", "Audio.wav") print("Audio Preprocessing Done!") def transcribe_audio(audio_file_path, input_language,denoising,music): # Auto-detect format from file extension file_ext = os.path.splitext(audio_file_path)[1][1:] # e.g. "mp3", "wav" # Add silence padding audio = AudioSegment.from_file(audio_file_path, format=file_ext) two_sec_silence = AudioSegment.silent(duration=2500) audio = two_sec_silence + audio + two_sec_silence audio.export("Audio.wav", format="wav") file_path2 = "Audio.wav" #if music=='Yes': if denoising=='Yes': audio_preproccessing() recognizer = sr.Recognizer() try: with sr.AudioFile(file_path2) as source: recognizer.adjust_for_ambient_noise(source) audio_data = recognizer.record(source) # Google API call text = recognizer.recognize_google(audio_data, language=input_language) return text except sr.UnknownValueError: return "Could not understand the audio" except sr.RequestError as e: return f"Could not request results; {e}" # Gradio interface iface = gr.Interface( fn=transcribe_audio, inputs=[ gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio Input"), #gr.Textbox(label="Language code (e.g., en-US, fa-IR)") gr.Dropdown(choices=["fa-IR", "en-US", "ar-SA"], label="Choose the right language:"), #gr.Checkbox(label="Apply Denoising?") gr.Dropdown(choices=["No","Yes"], label="Need Denoising?"), #gr.Dropdown(choices=["No","Yes"], label="Does the input audio have music?") ], #outputs="text", outputs=gr.Textbox(label="Transcription results", lines=10), title="Speech-to-Text Service", description="Upload or record audio and get transcription using our STT service." ) iface.launch()