File size: 2,079 Bytes
36c3392
 
 
c3a5d25
36c3392
6513d4f
 
 
 
 
9321315
3854fab
 
6513d4f
36c3392
3854fab
36c3392
 
6513d4f
 
36c3392
9321315
 
 
6513d4f
 
 
36c3392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9321315
3854fab
6513d4f
 
9321315
 
36c3392
c3a5d25
c0db9ad
3854fab
 
36c3392
 
2338311
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import gradio as gr
import speech_recognition as sr
from pydub import AudioSegment
import os

def audio_preproccessing():
	os.system("deepFilter 'Audio.wav'")
	os.rename("Audio_DeepFilterNet3.wav", "Audio.wav")
	print("Audio Preprocessing Done!")
    
def transcribe_audio(audio_file_path, input_language,denoising,music):
    # Auto-detect format from file extension
    file_ext = os.path.splitext(audio_file_path)[1][1:]  # e.g. "mp3", "wav"
    
    # Add silence padding
    audio = AudioSegment.from_file(audio_file_path, format=file_ext)
    two_sec_silence = AudioSegment.silent(duration=2500)
    audio = two_sec_silence + audio + two_sec_silence
    audio.export("Audio.wav", format="wav")
    file_path2 = "Audio.wav"

    #if music=='Yes':
        
        
    if denoising=='Yes':
        audio_preproccessing()
    
    recognizer = sr.Recognizer()
    try:
        with sr.AudioFile(file_path2) as source:
            recognizer.adjust_for_ambient_noise(source)
            audio_data = recognizer.record(source)

        # Google API call
        text = recognizer.recognize_google(audio_data, language=input_language)
        return text

    except sr.UnknownValueError:
        return "Could not understand the audio"
    except sr.RequestError as e:
        return f"Could not request results; {e}"

# Gradio interface
iface = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio Input"),
        #gr.Textbox(label="Language code (e.g., en-US, fa-IR)")
        gr.Dropdown(choices=["fa-IR", "en-US", "ar-SA"], label="Choose the right language:"),
        #gr.Checkbox(label="Apply Denoising?")
        gr.Dropdown(choices=["No","Yes"], label="Need Denoising?"),
        #gr.Dropdown(choices=["No","Yes"], label="Does the input audio have music?")
    ],
    #outputs="text",
    outputs=gr.Textbox(label="Transcription results", lines=10),
    title="Speech-to-Text Service",
    description="Upload or record audio and get transcription using our STT service."
)

iface.launch()