dmcartor commited on
Commit
ca4ed36
·
verified ·
1 Parent(s): 9eecb2d

Updated app.py to allow LID functionality

Browse files
Files changed (1) hide show
  1. app.py +24 -11
app.py CHANGED
@@ -1,15 +1,28 @@
1
  import gradio as gr
2
  from transformers import pipeline
 
3
 
4
- # Use a pipeline as a high-level helper for automatic speech recognition
5
- pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
6
 
7
- # Define the function for ASR
8
  def transcribe(audio):
9
- if isinstance(audio, tuple):
10
- audio = audio[1] # Extract file path from tuple if recorded audio
11
- result = pipe(audio)["text"]
12
- return result
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Retain the ChatInterface setup from the existing app.py
15
  from huggingface_hub import InferenceClient
@@ -51,10 +64,10 @@ def respond(
51
  # Create the ASR interface with a label and functionality for both file upload and direct recording
52
  asr_interface = gr.Interface(
53
  fn=transcribe,
54
- inputs=gr.Audio(source="upload", type="filepath", label="Upload or record audio"),
55
  outputs="text",
56
- title="ASR Transcription",
57
- description="Upload an audio file or record audio directly and get the transcription."
58
  )
59
 
60
  # Retain the ChatInterface setup from the existing app.py
@@ -83,4 +96,4 @@ with gr.Blocks() as demo:
83
  chat_interface.render()
84
 
85
  if __name__ == "__main__":
86
- demo.launch()
 
1
  import gradio as gr
2
  from transformers import pipeline
3
+ import whisper
4
 
5
+ # Load the Whisper model
6
+ model = whisper.load_model("large")
7
 
8
+ # Define the function for ASR with language detection
9
  def transcribe(audio):
10
+ # Load audio and pad/trim it to fit 30 seconds
11
+ audio_data = whisper.load_audio(audio)
12
+ audio_data = whisper.pad_or_trim(audio_data)
13
+
14
+ # Make log-Mel spectrogram and move to the same device as the model
15
+ mel = whisper.log_mel_spectrogram(audio_data).to(model.device)
16
+
17
+ # Detect the spoken language
18
+ _, probs = model.detect_language(mel)
19
+ detected_language = max(probs, key=probs.get)
20
+
21
+ # Decode the audio
22
+ options = whisper.DecodingOptions()
23
+ result = whisper.decode(model, mel, options)
24
+
25
+ return f"Detected language: {detected_language}\n\nTranscription: {result.text}"
26
 
27
  # Retain the ChatInterface setup from the existing app.py
28
  from huggingface_hub import InferenceClient
 
64
  # Create the ASR interface with a label and functionality for both file upload and direct recording
65
  asr_interface = gr.Interface(
66
  fn=transcribe,
67
+ inputs=gr.Audio(type="filepath", label="Upload or record audio"),
68
  outputs="text",
69
+ title="ASR Transcription with Language Detection",
70
+ description="Upload an audio file or record audio directly to get the transcription and detected language."
71
  )
72
 
73
  # Retain the ChatInterface setup from the existing app.py
 
96
  chat_interface.render()
97
 
98
  if __name__ == "__main__":
99
+ demo.launch()