Spaces:

dmcartor
/

ASR_Starter_Project

Sleeping

dmcartor commited on Jul 26, 2024

Commit

ca4ed36

verified ·

1 Parent(s): 9eecb2d

Updated app.py to allow LID functionality

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,15 +1,28 @@
 import gradio as gr
 from transformers import pipeline
-# Use a pipeline as a high-level helper for automatic speech recognition
-pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
-# Define the function for ASR
 def transcribe(audio):
-    if isinstance(audio, tuple):
-        audio = audio[1]  # Extract file path from tuple if recorded audio
-    result = pipe(audio)["text"]
-    return result
 # Retain the ChatInterface setup from the existing app.py
 from huggingface_hub import InferenceClient
@@ -51,10 +64,10 @@ def respond(
 # Create the ASR interface with a label and functionality for both file upload and direct recording
 asr_interface = gr.Interface(
     fn=transcribe,
-    inputs=gr.Audio(source="upload", type="filepath", label="Upload or record audio"),
     outputs="text",
-    title="ASR Transcription",
-    description="Upload an audio file or record audio directly and get the transcription."
 )
 # Retain the ChatInterface setup from the existing app.py
@@ -83,4 +96,4 @@ with gr.Blocks() as demo:
     chat_interface.render()
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from transformers import pipeline
+import whisper
+# Load the Whisper model
+model = whisper.load_model("large")
+# Define the function for ASR with language detection
 def transcribe(audio):
+    # Load audio and pad/trim it to fit 30 seconds
+    audio_data = whisper.load_audio(audio)
+    audio_data = whisper.pad_or_trim(audio_data)
+    # Make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio_data).to(model.device)
+    # Detect the spoken language
+    _, probs = model.detect_language(mel)
+    detected_language = max(probs, key=probs.get)
+    # Decode the audio
+    options = whisper.DecodingOptions()
+    result = whisper.decode(model, mel, options)
+    return f"Detected language: {detected_language}\n\nTranscription: {result.text}"
 # Retain the ChatInterface setup from the existing app.py
 from huggingface_hub import InferenceClient
 # Create the ASR interface with a label and functionality for both file upload and direct recording
 asr_interface = gr.Interface(
     fn=transcribe,
+    inputs=gr.Audio(type="filepath", label="Upload or record audio"),
     outputs="text",
+    title="ASR Transcription with Language Detection",
+    description="Upload an audio file or record audio directly to get the transcription and detected language."
 )
 # Retain the ChatInterface setup from the existing app.py
     chat_interface.render()
 if __name__ == "__main__":
+    demo.launch()