Spaces:

ajchri5
/

Assignment-2-IT164_ajchri5

Runtime error

App Files Files Community

ajchri5 commited on Nov 20, 2024

Commit

89c1bb5

verified ·

1 Parent(s): e40d9da

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -22

app.py CHANGED Viewed

@@ -1,31 +1,47 @@
 import re
 import gradio as gr
 from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
 import torch
 import numpy as np
 # Load Whisper model for transcription
 whisper_model_name = "openai/whisper-large"
 processor = WhisperProcessor.from_pretrained(whisper_model_name)
 model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
-# Initialize the language detection model
 lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 # Function to transcribe audio to text using Whisper model
 def transcribe_audio(audio_file):
     # Check if audio_file is a list (Gradio returns a list when multiple clips are recorded)
     if isinstance(audio_file, list):
-        audio = np.concatenate(audio_file)  # Concatenate the list of arrays into a single 1D array
     else:
         audio = np.array(audio_file)  # Ensure it's a 1D array
-    # Ensure the shape is 1D (if the shape is (2, N), we flatten it)
-    if len(audio.shape) > 1:
-        audio = audio.flatten()
     # Prepare input features for Whisper (sampling rate should be 16000 for Whisper)
-    input_features = processor(audio, return_tensors="pt", sampling_rate=16000)
     # Generate transcription
     generated_ids = model.generate(input_features["input_features"])
@@ -33,35 +49,58 @@ def transcribe_audio(audio_file):
     return transcription
-# Function to detect the language of the transcription using zero-shot classification
-def detect_language(text):
-    result = lang_detect_model(text, candidate_labels=["en", "fr", "es", "de", "it", "pt", "zh", "ja", "ar", "hi"])
-    return result['labels'][0], result['scores'][0]  # Return the detected language and score
 # Cleanup function to remove filler words and clean the transcription
 def cleanup_text(text):
-    # Remove filler words like "uh", "um", etc.
     text = re.sub(r'\b(uh|um|like|you know|so|actually|basically)\b', '', text, flags=re.IGNORECASE)
-    # Remove extra spaces
-    text = re.sub(r'\s+', ' ', text)
-    # Strip leading and trailing spaces
     text = text.strip()
-    # Capitalize the first letter
     text = text.capitalize()
     return text
-# Main function to process the audio and detect language
 def process_audio(audio_file):
     try:
         transcription = transcribe_audio(audio_file)  # Transcribe audio to text
         if not transcription.strip():  # If transcription is empty or just whitespace
             raise ValueError("Transcription is empty.")
-        lang, score = detect_language(transcription)  # Detect the language of the transcription
         cleaned_text = cleanup_text(transcription)  # Clean up the transcription
-        return cleaned_text, lang, score  # Return cleaned transcription, language, and confidence score
     except Exception as e:
         # If any error occurs, return the error message
         return f"Error: {str(e)}", "", ""

+!pip install git+https://github.com/speechbrain/speechbrain.git@develop
 import re
 import gradio as gr
 from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
 import torch
 import numpy as np
+import torchaudio
+from speechbrain.inference.classifiers import EncoderClassifier
 # Load Whisper model for transcription
 whisper_model_name = "openai/whisper-large"
 processor = WhisperProcessor.from_pretrained(whisper_model_name)
 model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
+# Initialize the language detection model (using zero-shot classification for language detection)
 lang_detect_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+# Load the SpeechBrain language ID model
+language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
 # Function to transcribe audio to text using Whisper model
 def transcribe_audio(audio_file):
+    """
+    Function to transcribe audio to text using Whisper model.
+    Handles both file input and live audio input.
+    """
     # Check if audio_file is a list (Gradio returns a list when multiple clips are recorded)
     if isinstance(audio_file, list):
+        # Ensure all elements in the list are of the same length before concatenating
+        audio = np.concatenate([np.array(a) for a in audio_file if a is not None])
     else:
         audio = np.array(audio_file)  # Ensure it's a 1D array
+    # If audio is stereo (2D array with shape (2, N)), mix the channels by averaging them
+    if audio.ndim > 1:
+        audio = audio.mean(axis=0)  # Mix the stereo channels into a mono signal
+    # Ensure the audio is a 1D array (e.g., [N])
+    if audio.ndim != 1:
+        raise ValueError("The audio input must be a 1D array (mono).")
     # Prepare input features for Whisper (sampling rate should be 16000 for Whisper)
+    input_features = processor(audio, return_tensors="pt", sampling_rate=48000)
     # Generate transcription
     generated_ids = model.generate(input_features["input_features"])
     return transcription
+# Function to detect language using SpeechBrain's language ID model
+def detect_language_speechbrain(audio_file):
+    # Load the audio using torchaudio
+    signal, sample_rate = torchaudio.load(audio_file)
+    # Use SpeechBrain to classify the language of the audio
+    prediction = language_id.classify_batch(signal)
+    # Extract the language ISO code and its confidence
+    language = prediction[3][0]  # Extracted language
+    confidence = prediction[1].exp()  # Linear scale of confidence
+    return language, confidence.item()
 # Cleanup function to remove filler words and clean the transcription
 def cleanup_text(text):
+    """
+    Function to clean the transcription text by removing filler words, unnecessary spaces,
+    non-alphabetic characters, and ensuring proper capitalization.
+    """
+    # Step 1: Remove filler words like "uh", "um", etc.
     text = re.sub(r'\b(uh|um|like|you know|so|actually|basically)\b', '', text, flags=re.IGNORECASE)
+    # Step 2: Remove unwanted characters (e.g., non-alphabetical characters except punctuation)
+    text = re.sub(r'[^a-zA-Z0-9\s,.\'?!]', '', text)
+    # Step 3: Remove extra spaces and ensure proper spacing around punctuation
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
+    text = re.sub(r'\s([?.!.,])', r'\1', text)  # Remove space before punctuation
+    # Step 4: Normalize the whitespace (remove leading/trailing spaces)
     text = text.strip()
+    # Step 5: Capitalize the first letter of the transcription
     text = text.capitalize()
     return text
+# Main function to process the audio, transcribe it, and detect the language
 def process_audio(audio_file):
     try:
         transcription = transcribe_audio(audio_file)  # Transcribe audio to text
         if not transcription.strip():  # If transcription is empty or just whitespace
             raise ValueError("Transcription is empty.")
+        # Detect language using SpeechBrain's model
+        language, confidence = detect_language_speechbrain(audio_file)
         cleaned_text = cleanup_text(transcription)  # Clean up the transcription
+        return cleaned_text, language, confidence  # Return cleaned transcription, language, and confidence score
     except Exception as e:
         # If any error occurs, return the error message
         return f"Error: {str(e)}", "", ""