Spaces:

adiitya29
/

Multilingual-ASR

Running

App Files Files Community

adiitya29 commited on 7 days ago

Commit

bddec1e

1 Parent(s): 8dced3a

Initialized project directories, added requirements, and implemented core Gradio UI with lazy-loaded Wav2Vec2 inference

Browse files

Files changed (5) hide show

app.py +4 -9
app/asr_model.py +47 -5
app/audio_processing.py +17 -3
app/history.py +48 -4
app/language_detection.py +16 -5

app.py CHANGED Viewed

@@ -1,18 +1,18 @@
 import gradio as gr
 from app.asr_model import load_model, transcribe_audio
-from app.language_detection import detect_language
 from app.history import save_to_history, export_history
 def process_audio(audio_path):
     if audio_path is None:
         return "No audio uploaded.", "Unknown"
-    # Optional: Detect Language
-    lang = detect_language(audio_path)
     # Transcribe Speech
     transcript = transcribe_audio(audio_path)
     # Save History
     save_to_history(audio_path, transcript, lang)
@@ -42,10 +42,5 @@ def create_ui():
     return demo
 if __name__ == "__main__":
-    # Pre-load model on start
-    print("Loading model...")
-    load_model()
-    print("Model loaded. Starting UI...")
     demo = create_ui()
     demo.launch()

 import gradio as gr
 from app.asr_model import load_model, transcribe_audio
+from app.language_detection import detect_language_from_text
 from app.history import save_to_history, export_history
 def process_audio(audio_path):
     if audio_path is None:
         return "No audio uploaded.", "Unknown"
     # Transcribe Speech
     transcript = transcribe_audio(audio_path)
+    # Detect Language from transcript
+    lang = detect_language_from_text(transcript)
     # Save History
     save_to_history(audio_path, transcript, lang)
     return demo
 if __name__ == "__main__":
     demo = create_ui()
     demo.launch()

app/asr_model.py CHANGED Viewed

@@ -1,15 +1,57 @@
-# This module handles the loading and inferencing of the Wav2Vec model
-def load_model():
     """
     Loads the Hugging Face Wav2Vec model and processor.
-    For Apple Silicon, we can utilize MPS (Metal Performance Shaders) later.
     """
-    pass
 def transcribe_audio(audio_filepath: str) -> str:
     """
     Takes an audio filepath, processes it, and runs it through the Wav2Vec model
     to return a text transcription.
     """
-    return "This is a placeholder transcription. Model integration is pending."

+import torch
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+from app.audio_processing import load_and_resample
+# Global variables to hold the model and processor
+processor = None
+model = None
+def load_model(model_name: str = "facebook/wav2vec2-base-960h"):
     """
     Loads the Hugging Face Wav2Vec model and processor.
+    Defaulting to English base model. For multilingual, consider models like:
+    - 'facebook/mms-1b-all'
+    - 'jonatasgrosman/wav2vec2-large-xlsr-53-english' (or other languages)
     """
+    global processor, model
+    print(f"Loading model {model_name}...")
+    processor = Wav2Vec2Processor.from_pretrained(model_name)
+    model = Wav2Vec2ForCTC.from_pretrained(model_name)
+    # Move to GPU if available (MPS for Apple Silicon)
+    if torch.backends.mps.is_available():
+        model.to("mps")
+    elif torch.cuda.is_available():
+        model.to("cuda")
 def transcribe_audio(audio_filepath: str) -> str:
     """
     Takes an audio filepath, processes it, and runs it through the Wav2Vec model
     to return a text transcription.
     """
+    if model is None or processor is None:
+        load_model()
+    try:
+        # 1. Load and resample audio to 16kHz
+        speech = load_and_resample(audio_filepath, target_sr=16000)
+        # 2. Prepare inputs
+        inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
+        # Move inputs to the same device as model
+        device = next(model.parameters()).device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # 3. Perform inference
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        # 4. Decode the output
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(predicted_ids)[0]
+        return transcription.lower()
+    except Exception as e:
+        return f"Error during transcription: {str(e)}"

app/audio_processing.py CHANGED Viewed

@@ -1,7 +1,21 @@
-# This module handles audio preprocessing using libraries like librosa
-def load_and_resample(audio_filepath: str, target_sr: int = 16000):
     """
     Loads an audio file and resamples it to the target sample rate (default 16kHz for Wav2Vec).
     """
-    pass

+import librosa
+import numpy as np
+def load_and_resample(audio_filepath: str, target_sr: int = 16000) -> np.ndarray:
     """
     Loads an audio file and resamples it to the target sample rate (default 16kHz for Wav2Vec).
+    Args:
+        audio_filepath (str): Path to the audio file.
+        target_sr (int): The sample rate required by the model.
+    Returns:
+        np.ndarray: The audio time series.
     """
+    try:
+        # librosa automatically resamples if sr is provided
+        speech, _ = librosa.load(audio_filepath, sr=target_sr)
+        return speech
+    except Exception as e:
+        raise RuntimeError(f"Error processing audio file {audio_filepath}: {e}")

app/history.py CHANGED Viewed

@@ -1,13 +1,57 @@
-# This module manages saving transcriptions to history and exporting them
 def save_to_history(audio_filepath: str, transcript: str, language: str):
     """
-    Saves the transcription data to a local JSON or CSV file in the data/ directory.
     """
-    pass
 def export_history(format: str = "csv"):
     """
     Exports the saved history into a downloadable format.
     """
-    pass

+import json
+import os
+import datetime
+import csv
+HISTORY_FILE = "data/history.json"
 def save_to_history(audio_filepath: str, transcript: str, language: str):
     """
+    Saves the transcription data to a local JSON file in the data/ directory.
     """
+    os.makedirs(os.path.dirname(HISTORY_FILE), exist_ok=True)
+    entry = {
+        "timestamp": datetime.datetime.now().isoformat(),
+        "audio_file": os.path.basename(audio_filepath),
+        "language": language,
+        "transcript": transcript
+    }
+    history = []
+    if os.path.exists(HISTORY_FILE):
+        try:
+            with open(HISTORY_FILE, "r") as f:
+                history = json.load(f)
+        except Exception:
+            pass
+    history.append(entry)
+    with open(HISTORY_FILE, "w") as f:
+        json.dump(history, f, indent=4)
 def export_history(format: str = "csv"):
     """
     Exports the saved history into a downloadable format.
+    Returns the path to the exported file.
     """
+    if not os.path.exists(HISTORY_FILE):
+        return None
+    export_path = f"data/export_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+    try:
+        with open(HISTORY_FILE, "r") as f:
+            history = json.load(f)
+        with open(export_path, "w", newline='') as f:
+            writer = csv.DictWriter(f, fieldnames=["timestamp", "audio_file", "language", "transcript"])
+            writer.writeheader()
+            for row in history:
+                writer.writerow(row)
+        return export_path
+    except Exception as e:
+        print(f"Failed to export history: {e}")
+        return None

app/language_detection.py CHANGED Viewed

@@ -1,8 +1,19 @@
-# This module handles language detection logic
-def detect_language(audio_filepath: str) -> str:
     """
-    Optional feature to detect the spoken language in the audio file.
-    Could use a separate small classification model or an API.
     """
-    return "English (Placeholder)"

+from langdetect import detect, DetectorFactory
+# Ensure consistent results
+DetectorFactory.seed = 0
+def detect_language_from_text(text: str) -> str:
     """
+    Detects language based on the transcribed text.
+    Returns the ISO 639-1 language code (e.g., 'en', 'es', 'fr').
     """
+    if not text or len(text.strip()) < 2:
+        return "Unknown"
+    try:
+        lang = detect(text)
+        return lang
+    except Exception as e:
+        return "Unknown"