transcribe-web / app.py
tanujasarma's picture
Update app.py
b86558b verified
import gradio as gr
import torch
import librosa
import numpy as np
import noisereduce as nr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
# -------------------------------
# Load model
# -------------------------------
MODEL_ID = "infinitejoy/Wav2Vec2-Large-XLSR-53-Assamese"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading model {MODEL_ID} on {DEVICE}...")
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(DEVICE)
print("✅ Model loaded.")
# -------------------------------
# Audio Preprocessing Function
# -------------------------------
def preprocess_audio(audio_path):
"""
Preprocess audio by:
- Loading and resampling to 16kHz mono
- Trimming silence
- Reducing background noise
- Normalizing amplitude
"""
# Load audio
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
# Trim silence
audio, _ = librosa.effects.trim(audio)
# Noise reduction
reduced_noise = nr.reduce_noise(y=audio, sr=sr)
# Normalize volume
if np.max(np.abs(reduced_noise)) > 0:
reduced_noise = reduced_noise / np.max(np.abs(reduced_noise))
return reduced_noise, sr
# -------------------------------
# Transcription Function
# -------------------------------
def transcribe(audio_path):
try:
# Preprocess audio
audio, sr = preprocess_audio(audio_path)
# Prepare input for model
inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
# Get prediction
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
except Exception as e:
return f"❌ Error: {str(e)}"
# -------------------------------
# Gradio Interface
# -------------------------------
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Upload audio (wav/mp3)"),
outputs="text",
title="Assamese Transcription by Tanuja and Kritika",
description="Upload an audio file (16kHz recommended). The model will transcribe it to Assamese."
)
if __name__ == "__main__":
demo.launch()