Spaces:

tanujasarma
/

transcribe-web

Runtime error

App Files Files Community

transcribe-web / app.py

tanujasarma

Update app.py

b86558b verified 5 months ago

raw

history blame contribute delete

2.36 kB

	import gradio as gr
	import torch
	import librosa
	import numpy as np
	import noisereduce as nr
	from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

	# -------------------------------
	# Load model
	# -------------------------------
	MODEL_ID = "infinitejoy/Wav2Vec2-Large-XLSR-53-Assamese"
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	print(f"Loading model {MODEL_ID} on {DEVICE}...")
	processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
	model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).to(DEVICE)
	print("✅ Model loaded.")


	# -------------------------------
	# Audio Preprocessing Function
	# -------------------------------
	def preprocess_audio(audio_path):
	"""
	Preprocess audio by:
	- Loading and resampling to 16kHz mono
	- Trimming silence
	- Reducing background noise
	- Normalizing amplitude
	"""
	# Load audio
	audio, sr = librosa.load(audio_path, sr=16000, mono=True)

	# Trim silence
	audio, _ = librosa.effects.trim(audio)

	# Noise reduction
	reduced_noise = nr.reduce_noise(y=audio, sr=sr)

	# Normalize volume
	if np.max(np.abs(reduced_noise)) > 0:
	reduced_noise = reduced_noise / np.max(np.abs(reduced_noise))

	return reduced_noise, sr


	# -------------------------------
	# Transcription Function
	# -------------------------------
	def transcribe(audio_path):
	try:
	# Preprocess audio
	audio, sr = preprocess_audio(audio_path)

	# Prepare input for model
	inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
	inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

	# Get prediction
	with torch.no_grad():
	logits = model(**inputs).logits

	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.batch_decode(predicted_ids)[0]

	return transcription

	except Exception as e:
	return f"❌ Error: {str(e)}"


	# -------------------------------
	# Gradio Interface
	# -------------------------------
	demo = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(type="filepath", label="Upload audio (wav/mp3)"),
	outputs="text",
	title="Assamese Transcription by Tanuja and Kritika",
	description="Upload an audio file (16kHz recommended). The model will transcribe it to Assamese."
	)

	if __name__ == "__main__":
	demo.launch()