MultilingualASR / app.py
rverma0631's picture
Update app.py
1034cbe verified
import gradio as gr
import torch
import torchaudio
from transformers import AutoModel
from huggingface_hub import login
import os
# Authenticate with Hugging Face
# The token will be automatically available in HF Spaces as an environment variable
hf_token = os.getenv("HF_TOKEN")
if hf_token:
login(token=hf_token)
print("βœ… Authenticated with Hugging Face")
else:
print("⚠️ HF_TOKEN not found. Make sure to add it in Space settings.")
# Initialize device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load IndicConformer model
print("Loading IndicConformer model...")
indic_asr_model = AutoModel.from_pretrained(
"ai4bharat/indic-conformer-600m-multilingual",
trust_remote_code=True,
token=hf_token # Pass token explicitly
)
if device == "cuda":
indic_asr_model = indic_asr_model.to(device)
print("Model loaded successfully")
def transcribe_audio(audio_file, language):
"""Transcribe audio using IndicConformer model"""
if audio_file is None:
return "❌ No audio file provided"
if not language or language.strip() == "":
return "❌ Please specify a language"
try:
# Load audio
wav, sr = torchaudio.load(audio_file)
# Convert to mono if stereo
if wav.shape[0] > 1:
wav = torch.mean(wav, dim=0, keepdim=True)
# Resample to 16kHz if needed
if sr != 16000:
resampler = torchaudio.transforms.Resample(sr, 16000)
wav = resampler(wav)
# Move to device
if device == "cuda":
wav = wav.to(device)
# Transcribe
transcription = indic_asr_model(wav, language, "ctc")
return transcription if transcription else "❌ Transcription failed"
except Exception as e:
return f"❌ Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Speech Recognition") as app:
gr.Markdown("# 🎀 Multilingual Speech Recognition")
gr.Markdown("Upload audio and specify language (e.g., 'sanskrit', 'hindi', 'tamil')")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
language_input = gr.Textbox(
label="Language",
placeholder="e.g., sanskrit, hindi, tamil"
)
transcribe_btn = gr.Button("πŸš€ Transcribe", variant="primary")
with gr.Column():
output = gr.Textbox(label="Transcription", lines=10)
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input, language_input],
outputs=output
)
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860)