import gradio as gr import torch import torchaudio from transformers import AutoModel from huggingface_hub import login import os # Authenticate with Hugging Face # The token will be automatically available in HF Spaces as an environment variable hf_token = os.getenv("HF_TOKEN") if hf_token: login(token=hf_token) print("✅ Authenticated with Hugging Face") else: print("⚠️ HF_TOKEN not found. Make sure to add it in Space settings.") # Initialize device device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Load IndicConformer model print("Loading IndicConformer model...") indic_asr_model = AutoModel.from_pretrained( "ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True, token=hf_token # Pass token explicitly ) if device == "cuda": indic_asr_model = indic_asr_model.to(device) print("Model loaded successfully") def transcribe_audio(audio_file, language): """Transcribe audio using IndicConformer model""" if audio_file is None: return "❌ No audio file provided" if not language or language.strip() == "": return "❌ Please specify a language" try: # Load audio wav, sr = torchaudio.load(audio_file) # Convert to mono if stereo if wav.shape[0] > 1: wav = torch.mean(wav, dim=0, keepdim=True) # Resample to 16kHz if needed if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) wav = resampler(wav) # Move to device if device == "cuda": wav = wav.to(device) # Transcribe transcription = indic_asr_model(wav, language, "ctc") return transcription if transcription else "❌ Transcription failed" except Exception as e: return f"❌ Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Speech Recognition") as app: gr.Markdown("# 🎤 Multilingual Speech Recognition") gr.Markdown("Upload audio and specify language (e.g., 'sanskrit', 'hindi', 'tamil')") with gr.Row(): with gr.Column(): audio_input = gr.Audio(type="filepath", label="Upload Audio") language_input = gr.Textbox( label="Language", placeholder="e.g., sanskrit, hindi, tamil" ) transcribe_btn = gr.Button("🚀 Transcribe", variant="primary") with gr.Column(): output = gr.Textbox(label="Transcription", lines=10) transcribe_btn.click( fn=transcribe_audio, inputs=[audio_input, language_input], outputs=output ) if __name__ == "__main__": app.launch(server_name="0.0.0.0", server_port=7860)