Spaces:
Sleeping
Sleeping
File size: 2,798 Bytes
5294440 1034cbe 5294440 1034cbe 5294440 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import gradio as gr
import torch
import torchaudio
from transformers import AutoModel
from huggingface_hub import login
import os
# Authenticate with Hugging Face
# The token will be automatically available in HF Spaces as an environment variable
hf_token = os.getenv("HF_TOKEN")
if hf_token:
login(token=hf_token)
print("β
Authenticated with Hugging Face")
else:
print("β οΈ HF_TOKEN not found. Make sure to add it in Space settings.")
# Initialize device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load IndicConformer model
print("Loading IndicConformer model...")
indic_asr_model = AutoModel.from_pretrained(
"ai4bharat/indic-conformer-600m-multilingual",
trust_remote_code=True,
token=hf_token # Pass token explicitly
)
if device == "cuda":
indic_asr_model = indic_asr_model.to(device)
print("Model loaded successfully")
def transcribe_audio(audio_file, language):
"""Transcribe audio using IndicConformer model"""
if audio_file is None:
return "β No audio file provided"
if not language or language.strip() == "":
return "β Please specify a language"
try:
# Load audio
wav, sr = torchaudio.load(audio_file)
# Convert to mono if stereo
if wav.shape[0] > 1:
wav = torch.mean(wav, dim=0, keepdim=True)
# Resample to 16kHz if needed
if sr != 16000:
resampler = torchaudio.transforms.Resample(sr, 16000)
wav = resampler(wav)
# Move to device
if device == "cuda":
wav = wav.to(device)
# Transcribe
transcription = indic_asr_model(wav, language, "ctc")
return transcription if transcription else "β Transcription failed"
except Exception as e:
return f"β Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Speech Recognition") as app:
gr.Markdown("# π€ Multilingual Speech Recognition")
gr.Markdown("Upload audio and specify language (e.g., 'sanskrit', 'hindi', 'tamil')")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
language_input = gr.Textbox(
label="Language",
placeholder="e.g., sanskrit, hindi, tamil"
)
transcribe_btn = gr.Button("π Transcribe", variant="primary")
with gr.Column():
output = gr.Textbox(label="Transcription", lines=10)
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input, language_input],
outputs=output
)
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860) |