File size: 2,798 Bytes
5294440
 
 
 
1034cbe
 
 
 
 
 
 
 
 
 
 
5294440
 
 
 
 
 
 
 
 
1034cbe
 
5294440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
import torch
import torchaudio
from transformers import AutoModel
from huggingface_hub import login
import os

# Authenticate with Hugging Face
# The token will be automatically available in HF Spaces as an environment variable
hf_token = os.getenv("HF_TOKEN")
if hf_token:
    login(token=hf_token)
    print("βœ… Authenticated with Hugging Face")
else:
    print("⚠️ HF_TOKEN not found. Make sure to add it in Space settings.")

# Initialize device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load IndicConformer model
print("Loading IndicConformer model...")
indic_asr_model = AutoModel.from_pretrained(
    "ai4bharat/indic-conformer-600m-multilingual", 
    trust_remote_code=True,
    token=hf_token  # Pass token explicitly
)
if device == "cuda":
    indic_asr_model = indic_asr_model.to(device)
print("Model loaded successfully")

def transcribe_audio(audio_file, language):
    """Transcribe audio using IndicConformer model"""
    if audio_file is None:
        return "❌ No audio file provided"
    
    if not language or language.strip() == "":
        return "❌ Please specify a language"
    
    try:
        # Load audio
        wav, sr = torchaudio.load(audio_file)
        
        # Convert to mono if stereo
        if wav.shape[0] > 1:
            wav = torch.mean(wav, dim=0, keepdim=True)
        
        # Resample to 16kHz if needed
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            wav = resampler(wav)
        
        # Move to device
        if device == "cuda":
            wav = wav.to(device)
        
        # Transcribe
        transcription = indic_asr_model(wav, language, "ctc")
        
        return transcription if transcription else "❌ Transcription failed"
    
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Speech Recognition") as app:
    gr.Markdown("# 🎀 Multilingual Speech Recognition")
    gr.Markdown("Upload audio and specify language (e.g., 'sanskrit', 'hindi', 'tamil')")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Audio")
            language_input = gr.Textbox(
                label="Language", 
                placeholder="e.g., sanskrit, hindi, tamil"
            )
            transcribe_btn = gr.Button("πŸš€ Transcribe", variant="primary")
        
        with gr.Column():
            output = gr.Textbox(label="Transcription", lines=10)
    
    transcribe_btn.click(
        fn=transcribe_audio,
        inputs=[audio_input, language_input],
        outputs=output
    )

if __name__ == "__main__":
    app.launch(server_name="0.0.0.0", server_port=7860)