File size: 2,591 Bytes
09a9b03
b1133a6
cb0a694
 
 
 
c48d761
cb0a694
 
c48d761
 
 
 
 
 
 
 
 
5d13a75
c48d761
5d13a75
c48d761
5d13a75
c48d761
 
 
 
 
 
 
 
 
 
 
 
 
 
09a9b03
c48d761
 
c838225
 
 
 
c48d761
c838225
 
 
 
 
 
 
c48d761
c838225
c48d761
c838225
c48d761
 
c838225
c48d761
 
 
c838225
 
 
 
 
 
cb0a694
 
 
c48d761
 
 
 
 
 
 
cb0a694
 
c48d761
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Gradio for Multi ASR
import gradio as gr
import torch
import soundfile as sf
from transformers import pipeline

# Device setup
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Pipeline cache
pipelines = {}

def get_pipeline(language):
    """Load and cache model pipelines based on selected language."""
    if language in pipelines:
        return pipelines[language]

    if language == "English":
        model_name = "openai/whisper-small"
    elif language == "Hindi":
        model_name = "vasista22/whisper-hindi-small"
    elif language == "Tamil":
        model_name = "vasista22/whisper-tamil-small"
    elif language == "Malayalam":
        model_name = "vrclc/W2V2-BERT-withLM-Malayalam-Studio"
    else:
        raise ValueError("Unsupported language")

    print(f"[INFO] Loading model for {language}: {model_name}")
    pipelines[language] = pipeline(
        "automatic-speech-recognition",
        model=model_name,
        device=device,
        chunk_length_s=10
    )
    return pipelines[language]

# Transcription code with error debugging
def transcribe(audio, language):
    """Transcribes speech from an audio file based on selected language."""
    try:
        if audio is None:
            return "Please record or upload an audio file."

        print(f"[DEBUG] Language: {language}")
        print(f"[DEBUG] Received audio: {audio}")

        audio_path = audio if isinstance(audio, str) else audio.get("name", None)
        if audio_path is None:
            return "Could not read audio file."

        audio_data, sample_rate = sf.read(audio_path)
        print(f"[DEBUG] Sample rate: {sample_rate}, shape: {audio_data.shape}")

        pipe = get_pipeline(language)

        # Prepare input format for transformers pipeline
        input_data = {"array": audio_data, "sampling_rate": sample_rate}

        result = pipe(input_data)["text"]
        print(f"[DEBUG] Transcription: {result}")
        return result

    except Exception as e:
        import traceback
        print("[ERROR] Exception during transcription:")
        traceback.print_exc()
        return f"Error: {str(e)}"

iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio"),
        gr.Dropdown(choices=["English", "Hindi", "Tamil", "Malayalam"], label="Select Language")
    ],
    outputs=gr.Textbox(label="Transcribed Text"),
    title="Multilingual Speech Recognition",
    description="Select a language and provide speech input to get transcription."
)

iface.launch()