File size: 1,856 Bytes
cd2510b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | import gradio as gr
from transformers import pipeline
# ==============================
# CONFIG
# ==============================
MODEL_PATH = "Sabbir772/BNWCH"
LANGUAGE = "bn" # Bengali
TASK = "transcribe"
# ==============================
# LOAD MODEL
# ==============================
print(f"π Loading model from {MODEL_PATH} ...")
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_PATH,
tokenizer=MODEL_PATH,
chunk_length_s=30, # You can tune this for performance
device=-1 # Use CPU on Spaces unless GPU is enabled
)
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(
language=LANGUAGE, task=TASK
)
print("β
Model loaded successfully!\n")
# ==============================
# DEFINE INFERENCE FUNCTION
# ==============================
def transcribe(audio):
"""Takes an audio file (tuple from Gradio) and returns transcription."""
if audio is None:
return "No audio provided."
# Gradio passes (sample_rate, data)
sr, data = audio
result = pipe(data)["text"]
return result.strip()
# ==============================
# DEFINE GRADIO INTERFACE
# ==============================
title = "Bangla Whisper ASR (Chittagong Dialect)"
description = (
"ποΈ Upload or record audio to transcribe Bangla (Chittagong dialect) speech "
"using fine-tuned Whisper model. <br><br>"
"Model: **Sabbir772/BNWCH**"
)
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="π§ Input Audio"),
outputs=gr.Textbox(label="π Transcription", placeholder="Model output will appear here..."),
title=title,
description=description,
allow_flagging="never",
)
# ==============================
# LAUNCH APP
# ==============================
if __name__ == "__main__":
demo.launch()
|