File size: 1,741 Bytes
f149168
9812441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import streamlit as st
import torch
import tempfile
import os
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Model from Hugging Face
MODEL_NAME = "chiyo123/whisper-small-tonga"

@st.cache_resource
def load_model_and_processor():
    processor = WhisperProcessor.from_pretrained(MODEL_NAME)
    model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
    model.eval()
    return processor, model

processor, model = load_model_and_processor()

# Streamlit UI
st.title("πŸ—£οΈ Custom Whisper Transcriber")
st.write("Upload an audio file and transcribe it using your fine-tuned Whisper model.")

uploaded_file = st.file_uploader("Upload audio", type=["mp3", "wav", "flac", "m4a"])
language = st.text_input("Target language code (e.g., loz, bemba, en)", value="loz")

if uploaded_file:
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        tmp.write(uploaded_file.read())
        tmp_path = tmp.name

    # Load and preprocess audio
    speech_array, sampling_rate = torchaudio.load(tmp_path)
    speech_array = torchaudio.functional.resample(speech_array, orig_freq=sampling_rate, new_freq=16000)
    input_values = processor(speech_array.squeeze(), return_tensors="pt", sampling_rate=16000).input_features

    # Generate
    with st.spinner("Transcribing..."):
        forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
        predicted_ids = model.generate(input_values, forced_decoder_ids=forced_decoder_ids)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    st.subheader("πŸ“„ Transcription")
    st.success(transcription)

    # Cleanup
    os.remove(tmp_path)