File size: 2,356 Bytes
1ca5d7d
628c2f9
1ca5d7d
4a76d1d
83ec9a0
1ca5d7d
4a76d1d
1ca5d7d
628c2f9
1ca5d7d
 
4a76d1d
1ca5d7d
360d8a8
8c7cb50
 
628c2f9
 
 
a3e165e
1ca5d7d
4a76d1d
83ec9a0
677d4e5
83ec9a0
 
 
 
1ca5d7d
83ec9a0
 
 
1ca5d7d
 
 
677d4e5
 
83ec9a0
 
1ca5d7d
 
 
 
677d4e5
1ca5d7d
 
 
 
83ec9a0
1ca5d7d
83ec9a0
677d4e5
83ec9a0
1ca5d7d
83ec9a0
 
4a76d1d
677d4e5
4a76d1d
 
360d8a8
4a76d1d
1ca5d7d
 
4a76d1d
 
6a624d9
1ca5d7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from transformers import Wav2Vec2Processor, HubertForCTC
import torch
import gradio as gr
import librosa
import numpy as np
import os

# Load processor dan model dari Hugging Face
processor = Wav2Vec2Processor.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned")
model = HubertForCTC.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned")
model.eval()

# Preprocess audio: convert ke float32 dan resample ke 16kHz
def preprocess(audio, sr):
    if not np.issubdtype(audio.dtype, np.floating):
        audio = audio.astype(np.float32)
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    return audio

# Fungsi transkripsi utama
def transcribe(audio):
    try:
        # Deteksi format input audio
        if isinstance(audio, tuple):
            sr, audio_data = audio
        elif isinstance(audio, str):
            audio_data, sr = librosa.load(audio, sr=None)
            os.remove(audio)  # hapus file sementara yang diunggah
        else:
            return "Format audio tidak dikenali."

        # Cek durasi minimal agar rekaman tidak terlalu pendek
        durasi = len(audio_data) / sr
        if durasi < 1.0:
            return "Audio terlalu pendek, coba rekam minimal 1 detik."

        input_values = preprocess(audio_data, sr)

        # Normalisasi audio agar volume seragam dan mengurangi noise
        max_abs = np.max(np.abs(input_values))
        if max_abs > 0:
            input_values = input_values / max_abs

        # Tokenisasi dan prediksi
        inputs = processor(input_values, return_tensors="pt", sampling_rate=16000, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]

        return transcription.lower()
    
    except Exception as e:
        return f"Terjadi kesalahan: {str(e)}"

# UI Gradio
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Rekam atau Unggah Audio"),
    outputs="text",
    title="Jasper - ASR Bahasa Jawa",
    description="Model ASR Bahasa Jawa fine-tuned dengan HuBERT\n\nTips:\n- Pastikan suara direkam dengan jelas.\n- Rekam minimal 1 detik untuk hasil optimal."
)

if __name__ == "__main__":
    iface.launch()