jasper / app.py
Athallah11's picture
Update app.py
6a624d9 verified
from transformers import Wav2Vec2Processor, HubertForCTC
import torch
import gradio as gr
import librosa
import numpy as np
import os
# Load processor dan model dari Hugging Face
processor = Wav2Vec2Processor.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned")
model = HubertForCTC.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned")
model.eval()
# Preprocess audio: convert ke float32 dan resample ke 16kHz
def preprocess(audio, sr):
if not np.issubdtype(audio.dtype, np.floating):
audio = audio.astype(np.float32)
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
return audio
# Fungsi transkripsi utama
def transcribe(audio):
try:
# Deteksi format input audio
if isinstance(audio, tuple):
sr, audio_data = audio
elif isinstance(audio, str):
audio_data, sr = librosa.load(audio, sr=None)
os.remove(audio) # hapus file sementara yang diunggah
else:
return "Format audio tidak dikenali."
# Cek durasi minimal agar rekaman tidak terlalu pendek
durasi = len(audio_data) / sr
if durasi < 1.0:
return "Audio terlalu pendek, coba rekam minimal 1 detik."
input_values = preprocess(audio_data, sr)
# Normalisasi audio agar volume seragam dan mengurangi noise
max_abs = np.max(np.abs(input_values))
if max_abs > 0:
input_values = input_values / max_abs
# Tokenisasi dan prediksi
inputs = processor(input_values, return_tensors="pt", sampling_rate=16000, padding=True)
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription.lower()
except Exception as e:
return f"Terjadi kesalahan: {str(e)}"
# UI Gradio
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Rekam atau Unggah Audio"),
outputs="text",
title="Jasper - ASR Bahasa Jawa",
description="Model ASR Bahasa Jawa fine-tuned dengan HuBERT\n\nTips:\n- Pastikan suara direkam dengan jelas.\n- Rekam minimal 1 detik untuk hasil optimal."
)
if __name__ == "__main__":
iface.launch()