Spaces:
Sleeping
Sleeping
File size: 2,356 Bytes
1ca5d7d 628c2f9 1ca5d7d 4a76d1d 83ec9a0 1ca5d7d 4a76d1d 1ca5d7d 628c2f9 1ca5d7d 4a76d1d 1ca5d7d 360d8a8 8c7cb50 628c2f9 a3e165e 1ca5d7d 4a76d1d 83ec9a0 677d4e5 83ec9a0 1ca5d7d 83ec9a0 1ca5d7d 677d4e5 83ec9a0 1ca5d7d 677d4e5 1ca5d7d 83ec9a0 1ca5d7d 83ec9a0 677d4e5 83ec9a0 1ca5d7d 83ec9a0 4a76d1d 677d4e5 4a76d1d 360d8a8 4a76d1d 1ca5d7d 4a76d1d 6a624d9 1ca5d7d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | from transformers import Wav2Vec2Processor, HubertForCTC
import torch
import gradio as gr
import librosa
import numpy as np
import os
# Load processor dan model dari Hugging Face
processor = Wav2Vec2Processor.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned")
model = HubertForCTC.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned")
model.eval()
# Preprocess audio: convert ke float32 dan resample ke 16kHz
def preprocess(audio, sr):
if not np.issubdtype(audio.dtype, np.floating):
audio = audio.astype(np.float32)
if sr != 16000:
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
return audio
# Fungsi transkripsi utama
def transcribe(audio):
try:
# Deteksi format input audio
if isinstance(audio, tuple):
sr, audio_data = audio
elif isinstance(audio, str):
audio_data, sr = librosa.load(audio, sr=None)
os.remove(audio) # hapus file sementara yang diunggah
else:
return "Format audio tidak dikenali."
# Cek durasi minimal agar rekaman tidak terlalu pendek
durasi = len(audio_data) / sr
if durasi < 1.0:
return "Audio terlalu pendek, coba rekam minimal 1 detik."
input_values = preprocess(audio_data, sr)
# Normalisasi audio agar volume seragam dan mengurangi noise
max_abs = np.max(np.abs(input_values))
if max_abs > 0:
input_values = input_values / max_abs
# Tokenisasi dan prediksi
inputs = processor(input_values, return_tensors="pt", sampling_rate=16000, padding=True)
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription.lower()
except Exception as e:
return f"Terjadi kesalahan: {str(e)}"
# UI Gradio
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Rekam atau Unggah Audio"),
outputs="text",
title="Jasper - ASR Bahasa Jawa",
description="Model ASR Bahasa Jawa fine-tuned dengan HuBERT\n\nTips:\n- Pastikan suara direkam dengan jelas.\n- Rekam minimal 1 detik untuk hasil optimal."
)
if __name__ == "__main__":
iface.launch() |