from transformers import Wav2Vec2Processor, HubertForCTC import torch import gradio as gr import librosa import numpy as np import os # Load processor dan model dari Hugging Face processor = Wav2Vec2Processor.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned") model = HubertForCTC.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned") model.eval() # Preprocess audio: convert ke float32 dan resample ke 16kHz def preprocess(audio, sr): if not np.issubdtype(audio.dtype, np.floating): audio = audio.astype(np.float32) if sr != 16000: audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) return audio # Fungsi transkripsi utama def transcribe(audio): try: # Deteksi format input audio if isinstance(audio, tuple): sr, audio_data = audio elif isinstance(audio, str): audio_data, sr = librosa.load(audio, sr=None) os.remove(audio) # hapus file sementara yang diunggah else: return "Format audio tidak dikenali." # Cek durasi minimal agar rekaman tidak terlalu pendek durasi = len(audio_data) / sr if durasi < 1.0: return "Audio terlalu pendek, coba rekam minimal 1 detik." input_values = preprocess(audio_data, sr) # Normalisasi audio agar volume seragam dan mengurangi noise max_abs = np.max(np.abs(input_values)) if max_abs > 0: input_values = input_values / max_abs # Tokenisasi dan prediksi inputs = processor(input_values, return_tensors="pt", sampling_rate=16000, padding=True) with torch.no_grad(): logits = model(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription.lower() except Exception as e: return f"Terjadi kesalahan: {str(e)}" # UI Gradio iface = gr.Interface( fn=transcribe, inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Rekam atau Unggah Audio"), outputs="text", title="Jasper - ASR Bahasa Jawa", description="Model ASR Bahasa Jawa fine-tuned dengan HuBERT\n\nTips:\n- Pastikan suara direkam dengan jelas.\n- Rekam minimal 1 detik untuk hasil optimal." ) if __name__ == "__main__": iface.launch()