Spaces:
Sleeping
Sleeping
| from transformers import Wav2Vec2Processor, HubertForCTC | |
| import torch | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import os | |
| # Load processor dan model dari Hugging Face | |
| processor = Wav2Vec2Processor.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned") | |
| model = HubertForCTC.from_pretrained("Athallah11/hubert-javanase-jasper-finetuned") | |
| model.eval() | |
| # Preprocess audio: convert ke float32 dan resample ke 16kHz | |
| def preprocess(audio, sr): | |
| if not np.issubdtype(audio.dtype, np.floating): | |
| audio = audio.astype(np.float32) | |
| if sr != 16000: | |
| audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) | |
| return audio | |
| # Fungsi transkripsi utama | |
| def transcribe(audio): | |
| try: | |
| # Deteksi format input audio | |
| if isinstance(audio, tuple): | |
| sr, audio_data = audio | |
| elif isinstance(audio, str): | |
| audio_data, sr = librosa.load(audio, sr=None) | |
| os.remove(audio) # hapus file sementara yang diunggah | |
| else: | |
| return "Format audio tidak dikenali." | |
| # Cek durasi minimal agar rekaman tidak terlalu pendek | |
| durasi = len(audio_data) / sr | |
| if durasi < 1.0: | |
| return "Audio terlalu pendek, coba rekam minimal 1 detik." | |
| input_values = preprocess(audio_data, sr) | |
| # Normalisasi audio agar volume seragam dan mengurangi noise | |
| max_abs = np.max(np.abs(input_values)) | |
| if max_abs > 0: | |
| input_values = input_values / max_abs | |
| # Tokenisasi dan prediksi | |
| inputs = processor(input_values, return_tensors="pt", sampling_rate=16000, padding=True) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = processor.batch_decode(predicted_ids)[0] | |
| return transcription.lower() | |
| except Exception as e: | |
| return f"Terjadi kesalahan: {str(e)}" | |
| # UI Gradio | |
| iface = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio(sources=["microphone", "upload"], type="numpy", label="Rekam atau Unggah Audio"), | |
| outputs="text", | |
| title="Jasper - ASR Bahasa Jawa", | |
| description="Model ASR Bahasa Jawa fine-tuned dengan HuBERT\n\nTips:\n- Pastikan suara direkam dengan jelas.\n- Rekam minimal 1 detik untuk hasil optimal." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |