Spaces:
Running
Running
| from pathlib import Path | |
| from typing import Any | |
| import librosa | |
| import numpy as np | |
| import tensorflow as tf | |
| # ============================================================ | |
| # AUDIO CONFIGURATION | |
| # Harus sama dengan preprocessing saat training | |
| # ============================================================ | |
| SAMPLE_RATE = 16000 | |
| DURATION = 2.0 | |
| NUM_SAMPLES = int(SAMPLE_RATE * DURATION) | |
| N_MFCC = 40 | |
| N_MELS = 64 | |
| FRAME_LENGTH = 512 | |
| FRAME_STEP = 160 | |
| FFT_LENGTH = 512 | |
| # ============================================================ | |
| # LOAD DAN POTONG AUDIO MENJADI CLIP | |
| # ============================================================ | |
| def load_audio_clips( | |
| file_path: str | Path | |
| ) -> list[np.ndarray]: | |
| """ | |
| Load audio, resample ke 16 kHz, ubah menjadi mono, | |
| lalu potong menjadi beberapa clip berdurasi 2 detik. | |
| Clip terakhir yang kurang dari 2 detik akan diberi padding nol. | |
| Contoh: | |
| audio 1 detik -> 1 clip | |
| audio 2 detik -> 1 clip | |
| audio 5 detik -> 3 clip | |
| audio 60 detik -> 30 clip | |
| """ | |
| audio, _ = librosa.load( | |
| str(file_path), | |
| sr=SAMPLE_RATE, | |
| mono=True | |
| ) | |
| audio = audio.astype( | |
| np.float32 | |
| ) | |
| if len(audio) == 0: | |
| raise ValueError( | |
| "Audio kosong atau tidak dapat dibaca." | |
| ) | |
| clips = [] | |
| for start_index in range( | |
| 0, | |
| len(audio), | |
| NUM_SAMPLES | |
| ): | |
| clip = audio[ | |
| start_index:start_index + NUM_SAMPLES | |
| ] | |
| # Padding jika clip terakhir kurang dari 2 detik | |
| if len(clip) < NUM_SAMPLES: | |
| padding_size = ( | |
| NUM_SAMPLES | |
| - len(clip) | |
| ) | |
| clip = np.pad( | |
| clip, | |
| pad_width=(0, padding_size), | |
| mode="constant" | |
| ) | |
| clips.append( | |
| clip.astype(np.float32) | |
| ) | |
| return clips | |
| # ============================================================ | |
| # PREPROCESS SATU CLIP AUDIO | |
| # ============================================================ | |
| def preprocess_audio_clip( | |
| audio_clip: np.ndarray | |
| ) -> dict[str, tf.Tensor]: | |
| """ | |
| Preprocess satu clip audio berdurasi tepat 2 detik. | |
| Returns: | |
| { | |
| "waveform_input": shape (1, 32000, 1), | |
| "mfcc_input": shape (1, 40, time_frames, 1) | |
| } | |
| """ | |
| audio_tensor = tf.convert_to_tensor( | |
| audio_clip, | |
| dtype=tf.float32 | |
| ) | |
| # ======================================================== | |
| # WAVEFORM INPUT | |
| # Shape: (batch, samples, channel) | |
| # ======================================================== | |
| waveform_input = tf.expand_dims( | |
| audio_tensor, | |
| axis=-1 | |
| ) | |
| waveform_input = tf.expand_dims( | |
| waveform_input, | |
| axis=0 | |
| ) | |
| # ======================================================== | |
| # MFCC INPUT | |
| # ======================================================== | |
| # Center padding manual agar sama seperti training | |
| pad = FFT_LENGTH // 2 | |
| audio_centered = tf.pad( | |
| audio_tensor, | |
| paddings=[[pad, pad]] | |
| ) | |
| stft = tf.signal.stft( | |
| audio_centered, | |
| frame_length=FRAME_LENGTH, | |
| frame_step=FRAME_STEP, | |
| fft_length=FFT_LENGTH | |
| ) | |
| spectrogram = tf.abs( | |
| stft | |
| ) | |
| power_spectrogram = tf.square( | |
| spectrogram | |
| ) | |
| num_spectrogram_bins = ( | |
| FFT_LENGTH // 2 + 1 | |
| ) | |
| mel_weight_matrix = ( | |
| tf.signal.linear_to_mel_weight_matrix( | |
| num_mel_bins=N_MELS, | |
| num_spectrogram_bins=num_spectrogram_bins, | |
| sample_rate=SAMPLE_RATE, | |
| lower_edge_hertz=80.0, | |
| upper_edge_hertz=7600.0 | |
| ) | |
| ) | |
| mel_spectrogram = tf.matmul( | |
| power_spectrogram, | |
| mel_weight_matrix | |
| ) | |
| log_mel_spectrogram = tf.math.log( | |
| mel_spectrogram + 1e-6 | |
| ) | |
| mfcc = tf.signal.mfccs_from_log_mel_spectrograms( | |
| log_mel_spectrogram | |
| ) | |
| # Ambil 40 koefisien MFCC | |
| mfcc = mfcc[:, :N_MFCC] | |
| # Shape: (mfcc, time) | |
| mfcc = tf.transpose( | |
| mfcc | |
| ) | |
| # Normalisasi MFCC | |
| mean = tf.reduce_mean( | |
| mfcc | |
| ) | |
| std = tf.math.reduce_std( | |
| mfcc | |
| ) | |
| mfcc = ( | |
| (mfcc - mean) | |
| / (std + 1e-6) | |
| ) | |
| # Shape: (batch, mfcc, time, channel) | |
| mfcc_input = tf.expand_dims( | |
| mfcc, | |
| axis=-1 | |
| ) | |
| mfcc_input = tf.expand_dims( | |
| mfcc_input, | |
| axis=0 | |
| ) | |
| return { | |
| "waveform_input": waveform_input, | |
| "mfcc_input": mfcc_input | |
| } | |
| # ============================================================ | |
| # PREDIKSI SATU CLIP | |
| # ============================================================ | |
| def predict_single_clip( | |
| model: tf.keras.Model, | |
| audio_clip: np.ndarray, | |
| threshold: float | |
| ) -> dict[str, Any]: | |
| """ | |
| Prediksi terhadap satu clip audio berdurasi 2 detik. | |
| Model output: | |
| class 0 = real | |
| class 1 = fake | |
| """ | |
| inputs = preprocess_audio_clip( | |
| audio_clip=audio_clip | |
| ) | |
| logits = model( | |
| inputs, | |
| training=False | |
| ) | |
| probabilities = tf.nn.softmax( | |
| logits, | |
| axis=-1 | |
| ).numpy()[0] | |
| probability_real = float( | |
| probabilities[0] | |
| ) | |
| probability_fake = float( | |
| probabilities[1] | |
| ) | |
| predicted_label = ( | |
| "fake" | |
| if probability_fake >= threshold | |
| else "real" | |
| ) | |
| return { | |
| "prediction": predicted_label, | |
| "probability_real": probability_real, | |
| "probability_fake": probability_fake | |
| } | |
| # ============================================================ | |
| # PREDIKSI AUDIO UTUH BERDASARKAN MAYORITAS CLIP | |
| # ============================================================ | |
| def predict_audio( | |
| model: tf.keras.Model, | |
| file_path: str | Path, | |
| threshold: float = 0.60 | |
| ) -> dict[str, Any]: | |
| """ | |
| Potong audio menjadi clip 2 detik, prediksi setiap clip, | |
| lalu tentukan hasil akhir berdasarkan mayoritas clip. | |
| Jika jumlah prediksi fake dan real sama: | |
| gunakan rata-rata probability_fake sebagai tie breaker. | |
| """ | |
| if not 0.0 <= threshold <= 1.0: | |
| raise ValueError( | |
| "Threshold harus berada pada rentang 0.0 sampai 1.0." | |
| ) | |
| clips = load_audio_clips( | |
| file_path=file_path | |
| ) | |
| clip_results = [] | |
| for clip_index, clip in enumerate( | |
| clips, | |
| start=1 | |
| ): | |
| result = predict_single_clip( | |
| model=model, | |
| audio_clip=clip, | |
| threshold=threshold | |
| ) | |
| clip_results.append({ | |
| "clip_index": clip_index, | |
| "start_second": round( | |
| (clip_index - 1) * DURATION, | |
| 2 | |
| ), | |
| "end_second": round( | |
| clip_index * DURATION, | |
| 2 | |
| ), | |
| "prediction": result["prediction"], | |
| "probability_real": round( | |
| result["probability_real"], | |
| 6 | |
| ), | |
| "probability_fake": round( | |
| result["probability_fake"], | |
| 6 | |
| ) | |
| }) | |
| total_clips = len( | |
| clip_results | |
| ) | |
| fake_clips = sum( | |
| result["prediction"] == "fake" | |
| for result in clip_results | |
| ) | |
| real_clips = ( | |
| total_clips | |
| - fake_clips | |
| ) | |
| average_probability_fake = float( | |
| np.mean([ | |
| result["probability_fake"] | |
| for result in clip_results | |
| ]) | |
| ) | |
| average_probability_real = float( | |
| np.mean([ | |
| result["probability_real"] | |
| for result in clip_results | |
| ]) | |
| ) | |
| # Hasil akhir berdasarkan mayoritas clip | |
| if fake_clips > real_clips: | |
| final_prediction = "fake" | |
| elif real_clips > fake_clips: | |
| final_prediction = "real" | |
| else: | |
| # Tie breaker jika jumlah real dan fake sama | |
| final_prediction = ( | |
| "fake" | |
| if average_probability_fake >= threshold | |
| else "real" | |
| ) | |
| return { | |
| "prediction": final_prediction, | |
| "decision_method": "majority_vote", | |
| "threshold": round( | |
| float(threshold), | |
| 4 | |
| ), | |
| "clip_duration_seconds": DURATION, | |
| "total_clips": total_clips, | |
| "real_clips": real_clips, | |
| "fake_clips": fake_clips, | |
| "average_probability_real": round( | |
| average_probability_real, | |
| 6 | |
| ), | |
| "average_probability_fake": round( | |
| average_probability_fake, | |
| 6 | |
| ), | |
| "clips": clip_results | |
| } |