from pathlib import Path from typing import Any import librosa import numpy as np import tensorflow as tf # ============================================================ # AUDIO CONFIGURATION # Harus sama dengan preprocessing saat training # ============================================================ SAMPLE_RATE = 16000 DURATION = 2.0 NUM_SAMPLES = int(SAMPLE_RATE * DURATION) N_MFCC = 40 N_MELS = 64 FRAME_LENGTH = 512 FRAME_STEP = 160 FFT_LENGTH = 512 # ============================================================ # LOAD DAN POTONG AUDIO MENJADI CLIP # ============================================================ def load_audio_clips( file_path: str | Path ) -> list[np.ndarray]: """ Load audio, resample ke 16 kHz, ubah menjadi mono, lalu potong menjadi beberapa clip berdurasi 2 detik. Clip terakhir yang kurang dari 2 detik akan diberi padding nol. Contoh: audio 1 detik -> 1 clip audio 2 detik -> 1 clip audio 5 detik -> 3 clip audio 60 detik -> 30 clip """ audio, _ = librosa.load( str(file_path), sr=SAMPLE_RATE, mono=True ) audio = audio.astype( np.float32 ) if len(audio) == 0: raise ValueError( "Audio kosong atau tidak dapat dibaca." ) clips = [] for start_index in range( 0, len(audio), NUM_SAMPLES ): clip = audio[ start_index:start_index + NUM_SAMPLES ] # Padding jika clip terakhir kurang dari 2 detik if len(clip) < NUM_SAMPLES: padding_size = ( NUM_SAMPLES - len(clip) ) clip = np.pad( clip, pad_width=(0, padding_size), mode="constant" ) clips.append( clip.astype(np.float32) ) return clips # ============================================================ # PREPROCESS SATU CLIP AUDIO # ============================================================ def preprocess_audio_clip( audio_clip: np.ndarray ) -> dict[str, tf.Tensor]: """ Preprocess satu clip audio berdurasi tepat 2 detik. Returns: { "waveform_input": shape (1, 32000, 1), "mfcc_input": shape (1, 40, time_frames, 1) } """ audio_tensor = tf.convert_to_tensor( audio_clip, dtype=tf.float32 ) # ======================================================== # WAVEFORM INPUT # Shape: (batch, samples, channel) # ======================================================== waveform_input = tf.expand_dims( audio_tensor, axis=-1 ) waveform_input = tf.expand_dims( waveform_input, axis=0 ) # ======================================================== # MFCC INPUT # ======================================================== # Center padding manual agar sama seperti training pad = FFT_LENGTH // 2 audio_centered = tf.pad( audio_tensor, paddings=[[pad, pad]] ) stft = tf.signal.stft( audio_centered, frame_length=FRAME_LENGTH, frame_step=FRAME_STEP, fft_length=FFT_LENGTH ) spectrogram = tf.abs( stft ) power_spectrogram = tf.square( spectrogram ) num_spectrogram_bins = ( FFT_LENGTH // 2 + 1 ) mel_weight_matrix = ( tf.signal.linear_to_mel_weight_matrix( num_mel_bins=N_MELS, num_spectrogram_bins=num_spectrogram_bins, sample_rate=SAMPLE_RATE, lower_edge_hertz=80.0, upper_edge_hertz=7600.0 ) ) mel_spectrogram = tf.matmul( power_spectrogram, mel_weight_matrix ) log_mel_spectrogram = tf.math.log( mel_spectrogram + 1e-6 ) mfcc = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrogram ) # Ambil 40 koefisien MFCC mfcc = mfcc[:, :N_MFCC] # Shape: (mfcc, time) mfcc = tf.transpose( mfcc ) # Normalisasi MFCC mean = tf.reduce_mean( mfcc ) std = tf.math.reduce_std( mfcc ) mfcc = ( (mfcc - mean) / (std + 1e-6) ) # Shape: (batch, mfcc, time, channel) mfcc_input = tf.expand_dims( mfcc, axis=-1 ) mfcc_input = tf.expand_dims( mfcc_input, axis=0 ) return { "waveform_input": waveform_input, "mfcc_input": mfcc_input } # ============================================================ # PREDIKSI SATU CLIP # ============================================================ def predict_single_clip( model: tf.keras.Model, audio_clip: np.ndarray, threshold: float ) -> dict[str, Any]: """ Prediksi terhadap satu clip audio berdurasi 2 detik. Model output: class 0 = real class 1 = fake """ inputs = preprocess_audio_clip( audio_clip=audio_clip ) logits = model( inputs, training=False ) probabilities = tf.nn.softmax( logits, axis=-1 ).numpy()[0] probability_real = float( probabilities[0] ) probability_fake = float( probabilities[1] ) predicted_label = ( "fake" if probability_fake >= threshold else "real" ) return { "prediction": predicted_label, "probability_real": probability_real, "probability_fake": probability_fake } # ============================================================ # PREDIKSI AUDIO UTUH BERDASARKAN MAYORITAS CLIP # ============================================================ def predict_audio( model: tf.keras.Model, file_path: str | Path, threshold: float = 0.60 ) -> dict[str, Any]: """ Potong audio menjadi clip 2 detik, prediksi setiap clip, lalu tentukan hasil akhir berdasarkan mayoritas clip. Jika jumlah prediksi fake dan real sama: gunakan rata-rata probability_fake sebagai tie breaker. """ if not 0.0 <= threshold <= 1.0: raise ValueError( "Threshold harus berada pada rentang 0.0 sampai 1.0." ) clips = load_audio_clips( file_path=file_path ) clip_results = [] for clip_index, clip in enumerate( clips, start=1 ): result = predict_single_clip( model=model, audio_clip=clip, threshold=threshold ) clip_results.append({ "clip_index": clip_index, "start_second": round( (clip_index - 1) * DURATION, 2 ), "end_second": round( clip_index * DURATION, 2 ), "prediction": result["prediction"], "probability_real": round( result["probability_real"], 6 ), "probability_fake": round( result["probability_fake"], 6 ) }) total_clips = len( clip_results ) fake_clips = sum( result["prediction"] == "fake" for result in clip_results ) real_clips = ( total_clips - fake_clips ) average_probability_fake = float( np.mean([ result["probability_fake"] for result in clip_results ]) ) average_probability_real = float( np.mean([ result["probability_real"] for result in clip_results ]) ) # Hasil akhir berdasarkan mayoritas clip if fake_clips > real_clips: final_prediction = "fake" elif real_clips > fake_clips: final_prediction = "real" else: # Tie breaker jika jumlah real dan fake sama final_prediction = ( "fake" if average_probability_fake >= threshold else "real" ) return { "prediction": final_prediction, "decision_method": "majority_vote", "threshold": round( float(threshold), 4 ), "clip_duration_seconds": DURATION, "total_clips": total_clips, "real_clips": real_clips, "fake_clips": fake_clips, "average_probability_real": round( average_probability_real, 6 ), "average_probability_fake": round( average_probability_fake, 6 ), "clips": clip_results }