AudioCapsDetectorV2 / inference.py
Syahhh01's picture
Update inference.py
e67bbef verified
from pathlib import Path
from typing import Any
import librosa
import numpy as np
import tensorflow as tf
# ============================================================
# AUDIO CONFIGURATION
# Harus sama dengan preprocessing saat training
# ============================================================
SAMPLE_RATE = 16000
DURATION = 2.0
NUM_SAMPLES = int(SAMPLE_RATE * DURATION)
N_MFCC = 40
N_MELS = 64
FRAME_LENGTH = 512
FRAME_STEP = 160
FFT_LENGTH = 512
# ============================================================
# LOAD DAN POTONG AUDIO MENJADI CLIP
# ============================================================
def load_audio_clips(
file_path: str | Path
) -> list[np.ndarray]:
"""
Load audio, resample ke 16 kHz, ubah menjadi mono,
lalu potong menjadi beberapa clip berdurasi 2 detik.
Clip terakhir yang kurang dari 2 detik akan diberi padding nol.
Contoh:
audio 1 detik -> 1 clip
audio 2 detik -> 1 clip
audio 5 detik -> 3 clip
audio 60 detik -> 30 clip
"""
audio, _ = librosa.load(
str(file_path),
sr=SAMPLE_RATE,
mono=True
)
audio = audio.astype(
np.float32
)
if len(audio) == 0:
raise ValueError(
"Audio kosong atau tidak dapat dibaca."
)
clips = []
for start_index in range(
0,
len(audio),
NUM_SAMPLES
):
clip = audio[
start_index:start_index + NUM_SAMPLES
]
# Padding jika clip terakhir kurang dari 2 detik
if len(clip) < NUM_SAMPLES:
padding_size = (
NUM_SAMPLES
- len(clip)
)
clip = np.pad(
clip,
pad_width=(0, padding_size),
mode="constant"
)
clips.append(
clip.astype(np.float32)
)
return clips
# ============================================================
# PREPROCESS SATU CLIP AUDIO
# ============================================================
def preprocess_audio_clip(
audio_clip: np.ndarray
) -> dict[str, tf.Tensor]:
"""
Preprocess satu clip audio berdurasi tepat 2 detik.
Returns:
{
"waveform_input": shape (1, 32000, 1),
"mfcc_input": shape (1, 40, time_frames, 1)
}
"""
audio_tensor = tf.convert_to_tensor(
audio_clip,
dtype=tf.float32
)
# ========================================================
# WAVEFORM INPUT
# Shape: (batch, samples, channel)
# ========================================================
waveform_input = tf.expand_dims(
audio_tensor,
axis=-1
)
waveform_input = tf.expand_dims(
waveform_input,
axis=0
)
# ========================================================
# MFCC INPUT
# ========================================================
# Center padding manual agar sama seperti training
pad = FFT_LENGTH // 2
audio_centered = tf.pad(
audio_tensor,
paddings=[[pad, pad]]
)
stft = tf.signal.stft(
audio_centered,
frame_length=FRAME_LENGTH,
frame_step=FRAME_STEP,
fft_length=FFT_LENGTH
)
spectrogram = tf.abs(
stft
)
power_spectrogram = tf.square(
spectrogram
)
num_spectrogram_bins = (
FFT_LENGTH // 2 + 1
)
mel_weight_matrix = (
tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=N_MELS,
num_spectrogram_bins=num_spectrogram_bins,
sample_rate=SAMPLE_RATE,
lower_edge_hertz=80.0,
upper_edge_hertz=7600.0
)
)
mel_spectrogram = tf.matmul(
power_spectrogram,
mel_weight_matrix
)
log_mel_spectrogram = tf.math.log(
mel_spectrogram + 1e-6
)
mfcc = tf.signal.mfccs_from_log_mel_spectrograms(
log_mel_spectrogram
)
# Ambil 40 koefisien MFCC
mfcc = mfcc[:, :N_MFCC]
# Shape: (mfcc, time)
mfcc = tf.transpose(
mfcc
)
# Normalisasi MFCC
mean = tf.reduce_mean(
mfcc
)
std = tf.math.reduce_std(
mfcc
)
mfcc = (
(mfcc - mean)
/ (std + 1e-6)
)
# Shape: (batch, mfcc, time, channel)
mfcc_input = tf.expand_dims(
mfcc,
axis=-1
)
mfcc_input = tf.expand_dims(
mfcc_input,
axis=0
)
return {
"waveform_input": waveform_input,
"mfcc_input": mfcc_input
}
# ============================================================
# PREDIKSI SATU CLIP
# ============================================================
def predict_single_clip(
model: tf.keras.Model,
audio_clip: np.ndarray,
threshold: float
) -> dict[str, Any]:
"""
Prediksi terhadap satu clip audio berdurasi 2 detik.
Model output:
class 0 = real
class 1 = fake
"""
inputs = preprocess_audio_clip(
audio_clip=audio_clip
)
logits = model(
inputs,
training=False
)
probabilities = tf.nn.softmax(
logits,
axis=-1
).numpy()[0]
probability_real = float(
probabilities[0]
)
probability_fake = float(
probabilities[1]
)
predicted_label = (
"fake"
if probability_fake >= threshold
else "real"
)
return {
"prediction": predicted_label,
"probability_real": probability_real,
"probability_fake": probability_fake
}
# ============================================================
# PREDIKSI AUDIO UTUH BERDASARKAN MAYORITAS CLIP
# ============================================================
def predict_audio(
model: tf.keras.Model,
file_path: str | Path,
threshold: float = 0.60
) -> dict[str, Any]:
"""
Potong audio menjadi clip 2 detik, prediksi setiap clip,
lalu tentukan hasil akhir berdasarkan mayoritas clip.
Jika jumlah prediksi fake dan real sama:
gunakan rata-rata probability_fake sebagai tie breaker.
"""
if not 0.0 <= threshold <= 1.0:
raise ValueError(
"Threshold harus berada pada rentang 0.0 sampai 1.0."
)
clips = load_audio_clips(
file_path=file_path
)
clip_results = []
for clip_index, clip in enumerate(
clips,
start=1
):
result = predict_single_clip(
model=model,
audio_clip=clip,
threshold=threshold
)
clip_results.append({
"clip_index": clip_index,
"start_second": round(
(clip_index - 1) * DURATION,
2
),
"end_second": round(
clip_index * DURATION,
2
),
"prediction": result["prediction"],
"probability_real": round(
result["probability_real"],
6
),
"probability_fake": round(
result["probability_fake"],
6
)
})
total_clips = len(
clip_results
)
fake_clips = sum(
result["prediction"] == "fake"
for result in clip_results
)
real_clips = (
total_clips
- fake_clips
)
average_probability_fake = float(
np.mean([
result["probability_fake"]
for result in clip_results
])
)
average_probability_real = float(
np.mean([
result["probability_real"]
for result in clip_results
])
)
# Hasil akhir berdasarkan mayoritas clip
if fake_clips > real_clips:
final_prediction = "fake"
elif real_clips > fake_clips:
final_prediction = "real"
else:
# Tie breaker jika jumlah real dan fake sama
final_prediction = (
"fake"
if average_probability_fake >= threshold
else "real"
)
return {
"prediction": final_prediction,
"decision_method": "majority_vote",
"threshold": round(
float(threshold),
4
),
"clip_duration_seconds": DURATION,
"total_clips": total_clips,
"real_clips": real_clips,
"fake_clips": fake_clips,
"average_probability_real": round(
average_probability_real,
6
),
"average_probability_fake": round(
average_probability_fake,
6
),
"clips": clip_results
}