Syahhh01 commited on
Commit
e67bbef
·
verified ·
1 Parent(s): 368e1c4

Update inference.py

Browse files
Files changed (1) hide show
  1. inference.py +237 -54
inference.py CHANGED
@@ -23,45 +23,91 @@ FRAME_STEP = 160
23
  FFT_LENGTH = 512
24
 
25
 
26
- def preprocess_single_audio(
 
 
 
 
27
  file_path: str | Path
28
- ) -> dict[str, tf.Tensor]:
29
  """
30
- Load dan preprocess satu file audio.
 
31
 
32
- Returns:
33
- {
34
- "waveform_input": shape (1, 32000, 1),
35
- "mfcc_input": shape (1, 40, time_frames, 1)
36
- }
37
- """
38
 
39
- file_path = str(file_path)
 
 
 
 
 
40
 
41
- # Load audio, ubah menjadi mono, lalu resample ke 16 kHz
42
  audio, _ = librosa.load(
43
- file_path,
44
  sr=SAMPLE_RATE,
45
  mono=True
46
  )
47
 
48
- audio = audio.astype(np.float32)
49
-
50
- # Potong atau tambahkan padding agar panjang audio tepat 2 detik
51
- if len(audio) > NUM_SAMPLES:
52
- audio = audio[:NUM_SAMPLES]
53
 
54
- elif len(audio) < NUM_SAMPLES:
55
- padding_size = NUM_SAMPLES - len(audio)
 
 
56
 
57
- audio = np.pad(
58
- audio,
59
- pad_width=(0, padding_size),
60
- mode="constant"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  )
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  audio_tensor = tf.convert_to_tensor(
64
- audio,
65
  dtype=tf.float32
66
  )
67
 
@@ -84,7 +130,7 @@ def preprocess_single_audio(
84
  # MFCC INPUT
85
  # ========================================================
86
 
87
- # Center padding manual agar sama seperti pipeline training
88
  pad = FFT_LENGTH // 2
89
 
90
  audio_centered = tf.pad(
@@ -99,17 +145,26 @@ def preprocess_single_audio(
99
  fft_length=FFT_LENGTH
100
  )
101
 
102
- spectrogram = tf.abs(stft)
103
- power_spectrogram = tf.square(spectrogram)
 
 
 
 
 
104
 
105
- num_spectrogram_bins = FFT_LENGTH // 2 + 1
 
 
106
 
107
- mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
108
- num_mel_bins=N_MELS,
109
- num_spectrogram_bins=num_spectrogram_bins,
110
- sample_rate=SAMPLE_RATE,
111
- lower_edge_hertz=80.0,
112
- upper_edge_hertz=7600.0
 
 
113
  )
114
 
115
  mel_spectrogram = tf.matmul(
@@ -128,12 +183,19 @@ def preprocess_single_audio(
128
  # Ambil 40 koefisien MFCC
129
  mfcc = mfcc[:, :N_MFCC]
130
 
131
- # Ubah shape dari (time, mfcc) menjadi (mfcc, time)
132
- mfcc = tf.transpose(mfcc)
 
 
133
 
134
  # Normalisasi MFCC
135
- mean = tf.reduce_mean(mfcc)
136
- std = tf.math.reduce_std(mfcc)
 
 
 
 
 
137
 
138
  mfcc = (
139
  (mfcc - mean)
@@ -157,28 +219,25 @@ def preprocess_single_audio(
157
  }
158
 
159
 
160
- def predict_audio(
 
 
 
 
161
  model: tf.keras.Model,
162
- file_path: str | Path,
163
- threshold: float = 0.60
164
  ) -> dict[str, Any]:
165
  """
166
- Melakukan prediksi terhadap satu file audio.
167
 
168
  Model output:
169
  class 0 = real
170
  class 1 = fake
171
-
172
- Threshold diterapkan pada probability_fake.
173
  """
174
 
175
- if not 0.0 <= threshold <= 1.0:
176
- raise ValueError(
177
- "Threshold harus berada pada rentang 0.0 sampai 1.0."
178
- )
179
-
180
- inputs = preprocess_single_audio(
181
- file_path=file_path
182
  )
183
 
184
  logits = model(
@@ -207,7 +266,131 @@ def predict_audio(
207
 
208
  return {
209
  "prediction": predicted_label,
210
- "threshold": round(float(threshold), 4),
211
- "probability_real": round(probability_real, 6),
212
- "probability_fake": round(probability_fake, 6)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  }
 
23
  FFT_LENGTH = 512
24
 
25
 
26
+ # ============================================================
27
+ # LOAD DAN POTONG AUDIO MENJADI CLIP
28
+ # ============================================================
29
+
30
+ def load_audio_clips(
31
  file_path: str | Path
32
+ ) -> list[np.ndarray]:
33
  """
34
+ Load audio, resample ke 16 kHz, ubah menjadi mono,
35
+ lalu potong menjadi beberapa clip berdurasi 2 detik.
36
 
37
+ Clip terakhir yang kurang dari 2 detik akan diberi padding nol.
 
 
 
 
 
38
 
39
+ Contoh:
40
+ audio 1 detik -> 1 clip
41
+ audio 2 detik -> 1 clip
42
+ audio 5 detik -> 3 clip
43
+ audio 60 detik -> 30 clip
44
+ """
45
 
 
46
  audio, _ = librosa.load(
47
+ str(file_path),
48
  sr=SAMPLE_RATE,
49
  mono=True
50
  )
51
 
52
+ audio = audio.astype(
53
+ np.float32
54
+ )
 
 
55
 
56
+ if len(audio) == 0:
57
+ raise ValueError(
58
+ "Audio kosong atau tidak dapat dibaca."
59
+ )
60
 
61
+ clips = []
62
+
63
+ for start_index in range(
64
+ 0,
65
+ len(audio),
66
+ NUM_SAMPLES
67
+ ):
68
+ clip = audio[
69
+ start_index:start_index + NUM_SAMPLES
70
+ ]
71
+
72
+ # Padding jika clip terakhir kurang dari 2 detik
73
+ if len(clip) < NUM_SAMPLES:
74
+ padding_size = (
75
+ NUM_SAMPLES
76
+ - len(clip)
77
+ )
78
+
79
+ clip = np.pad(
80
+ clip,
81
+ pad_width=(0, padding_size),
82
+ mode="constant"
83
+ )
84
+
85
+ clips.append(
86
+ clip.astype(np.float32)
87
  )
88
 
89
+ return clips
90
+
91
+
92
+ # ============================================================
93
+ # PREPROCESS SATU CLIP AUDIO
94
+ # ============================================================
95
+
96
+ def preprocess_audio_clip(
97
+ audio_clip: np.ndarray
98
+ ) -> dict[str, tf.Tensor]:
99
+ """
100
+ Preprocess satu clip audio berdurasi tepat 2 detik.
101
+
102
+ Returns:
103
+ {
104
+ "waveform_input": shape (1, 32000, 1),
105
+ "mfcc_input": shape (1, 40, time_frames, 1)
106
+ }
107
+ """
108
+
109
  audio_tensor = tf.convert_to_tensor(
110
+ audio_clip,
111
  dtype=tf.float32
112
  )
113
 
 
130
  # MFCC INPUT
131
  # ========================================================
132
 
133
+ # Center padding manual agar sama seperti training
134
  pad = FFT_LENGTH // 2
135
 
136
  audio_centered = tf.pad(
 
145
  fft_length=FFT_LENGTH
146
  )
147
 
148
+ spectrogram = tf.abs(
149
+ stft
150
+ )
151
+
152
+ power_spectrogram = tf.square(
153
+ spectrogram
154
+ )
155
 
156
+ num_spectrogram_bins = (
157
+ FFT_LENGTH // 2 + 1
158
+ )
159
 
160
+ mel_weight_matrix = (
161
+ tf.signal.linear_to_mel_weight_matrix(
162
+ num_mel_bins=N_MELS,
163
+ num_spectrogram_bins=num_spectrogram_bins,
164
+ sample_rate=SAMPLE_RATE,
165
+ lower_edge_hertz=80.0,
166
+ upper_edge_hertz=7600.0
167
+ )
168
  )
169
 
170
  mel_spectrogram = tf.matmul(
 
183
  # Ambil 40 koefisien MFCC
184
  mfcc = mfcc[:, :N_MFCC]
185
 
186
+ # Shape: (mfcc, time)
187
+ mfcc = tf.transpose(
188
+ mfcc
189
+ )
190
 
191
  # Normalisasi MFCC
192
+ mean = tf.reduce_mean(
193
+ mfcc
194
+ )
195
+
196
+ std = tf.math.reduce_std(
197
+ mfcc
198
+ )
199
 
200
  mfcc = (
201
  (mfcc - mean)
 
219
  }
220
 
221
 
222
+ # ============================================================
223
+ # PREDIKSI SATU CLIP
224
+ # ============================================================
225
+
226
+ def predict_single_clip(
227
  model: tf.keras.Model,
228
+ audio_clip: np.ndarray,
229
+ threshold: float
230
  ) -> dict[str, Any]:
231
  """
232
+ Prediksi terhadap satu clip audio berdurasi 2 detik.
233
 
234
  Model output:
235
  class 0 = real
236
  class 1 = fake
 
 
237
  """
238
 
239
+ inputs = preprocess_audio_clip(
240
+ audio_clip=audio_clip
 
 
 
 
 
241
  )
242
 
243
  logits = model(
 
266
 
267
  return {
268
  "prediction": predicted_label,
269
+ "probability_real": probability_real,
270
+ "probability_fake": probability_fake
271
+ }
272
+
273
+
274
+ # ============================================================
275
+ # PREDIKSI AUDIO UTUH BERDASARKAN MAYORITAS CLIP
276
+ # ============================================================
277
+
278
+ def predict_audio(
279
+ model: tf.keras.Model,
280
+ file_path: str | Path,
281
+ threshold: float = 0.60
282
+ ) -> dict[str, Any]:
283
+ """
284
+ Potong audio menjadi clip 2 detik, prediksi setiap clip,
285
+ lalu tentukan hasil akhir berdasarkan mayoritas clip.
286
+
287
+ Jika jumlah prediksi fake dan real sama:
288
+ gunakan rata-rata probability_fake sebagai tie breaker.
289
+ """
290
+
291
+ if not 0.0 <= threshold <= 1.0:
292
+ raise ValueError(
293
+ "Threshold harus berada pada rentang 0.0 sampai 1.0."
294
+ )
295
+
296
+ clips = load_audio_clips(
297
+ file_path=file_path
298
+ )
299
+
300
+ clip_results = []
301
+
302
+ for clip_index, clip in enumerate(
303
+ clips,
304
+ start=1
305
+ ):
306
+ result = predict_single_clip(
307
+ model=model,
308
+ audio_clip=clip,
309
+ threshold=threshold
310
+ )
311
+
312
+ clip_results.append({
313
+ "clip_index": clip_index,
314
+ "start_second": round(
315
+ (clip_index - 1) * DURATION,
316
+ 2
317
+ ),
318
+ "end_second": round(
319
+ clip_index * DURATION,
320
+ 2
321
+ ),
322
+ "prediction": result["prediction"],
323
+ "probability_real": round(
324
+ result["probability_real"],
325
+ 6
326
+ ),
327
+ "probability_fake": round(
328
+ result["probability_fake"],
329
+ 6
330
+ )
331
+ })
332
+
333
+ total_clips = len(
334
+ clip_results
335
+ )
336
+
337
+ fake_clips = sum(
338
+ result["prediction"] == "fake"
339
+ for result in clip_results
340
+ )
341
+
342
+ real_clips = (
343
+ total_clips
344
+ - fake_clips
345
+ )
346
+
347
+ average_probability_fake = float(
348
+ np.mean([
349
+ result["probability_fake"]
350
+ for result in clip_results
351
+ ])
352
+ )
353
+
354
+ average_probability_real = float(
355
+ np.mean([
356
+ result["probability_real"]
357
+ for result in clip_results
358
+ ])
359
+ )
360
+
361
+ # Hasil akhir berdasarkan mayoritas clip
362
+ if fake_clips > real_clips:
363
+ final_prediction = "fake"
364
+
365
+ elif real_clips > fake_clips:
366
+ final_prediction = "real"
367
+
368
+ else:
369
+ # Tie breaker jika jumlah real dan fake sama
370
+ final_prediction = (
371
+ "fake"
372
+ if average_probability_fake >= threshold
373
+ else "real"
374
+ )
375
+
376
+ return {
377
+ "prediction": final_prediction,
378
+ "decision_method": "majority_vote",
379
+ "threshold": round(
380
+ float(threshold),
381
+ 4
382
+ ),
383
+ "clip_duration_seconds": DURATION,
384
+ "total_clips": total_clips,
385
+ "real_clips": real_clips,
386
+ "fake_clips": fake_clips,
387
+ "average_probability_real": round(
388
+ average_probability_real,
389
+ 6
390
+ ),
391
+ "average_probability_fake": round(
392
+ average_probability_fake,
393
+ 6
394
+ ),
395
+ "clips": clip_results
396
  }