syahh-coder commited on
Commit
368e1c4
·
1 Parent(s): d966fa6

Deploy Capst

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.keras filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Library tambahan agar librosa dapat membaca berbagai format audio
4
+ RUN apt-get update && \
5
+ apt-get install -y --no-install-recommends \
6
+ ffmpeg \
7
+ libsndfile1 && \
8
+ rm -rf /var/lib/apt/lists/*
9
+
10
+ # Hugging Face Docker Spaces berjalan dengan user ID 1000
11
+ RUN useradd -m -u 1000 user
12
+
13
+ USER user
14
+
15
+ ENV HOME=/home/user \
16
+ PATH=/home/user/.local/bin:$PATH \
17
+ PYTHONUNBUFFERED=1
18
+
19
+ WORKDIR $HOME/app
20
+
21
+ COPY --chown=user requirements.txt .
22
+
23
+ RUN pip install --no-cache-dir \
24
+ --upgrade pip && \
25
+ pip install --no-cache-dir \
26
+ -r requirements.txt
27
+
28
+ COPY --chown=user . .
29
+
30
+ EXPOSE 7860
31
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,9 @@
1
  ---
2
- title: AudioCapsDetectorV2
3
- emoji: 💻
4
  colorFrom: blue
5
- colorTo: blue
6
  sdk: docker
 
7
  pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Deepfake Audio Detection API
3
+ emoji: 🎙️
4
  colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
+ ---
 
 
app.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+ from pathlib import Path
3
+ from tempfile import NamedTemporaryFile
4
+ from typing import Annotated
5
+
6
+ import tensorflow as tf
7
+ from fastapi import (
8
+ FastAPI,
9
+ File,
10
+ Form,
11
+ HTTPException,
12
+ UploadFile
13
+ )
14
+
15
+ from custom_layers import (
16
+ AdaptiveAvgPool1D,
17
+ AdaptiveAvgPool2D
18
+ )
19
+
20
+ from inference import predict_audio
21
+
22
+
23
+ # ============================================================
24
+ # CONFIGURATION
25
+ # ============================================================
26
+
27
+ MODEL_PATH = Path(
28
+ "best_torchlike_mfcc_waveform_model.keras"
29
+ )
30
+
31
+ ALLOWED_EXTENSIONS = {
32
+ ".wav",
33
+ ".mp3",
34
+ ".flac",
35
+ ".ogg",
36
+ ".m4a"
37
+ }
38
+
39
+ MAX_FILE_SIZE_MB = 20
40
+ MAX_FILE_SIZE_BYTES = (
41
+ MAX_FILE_SIZE_MB
42
+ * 1024
43
+ * 1024
44
+ )
45
+
46
+ model: tf.keras.Model | None = None
47
+
48
+
49
+ # ============================================================
50
+ # LOAD MODEL ON STARTUP
51
+ # ============================================================
52
+
53
+ @asynccontextmanager
54
+ async def lifespan(app: FastAPI):
55
+ global model
56
+
57
+ if not MODEL_PATH.exists():
58
+ raise FileNotFoundError(
59
+ f"Model tidak ditemukan: {MODEL_PATH}"
60
+ )
61
+
62
+ print("Loading model...")
63
+
64
+ model = tf.keras.models.load_model(
65
+ MODEL_PATH,
66
+ custom_objects={
67
+ "AdaptiveAvgPool1D": AdaptiveAvgPool1D,
68
+ "AdaptiveAvgPool2D": AdaptiveAvgPool2D
69
+ },
70
+ compile=False
71
+ )
72
+
73
+ print("Model loaded successfully.")
74
+
75
+ yield
76
+
77
+ model = None
78
+
79
+
80
+ # ============================================================
81
+ # FASTAPI APP
82
+ # ============================================================
83
+
84
+ app = FastAPI(
85
+ title="Deepfake Audio Detection API",
86
+ description=(
87
+ "REST API untuk mendeteksi audio real atau fake "
88
+ "menggunakan model MFCC + Waveform."
89
+ ),
90
+ version="1.0.0",
91
+ lifespan=lifespan
92
+ )
93
+
94
+
95
+ # ============================================================
96
+ # ROUTES
97
+ # ============================================================
98
+
99
+ @app.get("/")
100
+ def root():
101
+ return {
102
+ "message": "Deepfake Audio Detection API",
103
+ "status": "running",
104
+ "docs": "/docs",
105
+ "predict_endpoint": "/predict",
106
+ "default_threshold": 0.60
107
+ }
108
+
109
+
110
+ @app.get("/health")
111
+ def health():
112
+ return {
113
+ "status": (
114
+ "healthy"
115
+ if model is not None
116
+ else "model_not_loaded"
117
+ ),
118
+ "model_loaded": model is not None
119
+ }
120
+
121
+
122
+ @app.post("/predict")
123
+ async def predict(
124
+ file: Annotated[
125
+ UploadFile,
126
+ File(
127
+ description=(
128
+ "File audio dengan format WAV, MP3, "
129
+ "FLAC, OGG, atau M4A."
130
+ )
131
+ )
132
+ ],
133
+ threshold: Annotated[
134
+ float,
135
+ Form(
136
+ ge=0.0,
137
+ le=1.0,
138
+ description=(
139
+ "Audio dianggap fake jika probability_fake "
140
+ "lebih besar atau sama dengan threshold."
141
+ )
142
+ )
143
+ ] = 0.60
144
+ ):
145
+ """
146
+ Prediksi apakah audio termasuk real atau fake.
147
+
148
+ Default threshold:
149
+ 0.60
150
+
151
+ Threshold dapat diubah pada setiap request.
152
+ """
153
+
154
+ if model is None:
155
+ raise HTTPException(
156
+ status_code=503,
157
+ detail="Model belum siap digunakan."
158
+ )
159
+
160
+ original_filename = file.filename or "uploaded_audio.wav"
161
+
162
+ suffix = Path(
163
+ original_filename
164
+ ).suffix.lower()
165
+
166
+ if suffix not in ALLOWED_EXTENSIONS:
167
+ raise HTTPException(
168
+ status_code=400,
169
+ detail=(
170
+ "Format audio tidak didukung. "
171
+ "Gunakan WAV, MP3, FLAC, OGG, atau M4A."
172
+ )
173
+ )
174
+
175
+ file_content = await file.read()
176
+
177
+ if len(file_content) == 0:
178
+ raise HTTPException(
179
+ status_code=400,
180
+ detail="File audio kosong."
181
+ )
182
+
183
+ if len(file_content) > MAX_FILE_SIZE_BYTES:
184
+ raise HTTPException(
185
+ status_code=413,
186
+ detail=(
187
+ f"Ukuran file terlalu besar. "
188
+ f"Maksimal {MAX_FILE_SIZE_MB} MB."
189
+ )
190
+ )
191
+
192
+ temp_path: Path | None = None
193
+
194
+ try:
195
+ with NamedTemporaryFile(
196
+ delete=False,
197
+ suffix=suffix
198
+ ) as temp_file:
199
+ temp_file.write(file_content)
200
+
201
+ temp_path = Path(
202
+ temp_file.name
203
+ )
204
+
205
+ result = predict_audio(
206
+ model=model,
207
+ file_path=temp_path,
208
+ threshold=threshold
209
+ )
210
+
211
+ return {
212
+ "filename": original_filename,
213
+ **result
214
+ }
215
+
216
+ except ValueError as error:
217
+ raise HTTPException(
218
+ status_code=400,
219
+ detail=str(error)
220
+ ) from error
221
+
222
+ except Exception as error:
223
+ raise HTTPException(
224
+ status_code=500,
225
+ detail=f"Inference gagal: {str(error)}"
226
+ ) from error
227
+
228
+ finally:
229
+ if (
230
+ temp_path is not None
231
+ and temp_path.exists()
232
+ ):
233
+ temp_path.unlink()
best_torchlike_mfcc_waveform_model.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31fff975bbb95599f0d8c87ad44cd5798e1621ce499905bca4754fdacea53ec9
3
+ size 13272680
custom_layers.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+
3
+
4
+ class AdaptiveAvgPool1D(tf.keras.layers.Layer):
5
+ def __init__(self, output_size, **kwargs):
6
+ super().__init__(**kwargs)
7
+ self.output_size = output_size
8
+
9
+ def call(self, inputs):
10
+ # inputs: (batch, time, channels)
11
+ x = tf.transpose(
12
+ inputs,
13
+ [0, 2, 1]
14
+ )
15
+
16
+ # Shape: (batch, channels, time, 1)
17
+ x = tf.expand_dims(
18
+ x,
19
+ axis=-1
20
+ )
21
+
22
+ x = tf.image.resize(
23
+ x,
24
+ size=[
25
+ tf.shape(x)[1],
26
+ self.output_size
27
+ ],
28
+ method="bilinear"
29
+ )
30
+
31
+ # Shape: (batch, channels, output_size)
32
+ x = tf.squeeze(
33
+ x,
34
+ axis=-1
35
+ )
36
+
37
+ # Shape: (batch, output_size, channels)
38
+ x = tf.transpose(
39
+ x,
40
+ [0, 2, 1]
41
+ )
42
+
43
+ return x
44
+
45
+ def get_config(self):
46
+ config = super().get_config()
47
+
48
+ config.update({
49
+ "output_size": self.output_size
50
+ })
51
+
52
+ return config
53
+
54
+
55
+ class AdaptiveAvgPool2D(tf.keras.layers.Layer):
56
+ def __init__(self, output_size, **kwargs):
57
+ super().__init__(**kwargs)
58
+ self.output_size = output_size
59
+
60
+ def call(self, inputs):
61
+ # inputs: (batch, height, width, channels)
62
+ return tf.image.resize(
63
+ inputs,
64
+ size=self.output_size,
65
+ method="bilinear"
66
+ )
67
+
68
+ def get_config(self):
69
+ config = super().get_config()
70
+
71
+ config.update({
72
+ "output_size": self.output_size
73
+ })
74
+
75
+ return config
inference.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ import librosa
5
+ import numpy as np
6
+ import tensorflow as tf
7
+
8
+
9
+ # ============================================================
10
+ # AUDIO CONFIGURATION
11
+ # Harus sama dengan preprocessing saat training
12
+ # ============================================================
13
+
14
+ SAMPLE_RATE = 16000
15
+ DURATION = 2.0
16
+ NUM_SAMPLES = int(SAMPLE_RATE * DURATION)
17
+
18
+ N_MFCC = 40
19
+ N_MELS = 64
20
+
21
+ FRAME_LENGTH = 512
22
+ FRAME_STEP = 160
23
+ FFT_LENGTH = 512
24
+
25
+
26
+ def preprocess_single_audio(
27
+ file_path: str | Path
28
+ ) -> dict[str, tf.Tensor]:
29
+ """
30
+ Load dan preprocess satu file audio.
31
+
32
+ Returns:
33
+ {
34
+ "waveform_input": shape (1, 32000, 1),
35
+ "mfcc_input": shape (1, 40, time_frames, 1)
36
+ }
37
+ """
38
+
39
+ file_path = str(file_path)
40
+
41
+ # Load audio, ubah menjadi mono, lalu resample ke 16 kHz
42
+ audio, _ = librosa.load(
43
+ file_path,
44
+ sr=SAMPLE_RATE,
45
+ mono=True
46
+ )
47
+
48
+ audio = audio.astype(np.float32)
49
+
50
+ # Potong atau tambahkan padding agar panjang audio tepat 2 detik
51
+ if len(audio) > NUM_SAMPLES:
52
+ audio = audio[:NUM_SAMPLES]
53
+
54
+ elif len(audio) < NUM_SAMPLES:
55
+ padding_size = NUM_SAMPLES - len(audio)
56
+
57
+ audio = np.pad(
58
+ audio,
59
+ pad_width=(0, padding_size),
60
+ mode="constant"
61
+ )
62
+
63
+ audio_tensor = tf.convert_to_tensor(
64
+ audio,
65
+ dtype=tf.float32
66
+ )
67
+
68
+ # ========================================================
69
+ # WAVEFORM INPUT
70
+ # Shape: (batch, samples, channel)
71
+ # ========================================================
72
+
73
+ waveform_input = tf.expand_dims(
74
+ audio_tensor,
75
+ axis=-1
76
+ )
77
+
78
+ waveform_input = tf.expand_dims(
79
+ waveform_input,
80
+ axis=0
81
+ )
82
+
83
+ # ========================================================
84
+ # MFCC INPUT
85
+ # ========================================================
86
+
87
+ # Center padding manual agar sama seperti pipeline training
88
+ pad = FFT_LENGTH // 2
89
+
90
+ audio_centered = tf.pad(
91
+ audio_tensor,
92
+ paddings=[[pad, pad]]
93
+ )
94
+
95
+ stft = tf.signal.stft(
96
+ audio_centered,
97
+ frame_length=FRAME_LENGTH,
98
+ frame_step=FRAME_STEP,
99
+ fft_length=FFT_LENGTH
100
+ )
101
+
102
+ spectrogram = tf.abs(stft)
103
+ power_spectrogram = tf.square(spectrogram)
104
+
105
+ num_spectrogram_bins = FFT_LENGTH // 2 + 1
106
+
107
+ mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
108
+ num_mel_bins=N_MELS,
109
+ num_spectrogram_bins=num_spectrogram_bins,
110
+ sample_rate=SAMPLE_RATE,
111
+ lower_edge_hertz=80.0,
112
+ upper_edge_hertz=7600.0
113
+ )
114
+
115
+ mel_spectrogram = tf.matmul(
116
+ power_spectrogram,
117
+ mel_weight_matrix
118
+ )
119
+
120
+ log_mel_spectrogram = tf.math.log(
121
+ mel_spectrogram + 1e-6
122
+ )
123
+
124
+ mfcc = tf.signal.mfccs_from_log_mel_spectrograms(
125
+ log_mel_spectrogram
126
+ )
127
+
128
+ # Ambil 40 koefisien MFCC
129
+ mfcc = mfcc[:, :N_MFCC]
130
+
131
+ # Ubah shape dari (time, mfcc) menjadi (mfcc, time)
132
+ mfcc = tf.transpose(mfcc)
133
+
134
+ # Normalisasi MFCC
135
+ mean = tf.reduce_mean(mfcc)
136
+ std = tf.math.reduce_std(mfcc)
137
+
138
+ mfcc = (
139
+ (mfcc - mean)
140
+ / (std + 1e-6)
141
+ )
142
+
143
+ # Shape: (batch, mfcc, time, channel)
144
+ mfcc_input = tf.expand_dims(
145
+ mfcc,
146
+ axis=-1
147
+ )
148
+
149
+ mfcc_input = tf.expand_dims(
150
+ mfcc_input,
151
+ axis=0
152
+ )
153
+
154
+ return {
155
+ "waveform_input": waveform_input,
156
+ "mfcc_input": mfcc_input
157
+ }
158
+
159
+
160
+ def predict_audio(
161
+ model: tf.keras.Model,
162
+ file_path: str | Path,
163
+ threshold: float = 0.60
164
+ ) -> dict[str, Any]:
165
+ """
166
+ Melakukan prediksi terhadap satu file audio.
167
+
168
+ Model output:
169
+ class 0 = real
170
+ class 1 = fake
171
+
172
+ Threshold diterapkan pada probability_fake.
173
+ """
174
+
175
+ if not 0.0 <= threshold <= 1.0:
176
+ raise ValueError(
177
+ "Threshold harus berada pada rentang 0.0 sampai 1.0."
178
+ )
179
+
180
+ inputs = preprocess_single_audio(
181
+ file_path=file_path
182
+ )
183
+
184
+ logits = model(
185
+ inputs,
186
+ training=False
187
+ )
188
+
189
+ probabilities = tf.nn.softmax(
190
+ logits,
191
+ axis=-1
192
+ ).numpy()[0]
193
+
194
+ probability_real = float(
195
+ probabilities[0]
196
+ )
197
+
198
+ probability_fake = float(
199
+ probabilities[1]
200
+ )
201
+
202
+ predicted_label = (
203
+ "fake"
204
+ if probability_fake >= threshold
205
+ else "real"
206
+ )
207
+
208
+ return {
209
+ "prediction": predicted_label,
210
+ "threshold": round(float(threshold), 4),
211
+ "probability_real": round(probability_real, 6),
212
+ "probability_fake": round(probability_fake, 6)
213
+ }
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ python-multipart
4
+ tensorflow-cpu
5
+ librosa
6
+ numpy
7
+ soundfile