Spaces:

Syahhh01
/

AudioCapsDetectorV2

Running

App Files Files Community

syahh-coder commited on about 14 hours ago

Commit

368e1c4

1 Parent(s): d966fa6

Deploy Capst

Browse files

Files changed (8) hide show

.gitattributes +1 -0
Dockerfile +31 -0
README.md +5 -6
app.py +233 -0
best_torchlike_mfcc_waveform_model.keras +3 -0
custom_layers.py +75 -0
inference.py +213 -0
requirements.txt +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.keras filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.11-slim
+# Library tambahan agar librosa dapat membaca berbagai format audio
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ffmpeg \
+        libsndfile1 && \
+    rm -rf /var/lib/apt/lists/*
+# Hugging Face Docker Spaces berjalan dengan user ID 1000
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH \
+    PYTHONUNBUFFERED=1
+WORKDIR $HOME/app
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir \
+    --upgrade pip && \
+    pip install --no-cache-dir \
+    -r requirements.txt
+COPY --chown=user . .
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,9 @@
 ---
-title: AudioCapsDetectorV2
-emoji: 💻
 colorFrom: blue
-colorTo: blue
 sdk: docker
 pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Deepfake Audio Detection API
+emoji: 🎙️
 colorFrom: blue
+colorTo: purple
 sdk: docker
+app_port: 7860
 pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,233 @@

+from contextlib import asynccontextmanager
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import Annotated
+import tensorflow as tf
+from fastapi import (
+    FastAPI,
+    File,
+    Form,
+    HTTPException,
+    UploadFile
+)
+from custom_layers import (
+    AdaptiveAvgPool1D,
+    AdaptiveAvgPool2D
+)
+from inference import predict_audio
+# ============================================================
+# CONFIGURATION
+# ============================================================
+MODEL_PATH = Path(
+    "best_torchlike_mfcc_waveform_model.keras"
+)
+ALLOWED_EXTENSIONS = {
+    ".wav",
+    ".mp3",
+    ".flac",
+    ".ogg",
+    ".m4a"
+}
+MAX_FILE_SIZE_MB = 20
+MAX_FILE_SIZE_BYTES = (
+    MAX_FILE_SIZE_MB
+    * 1024
+    * 1024
+)
+model: tf.keras.Model | None = None
+# ============================================================
+# LOAD MODEL ON STARTUP
+# ============================================================
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global model
+    if not MODEL_PATH.exists():
+        raise FileNotFoundError(
+            f"Model tidak ditemukan: {MODEL_PATH}"
+        )
+    print("Loading model...")
+    model = tf.keras.models.load_model(
+        MODEL_PATH,
+        custom_objects={
+            "AdaptiveAvgPool1D": AdaptiveAvgPool1D,
+            "AdaptiveAvgPool2D": AdaptiveAvgPool2D
+        },
+        compile=False
+    )
+    print("Model loaded successfully.")
+    yield
+    model = None
+# ============================================================
+# FASTAPI APP
+# ============================================================
+app = FastAPI(
+    title="Deepfake Audio Detection API",
+    description=(
+        "REST API untuk mendeteksi audio real atau fake "
+        "menggunakan model MFCC + Waveform."
+    ),
+    version="1.0.0",
+    lifespan=lifespan
+)
+# ============================================================
+# ROUTES
+# ============================================================
+@app.get("/")
+def root():
+    return {
+        "message": "Deepfake Audio Detection API",
+        "status": "running",
+        "docs": "/docs",
+        "predict_endpoint": "/predict",
+        "default_threshold": 0.60
+    }
+@app.get("/health")
+def health():
+    return {
+        "status": (
+            "healthy"
+            if model is not None
+            else "model_not_loaded"
+        ),
+        "model_loaded": model is not None
+    }
+@app.post("/predict")
+async def predict(
+    file: Annotated[
+        UploadFile,
+        File(
+            description=(
+                "File audio dengan format WAV, MP3, "
+                "FLAC, OGG, atau M4A."
+            )
+        )
+    ],
+    threshold: Annotated[
+        float,
+        Form(
+            ge=0.0,
+            le=1.0,
+            description=(
+                "Audio dianggap fake jika probability_fake "
+                "lebih besar atau sama dengan threshold."
+            )
+        )
+    ] = 0.60
+):
+    """
+    Prediksi apakah audio termasuk real atau fake.
+    Default threshold:
+        0.60
+    Threshold dapat diubah pada setiap request.
+    """
+    if model is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Model belum siap digunakan."
+        )
+    original_filename = file.filename or "uploaded_audio.wav"
+    suffix = Path(
+        original_filename
+    ).suffix.lower()
+    if suffix not in ALLOWED_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail=(
+                "Format audio tidak didukung. "
+                "Gunakan WAV, MP3, FLAC, OGG, atau M4A."
+            )
+        )
+    file_content = await file.read()
+    if len(file_content) == 0:
+        raise HTTPException(
+            status_code=400,
+            detail="File audio kosong."
+        )
+    if len(file_content) > MAX_FILE_SIZE_BYTES:
+        raise HTTPException(
+            status_code=413,
+            detail=(
+                f"Ukuran file terlalu besar. "
+                f"Maksimal {MAX_FILE_SIZE_MB} MB."
+            )
+        )
+    temp_path: Path | None = None
+    try:
+        with NamedTemporaryFile(
+            delete=False,
+            suffix=suffix
+        ) as temp_file:
+            temp_file.write(file_content)
+            temp_path = Path(
+                temp_file.name
+            )
+        result = predict_audio(
+            model=model,
+            file_path=temp_path,
+            threshold=threshold
+        )
+        return {
+            "filename": original_filename,
+            **result
+        }
+    except ValueError as error:
+        raise HTTPException(
+            status_code=400,
+            detail=str(error)
+        ) from error
+    except Exception as error:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Inference gagal: {str(error)}"
+        ) from error
+    finally:
+        if (
+            temp_path is not None
+            and temp_path.exists()
+        ):
+            temp_path.unlink()

best_torchlike_mfcc_waveform_model.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31fff975bbb95599f0d8c87ad44cd5798e1621ce499905bca4754fdacea53ec9
+size 13272680

custom_layers.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import tensorflow as tf
+class AdaptiveAvgPool1D(tf.keras.layers.Layer):
+    def __init__(self, output_size, **kwargs):
+        super().__init__(**kwargs)
+        self.output_size = output_size
+    def call(self, inputs):
+        # inputs: (batch, time, channels)
+        x = tf.transpose(
+            inputs,
+            [0, 2, 1]
+        )
+        # Shape: (batch, channels, time, 1)
+        x = tf.expand_dims(
+            x,
+            axis=-1
+        )
+        x = tf.image.resize(
+            x,
+            size=[
+                tf.shape(x)[1],
+                self.output_size
+            ],
+            method="bilinear"
+        )
+        # Shape: (batch, channels, output_size)
+        x = tf.squeeze(
+            x,
+            axis=-1
+        )
+        # Shape: (batch, output_size, channels)
+        x = tf.transpose(
+            x,
+            [0, 2, 1]
+        )
+        return x
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "output_size": self.output_size
+        })
+        return config
+class AdaptiveAvgPool2D(tf.keras.layers.Layer):
+    def __init__(self, output_size, **kwargs):
+        super().__init__(**kwargs)
+        self.output_size = output_size
+    def call(self, inputs):
+        # inputs: (batch, height, width, channels)
+        return tf.image.resize(
+            inputs,
+            size=self.output_size,
+            method="bilinear"
+        )
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "output_size": self.output_size
+        })
+        return config

inference.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from pathlib import Path
+from typing import Any
+import librosa
+import numpy as np
+import tensorflow as tf
+# ============================================================
+# AUDIO CONFIGURATION
+# Harus sama dengan preprocessing saat training
+# ============================================================
+SAMPLE_RATE = 16000
+DURATION = 2.0
+NUM_SAMPLES = int(SAMPLE_RATE * DURATION)
+N_MFCC = 40
+N_MELS = 64
+FRAME_LENGTH = 512
+FRAME_STEP = 160
+FFT_LENGTH = 512
+def preprocess_single_audio(
+    file_path: str | Path
+) -> dict[str, tf.Tensor]:
+    """
+    Load dan preprocess satu file audio.
+    Returns:
+        {
+            "waveform_input": shape (1, 32000, 1),
+            "mfcc_input": shape (1, 40, time_frames, 1)
+        }
+    """
+    file_path = str(file_path)
+    # Load audio, ubah menjadi mono, lalu resample ke 16 kHz
+    audio, _ = librosa.load(
+        file_path,
+        sr=SAMPLE_RATE,
+        mono=True
+    )
+    audio = audio.astype(np.float32)
+    # Potong atau tambahkan padding agar panjang audio tepat 2 detik
+    if len(audio) > NUM_SAMPLES:
+        audio = audio[:NUM_SAMPLES]
+    elif len(audio) < NUM_SAMPLES:
+        padding_size = NUM_SAMPLES - len(audio)
+        audio = np.pad(
+            audio,
+            pad_width=(0, padding_size),
+            mode="constant"
+        )
+    audio_tensor = tf.convert_to_tensor(
+        audio,
+        dtype=tf.float32
+    )
+    # ========================================================
+    # WAVEFORM INPUT
+    # Shape: (batch, samples, channel)
+    # ========================================================
+    waveform_input = tf.expand_dims(
+        audio_tensor,
+        axis=-1
+    )
+    waveform_input = tf.expand_dims(
+        waveform_input,
+        axis=0
+    )
+    # ========================================================
+    # MFCC INPUT
+    # ========================================================
+    # Center padding manual agar sama seperti pipeline training
+    pad = FFT_LENGTH // 2
+    audio_centered = tf.pad(
+        audio_tensor,
+        paddings=[[pad, pad]]
+    )
+    stft = tf.signal.stft(
+        audio_centered,
+        frame_length=FRAME_LENGTH,
+        frame_step=FRAME_STEP,
+        fft_length=FFT_LENGTH
+    )
+    spectrogram = tf.abs(stft)
+    power_spectrogram = tf.square(spectrogram)
+    num_spectrogram_bins = FFT_LENGTH // 2 + 1
+    mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
+        num_mel_bins=N_MELS,
+        num_spectrogram_bins=num_spectrogram_bins,
+        sample_rate=SAMPLE_RATE,
+        lower_edge_hertz=80.0,
+        upper_edge_hertz=7600.0
+    )
+    mel_spectrogram = tf.matmul(
+        power_spectrogram,
+        mel_weight_matrix
+    )
+    log_mel_spectrogram = tf.math.log(
+        mel_spectrogram + 1e-6
+    )
+    mfcc = tf.signal.mfccs_from_log_mel_spectrograms(
+        log_mel_spectrogram
+    )
+    # Ambil 40 koefisien MFCC
+    mfcc = mfcc[:, :N_MFCC]
+    # Ubah shape dari (time, mfcc) menjadi (mfcc, time)
+    mfcc = tf.transpose(mfcc)
+    # Normalisasi MFCC
+    mean = tf.reduce_mean(mfcc)
+    std = tf.math.reduce_std(mfcc)
+    mfcc = (
+        (mfcc - mean)
+        / (std + 1e-6)
+    )
+    # Shape: (batch, mfcc, time, channel)
+    mfcc_input = tf.expand_dims(
+        mfcc,
+        axis=-1
+    )
+    mfcc_input = tf.expand_dims(
+        mfcc_input,
+        axis=0
+    )
+    return {
+        "waveform_input": waveform_input,
+        "mfcc_input": mfcc_input
+    }
+def predict_audio(
+    model: tf.keras.Model,
+    file_path: str | Path,
+    threshold: float = 0.60
+) -> dict[str, Any]:
+    """
+    Melakukan prediksi terhadap satu file audio.
+    Model output:
+        class 0 = real
+        class 1 = fake
+    Threshold diterapkan pada probability_fake.
+    """
+    if not 0.0 <= threshold <= 1.0:
+        raise ValueError(
+            "Threshold harus berada pada rentang 0.0 sampai 1.0."
+        )
+    inputs = preprocess_single_audio(
+        file_path=file_path
+    )
+    logits = model(
+        inputs,
+        training=False
+    )
+    probabilities = tf.nn.softmax(
+        logits,
+        axis=-1
+    ).numpy()[0]
+    probability_real = float(
+        probabilities[0]
+    )
+    probability_fake = float(
+        probabilities[1]
+    )
+    predicted_label = (
+        "fake"
+        if probability_fake >= threshold
+        else "real"
+    )
+    return {
+        "prediction": predicted_label,
+        "threshold": round(float(threshold), 4),
+        "probability_real": round(probability_real, 6),
+        "probability_fake": round(probability_fake, 6)
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn[standard]
+python-multipart
+tensorflow-cpu
+librosa
+numpy
+soundfile