Spaces:

AI-D3S
/

AI_Driven_DeepFake_Detection_System

Running

App Files Files Community

atharvaballa commited on 4 days ago

Commit

7c86e01

1 Parent(s): 0386864

Update audio processing logic to support multiple formats

Browse files

Files changed (3) hide show

app.py +16 -6
audio_backend.py +36 -9
audio_utils.py +37 -4

app.py CHANGED Viewed

@@ -34,8 +34,18 @@ def analyze_image(image):
 # AUDIO LOGIC (UNCHANGED)
 # =========================
 def analyze_audio(audio_path):
-    label, confidence = predict_audio(audio_path)
     if label == "fake":
         if confidence >= 90:
             risk = '<span class="material-icons">error</span> High likelihood of deepfake'
@@ -51,7 +61,7 @@ def analyze_audio(audio_path):
         else:
             risk = '<span class="material-icons">help_outline</span> Uncertain – needs review'
-    return label.capitalize(), f"{confidence} %", risk
 # =========================
@@ -85,7 +95,6 @@ with gr.Blocks(css="style.css") as demo:
             - Audio: WAV, MP3, FLAC, M4A, OGG formats (clear speech preferred)
             """)
             gr.Markdown("""
             ### How to use
             1. Select a detection mode using the tabs above.
@@ -173,17 +182,18 @@ with gr.Blocks(css="style.css") as demo:
                     aud_pred = gr.Text(label="Prediction")
                     aud_conf = gr.Text(label="Confidence")
                     aud_risk = gr.HTML(label="Risk Assessment")
             aud_submit.click(
                 fn=analyze_audio,
                 inputs=audio_input,
-                outputs=[aud_pred, aud_conf, aud_risk]
             )
             aud_clear.click(
                 fn=lambda: (None, "", ""),
                 inputs=None,
-                outputs=[audio_input, aud_pred, aud_conf]
             )

 # AUDIO LOGIC (UNCHANGED)
 # =========================
 def analyze_audio(audio_path):
+    label, confidence, spec_img, error = predict_audio(audio_path)
+    # ---------- Error handling ----------
+    if error is not None:
+        return (
+            "Error",
+            "-",
+            f'<span class="material-icons">error</span> {error}',
+            None
+        )
+    # ---------- Risk logic ----------
     if label == "fake":
         if confidence >= 90:
             risk = '<span class="material-icons">error</span> High likelihood of deepfake'
         else:
             risk = '<span class="material-icons">help_outline</span> Uncertain – needs review'
+    return label.capitalize(), f"{confidence} %", risk, spec_img
 # =========================
             - Audio: WAV, MP3, FLAC, M4A, OGG formats (clear speech preferred)
             """)
             gr.Markdown("""
             ### How to use
             1. Select a detection mode using the tabs above.
                     aud_pred = gr.Text(label="Prediction")
                     aud_conf = gr.Text(label="Confidence")
                     aud_risk = gr.HTML(label="Risk Assessment")
+                    aud_spec = gr.Image(label="Audio Spectrogram (Model Input)",height=280)
             aud_submit.click(
                 fn=analyze_audio,
                 inputs=audio_input,
+                outputs=[aud_pred, aud_conf, aud_risk, aud_spec]
             )
             aud_clear.click(
                 fn=lambda: (None, "", ""),
                 inputs=None,
+                outputs=[audio_input, aud_pred, aud_conf, aud_risk, aud_spec]
             )

audio_backend.py CHANGED Viewed

@@ -2,22 +2,49 @@ import tensorflow as tf
 import numpy as np
 from audio_utils import audio_to_spectrogram
 MODEL_PATH = "models/audio_vit_savedmodel"
 model = tf.saved_model.load(MODEL_PATH)
 infer = model.signatures["serving_default"]
-def predict_audio(wav_file):
-    spec_img = audio_to_spectrogram(wav_file)
-    x = spec_img.astype("float32") / 255.0
-    x = np.expand_dims(x, axis=0)
-    preds = infer(tf.constant(x))
-    prob = list(preds.values())[0].numpy()[0][0]
-    label = "Fake" if prob >= 0.5 else "Real"
-    confidence = prob * 100
-    return label, round(confidence, 2), spec_img

 import numpy as np
 from audio_utils import audio_to_spectrogram
+# ======================================================
+# MODEL LOAD
+# ======================================================
 MODEL_PATH = "models/audio_vit_savedmodel"
 model = tf.saved_model.load(MODEL_PATH)
 infer = model.signatures["serving_default"]
+# ======================================================
+# PREDICTION FUNCTION (UI-SAFE)
+# ======================================================
+def predict_audio(audio_file_path):
+    """
+    Returns:
+    - label (real / fake)
+    - confidence (%)
+    - spectrogram image
+    - error message (None if OK)
+    """
+    try:
+        # Convert audio → spectrogram
+        spec_img = audio_to_spectrogram(audio_file_path)
+        x = spec_img.astype("float32") / 255.0
+        x = np.expand_dims(x, axis=0)
+        preds = infer(tf.constant(x))
+        prob = list(preds.values())[0].numpy()[0][0]
+        label = "fake" if prob >= 0.5 else "real"
+        confidence = round(prob * 100, 2)
+        return label, confidence, spec_img, None
+    except ValueError as ve:
+        # Expected errors (short audio, invalid input)
+        return None, None, None, str(ve)
+    except Exception:
+        # Unexpected errors (decoding/model issues)
+        return None, None, None, (
+            "Unable to process the audio file. "
+            "Please upload a clear audio clip in WAV, MP3, FLAC, or M4A format."
+        )

audio_utils.py CHANGED Viewed

@@ -2,19 +2,47 @@ import librosa
 import numpy as np
 import cv2
 SR = 16000
 DURATION = 4.0
 N_MELS = 192
 N_FFT = 2048
 HOP_LENGTH = 160
 IMG_SIZE = 224
-def audio_to_spectrogram(wav_path):
-    y, _ = librosa.load(wav_path, sr=SR)
     y, _ = librosa.effects.trim(y, top_db=30)
     target = int(SR * DURATION)
     if len(y) < target:
@@ -23,18 +51,23 @@ def audio_to_spectrogram(wav_path):
     else:
         y = y[:target]
     mel = librosa.feature.melspectrogram(
         y=y,
         sr=SR,
         n_fft=N_FFT,
         hop_length=HOP_LENGTH,
-        n_mels=N_MELS
     )
     logmel = librosa.power_to_db(mel, ref=np.max)
-    logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min())
     img = (logmel * 255).astype(np.uint8)
     img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
     img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

 import numpy as np
 import cv2
+# ======================================================
+# CONFIG (DO NOT CHANGE – must match training)
+# ======================================================
 SR = 16000
 DURATION = 4.0
+MIN_DURATION = 1.5
 N_MELS = 192
 N_FFT = 2048
 HOP_LENGTH = 160
 IMG_SIZE = 224
+def audio_to_spectrogram(audio_path):
+    """
+    Universal audio preprocessing:
+    Supports WAV, MP3, FLAC, M4A, OGG
+    Internally converts everything to:
+    - mono
+    - 16 kHz
+    - fixed duration
+    """
+    # -------- Load audio (format-agnostic) --------
+    try:
+        y, _ = librosa.load(
+            audio_path,
+            sr=SR,        # force 16 kHz
+            mono=True     # force mono
+        )
+    except Exception as e:
+        raise RuntimeError(f"Audio decoding failed: {e}")
+    # -------- Trim silence --------
     y, _ = librosa.effects.trim(y, top_db=30)
+    # -------- Reject very short clips --------
+    if len(y) < int(MIN_DURATION * SR):
+        raise ValueError("Audio too short for reliable analysis")
+    # -------- Fix duration --------
     target = int(SR * DURATION)
     if len(y) < target:
     else:
         y = y[:target]
+    # -------- Log-mel spectrogram --------
     mel = librosa.feature.melspectrogram(
         y=y,
         sr=SR,
         n_fft=N_FFT,
         hop_length=HOP_LENGTH,
+        n_mels=N_MELS,
+        power=2.0
     )
     logmel = librosa.power_to_db(mel, ref=np.max)
+    # -------- Normalize (stable) --------
+    logmel = (logmel - np.mean(logmel)) / (np.std(logmel) + 1e-6)
+    logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min() + 1e-8)
+    # -------- Convert to image --------
     img = (logmel * 255).astype(np.uint8)
     img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
     img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)