atharvaballa commited on
Commit
7c86e01
·
1 Parent(s): 0386864

Update audio processing logic to support multiple formats

Browse files
Files changed (3) hide show
  1. app.py +16 -6
  2. audio_backend.py +36 -9
  3. audio_utils.py +37 -4
app.py CHANGED
@@ -34,8 +34,18 @@ def analyze_image(image):
34
  # AUDIO LOGIC (UNCHANGED)
35
  # =========================
36
  def analyze_audio(audio_path):
37
- label, confidence = predict_audio(audio_path)
38
-
 
 
 
 
 
 
 
 
 
 
39
  if label == "fake":
40
  if confidence >= 90:
41
  risk = '<span class="material-icons">error</span> High likelihood of deepfake'
@@ -51,7 +61,7 @@ def analyze_audio(audio_path):
51
  else:
52
  risk = '<span class="material-icons">help_outline</span> Uncertain – needs review'
53
 
54
- return label.capitalize(), f"{confidence} %", risk
55
 
56
 
57
  # =========================
@@ -85,7 +95,6 @@ with gr.Blocks(css="style.css") as demo:
85
  - Audio: WAV, MP3, FLAC, M4A, OGG formats (clear speech preferred)
86
  """)
87
 
88
-
89
  gr.Markdown("""
90
  ### How to use
91
  1. Select a detection mode using the tabs above.
@@ -173,17 +182,18 @@ with gr.Blocks(css="style.css") as demo:
173
  aud_pred = gr.Text(label="Prediction")
174
  aud_conf = gr.Text(label="Confidence")
175
  aud_risk = gr.HTML(label="Risk Assessment")
 
176
 
177
  aud_submit.click(
178
  fn=analyze_audio,
179
  inputs=audio_input,
180
- outputs=[aud_pred, aud_conf, aud_risk]
181
  )
182
 
183
  aud_clear.click(
184
  fn=lambda: (None, "", ""),
185
  inputs=None,
186
- outputs=[audio_input, aud_pred, aud_conf]
187
  )
188
 
189
 
 
34
  # AUDIO LOGIC (UNCHANGED)
35
  # =========================
36
  def analyze_audio(audio_path):
37
+ label, confidence, spec_img, error = predict_audio(audio_path)
38
+
39
+ # ---------- Error handling ----------
40
+ if error is not None:
41
+ return (
42
+ "Error",
43
+ "-",
44
+ f'<span class="material-icons">error</span> {error}',
45
+ None
46
+ )
47
+
48
+ # ---------- Risk logic ----------
49
  if label == "fake":
50
  if confidence >= 90:
51
  risk = '<span class="material-icons">error</span> High likelihood of deepfake'
 
61
  else:
62
  risk = '<span class="material-icons">help_outline</span> Uncertain – needs review'
63
 
64
+ return label.capitalize(), f"{confidence} %", risk, spec_img
65
 
66
 
67
  # =========================
 
95
  - Audio: WAV, MP3, FLAC, M4A, OGG formats (clear speech preferred)
96
  """)
97
 
 
98
  gr.Markdown("""
99
  ### How to use
100
  1. Select a detection mode using the tabs above.
 
182
  aud_pred = gr.Text(label="Prediction")
183
  aud_conf = gr.Text(label="Confidence")
184
  aud_risk = gr.HTML(label="Risk Assessment")
185
+ aud_spec = gr.Image(label="Audio Spectrogram (Model Input)",height=280)
186
 
187
  aud_submit.click(
188
  fn=analyze_audio,
189
  inputs=audio_input,
190
+ outputs=[aud_pred, aud_conf, aud_risk, aud_spec]
191
  )
192
 
193
  aud_clear.click(
194
  fn=lambda: (None, "", ""),
195
  inputs=None,
196
+ outputs=[audio_input, aud_pred, aud_conf, aud_risk, aud_spec]
197
  )
198
 
199
 
audio_backend.py CHANGED
@@ -2,22 +2,49 @@ import tensorflow as tf
2
  import numpy as np
3
  from audio_utils import audio_to_spectrogram
4
 
 
 
 
5
  MODEL_PATH = "models/audio_vit_savedmodel"
6
 
7
  model = tf.saved_model.load(MODEL_PATH)
8
  infer = model.signatures["serving_default"]
9
 
10
 
11
- def predict_audio(wav_file):
12
- spec_img = audio_to_spectrogram(wav_file)
 
 
 
 
 
 
 
 
 
13
 
14
- x = spec_img.astype("float32") / 255.0
15
- x = np.expand_dims(x, axis=0)
 
16
 
17
- preds = infer(tf.constant(x))
18
- prob = list(preds.values())[0].numpy()[0][0]
19
 
20
- label = "Fake" if prob >= 0.5 else "Real"
21
- confidence = prob * 100
22
 
23
- return label, round(confidence, 2), spec_img
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  from audio_utils import audio_to_spectrogram
4
 
5
+ # ======================================================
6
+ # MODEL LOAD
7
+ # ======================================================
8
  MODEL_PATH = "models/audio_vit_savedmodel"
9
 
10
  model = tf.saved_model.load(MODEL_PATH)
11
  infer = model.signatures["serving_default"]
12
 
13
 
14
+ # ======================================================
15
+ # PREDICTION FUNCTION (UI-SAFE)
16
+ # ======================================================
17
+ def predict_audio(audio_file_path):
18
+ """
19
+ Returns:
20
+ - label (real / fake)
21
+ - confidence (%)
22
+ - spectrogram image
23
+ - error message (None if OK)
24
+ """
25
 
26
+ try:
27
+ # Convert audio → spectrogram
28
+ spec_img = audio_to_spectrogram(audio_file_path)
29
 
30
+ x = spec_img.astype("float32") / 255.0
31
+ x = np.expand_dims(x, axis=0)
32
 
33
+ preds = infer(tf.constant(x))
34
+ prob = list(preds.values())[0].numpy()[0][0]
35
 
36
+ label = "fake" if prob >= 0.5 else "real"
37
+ confidence = round(prob * 100, 2)
38
+
39
+ return label, confidence, spec_img, None
40
+
41
+ except ValueError as ve:
42
+ # Expected errors (short audio, invalid input)
43
+ return None, None, None, str(ve)
44
+
45
+ except Exception:
46
+ # Unexpected errors (decoding/model issues)
47
+ return None, None, None, (
48
+ "Unable to process the audio file. "
49
+ "Please upload a clear audio clip in WAV, MP3, FLAC, or M4A format."
50
+ )
audio_utils.py CHANGED
@@ -2,19 +2,47 @@ import librosa
2
  import numpy as np
3
  import cv2
4
 
 
 
 
5
  SR = 16000
6
  DURATION = 4.0
 
 
7
  N_MELS = 192
8
  N_FFT = 2048
9
  HOP_LENGTH = 160
10
  IMG_SIZE = 224
11
 
12
 
13
- def audio_to_spectrogram(wav_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- y, _ = librosa.load(wav_path, sr=SR)
16
  y, _ = librosa.effects.trim(y, top_db=30)
17
 
 
 
 
 
 
18
  target = int(SR * DURATION)
19
 
20
  if len(y) < target:
@@ -23,18 +51,23 @@ def audio_to_spectrogram(wav_path):
23
  else:
24
  y = y[:target]
25
 
 
26
  mel = librosa.feature.melspectrogram(
27
  y=y,
28
  sr=SR,
29
  n_fft=N_FFT,
30
  hop_length=HOP_LENGTH,
31
- n_mels=N_MELS
 
32
  )
33
 
34
  logmel = librosa.power_to_db(mel, ref=np.max)
35
 
36
- logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min())
 
 
37
 
 
38
  img = (logmel * 255).astype(np.uint8)
39
  img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
40
  img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
 
2
  import numpy as np
3
  import cv2
4
 
5
+ # ======================================================
6
+ # CONFIG (DO NOT CHANGE – must match training)
7
+ # ======================================================
8
  SR = 16000
9
  DURATION = 4.0
10
+ MIN_DURATION = 1.5
11
+
12
  N_MELS = 192
13
  N_FFT = 2048
14
  HOP_LENGTH = 160
15
  IMG_SIZE = 224
16
 
17
 
18
+ def audio_to_spectrogram(audio_path):
19
+ """
20
+ Universal audio preprocessing:
21
+ Supports WAV, MP3, FLAC, M4A, OGG
22
+ Internally converts everything to:
23
+ - mono
24
+ - 16 kHz
25
+ - fixed duration
26
+ """
27
+
28
+ # -------- Load audio (format-agnostic) --------
29
+ try:
30
+ y, _ = librosa.load(
31
+ audio_path,
32
+ sr=SR, # force 16 kHz
33
+ mono=True # force mono
34
+ )
35
+ except Exception as e:
36
+ raise RuntimeError(f"Audio decoding failed: {e}")
37
 
38
+ # -------- Trim silence --------
39
  y, _ = librosa.effects.trim(y, top_db=30)
40
 
41
+ # -------- Reject very short clips --------
42
+ if len(y) < int(MIN_DURATION * SR):
43
+ raise ValueError("Audio too short for reliable analysis")
44
+
45
+ # -------- Fix duration --------
46
  target = int(SR * DURATION)
47
 
48
  if len(y) < target:
 
51
  else:
52
  y = y[:target]
53
 
54
+ # -------- Log-mel spectrogram --------
55
  mel = librosa.feature.melspectrogram(
56
  y=y,
57
  sr=SR,
58
  n_fft=N_FFT,
59
  hop_length=HOP_LENGTH,
60
+ n_mels=N_MELS,
61
+ power=2.0
62
  )
63
 
64
  logmel = librosa.power_to_db(mel, ref=np.max)
65
 
66
+ # -------- Normalize (stable) --------
67
+ logmel = (logmel - np.mean(logmel)) / (np.std(logmel) + 1e-6)
68
+ logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min() + 1e-8)
69
 
70
+ # -------- Convert to image --------
71
  img = (logmel * 255).astype(np.uint8)
72
  img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
73
  img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)