itssKarthiii's picture
Upload 70 files
6b408d7 verified
"""
Audio Forensics Analyzer for deepfake detection.
Extracts low-level audio features that help distinguish
AI-generated speech from human speech.
"""
import numpy as np
from scipy import signal
from scipy.fft import fft
from app.utils.logger import get_logger
logger = get_logger(__name__)
class AudioForensicsAnalyzer:
"""
Forensic analysis of audio to detect AI-generated patterns.
analyzes spectral characteristics, pitch stability, and silence patterns
to identify artifacts typical of neural vocoders.
"""
def __init__(self, sample_rate: int = 16000):
"""Initialize analyzer with sample rate."""
self.sample_rate = sample_rate
def analyze(self, audio_array: np.ndarray) -> dict:
"""
Perform comprehensive forensic analysis on audio.
Args:
audio_array: Normalized audio samples (16kHz, mono)
Returns:
Dictionary with forensic metrics and AI likelihood indicators
"""
logger.debug("Starting forensic audio analysis")
# Compute all forensic features
spectral = self._analyze_spectral(audio_array)
temporal = self._analyze_temporal(audio_array)
pitch = self._analyze_pitch_stability(audio_array)
energy = self._analyze_energy_patterns(audio_array)
# Combine into forensic report
forensics = {
"spectral": spectral,
"temporal": temporal,
"pitch": pitch,
"energy": energy,
"ai_indicators": self._compute_ai_indicators(spectral, temporal, pitch, energy),
}
logger.debug("Forensic analysis complete", indicators=forensics["ai_indicators"])
return forensics
def _analyze_spectral(self, audio: np.ndarray) -> dict:
"""Analyze spectral characteristics."""
# Compute FFT
n = len(audio)
fft_vals = np.abs(fft(audio))[:n // 2]
freqs = np.fft.fftfreq(n, 1 / self.sample_rate)[:n // 2]
# Spectral centroid (center of mass of spectrum)
spectral_centroid = np.sum(freqs * fft_vals) / (np.sum(fft_vals) + 1e-10)
# Spectral flatness (measure of noise-like vs tonal)
# AI often has higher flatness in certain bands
geometric_mean = np.exp(np.mean(np.log(fft_vals + 1e-10)))
arithmetic_mean = np.mean(fft_vals) + 1e-10
spectral_flatness = geometric_mean / arithmetic_mean
# Spectral rolloff (frequency below which 85% of energy is contained)
cumsum = np.cumsum(fft_vals)
rolloff_idx = np.searchsorted(cumsum, 0.85 * cumsum[-1])
spectral_rolloff = freqs[min(rolloff_idx, len(freqs) - 1)]
# Spectral bandwidth
spectral_bandwidth = np.sqrt(
np.sum(((freqs - spectral_centroid) ** 2) * fft_vals) / (np.sum(fft_vals) + 1e-10)
)
return {
"centroid_hz": round(float(spectral_centroid), 2),
"flatness": round(float(spectral_flatness), 4),
"rolloff_hz": round(float(spectral_rolloff), 2),
"bandwidth_hz": round(float(spectral_bandwidth), 2),
}
def _analyze_temporal(self, audio: np.ndarray) -> dict:
"""Analyze temporal characteristics."""
# Zero crossing rate (how often signal crosses zero)
zero_crossings = np.sum(np.abs(np.diff(np.sign(audio)))) / 2
zcr = zero_crossings / len(audio)
# RMS energy
rms = np.sqrt(np.mean(audio ** 2))
# Compute short-time energy variance (humans have more variation)
frame_size = int(0.025 * self.sample_rate) # 25ms frames
hop_size = int(0.010 * self.sample_rate) # 10ms hop
energies = []
for i in range(0, len(audio) - frame_size, hop_size):
frame = audio[i:i + frame_size]
energies.append(np.sum(frame ** 2))
energy_variance = np.var(energies) if energies else 0
# Silence ratio (AI often has different silence patterns)
silence_threshold = 0.01 * np.max(np.abs(audio))
silence_samples = np.sum(np.abs(audio) < silence_threshold)
silence_ratio = silence_samples / len(audio)
return {
"zero_crossing_rate": round(float(zcr), 6),
"rms_energy": round(float(rms), 6),
"energy_variance": round(float(energy_variance), 8),
"silence_ratio": round(float(silence_ratio), 4),
}
def _analyze_pitch_stability(self, audio: np.ndarray) -> dict:
"""
Analyze pitch stability.
AI-generated speech often has unnaturally stable pitch.
Humans have natural pitch variations (jitter).
"""
# Use autocorrelation for pitch estimation
frame_size = int(0.030 * self.sample_rate) # 30ms frames
hop_size = int(0.010 * self.sample_rate) # 10ms hop
pitches = []
for i in range(0, len(audio) - frame_size, hop_size):
frame = audio[i:i + frame_size]
# Autocorrelation
corr = np.correlate(frame, frame, mode='full')
corr = corr[len(corr) // 2:]
# Find first peak after initial decay
d = np.diff(corr)
start = np.where(d > 0)[0]
if len(start) > 0:
start = start[0]
peak = np.argmax(corr[start:]) + start
if peak > 0 and corr[peak] > 0.3 * corr[0]:
pitch = self.sample_rate / peak
if 50 < pitch < 500: # Human voice range
pitches.append(pitch)
if len(pitches) < 2:
return {
"mean_pitch_hz": 0,
"pitch_std": 0,
"pitch_stability": 1.0, # Unknown = assume stable
"jitter": 0,
}
pitches = np.array(pitches)
mean_pitch = np.mean(pitches)
pitch_std = np.std(pitches)
# Pitch stability (inverse of variation) - high = AI-like
pitch_stability = 1.0 / (1.0 + pitch_std / (mean_pitch + 1e-10))
# Jitter (frame-to-frame pitch variation) - low = AI-like
jitter = np.mean(np.abs(np.diff(pitches))) / (mean_pitch + 1e-10)
return {
"mean_pitch_hz": round(float(mean_pitch), 2),
"pitch_std": round(float(pitch_std), 4),
"pitch_stability": round(float(pitch_stability), 4),
"jitter": round(float(jitter), 6),
}
def _analyze_energy_patterns(self, audio: np.ndarray) -> dict:
"""Analyze energy envelope patterns."""
# Compute envelope using Hilbert transform
analytic_signal = signal.hilbert(audio)
envelope = np.abs(analytic_signal)
# Envelope smoothness (AI is often smoother)
envelope_diff = np.abs(np.diff(envelope))
envelope_roughness = np.mean(envelope_diff)
# Attack/decay characteristics
# Find amplitude peaks
peaks, _ = signal.find_peaks(envelope, height=0.1 * np.max(envelope))
if len(peaks) > 1:
# Measure consistency of peaks (AI is more consistent)
peak_heights = envelope[peaks]
peak_consistency = 1.0 - (np.std(peak_heights) / (np.mean(peak_heights) + 1e-10))
else:
peak_consistency = 0.5
return {
"envelope_roughness": round(float(envelope_roughness), 6),
"peak_consistency": round(float(peak_consistency), 4),
"dynamic_range": round(float(np.max(envelope) - np.min(envelope)), 4),
}
def _compute_ai_indicators(
self,
spectral: dict,
temporal: dict,
pitch: dict,
energy: dict,
) -> dict:
"""
Compute features indicating AI generation (Tuned for modern TTS).
Modern AI (ElevenLabs etc) adds simulated breaths and jitter, so we must
be more sensitive to 'slightly too perfect' signals.
"""
indicators = {}
# 1. Pitch Consistency
# AI pitch tracks are smoother than human vocal cords even with simulated emotion
pitch_stability = pitch.get("pitch_stability", 0.5)
# RELAXED: Needs very high stability (>0.75) to be suspicious
# This prevents high-quality human voice from flagging
indicators["pitch_regularity"] = min(1.0, pitch_stability / 0.75)
# 2. Jitter (Micro-fluctuations)
# Real voices have chaotic micro-tremors. AI simulates them but often perfectly.
jitter = pitch.get("jitter", 0.02)
# RELAXED: Only extremely low jitter (<0.025) is suspicious
indicators["low_jitter"] = max(0.0, 1.0 - (jitter / 0.025))
# 3. Energy/Envelope Smoothness
# Neural vocoders produce smoother envelopes than air pressure from lungs
roughness = energy.get("envelope_roughness", 0.01)
# RELAXED: < 0.03 is suspicious
indicators["smooth_envelope"] = max(0.0, 1.0 - (roughness / 0.03))
# 4. Silence/Noise Floor
# Check if silence is "too digital" (low variance in zero crossing)
zcr = temporal.get("zero_crossing_rate", 0.1)
# RELAXED: Only mathematical silence (<0.01) is suspicious
indicators["unnatural_silence"] = 1.0 if zcr < 0.01 else 0.0
# 5. Energy Consistency (Peaks)
peak_consistency = energy.get("peak_consistency", 0.5)
indicators["energy_consistency"] = peak_consistency if peak_consistency > 0.8 else 0.0
# --- Aggressive Scoring for Robustness ---
# We assume if ANY strong indicator is present, chance of AI is high.
scores = [
indicators["pitch_regularity"] * 1.2, # Weight pitch highest
indicators["low_jitter"] * 1.0,
indicators["smooth_envelope"] * 0.8,
indicators["unnatural_silence"] * 0.5,
indicators["energy_consistency"] * 0.6
]
# Take the MAXIMUM strong signal, not just average
# This catches cases where one feature is a "dead giveaway"
strongest_signal = max(scores)
average_signal = sum(scores) / len(scores)
# Combined score is dominated by strongest signal
combined_likelihood = (strongest_signal * 0.7) + (average_signal * 0.3)
indicators["combined_ai_likelihood"] = min(1.0, combined_likelihood)
return indicators
def get_explanation_factors(self, forensics: dict, classification: str = None) -> list[str]:
"""
Get human-readable factors that contributed to detection.
Args:
forensics: Forensics analysis data
classification: The final classification (AI_GENERATED or HUMAN)
Returns list of detected indicators in plain English.
"""
factors = []
indicators = forensics.get("ai_indicators", {})
ai_likelihood = indicators.get("combined_ai_likelihood", 0.5)
# If classified as AI, always show AI indicators
if classification == "AI_GENERATED":
# Show AI indicators based on what we found
if indicators.get("pitch_regularity", 0) > 0.4:
factors.append("unnaturally consistent pitch patterns")
if indicators.get("low_jitter", 0) > 0.4:
factors.append("absence of natural voice micro-variations")
if indicators.get("energy_consistency", 0) > 0.4:
factors.append("mechanical energy envelope patterns")
if indicators.get("smooth_envelope", 0) > 0.4:
factors.append("artificially smooth amplitude transitions")
if indicators.get("unnatural_silence", 0) > 0.3:
factors.append("irregular silence patterns")
# If no strong indicators but still AI, give generic AI reason
if not factors:
factors.append("subtle synthetic audio artifacts")
else: # HUMAN classification
if forensics["pitch"]["jitter"] > 0.015:
factors.append("natural pitch variations")
if forensics["energy"]["envelope_roughness"] > 0.015:
factors.append("organic voice texture")
if 0.05 < forensics["temporal"]["silence_ratio"] < 0.25:
factors.append("natural breathing patterns")
if not factors:
factors.append("natural human voice characteristics")
return factors if factors else ["voice characteristics analyzed"]