Spaces:

arshan123
/

voice-detection-api

Sleeping

App Files Files Community

voice-detection-api / detector.py

arshan123

Added updates

e66cd72 about 1 month ago

raw

history blame contribute delete

34.3 kB

	import torch
	import librosa
	import numpy as np
	import scipy.stats as stats
	import torch.nn.functional as F
	from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import base64
	import io
	import tempfile
	import os
	import soundfile as sf
	import warnings

	# Suppress librosa warnings
	warnings.filterwarnings('ignore')

	class HybridEnsembleDetector:
	"""
	Hybrid AI Voice Detection System with Language Detection

	Features:
	1. Physics-based acoustic analysis
	2. Deep Learning deepfake detection
	3. Language identification using Whisper (focus on Indian languages)
	4. Auto-truncation to 30 seconds for faster processing
	"""

	def __init__(
	self,
	deepfake_model_path="garystafford/wav2vec2-deepfake-voice-detector",
	whisper_model_path="openai/whisper-base",
	physics_weight=0.4,
	dl_weight=0.6,
	use_local_deepfake_model=False,
	use_local_whisper_model=False,
	max_audio_duration=30, # seconds
	load_whisper=True,
	):
	"""
	Initialize the hybrid detector

	Args:
	deepfake_model_path: Path to deepfake detection model
	whisper_model_path: Path to Whisper model for language detection
	physics_weight: Weight for physics score (0-1)
	dl_weight: Weight for DL score (0-1)
	use_local_deepfake_model: Whether to load deepfake model from local path
	use_local_whisper_model: Whether to load Whisper from local path
	max_audio_duration: Maximum audio duration to process (seconds)
	load_whisper: If False, skip loading Whisper (saves GPU memory when language detection is not used)
	"""
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# float16 on GPU (e.g. T4) = faster inference + less memory; no benefit on CPU
	self.torch_dtype = torch.float16 if self.device.type == "cuda" else torch.float32
	self.max_duration = max_audio_duration

	# Normalize weights
	total_weight = physics_weight + dl_weight
	self.physics_weight = physics_weight / total_weight
	self.dl_weight = dl_weight / total_weight

	print(f"🔧 Initializing Hybrid Detector with Language Detection")
	print(f" Device: {self.device} (dtype: {self.torch_dtype})")
	print(f" Physics Weight: {self.physics_weight*100:.0f}%")
	print(f" DL Weight: {self.dl_weight*100:.0f}%")
	print(f" Max Audio Duration: {self.max_duration}s")

	# --- LOAD DEEPFAKE DETECTION MODEL ---
	try:
	print(f"📥 Loading deepfake detection model from '{deepfake_model_path}'...")

	if use_local_deepfake_model:
	self.dl_model = AutoModelForAudioClassification.from_pretrained(
	deepfake_model_path,
	local_files_only=True,
	torch_dtype=self.torch_dtype,
	)
	self.feature_extractor = AutoFeatureExtractor.from_pretrained(
	deepfake_model_path,
	local_files_only=True,
	)
	else:
	self.dl_model = AutoModelForAudioClassification.from_pretrained(
	deepfake_model_path,
	torch_dtype=self.torch_dtype,
	)
	self.feature_extractor = AutoFeatureExtractor.from_pretrained(deepfake_model_path)

	self.dl_model.to(self.device)
	self.dl_model.eval()
	# torch.compile() not used: inductor backend requires a C++ compiler (g++), which
	# is often missing in minimal Docker/HF Spaces images and causes InvalidCxxCompiler at first run.
	self.dl_ready = True
	print("✅ Deepfake Detection Model Loaded")

	except Exception as e:
	print(f"⚠️ DL Model Load Failed: {e}")
	print(" Running in Physics-Only mode")
	self.dl_ready = False
	self.dl_weight = 0
	self.physics_weight = 1.0

	# --- LOAD WHISPER FOR LANGUAGE DETECTION (optional; skip if API does not use detect_language) ---
	self.lang_ready = False
	self.whisper_model = None
	self.whisper_processor = None
	self.language_map = {}
	if load_whisper:
	try:
	print(f"📥 Loading Whisper model for language detection from '{whisper_model_path}'...")

	if use_local_whisper_model:
	self.whisper_processor = WhisperProcessor.from_pretrained(
	whisper_model_path,
	local_files_only=True,
	)
	self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
	whisper_model_path,
	local_files_only=True,
	torch_dtype=self.torch_dtype,
	)
	else:
	self.whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path)
	self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
	whisper_model_path,
	torch_dtype=self.torch_dtype,
	)

	self.whisper_model.to(self.device)
	self.whisper_model.eval()
	self.lang_ready = True
	print("✅ Whisper Language Detection Model Loaded")

	# Language code mapping for Indian languages and common languages
	self.language_map = {
	'hi': 'Hindi',
	'bn': 'Bengali',
	'te': 'Telugu',
	'mr': 'Marathi',
	'ta': 'Tamil',
	'gu': 'Gujarati',
	'kn': 'Kannada',
	'ml': 'Malayalam',
	'or': 'Odia',
	'pa': 'Punjabi',
	'as': 'Assamese',
	'ur': 'Urdu',
	'en': 'English',
	'ne': 'Nepali',
	'si': 'Sinhala',
	'sa': 'Sanskrit',
	'sd': 'Sindhi',
	'ks': 'Kashmiri'
	}

	except Exception as e:
	print(f"⚠️ Whisper Model Load Failed: {e}")
	print(" Running without language detection")
	self.lang_ready = False
	else:
	print(" Skipping Whisper (load_whisper=False)")

	# --- PHYSICS ENGINE PARAMETERS ---
	self.CV_AI_THRESHOLD = 0.20
	self.CV_HUMAN_THRESHOLD = 0.32
	self.INTENSITY_MIN_STD = 0.05
	self.INTENSITY_MAX_STD = 0.15
	# Analyze only middle N seconds for physics (reduces latency ~60% vs full 30s)
	self.physics_analysis_duration = 8 # seconds; middle segment where voice is most stable

	# --- CHUNKING (MAX-POOLING) FOR LONG AUDIO ---
	self.chunk_duration_sec = 5
	self.max_chunks = 3 # If any chunk is AI, whole file is AI

	# --- CLASSIFICATION THRESHOLD (language-specific) ---
	self.default_ai_threshold = 0.55 # Standard
	self.tamil_ai_threshold = 0.45 # Harder to detect, be aggressive
	self.english_ai_threshold = 0.65 # High-quality human audio common, be conservative

	print("✅ Hybrid Detector Ready\n")

	# ==========================================================
	# HELPER: Audio Preprocessing
	# ==========================================================
	def preprocess_audio(self, audio_path, target_sr=16000):
	"""
	Load and preprocess audio:
	1. Load audio
	2. Convert to mono
	3. Truncate to max_duration if needed
	4. Resample to target_sr

	Args:
	audio_path: Path to audio file
	target_sr: Target sample rate

	Returns:
	tuple: (waveform_array, sample_rate, duration, was_truncated)
	"""
	try:
	# Load audio
	y, sr = librosa.load(audio_path, sr=None, mono=True)

	# Calculate duration
	duration = len(y) / sr
	was_truncated = False

	# Truncate if longer than max_duration
	if duration > self.max_duration:
	print(f" ⚠️ Audio is {duration:.1f}s, truncating to {self.max_duration}s")
	max_samples = int(self.max_duration * sr)
	y = y[:max_samples]
	duration = self.max_duration
	was_truncated = True

	# Resample if needed
	if sr != target_sr:
	y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
	sr = target_sr

	return y, sr, duration, was_truncated

	except Exception as e:
	raise ValueError(f"Failed to preprocess audio: {str(e)}")

	def _load_audio_once(self, audio_path):
	"""
	Load audio once at native sample rate and truncate to max_duration.
	Used to avoid loading the same file twice in physics + DL branches.
	Returns:
	tuple: (y, sr, duration, was_truncated)
	"""
	y, sr = librosa.load(audio_path, sr=None, mono=True)
	duration = len(y) / sr
	was_truncated = False
	if duration > self.max_duration:
	max_samples = int(self.max_duration * sr)
	y = y[:max_samples]
	duration = self.max_duration
	was_truncated = True
	return y, sr, duration, was_truncated

	def _chunk_audio(self, y, sr, duration):
	"""
	Split audio into fixed-duration chunks for max-pooling.
	Returns list of (y_chunk, sr, duration_chunk, was_truncated) for preloaded.
	"""
	chunks = []
	chunk_len = int(self.chunk_duration_sec * sr)
	min_chunk_samples = int(1.0 * sr) # skip chunks shorter than 1s
	for i in range(self.max_chunks):
	start = i * chunk_len
	if start >= len(y):
	break
	end = min(start + chunk_len, len(y))
	y_chunk = y[start:end]
	if len(y_chunk) < min_chunk_samples:
	break
	dur_chunk = len(y_chunk) / sr
	chunks.append((y_chunk.copy(), sr, dur_chunk, False))
	return chunks

	# ==========================================================
	# HELPER: Base64 Decoding
	# ==========================================================
	def decode_base64_audio(self, base64_string):
	"""
	Decode base64 audio and save to temporary file

	Args:
	base64_string: Base64 encoded audio data

	Returns:
	str: Path to temporary audio file
	"""
	try:
	# Decode base64
	audio_data = base64.b64decode(base64_string)

	# Create temporary file
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
	temp_file.write(audio_data)
	temp_file.close()

	return temp_file.name

	except Exception as e:
	raise ValueError(f"Failed to decode base64 audio: {str(e)}")

	# ==========================================================
	# LANGUAGE DETECTION
	# ==========================================================
	def detect_language(self, audio_path):
	"""
	Detect language using Whisper model

	Args:
	audio_path: Path to audio file

	Returns:
	str: Detected language name
	"""
	if not self.lang_ready:
	return "Unknown"

	try:
	# Load and preprocess audio for Whisper (uses 16kHz)
	# Use first 30 seconds for language detection
	audio, sr = librosa.load(audio_path, sr=16000, mono=True, duration=30)

	# Process audio with Whisper processor
	input_features = self.whisper_processor(
	audio,
	sampling_rate=16000,
	return_tensors="pt"
	).input_features

	input_features = input_features.to(device=self.device, dtype=self.torch_dtype)

	# Whisper language detection using forced_decoder_ids
	with torch.inference_mode():
	# Generate with language detection enabled
	generated_ids = self.whisper_model.generate(
	input_features,
	task="transcribe",
	return_dict_in_generate=True
	)

	# Decode the output
	full_output = self.whisper_processor.batch_decode(
	generated_ids.sequences,
	skip_special_tokens=False
	)[0]

	# Parse language from special tokens
	# Format: <\|startoftranscript\|><\|en\|><\|transcribe\|>...
	detected_lang = None

	# Look for language tokens in the format <\|xx\|>
	import re
	lang_pattern = r'<\\|([a-z]{2})\\|>'
	matches = re.findall(lang_pattern, full_output)

	if matches:
	# First match after startoftranscript is usually the language
	for match in matches:
	if match in self.language_map:
	detected_lang = match
	break

	if detected_lang:
	lang_name = self.language_map.get(detected_lang, detected_lang.upper())
	print(f" 🌐 Detected Language: {lang_name} ({detected_lang})")
	return lang_name
	else:
	# Fallback: if transcription successful, assume English
	transcription = self.whisper_processor.batch_decode(
	generated_ids.sequences,
	skip_special_tokens=True
	)[0]

	if len(transcription.strip()) > 0:
	print(f" 🌐 Detected Language: English (default)")
	return "English"
	else:
	return "Unknown"

	except Exception as e:
	print(f" ⚠️ Language detection error: {str(e)}")
	return "Unknown"

	# ==========================================================
	# PART A: PHYSICS ENGINE (FIXED)
	# ==========================================================
	def get_linear_score(self, val, min_val, max_val):
	"""Linear interpolation for scoring"""
	if val <= min_val:
	return 1.0
	if val >= max_val:
	return 0.0
	return 1.0 - ((val - min_val) / (max_val - min_val))

	def get_physics_score(self, audio_path, preloaded=None):
	"""
	Analyze audio using physics-based acoustic features.
	If preloaded is provided, use (y, sr, duration, was_truncated) instead of loading from path.

	Returns:
	tuple: (ai_score, method, features_dict)
	"""
	try:
	if preloaded is not None:
	y, sr, duration, was_truncated = preloaded
	else:
	y, sr, duration, was_truncated = self._load_audio_once(audio_path)

	# Use middle segment only for physics (faster; voice most stable in middle)
	seg_dur = min(self.physics_analysis_duration, duration)
	half = (duration - seg_dur) / 2.0
	start_samp = int(half * sr)
	end_samp = int((half + seg_dur) * sr)
	y_physics = y[start_samp:end_samp]

	# VAD: trim silence so physics doesn't see "perfect stability" from empty segments
	y_physics_trimmed, _ = librosa.effects.trim(y_physics, top_db=20)
	if len(y_physics_trimmed) >= int(0.5 * sr): # Keep at least 0.5s of audio
	y_physics = y_physics_trimmed

	print(f" 🔬 Running physics analysis on middle {seg_dur:.1f}s of {duration:.1f}s audio at {sr}Hz (VAD trimmed)")

	# Robust pitch tracking using pYIN (filters silence/noise; yin blindly estimates pitch there)
	try:
	f0, _, _ = librosa.pyin(
	y_physics,
	fmin=librosa.note_to_hz('C2'), # ~65 Hz
	fmax=librosa.note_to_hz('C7'), # ~2093 Hz
	sr=sr,
	frame_length=2048,
	)
	valid_f0 = f0[~np.isnan(f0)] if f0 is not None else np.array([])
	except Exception as pitch_error:
	print(f" ⚠️ Pitch detection failed: {pitch_error}, using fallback method")
	valid_f0 = np.array([])

	if len(valid_f0) < 10: # Need at least 10 valid pitch points
	print(f" ⚠️ Insufficient pitch data ({len(valid_f0)} points), using alternative features")
	# Fall back to non-pitch features (on segment)
	rms = librosa.feature.rms(y=y_physics)[0]
	centroid = librosa.feature.spectral_centroid(y=y_physics, sr=sr)[0]
	zcr = librosa.feature.zero_crossing_rate(y_physics)[0]

	feats = {
	'pitch_cv': 0.25, # Neutral value
	'intensity_std': np.std(rms),
	'freq_skew': stats.skew(centroid),
	'zcr_std': np.std(zcr),
	'mean_pitch': 0,
	'std_pitch': 0,
	'duration': duration,
	'was_truncated': was_truncated
	}

	# Score based on available features
	intensity_score = self.get_linear_score(
	feats['intensity_std'],
	self.INTENSITY_MIN_STD,
	self.INTENSITY_MAX_STD
	)

	zcr_score = self.get_linear_score(
	feats['zcr_std'],
	0.01,
	0.08
	)

	skew_score = self.get_linear_score(
	abs(feats['freq_skew']),
	0.1,
	1.0
	)

	# Weighted combination (no pitch)
	final_score = (intensity_score * 0.5 + zcr_score * 0.2 + skew_score * 0.3)

	print(f" 🔬 Physics score (no pitch): {final_score:.3f}")
	return round(final_score, 3), "Physics Analysis (Limited)", feats

	# Full analysis with pitch (on segment)
	rms = librosa.feature.rms(y=y_physics)[0]
	centroid = librosa.feature.spectral_centroid(y=y_physics, sr=sr)[0]

	mean_pitch = np.mean(valid_f0)
	std_pitch = np.std(valid_f0)

	# Calculate feature metrics
	feats = {
	'pitch_cv': std_pitch / mean_pitch if mean_pitch > 0 else 0,
	'intensity_std': np.std(rms),
	'freq_skew': stats.skew(centroid),
	'mean_pitch': mean_pitch,
	'std_pitch': std_pitch,
	'duration': duration,
	'was_truncated': was_truncated
	}

	# Individual feature scores (higher = more AI-like)
	intensity_score = self.get_linear_score(
	feats['intensity_std'],
	self.INTENSITY_MIN_STD,
	self.INTENSITY_MAX_STD
	)

	pitch_score = self.get_linear_score(
	feats['pitch_cv'],
	self.CV_AI_THRESHOLD,
	self.CV_HUMAN_THRESHOLD
	)

	skew_score = self.get_linear_score(
	abs(feats['freq_skew']),
	0.1,
	1.0
	)

	# Weighted combination
	W_INTENSITY = 0.40
	W_PITCH = 0.40
	W_SKEW = 0.20

	base_score = (
	intensity_score * W_INTENSITY +
	pitch_score * W_PITCH +
	skew_score * W_SKEW
	)

	# Synergy bonus: if both intensity and pitch are suspicious
	if intensity_score > 0.4 and pitch_score > 0.4:
	final_score = min(base_score + 0.15, 1.0)
	else:
	final_score = base_score

	print(f" 🔬 Physics score: {final_score:.3f} (intensity:{intensity_score:.2f}, pitch:{pitch_score:.2f})")
	return round(final_score, 3), "Physics Analysis", feats

	except Exception as e:
	print(f" ❌ Physics analysis failed: {str(e)}")
	import traceback
	traceback.print_exc()
	return 0.0, f"Physics Error: {str(e)}", {'duration': 0, 'was_truncated': False}

	# ==========================================================
	# PART B: DEEP LEARNING ENGINE
	# ==========================================================
	def get_dl_score(self, audio_path, preloaded=None):
	"""
	Analyze audio using deep learning model.
	If preloaded is provided, use (y, sr, duration, was_truncated) and resample to 16kHz for the model.

	Returns:
	tuple: (ai_score, label)
	"""
	if not self.dl_ready:
	return 0.0, "Model not loaded"

	try:
	if preloaded is not None:
	y, sr, duration, was_truncated = preloaded
	if sr != 16000:
	y = librosa.resample(y, orig_sr=sr, target_sr=16000)
	waveform_np = y
	sr = 16000
	else:
	waveform_np, sr, duration, was_truncated = self.preprocess_audio(audio_path, target_sr=16000)

	# Process with feature extractor
	inputs = self.feature_extractor(
	waveform_np,
	sampling_rate=16000,
	return_tensors="pt",
	padding=True
	)

	# Move to device and match model dtype (important for float16 on GPU)
	inputs = {k: v.to(device=self.device, dtype=self.torch_dtype) for k, v in inputs.items()}

	# Run inference
	with torch.inference_mode():
	outputs = self.dl_model(**inputs)
	logits = outputs.logits
	probs = F.softmax(logits, dim=-1)

	# Get predictions
	# Class 0: Real, Class 1: Fake
	prob_real = probs[0][0].item()
	prob_fake = probs[0][1].item()

	# AI score is the fake probability
	ai_score = prob_fake

	label = "Fake/Deepfake" if prob_fake > 0.5 else "Real/Human"
	print(f"Confidence score generated by DL model:${ai_score}, label by DL model: ${label}")

	return round(ai_score, 3), label

	except Exception as e:
	print(f" ❌ DL analysis failed: {str(e)}")
	return 0.0, f"DL Error: {str(e)}"

	# ==========================================================
	# PART C: EXPLANATION GENERATOR
	# ==========================================================
	def generate_explanation(self, final_score, dl_score, dl_label,phys_score, phys_feats):
	"""
	Generate human-readable explanation for the classification

	Returns:
	str: Explanation text
	"""
	explanations = []

	if final_score > 0.55:
	# AI GENERATED

	# Deep Learning contributions
	if dl_score > 0.55 and self.dl_ready:
	if "Fake" in dl_label or "Deepfake" in dl_label:
	explanations.append(
	f"Deep learning model detected synthetic voice patterns "
	f"(confidence: {dl_score*100:.1f}%)"
	)

	# Physics contributions
	if phys_score > 0.55:
	p_cv = phys_feats.get('pitch_cv', 0)
	i_std = phys_feats.get('intensity_std', 0)

	if i_std < 0.06:
	explanations.append(
	f"Unnaturally consistent energy levels detected "
	f"(std: {i_std:.3f}, expected: >0.06)"
	)

	if p_cv < 0.22 and p_cv > 0:
	explanations.append(
	f"Robotic pitch modulation patterns "
	f"(CV: {p_cv:.2f}, expected: >0.22)"
	)

	if not explanations or (i_std >= 0.06 and p_cv >= 0.22):
	explanations.append(
	"Acoustic parameters lack natural human variability"
	)

	if not explanations:
	explanations.append(
	"Voice exhibits characteristics consistent with AI generation"
	)

	else:
	# HUMAN
	explanations.append(
	"Voice exhibits natural acoustic variability and human speech characteristics"
	)

	return "; ".join(explanations)

	# ==========================================================
	# PART D: MAIN ANALYSIS FUNCTION
	# ==========================================================
	def analyze(self, audio_input, input_type="file"):
	"""
	Main analysis function with configurable input types

	Args:
	audio_input: Either file path or base64 string
	input_type: "file" or "base64"

	Returns:
	dict: Analysis results following API response format
	"""
	temp_file = None

	try:
	# Handle input type
	if input_type == "base64":
	temp_file = self.decode_base64_audio(audio_input)
	audio_path = temp_file
	elif input_type == "file":
	audio_path = audio_input
	if not os.path.exists(audio_path):
	return {
	"status": "error",
	"error": f"Audio file not found: {audio_path}"
	}
	else:
	return {
	"status": "error",
	"error": f"Invalid input_type: {input_type}. Use 'file' or 'base64'"
	}

	print(f"🎵 Analyzing: {os.path.basename(audio_path)}")

	# 1. Detect language for language-specific threshold
	language = self.detect_language(audio_path)
	if language == "Tamil":
	threshold = self.tamil_ai_threshold
	print(f" 📐 Using Tamil-specific AI threshold: {threshold}")
	elif language == "English":
	threshold = self.english_ai_threshold
	print(f" 📐 Using English-specific AI threshold: {threshold} (conservative)")
	else:
	threshold = self.default_ai_threshold

	# 2. Load audio once and split into chunks for max-pooling
	preloaded = self._load_audio_once(audio_path)
	y, sr, duration, was_truncated = preloaded
	chunks = self._chunk_audio(y, sr, duration)
	if not chunks:
	chunks = [preloaded] # Very short audio: single "chunk" = full audio

	# 3. Run Physics and DL per chunk; take max score (if any chunk is AI → whole file AI)
	best_score = -1.0
	best_phys_score, best_phys_method, best_phys_feats = 0.0, "", {}
	best_dl_score, best_dl_label = 0.0, ""
	any_chunk_studio_mode = False # True if ANY chunk has DL<0.10 and Physics>0.70

	for idx, preloaded_chunk in enumerate(chunks):
	phys_score, phys_method, phys_feats = 0.0, "", {}
	dl_score, dl_label = 0.0, ""
	with ThreadPoolExecutor(max_workers=2) as executor:
	future_phys = executor.submit(self.get_physics_score, None, preloaded_chunk)
	future_dl = executor.submit(self.get_dl_score, None, preloaded_chunk)
	for future in as_completed([future_phys, future_dl]):
	try:
	result = future.result()
	if len(result) == 3:
	phys_score, phys_method, phys_feats = result
	else:
	dl_score, dl_label = result
	except Exception as e:
	print(f" ⚠️ Chunk {idx+1} task error: {e}")
	if future is future_phys:
	phys_score, phys_method, phys_feats = 0.0, f"Error: {e}", {'duration': 0, 'was_truncated': False}
	else:
	dl_score, dl_label = 0.0, f"DL Error: {e}"

	if dl_score < 0.10 and phys_score > 0.70:
	any_chunk_studio_mode = True

	chunk_score = self.physics_weight * phys_score + self.dl_weight * dl_score
	if chunk_score > best_score:
	best_score = chunk_score
	best_phys_score, best_phys_method, best_phys_feats = phys_score, phys_method, phys_feats
	best_dl_score, best_dl_label = dl_score, dl_label
	print(f" Chunk {idx+1}/{len(chunks)} score: {chunk_score:.3f} (phys={phys_score:.2f}, dl={dl_score:.2f})")

	# Defensive: if all chunks failed, treat as HUMAN with low confidence
	if best_score < 0:
	print(f" ⚠️ No valid chunk scores; defaulting to HUMAN (confidence 0.0)")
	best_score = 0.0

	final_score = round(best_score, 2)
	phys_score, phys_feats = best_phys_score, best_phys_feats
	dl_score, dl_label = best_dl_score, best_dl_label

	# 4. Classification with language-specific threshold
	classification = "AI_GENERATED" if final_score > threshold else "HUMAN"

	# 4b. Studio Mode (best chunk): DL says human, Physics says "too clean" → studio recording
	if dl_score < 0.10 and phys_score > 0.70:
	classification = "HUMAN"
	print(f" 🎙️ Studio Mode (best chunk): DL={dl_score:.2f} (human) + Physics={phys_score:.2f} (clean) → HUMAN")

	# 4c. File-level Studio Mode: if ANY chunk was studio-like, don't call whole file AI (fixes false positives)
	if any_chunk_studio_mode and classification == "AI_GENERATED":
	classification = "HUMAN"
	print(f" 🎙️ Studio Mode (file-level): at least one chunk studio-like → HUMAN")

	# 5. Generate explanation (using best chunk's scores/feats)
	explanation = self.generate_explanation(
	final_score,
	dl_score,
	dl_label,
	phys_score,
	phys_feats
	)

	# 6. Return API-compliant response
	return {
	"status": "success",
	"classification": classification,
	"confidenceScore": float(final_score),
	"explanation": explanation,
	"debug": {
	"physics_score": float(phys_score),
	"dl_score": float(dl_score),
	"dl_label": dl_label,
	"physics_weight": f"{self.physics_weight*100:.0f}%",
	"dl_weight": f"{self.dl_weight*100:.0f}%",
	"audio_duration": float(phys_feats.get('duration', 0)),
	"was_truncated": bool(phys_feats.get('was_truncated', False)),
	"chunks_used": len(chunks),
	"ai_threshold_used": float(threshold),
	"language": language,
	"physics_features": {k: float(v) if isinstance(v, (np.floating, np.integer)) else v
	for k, v in phys_feats.items()
	if k not in ['duration', 'was_truncated']}
	}
	}

	except Exception as e:
	import traceback
	return {
	"status": "error",
	"error": str(e),
	"traceback": traceback.format_exc()
	}

	finally:
	# Clean up temporary file
	if temp_file and os.path.exists(temp_file):
	try:
	os.unlink(temp_file)
	except:
	pass

	# ==========================================================
	# UTILITY: Update Weights
	# ==========================================================
	def update_weights(self, physics_weight, dl_weight):
	"""
	Update ensemble weights dynamically

	Args:
	physics_weight: New physics weight (0-1)
	dl_weight: New DL weight (0-1)
	"""
	total = physics_weight + dl_weight
	self.physics_weight = physics_weight / total
	self.dl_weight = dl_weight / total

	print(f"⚙️ Weights updated:")
	print(f" Physics: {self.physics_weight*100:.0f}%")
	print(f" DL: {self.dl_weight*100:.0f}%")