Spaces:

testing-ak
/

voice-detection-v2

Sleeping

App Files Files Community

voice-detection-v2 / detector.py

testing-ak

Create detector.py

53bc43f verified about 2 months ago

raw

history blame contribute delete

5.05 kB

	import torch
	import librosa
	import numpy as np
	import io
	import torch.nn.functional as F
	from transformers import AutoModelForAudioClassification, AutoFeatureExtractor

	class VoiceDetector:
	def __init__(self):
	print("⏳ Loading Robust AI Detection Model...")
	# FIX: Using the verified MelodyMachine model
	self.model_name = "MelodyMachine/Deepfake-audio-detection-V2"

	try:
	self.feature_extractor = AutoFeatureExtractor.from_pretrained(self.model_name)
	self.model = AutoModelForAudioClassification.from_pretrained(self.model_name)
	self.model.eval()

	# Print labels to debug (Make sure we know what 0 and 1 mean)
	print(f"✅ Model Labels: {self.model.config.id2label}")

	except Exception as e:
	print(f"❌ CRITICAL ERROR: Failed to load AI model. {e}")
	raise e

	def preprocess_audio(self, audio_buffer: io.BytesIO, target_sr=16000):
	"""
	Robust preprocessing: Resample, Normalize, and Fix Duration.
	"""
	audio_buffer.seek(0)
	y, sr = librosa.load(audio_buffer, sr=target_sr)

	# 1. Normalize Volume (Crucial for quiet clips)
	y = librosa.util.normalize(y)

	# 2. Fix Duration (Model expects ~3-5 seconds)
	# If too short (< 1.5s), loop it.
	if len(y) < target_sr * 1.5:
	tile_factor = int_(np.ceil((target_sr * 1.5) / len(y)))
	y = np.tile(y, tile_factor)

	# 3. Limit Duration (If > 10s, take the middle 5s)
	# Long files confuse the model logic if not chunked.
	max_len = target_sr * 5
	if len(y) > max_len:
	start = (len(y) - max_len) // 2
	y = y[start : start + max_len]

	return y

	def analyze(self, audio_buffer: io.BytesIO, language: str):
	"""
	Analyzes audio with improved threshold logic using MelodyMachine.
	"""
	try:
	# 1. Preprocess
	audio_input = self.preprocess_audio(audio_buffer)

	# 2. Prepare Input
	inputs = self.feature_extractor(
	audio_input,
	sampling_rate=16000,
	return_tensors="pt",
	padding=True
	)

	# 3. Inference
	with torch.no_grad():
	logits = self.model(**inputs).logits

	# 4. Get Probabilities
	probs = F.softmax(logits, dim=-1)

	# 5. Dynamic Label Mapping (Safe method)
	# MelodyMachine usually: Label 0 = Fake, Label 1 = Real
	# But we check config to be 100% sure.
	id2label = self.model.config.id2label
	fake_score = 0.0
	real_score = 0.0

	# Find which index is 'fake' and which is 'real'
	for idx, label in id2label.items():
	label_lower = str(label).lower()
	if "fake" in label_lower or "spoof" in label_lower:
	fake_score = probs[0][idx].item()
	elif "real" in label_lower or "bonafide" in label_lower:
	real_score = probs[0][idx].item()

	# Fallback if labels are just "LABEL_0", "LABEL_1" (MelodyMachine default)
	# In MelodyMachine: LABEL_0 is REAL, LABEL_1 is FAKE (Wait, let's verify logic below)
	# Actually, standard MelodyMachine:
	# Index 1 (Deepfake) -> "fake"
	# Index 0 (Real) -> "real"
	# Let's use direct index access if names are generic
	if fake_score == 0.0 and real_score == 0.0:
	# Default mapping for MelodyMachine
	real_score = probs[0][0].item() # Real is usually 0
	fake_score = probs[0][1].item() # Fake is usually 1

	print(f"🔍 DEBUG: Real Score: {real_score:.4f} \| Fake Score: {fake_score:.4f}")

	# 6. Decision Logic
	# If AI confidence is > 50%, call it AI.
	if fake_score > real_score:
	classification = "AI_GENERATED"
	confidence = fake_score
	explanation = f"Detected synthetic artifacts with {int(fake_score*100)}% confidence."
	else:
	classification = "HUMAN"
	confidence = real_score
	explanation = f"Verified human vocal characteristics with {int(real_score*100)}% confidence."

	return {
	"classification": classification,
	"confidenceScore": round(confidence, 2),
	"explanation": explanation
	}

	except Exception as e:
	print(f"Analysis Error: {e}")
	return {
	"classification": "HUMAN",
	"confidenceScore": 0.0,
	"explanation": f"Error: {str(e)}"
	}

	# Helper to fix numpy integer issue
	def int_(val):
	return int(val)