File size: 1,840 Bytes
b7e88e7 3fc49d8 b7e88e7 3fc49d8 b7e88e7 3fc49d8 b7e88e7 3fc49d8 b7e88e7 3fc49d8 b7e88e7 3fc49d8 b7e88e7 3fc49d8 b7e88e7 3fc49d8 b7e88e7 3fc49d8 b7e88e7 3fc49d8 b7e88e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import torch
import gradio as gr
import librosa
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
# 1. CONFIGURATION
MODEL_ID = "facebook/wav2vec2-xls-r-300m"
QUANTIZED_MODEL_PATH = "quantized_model.pth"
# 2. LOAD MODEL
print("Loading model architecture...")
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_ID, num_labels=2)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
# Apply quantization structure
model = torch.quantization.quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
# Load weights
print("Loading quantized weights...")
model.load_state_dict(torch.load(QUANTIZED_MODEL_PATH, map_location=torch.device('cpu')))
model.eval()
# 3. PREDICTION FUNCTION
def predict_audio(audio_path):
if audio_path is None:
return "No Audio Provided"
# Load and resample
speech_array, sr = librosa.load(audio_path, sr=16000)
inputs = feature_extractor(
speech_array,
sampling_rate=16000,
return_tensors="pt",
padding=True
)
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.nn.functional.softmax(logits, dim=-1)
# Label 0 = Real, Label 1 = Deepfake (Double check your own labels!)
fake_prob = probs[0][1].item()
real_prob = probs[0][0].item()
return {
"Deepfake": fake_prob,
"Real": real_prob
}
# 4. CREATE INTERFACE (Modified for Upload Only)
iface = gr.Interface(
fn=predict_audio,
inputs=gr.Audio(
sources=["upload"],
type="filepath",
label="Upload Audio File"
),
outputs=gr.Label(num_top_classes=2),
title="Deepfake Audio Detection API",
description="Upload an audio file (WAV/MP3) to check if it's real or fake."
)
iface.launch() |