|
|
--- |
|
|
license: mit |
|
|
datasets: |
|
|
- LanceaKing/asvspoof2019 |
|
|
language: |
|
|
- en |
|
|
metrics: |
|
|
- accuracy |
|
|
--- |
|
|
# DeepVoiceGuard: Real-Time Audio Authenticity Detection |
|
|
|
|
|
**DeepVoiceGuard** is an advanced AI-powered tool for detecting whether an audio file is genuine or AI-generated. Built using RawNet-based architecture and trained on ASVspoof datasets, this model is optimized for real-time inference using ONNX format. |
|
|
--- |
|
|
## π Features |
|
|
- **Real-Time Detection:** Analyze audio files quickly and efficiently to determine authenticity. |
|
|
- **Sliding Window Processing:** Processes long audio files in segments for accurate classification. |
|
|
- **ONNX Optimized:** Faster inference compared to traditional formats. |
|
|
- **Interactive Demo:** Test the model using [our Streamlit application](https://huggingface.co/spaces/Mrkomiljon/DeepVoiceGuard). |
|
|
--- |
|
|
## π Model Overview |
|
|
- **Architecture:** RawNet-based Neural Network |
|
|
- **Frameworks Used:** PyTorch, ONNX |
|
|
- **Dataset:** Trained on ASVspoof 2019 Challenge dataset(LA) |
|
|
- **Classes:** |
|
|
- **Real:** Genuine human speech |
|
|
- **Fake:** AI-generated or spoofed audio |
|
|
--- |
|
|
## π Installation |
|
|
Install the necessary dependencies: |
|
|
```bash |
|
|
pip install onnxruntime librosa numpy requests streamlit |
|
|
``` |
|
|
π§ How to Use |
|
|
Using the ONNX Model |
|
|
``` |
|
|
import streamlit as st |
|
|
import librosa |
|
|
import numpy as np |
|
|
import onnxruntime as ort |
|
|
import os |
|
|
import requests |
|
|
|
|
|
# Audio padding function |
|
|
def pad(x, max_len=64600): |
|
|
""" |
|
|
Pad or trim an audio segment to a fixed length by repeating or slicing. |
|
|
""" |
|
|
x_len = x.shape[0] |
|
|
if x_len >= max_len: |
|
|
return x[:max_len] # Trim if longer |
|
|
# Repeat to fill max_len |
|
|
num_repeats = (max_len // x_len) + 1 |
|
|
padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0] |
|
|
return padded_x |
|
|
# Preprocess audio for a single segment |
|
|
def preprocess_audio_segment(segment, cut=64600): |
|
|
""" |
|
|
Preprocess a single audio segment: pad or trim as required. |
|
|
""" |
|
|
segment = pad(segment, max_len=cut) |
|
|
return np.expand_dims(np.array(segment, dtype=np.float32), axis=0) # Add batch dimension |
|
|
|
|
|
# Download ONNX model from Hugging Face |
|
|
def download_model(url, local_path="RawNet_model.onnx"): |
|
|
""" |
|
|
Download the ONNX model from a URL if it doesn't already exist locally. |
|
|
""" |
|
|
if not os.path.exists(local_path): |
|
|
with st.spinner("Downloading ONNX model..."): |
|
|
response = requests.get(url) |
|
|
if response.status_code == 200: |
|
|
with open(local_path, "wb") as f: |
|
|
f.write(response.content) |
|
|
st.success("Model downloaded successfully!") |
|
|
else: |
|
|
raise Exception("Failed to download ONNX model") |
|
|
return local_path |
|
|
# Sliding window prediction function |
|
|
def predict_with_sliding_window(audio_path, onnx_model_path, window_size=64600, step_size=64600, sample_rate=16000): |
|
|
""" |
|
|
Use a sliding window to predict if the audio is real or fake over the entire audio. |
|
|
""" |
|
|
# Load ONNX runtime session |
|
|
ort_session = ort.InferenceSession(onnx_model_path) |
|
|
|
|
|
# Load audio file |
|
|
waveform, _ = librosa.load(audio_path, sr=sample_rate) |
|
|
total_segments = [] |
|
|
total_probabilities = [] |
|
|
|
|
|
# Sliding window processing |
|
|
for start in range(0, len(waveform), step_size): |
|
|
end = start + window_size |
|
|
segment = waveform[start:end] |
|
|
|
|
|
# Preprocess the segment |
|
|
audio_tensor = preprocess_audio_segment(segment) |
|
|
|
|
|
# Perform inference |
|
|
inputs = {ort_session.get_inputs()[0].name: audio_tensor} |
|
|
outputs = ort_session.run(None, inputs) |
|
|
probabilities = np.exp(outputs[0]) # Softmax probabilities |
|
|
prediction = np.argmax(probabilities) |
|
|
|
|
|
# Store the results |
|
|
predicted_class = "Real" if prediction == 1 else "Fake" |
|
|
total_segments.append(predicted_class) |
|
|
total_probabilities.append(probabilities[0][prediction]) |
|
|
|
|
|
# Final aggregation |
|
|
majority_class = max(set(total_segments), key=total_segments.count) # Majority voting |
|
|
avg_probability = np.mean(total_probabilities) * 100 # Average probability in percentage |
|
|
|
|
|
return majority_class, avg_probability |
|
|
|
|
|
# Example |
|
|
result = predict("example.wav") |
|
|
print(f"Prediction: {result}") |
|
|
``` |
|
|
π Performance Metrics |
|
|
Equal Error Rate (EER): 4.21% |
|
|
Accuracy: 95.8% |
|
|
ROC-AUC: 0.986 |
|
|
|
|
|
π‘ License |
|
|
This project is licensed under the MIT License. |
|
|
|
|
|
βοΈ Contact |
|
|
For inquiries or support, please contact: |
|
|
|
|
|
- GitHub: [Mrkomiljon](https://github.com/Mrkomiljon/DeepVoiceGuard) |
|
|
- Hugging Face: [DeepVoiceGuard](https://huggingface.co/spaces/Mrkomiljon/DeepVoiceGuard) |
|
|
|