Audio Classification
English
Audio
Classification
File size: 4,802 Bytes
55cf66a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import numpy as np
import librosa
import tensorflow as tf
import streamlit as st
import sounddevice as sd
import wave
import os

# Constants
window_length = 0.02  # 20ms window length
hop_length = 0.0025  # 2.5ms hop length
sample_rate = 22050  # Standard audio sample rate
n_mels = 128  # Number of mel filter banks
threshold_zcr = 0.1  # Adjust this threshold to detect breath based on ZCR
threshold_rmse = 0.1  # Adjust this threshold to detect breath based on RMSE
max_len = 500  # Fix length for feature extraction

# Load TFLite model
interpreter = tf.lite.Interpreter(model_path="model_breath_logspec_mfcc_cnn.tflite")
interpreter.allocate_tensors()

# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Function to extract breath features
def extract_breath_features(y, sr):
    frame_length = int(window_length * sr)
    hop_length_samples = int(hop_length * sr)
    
    zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=frame_length, hop_length=hop_length_samples)
    rmse = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length_samples)
    
    zcr = zcr.T.flatten()
    rmse = rmse.T.flatten()
    
    breaths = (zcr > threshold_zcr) & (rmse > threshold_rmse)
    breath_feature = np.where(breaths, 1, 0)
    
    return breath_feature

# Feature extraction
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        logspec = librosa.amplitude_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels))
        breath_feature = extract_breath_features(y, sr)
        
        # Fix lengths
        mfcc = librosa.util.fix_length(mfcc, size=max_len, axis=1)
        logspec = librosa.util.fix_length(logspec, size=max_len, axis=1)
        breath_feature = librosa.util.fix_length(breath_feature, size=max_len)
        
        return np.vstack((mfcc, logspec, breath_feature))
    except Exception as e:
        st.error(f"Error processing audio: {e}")
        return None

# Prepare input for model
def prepare_single_data(features):
    features = librosa.util.fix_length(features, size=max_len, axis=1)
    features = features[np.newaxis, ..., np.newaxis]  # Add batch and channel dimensions
    return features.astype(np.float32)  # Convert to FLOAT32

# Predict audio class
def predict_audio(file_path):
    features = extract_features(file_path)
    if features is not None:
        prepared_features = prepare_single_data(features)
        interpreter.set_tensor(input_details[0]['index'], prepared_features)
        interpreter.invoke()
        prediction = interpreter.get_tensor(output_details[0]['index'])
        predicted_class = np.argmax(prediction, axis=1)
        predicted_prob = prediction[0]
        return predicted_class[0], predicted_prob
    return None, None

# Record audio function
def record_audio(duration=5, samplerate=22050):
    st.info(f"🎤 Recording for {duration} seconds...")
    audio_data = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
    sd.wait()
    st.success("✅ Recording Complete!")
    return audio_data, samplerate

# Save recorded audio as .wav
def save_wav(file_path, audio_data, samplerate):
    with wave.open(file_path, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(samplerate)
        wf.writeframes(audio_data.tobytes())

# Streamlit UI
st.title('🎙️ Audio Deepfake Detection')
st.write('Upload or record an audio file to classify it as real or fake.')

# File uploader
uploaded_file = st.file_uploader('📂 Upload an audio file', type=['wav', 'mp3'])
recorded_file_path = "recorded_audio.wav"

# Record audio button
if st.button("🎤 Record Live Audio"):
    duration = st.slider("⏳ Set Duration (seconds)", 1, 10, 5)
    audio_data, samplerate = record_audio(duration)
    save_wav(recorded_file_path, audio_data, samplerate)
    st.audio(recorded_file_path, format="audio/wav")

# Process uploaded or recorded audio
if uploaded_file is not None:
    with open("uploaded_audio.wav", 'wb') as f:
        f.write(uploaded_file.getbuffer())
    file_path = "uploaded_audio.wav"
    st.audio(file_path, format="audio/wav")
elif os.path.exists(recorded_file_path):
    file_path = recorded_file_path
else:
    file_path = None

# Run prediction
if file_path:
    prediction, probability = predict_audio(file_path)
    if prediction is not None:
        st.write(f'**Predicted Class:** {prediction}')
        st.write(f'**Probability of being Real:** {probability[0]*100:.2f}%')
        st.write(f'**Probability of being Fake:** {probability[1]*100:.2f}%')
    else:
        st.error("❌ Failed to process the audio file.")