Audio Classification
English
Audio
Classification
joka13 commited on
Commit
d71c7cb
·
verified ·
1 Parent(s): 55cf66a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +18 -129
README.md CHANGED
@@ -9,137 +9,26 @@ metrics:
9
  - accuracy
10
  - f1
11
  - roc_auc
 
 
12
  base_model:
13
  - facebook/wav2vec2-base-960h
14
  - openai/whisper-large-v3-turbo
 
15
  pipeline_tag: audio-classification
16
  ---
17
- import numpy as np
18
- import librosa
19
- import tensorflow as tf
20
- import streamlit as st
21
- import sounddevice as sd
22
- import wave
23
- import os
24
-
25
- # Constants
26
- window_length = 0.02 # 20ms window length
27
- hop_length = 0.0025 # 2.5ms hop length
28
- sample_rate = 22050 # Standard audio sample rate
29
- n_mels = 128 # Number of mel filter banks
30
- threshold_zcr = 0.1 # Adjust this threshold to detect breath based on ZCR
31
- threshold_rmse = 0.1 # Adjust this threshold to detect breath based on RMSE
32
- max_len = 500 # Fix length for feature extraction
33
-
34
- # Load TFLite model
35
- interpreter = tf.lite.Interpreter(model_path="model_breath_logspec_mfcc_cnn.tflite")
36
- interpreter.allocate_tensors()
37
-
38
- # Get input and output details
39
- input_details = interpreter.get_input_details()
40
- output_details = interpreter.get_output_details()
41
-
42
- # Function to extract breath features
43
- def extract_breath_features(y, sr):
44
- frame_length = int(window_length * sr)
45
- hop_length_samples = int(hop_length * sr)
46
-
47
- zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=frame_length, hop_length=hop_length_samples)
48
- rmse = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length_samples)
49
-
50
- zcr = zcr.T.flatten()
51
- rmse = rmse.T.flatten()
52
-
53
- breaths = (zcr > threshold_zcr) & (rmse > threshold_rmse)
54
- breath_feature = np.where(breaths, 1, 0)
55
-
56
- return breath_feature
57
-
58
- # Feature extraction
59
- def extract_features(file_path):
60
- try:
61
- y, sr = librosa.load(file_path, sr=None)
62
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
63
- logspec = librosa.amplitude_to_db(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels))
64
- breath_feature = extract_breath_features(y, sr)
65
-
66
- # Fix lengths
67
- mfcc = librosa.util.fix_length(mfcc, size=max_len, axis=1)
68
- logspec = librosa.util.fix_length(logspec, size=max_len, axis=1)
69
- breath_feature = librosa.util.fix_length(breath_feature, size=max_len)
70
-
71
- return np.vstack((mfcc, logspec, breath_feature))
72
- except Exception as e:
73
- st.error(f"Error processing audio: {e}")
74
- return None
75
-
76
- # Prepare input for model
77
- def prepare_single_data(features):
78
- features = librosa.util.fix_length(features, size=max_len, axis=1)
79
- features = features[np.newaxis, ..., np.newaxis] # Add batch and channel dimensions
80
- return features.astype(np.float32) # Convert to FLOAT32
81
-
82
- # Predict audio class
83
- def predict_audio(file_path):
84
- features = extract_features(file_path)
85
- if features is not None:
86
- prepared_features = prepare_single_data(features)
87
- interpreter.set_tensor(input_details[0]['index'], prepared_features)
88
- interpreter.invoke()
89
- prediction = interpreter.get_tensor(output_details[0]['index'])
90
- predicted_class = np.argmax(prediction, axis=1)
91
- predicted_prob = prediction[0]
92
- return predicted_class[0], predicted_prob
93
- return None, None
94
-
95
- # Record audio function
96
- def record_audio(duration=5, samplerate=22050):
97
- st.info(f"Recording for {duration} seconds...")
98
- audio_data = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
99
- sd.wait()
100
- st.success("Recording Complete!")
101
- return audio_data, samplerate
102
-
103
- # Save recorded audio as .wav
104
- def save_wav(file_path, audio_data, samplerate):
105
- with wave.open(file_path, 'wb') as wf:
106
- wf.setnchannels(1)
107
- wf.setsampwidth(2)
108
- wf.setframerate(samplerate)
109
- wf.writeframes(audio_data.tobytes())
110
-
111
- # Streamlit UI
112
- st.title('🎙️ Audio Deepfake Detection')
113
- st.write('Upload or record an audio file to classify it as real or fake.')
114
-
115
- # File uploader
116
- uploaded_file = st.file_uploader('Upload an audio file', type=['wav', 'mp3'])
117
- recorded_file_path = "recorded_audio.wav"
118
-
119
- # Record audio button
120
- if st.button("Record Live Audio"):
121
- duration = st.slider("⏳ Set Duration (seconds)", 1, 10, 5)
122
- audio_data, samplerate = record_audio(duration)
123
- save_wav(recorded_file_path, audio_data, samplerate)
124
- st.audio(recorded_file_path, format="audio/wav")
125
-
126
- # Process uploaded or recorded audio
127
- if uploaded_file is not None:
128
- with open("uploaded_audio.wav", 'wb') as f:
129
- f.write(uploaded_file.getbuffer())
130
- file_path = "uploaded_audio.wav"
131
- st.audio(file_path, format="audio/wav")
132
- elif os.path.exists(recorded_file_path):
133
- file_path = recorded_file_path
134
- else:
135
- file_path = None
136
-
137
- # Run prediction
138
- if file_path:
139
- prediction, probability = predict_audio(file_path)
140
- if prediction is not None:
141
- st.write(f'**Predicted Class:** {prediction}')
142
- st.write(f'**Probability of being Real:** {probability[0]*100:.2f}%')
143
- st.write(f'**Probability of being Fake:** {probability[1]*100:.2f}%')
144
- else:
145
- st.error("Failed to process the audio file.")
 
9
  - accuracy
10
  - f1
11
  - roc_auc
12
+ - recall
13
+ - precision
14
  base_model:
15
  - facebook/wav2vec2-base-960h
16
  - openai/whisper-large-v3-turbo
17
+ - MelodyMachine/Deepfake-audio-detection-V2
18
  pipeline_tag: audio-classification
19
  ---
20
+ ---
21
+ datasets:
22
+ - DynamicSuperb/SpeechDetection_LJSpeech
23
+ - DynamicSuperb/AudioDeepFakeDetection_LJSpeech_WaveFake_MUSDB18HQ
24
+ - DynamicSuperb/SceneFakeDetection_SceneFake_ASPIRE
25
+ language:
26
+ - en
27
+ metrics:
28
+ - accuracy
29
+ - f1
30
+ - roc_auc
31
+ base_model:
32
+ - facebook/wav2vec2-base-960h
33
+ - openai/whisper-large-v3-turbo
34
+ pipeline_tag: audio-classification