Spaces:

mateo496
/

esc50-model

Sleeping

App Files Files Community

mateo496 commited on Jan 30

Commit

0a7ebe1

1 Parent(s): a12db03

Added READ.ME and src/data files

Browse files

Files changed (6) hide show

README.md +51 -0
src/data/__pycache__/augment.cpython-311.pyc +0 -0
src/data/__pycache__/datasets.cpython-311.pyc +0 -0
src/data/augment.py +187 -0
src/data/datasets.py +96 -0
src/data/download.py +75 -0

README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# Environmental Sound Classification (ESC50) with Deep CNNs
+A PyTorch reimplementation of the deep convolutional neural network approach from [Salamon & Bello (2017)](https://arxiv.org/pdf/1608.04363) for environmental sound classification, extended to handle 50 classes instead of the original 10.
+## Overview
+This project implements a deep CNN architecture for environmental sound classification using log-mel spectrograms as input features. The implementation follows the methodology described in the paper "Deep Convolutional Neural Networks and Data Augmentation for Environmental Sound Classification" but is scaled to work with a more challenging 50-class classification task.
+### Key Features
+- Deep CNN with 3 convolutional layers + 2 fully connected layers
+- Log-mel spectrogram feature extraction using Essentia
+- Data augmentation (time stretching, pitch shifting, dynamic range compression)
+- Overlapping patch prediction for validation (1-frame hop)
+## Results
+| Dataset | Classes | Accuracy (with augmentation) |
+|---------|---------|--------------------------------|
+| UrbanSound8K (paper) | 10 | 79% |
+| **This project** | **50** | **74%** |
+## Architecture
+### Model Structure
+```
+Input: Log-mel Spectrogram (128 × 128)
+    ↓
+Conv2D(1→24, 5×5) + ReLU + MaxPool(4×2)
+    ↓
+Conv2D(24→48, 5×5) + ReLU + MaxPool(4×2)
+    ↓
+Conv2D(48→48, 5×5) + ReLU
+    ↓
+Flatten → Dense(2400→64) + ReLU + Dropout(0.5)
+    ↓
+Dense(64→50) + Softmax
+    ↓
+Output: 50 classes
+```
+### Training Configuration
+- **Optimizer**: SGD with momentum (0.9)
+- **Learning Rate**: 0.01
+- **Batch Size**: 100 TF-patches
+- **L2 Regularization**: 0.001 (on classifier layers only)
+- **Dropout**: 0.5 (on classifier layers)
+- **Gradient Clipping**: max_norm=1.0
+- **Epochs**: 100

src/data/__pycache__/augment.cpython-311.pyc ADDED Viewed

Binary file (10.6 kB). View file

src/data/__pycache__/datasets.cpython-311.pyc ADDED Viewed

Binary file (5.86 kB). View file

src/data/augment.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import tqdm
+import essentia.standard as es
+import librosa
+import numpy as np
+import os
+import soundfile as sf
+sample_rate = 44100
+parameters = {
+    "n_bands"  : 128,
+    "n_mels" : 128,
+    "frame_size" : 1024,
+    "hop_size": 1024,
+    "sample_rate": sample_rate,
+    "fft_size": 8192,
+}
+def data_treatment(
+    audio_path,
+    n_bands, n_mels, frame_size, hop_size, sample_rate, fft_size
+    ):
+    labels = []
+    log_mel_spectrograms = []
+    filenames = os.listdir(audio_path)
+    for filename in tqdm.tqdm(filenames, desc="Processing audio files"):
+        filename_splitted = filename.split("-")
+        label = filename_splitted[-1].split(".")[0]
+        label = label.split("_")[0]
+        labels.append(int(label))
+        file_path = os.path.join(audio_path, filename)
+        window = es.Windowing(type="hann")
+        spectrum = es.Spectrum(size=fft_size)
+        mel = es.MelBands(
+            numberBands=n_bands,
+            inputSize=fft_size//2 + 1,
+            sampleRate=sample_rate,
+            lowFrequencyBound=0,
+            highFrequencyBound=sample_rate / 2
+        )
+        loader = es.MonoLoader(filename=file_path)
+        audio = loader()
+        frames = es.FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size)
+        log_mel_spectrogram = []
+        for frame in frames:
+            frame_padded = np.pad(frame, (0, fft_size - len(frame)), mode='constant')
+            windowed_frame = window(frame_padded)
+            spec = spectrum(windowed_frame)
+            mel_bands = mel(spec)
+            log_mel_spectrogram.append(mel_bands)
+        log_mel_spectrogram = np.array(log_mel_spectrogram)
+        mel_spectrogram_db = 10 * np.log10(log_mel_spectrogram + 1e-10)
+        max_db = mel_spectrogram_db.max()
+        mel_spectrogram_db = mel_spectrogram_db - max_db
+        log_mel_spectrograms.append(mel_spectrogram_db)
+    return log_mel_spectrograms, np.array(labels)
+def pad(audio, target_seconds, sample_rate):
+    target_len = int(sample_rate * target_seconds)
+    n = len(audio)
+    if n < target_len:
+        audio = np.pad(audio, (0, target_len - n), mode="constant")
+    return audio
+def time_stretch_augmentation(file_path, sample_rate, rate):
+    audio, _ = librosa.load(file_path, sr=sample_rate)
+    audio_timestretch = librosa.effects.time_stretch(audio.astype(np.float32), rate=rate)
+    return pad(audio_timestretch, 5, sample_rate)
+def pitch_shift_augmentation(file_path, sample_rate, semitones):
+    audio, _ = librosa.load(file_path, sr=sample_rate)
+    return librosa.effects.pitch_shift(audio.astype(np.float32), sr=sample_rate, n_steps=semitones)
+def drc_augmentation(file_path, sample_rate, compression):
+    if compression == "music_standard":   threshold_db=-20; ratio=2.0; attack_ms=5;  release_ms=50
+    elif compression == "film_standard":  threshold_db=-25; ratio=4.0; attack_ms=10; release_ms= 100
+    elif compression == "speech":         threshold_db=-18; ratio=3.0; attack_ms=2;  release_ms= 40
+    elif compression == "radio":          threshold_db=-15; ratio=3.5; attack_ms=1;  release_ms= 200
+    audio, _ = librosa.load(file_path, sr=sample_rate)
+    threshold = 10**(threshold_db / 20)
+    attack_coeff  = np.exp(-1.0 / (0.001 * attack_ms * sample_rate))
+    release_coeff = np.exp(-1.0 / (0.001 * release_ms * sample_rate))
+    audio_filtered = np.zeros_like(audio)
+    gain = 1.0
+    for n in range(len(audio)):
+        abs_audio = abs(audio[n])
+        if abs_audio > threshold:
+            desired_gain = (threshold / abs_audio) ** (ratio - 1)
+        else:
+            desired_gain = 1.0
+        if desired_gain < gain:
+            gain = attack_coeff * (gain - desired_gain) + desired_gain
+        else:
+            gain = release_coeff * (gain - desired_gain) + desired_gain
+        audio_filtered[n] = audio[n] * gain
+    return audio_filtered
+def augment_dataset(audio_path, output_path, probability_list):
+    filenames = os.listdir(audio_path)
+    p1, p2, p3 = probability_list
+    os.makedirs(output_path, exist_ok=True)
+    for filename in tqdm.tqdm(filenames, desc="Processing audio files"):
+        augmentations = []
+        audio, _ = librosa.load(os.path.join(audio_path, filename), sr=sample_rate)
+        # TS
+        if np.random.rand() > p1:
+            stretch_rates = [0.81, 0.93, 1.07, 1.23]
+            stretch_rate = np.random.choice(stretch_rates)
+            audio = time_stretch_augmentation(os.path.join(audio_path, filename), sample_rate, stretch_rate)
+            augmentations.append(f"TS{stretch_rate}")
+        # PS
+        if np.random.rand() > p2:
+            semitones = [-3.5, -2.5, -2, -1, 1, 2.5, 3, 3.5]
+            semitone = np.random.choice(semitones)
+            audio = pitch_shift_augmentation(os.path.join(audio_path, filename), sample_rate, semitone)
+            augmentations.append(f"PS{semitone}")
+        # DRC
+        if np.random.rand() > p3:
+            compressions = ["radio", "film_standard", "music_standard", "speech"]
+            compression = np.random.choice(compressions)
+            audio = drc_augmentation(os.path.join(audio_path, filename), sample_rate, compression)
+            augmentations.append(f"DRC{compression}")
+        for aug in augmentations:
+            filename_splitted = filename.split(".")
+            filename = filename_splitted[0] + f"_{aug}." + filename_splitted[-1]
+        sf.write(os.path.join(output_path, filename), audio, 44100)
+def create_augmented_datasets(input_path, output_path):
+    probability_lists = [
+        [0.0 , 1.0, 1.0],
+        [1.0 , 1.0, 0.0],
+        [1.0 , 0.0, 1.0],
+        [0.0 , 0.0, 0.0],
+        [0.5 , 0.5, 0.5]]
+    for i, probability_list in enumerate(probability_lists):
+        augmented_path = os.path.join(output_path, f"{i+1}")
+        os.makedirs(augmented_path, exist_ok=True)
+        augment_dataset(input_path, augmented_path, probability_list)
+def create_log_mel(input_path, output_path):
+    directories = os.listdir(input_path)
+    X, y = [], []
+    for directory in directories:
+        log_mels, labels = data_treatment(os.path.join(input_path, directory), **parameters)
+        X.extend(log_mels)
+        y.extend(labels)
+    X_array = np.empty(len(X), dtype=object)
+    for i, spec in enumerate(X):
+        X_array[i] = spec
+    y = np.array(y)
+    os.makedirs(output_path, exist_ok=True)
+    np.save(os.path.join(output_path, "X.npy"), X_array, allow_pickle=True)
+    np.save(os.path.join(output_path, 'y.npy'), y)
+    return X, y
+if __name__ == "__main__":
+    input_path = "data/audio/0"
+    output_base_path = "data/audio"
+    create_augmented_datasets(input_path, output_base_path)

src/data/datasets.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+import numpy as np
+from torch.utils.data import Dataset
+cnn_input_length = 128
+class SpectrogramDataset(Dataset):
+    def __init__(self, spectrograms, labels, patch_length=cnn_input_length, mode='train'):
+        self.spectrograms = spectrograms
+        self.labels = labels
+        self.patch_length = patch_length
+        self.mode = mode
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        spec = self.spectrograms[idx]
+        label = self.labels[idx]
+        if self.mode == 'train':
+            n_frames = spec.shape[0]
+            if n_frames >= self.patch_length:
+                start = np.random.randint(0, n_frames - self.patch_length + 1)
+                patch = spec[start:start + self.patch_length]
+            else:
+                pad = self.patch_length - n_frames
+                patch = np.pad(spec, ((0, pad), (0, 0)), mode='constant')
+            patch = patch[np.newaxis, :, :]
+            return torch.tensor(patch, dtype=torch.float32), torch.tensor(label, dtype=torch.long)
+        else:
+            return spec, label
+class FullTFPatchesDataset(Dataset):
+    def __init__(self, spectrograms, labels, patch_length=128):
+        self.patch_length = patch_length
+        self.patch_indices = []
+        for spec_idx, spec in enumerate(spectrograms):
+            n_frames = spec.shape[0]
+            label = labels[spec_idx]
+            if n_frames >= patch_length:
+                for start_frame in range(n_frames - patch_length + 1):
+                    self.patch_indices.append((spec_idx, start_frame, label))
+            else:
+                self.patch_indices.append((spec_idx, 0, label))
+        self.spectrograms = spectrograms
+    def __len__(self):
+        return len(self.patch_indices)
+    def __getitem__(self, idx):
+        spec_idx, start_frame, label = self.patch_indices[idx]
+        spec = self.spectrograms[spec_idx]
+        n_frames = spec.shape[0]
+        if n_frames >= self.patch_length:
+            patch = spec[start_frame:start_frame + self.patch_length]
+        else:
+            pad = self.patch_length - n_frames
+            patch = np.pad(spec, ((0, pad), (0, 0)), mode='constant')
+        patch = patch[np.newaxis, :, :]
+        return torch.tensor(patch, dtype=torch.float32), torch.tensor(label, dtype=torch.long)
+class RandomPatchDataset(Dataset):
+    def __init__(self, spectrograms, labels, patch_length=128):
+        self.spectrograms = spectrograms
+        self.labels = labels
+        self.patch_length = patch_length
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        spec = self.spectrograms[idx]
+        label = self.labels[idx]
+        n_frames = spec.shape[0]
+        if n_frames >= self.patch_length:
+            start = np.random.randint(0, n_frames - self.patch_length + 1)
+            patch = spec[start:start + self.patch_length]
+        else:
+            pad = self.patch_length - n_frames
+            patch = np.pad(spec, ((0, pad), (0, 0)), mode='constant')
+        patch = patch[np.newaxis, :, :]
+        return torch.tensor(patch, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

src/data/download.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import requests
+import zipfile
+import io
+import os
+import shutil
+repo_url = "https://github.com/karolpiczak/ESC-50/archive/refs/heads/master.zip"
+repo_dst_dir = "data"
+audio_dst_dir = os.path.join(repo_dst_dir, "audio", "0")
+paths_to_delete = [
+    ".gitignore",
+    "esc50.gif",
+    "LICENSE",
+    "pytest.ini",
+    "README.md",
+    "requirements.txt",
+    "tests",
+    "meta",
+    ".github",
+    ".circleci"
+]
+def download_and_extract(url, dst_dir):
+    os.makedirs(dst_dir, exist_ok=True)
+    print(f"Downloading from {url}")
+    response = requests.get(url)
+    response.raise_for_status()
+    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
+        print(f"Extracting to {dst_dir}")
+        z.extractall(dst_dir)
+    print("Done extracting.")
+def clean_files(repo_dir, paths_to_delete):
+    for f in paths_to_delete:
+        path = os.path.join(repo_dir, f)
+        if os.path.isfile(path):
+            os.remove(path)
+            print(f"Deleted file: {path}")
+        elif os.path.isdir(path):
+            shutil.rmtree(path)
+            print(f"Deleted directory: {path}")
+def move_audio_files(src_dir, dst_dir):
+    os.makedirs(dst_dir, exist_ok=True)
+    print(f"Moving audio files from {src_dir} to {dst_dir}")
+    for filename in os.listdir(src_dir):
+        src_file = os.path.join(src_dir, filename)
+        dst_file = os.path.join(dst_dir, filename)
+        if os.path.isfile(src_file):
+            shutil.move(src_file, dst_file)
+    print(f"Moved all audio files to {dst_dir}")
+def download_clean():
+    # Download and extract
+    download_and_extract(repo_url, repo_dst_dir)
+    # The extracted path will be data/ESC-50-master/
+    extracted_dir = os.path.join(repo_dst_dir, "ESC-50-master")
+    audio_src_dir = os.path.join(extracted_dir, "audio")
+    # Clean unwanted files
+    clean_files(extracted_dir, paths_to_delete)
+    # Move audio files to data/audio/0
+    move_audio_files(audio_src_dir, audio_dst_dir)
+    # Clean up the extracted directory
+    shutil.rmtree(extracted_dir)
+    print(f"Cleanup complete. Audio files are in {audio_dst_dir}")
+if __name__ == "__main__":
+    download_clean()