File size: 6,459 Bytes
e9ee222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import noisereduce as nr
import random
import shutil

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam


DATA_SOURCE_PATH = 'data'
SPECTROGRAM_PATH = 'spectrograms_stft_5s_grayscale' 
MODEL_SAVE_PATH = 'parkinson_cnn_model_stft_grayscale.h5' 
IMG_HEIGHT, IMG_WIDTH = 224, 224
BATCH_SIZE = 32
TARGET_DURATION_S = 5

# Data Augmentation 
def augment_audio(y, sr):
    y_aug = y.copy()
    pitch_steps = random.uniform(-2, 2)
    y_aug = librosa.effects.pitch_shift(y_aug, sr=sr, n_steps=pitch_steps)
    stretch_rate = random.uniform(0.9, 1.1)
    y_aug = librosa.effects.time_stretch(y_aug, rate=stretch_rate)
    noise_amp = 0.005 * np.random.uniform() * np.amax(y)
    y_aug = y_aug + noise_amp * np.random.normal(size=len(y_aug))
    return y_aug

# Spectrogram Creation 
def create_stft_spectrogram(audio_file, save_path, augment=False):
    """
    Creates a high-quality GRAYSCALE spectrogram from a standardized 5s audio segment.
    """
    try:
        y, sr = librosa.load(audio_file, sr=None)
        
        target_samples = TARGET_DURATION_S * sr
        
        if len(y) > target_samples:
            start_index = int((len(y) - target_samples) / 2)
            y_segment = y[start_index : start_index + target_samples]
        else:
            y_segment = librosa.util.pad_center(y, size=target_samples)

        if augment:
            y_segment = augment_audio(y_segment, sr)

        y_reduced = nr.reduce_noise(y=y_segment, sr=sr)
        
        N_FFT = 1024
        HOP_LENGTH = 256
        S_audio = librosa.stft(y_reduced, n_fft=N_FFT, hop_length=HOP_LENGTH)
        Y_db = librosa.amplitude_to_db(np.abs(S_audio), ref=np.max)

        plt.figure(figsize=(12, 4))
        
        librosa.display.specshow(Y_db, sr=sr, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', cmap='gray_r')
        plt.axis('off')
        plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
        plt.close()
        return True
        
    except Exception as e:
        print(f"      - Error processing {audio_file}: {e}")
        return False

#Data Preparation 
def process_all_audio_files():
    if os.path.exists(SPECTROGRAM_PATH):
        shutil.rmtree(SPECTROGRAM_PATH)
    print(f"Starting audio to Grayscale STFT Spectrogram conversion ({TARGET_DURATION_S}s)...")
    for split in ['train', 'validation']:
        for category in ['parkinson', 'healthy']:
            os.makedirs(os.path.join(SPECTROGRAM_PATH, split, category), exist_ok=True)
    for category in ['parkinson', 'healthy']:
        source_dir = os.path.join(DATA_SOURCE_PATH, category)
        all_files = [f for f in os.listdir(source_dir) if f.lower().endswith(('.wav', '.mp3'))]
        if not all_files: continue
        random.shuffle(all_files)
        split_index = int(len(all_files) * 0.8)
        train_files, validation_files = all_files[:split_index], all_files[split_index:]
        print(f"--- Processing Category: {category} ---")
        for filename in train_files:
            file_path = os.path.join(source_dir, filename)
            base_name = os.path.splitext(filename)[0]
            for i in range(3):
                save_path = os.path.join(SPECTROGRAM_PATH, 'train', category, f"{base_name}_aug_{i}.png")
                create_stft_spectrogram(file_path, save_path, augment=(i > 0))
        for filename in validation_files:
            file_path = os.path.join(source_dir, filename)
            base_name = os.path.splitext(filename)[0]
            save_path = os.path.join(SPECTROGRAM_PATH, 'validation', category, f"{base_name}.png")
            create_stft_spectrogram(file_path, save_path, augment=False)
    print("Spectrogram generation complete.")


def train_cnn_model():
    """
    Trains a CNN model optimized for grayscale spectrograms.
    """
    if not os.path.exists(SPECTROGRAM_PATH):
        print("Spectrograms not found.")
        return

    train_datagen = ImageDataGenerator(rescale=1./255)
    validation_datagen = ImageDataGenerator(rescale=1./255)

   
    train_generator = train_datagen.flow_from_directory(
        os.path.join(SPECTROGRAM_PATH, 'train'),
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        batch_size=BATCH_SIZE,
        class_mode='binary',
        color_mode='grayscale' # Tells Keras to load images with 1 channel
    )
    validation_generator = validation_datagen.flow_from_directory(
        os.path.join(SPECTROGRAM_PATH, 'validation'),
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        batch_size=BATCH_SIZE,
        class_mode='binary',
        color_mode='grayscale'
    )

    if not train_generator.samples > 0:
        print("Error: No training images were generated.")
        return
        

    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, 1), padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu', padding='same'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    model.summary() 

    callbacks_list = [
        EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=7),
        ModelCheckpoint(MODEL_SAVE_PATH, monitor='val_accuracy', save_best_only=True, mode='max')
    ]
    
    model.fit(
        train_generator,
        epochs=100,
        validation_data=validation_generator,
        callbacks=callbacks_list
    )
    print(f"Grayscale model training complete. Best model saved to {MODEL_SAVE_PATH}")


if __name__ == '__main__':
    process_all_audio_files()
    train_cnn_model()