File size: 20,159 Bytes
4ec806c
 
 
 
 
 
0b42831
 
 
4ec806c
 
 
0b42831
4ec806c
 
 
0b42831
4ec806c
 
 
0b42831
4ec806c
 
 
0b42831
4ec806c
0b42831
4ec806c
0b42831
4ec806c
 
0b42831
4ec806c
 
 
0b42831
4ec806c
 
 
 
 
0b42831
 
4ec806c
 
 
 
0b42831
4ec806c
 
 
 
0b42831
4ec806c
 
 
 
 
0b42831
4ec806c
 
 
0b42831
4ec806c
 
0b42831
4ec806c
 
 
 
 
 
0b42831
4ec806c
0b42831
4ec806c
0b42831
4ec806c
 
 
 
 
0b42831
 
4ec806c
 
 
0b42831
4ec806c
 
 
 
 
0b42831
4ec806c
 
 
 
 
 
 
0b42831
4ec806c
0b42831
4ec806c
521317f
4ec806c
 
 
0b42831
 
4ec806c
 
0b42831
 
4ec806c
 
0b42831
 
4ec806c
 
 
 
0b42831
 
4ec806c
 
0b42831
4ec806c
 
521317f
4ec806c
0b42831
4ec806c
8e8675d
0b42831
4ec806c
8e8675d
4ec806c
0b42831
 
4ec806c
0b42831
4ec806c
0b42831
4ec806c
 
0b42831
4ec806c
 
 
 
 
0b42831
 
 
4ec806c
 
0b42831
8e8675d
 
0b42831
 
8e8675d
 
0b42831
8e8675d
 
 
 
0b42831
8e8675d
 
 
0b42831
8e8675d
0b42831
8e8675d
 
 
 
 
 
 
 
 
0b42831
8e8675d
 
0b42831
8e8675d
 
 
 
 
 
 
 
 
 
 
 
4ec806c
0b42831
4ec806c
 
0b42831
4ec806c
0b42831
4ec806c
 
 
 
0b42831
 
4ec806c
 
 
 
0b42831
4ec806c
 
 
 
0b42831
4ec806c
 
 
 
 
 
 
 
0b42831
4ec806c
 
 
 
 
 
 
 
 
0b42831
4ec806c
 
 
 
0b42831
4ec806c
 
 
 
0b42831
4ec806c
 
 
 
 
 
0b42831
 
 
4ec806c
521317f
0b42831
95ad43e
0b42831
 
 
 
95ad43e
0b42831
 
 
4ec806c
0b42831
4ec806c
95ad43e
 
 
 
4ec806c
 
0b42831
95ad43e
0b42831
 
 
 
 
 
 
 
4ec806c
0b42831
4ec806c
95ad43e
 
 
 
4ec806c
 
0b42831
95ad43e
 
0b42831
 
 
 
 
4ec806c
0b42831
4ec806c
95ad43e
 
 
 
4ec806c
 
0b42831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ec806c
0b42831
4ec806c
0b42831
 
 
 
4ec806c
 
0b42831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ec806c
0b42831
 
 
 
 
 
4ec806c
 
0b42831
4ec806c
95ad43e
 
 
0b42831
 
 
4ec806c
 
0b42831
 
4ec806c
0b42831
 
4ec806c
0b42831
4ec806c
0b42831
95ad43e
4ec806c
 
 
95ad43e
4ec806c
 
 
 
0b42831
 
 
4ec806c
 
0b42831
4ec806c
 
0b42831
4ec806c
 
 
 
 
8e8675d
 
 
 
4ec806c
0b42831
4ec806c
 
0b42831
4ec806c
 
 
 
0b42831
95ad43e
0b42831
 
95ad43e
 
 
0b42831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ec806c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b42831
4ec806c
 
0b42831
 
 
 
 
 
 
 
 
4ec806c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b42831
4ec806c
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
import torch
import torch.nn as nn
import torch.nn.functional as F
import librosa
import numpy as np


# Basic building block for the ResNet-style CNN
# Uses two convolutional layers with batch normalization
class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        # first conv layer with specified stride
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, 
                               stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        # second conv layer always has stride 1
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        # downsample is used when dimensions change
        self.downsample = downsample

    def forward(self, x):
        # save input for skip connection
        identity = x
        # pass through first conv + batchnorm + relu
        out = F.relu(self.bn1(self.conv1(x)))
        # pass through second conv + batchnorm
        out = self.bn2(self.conv2(out))
        
        # apply downsample if needed to match dimensions
        if self.downsample is not None:
            identity = self.downsample(x)
        
        # add skip connection and apply relu
        out += identity
        out = F.relu(out)
        return out


# Main CNN model for speech style classification
# Architecture based on ResNet with custom layer configuration
class SpeechStyleCNN(nn.Module):
    def __init__(self, num_classes=2):
        super(SpeechStyleCNN, self).__init__()
        
        # initial convolution layer - takes 3 channel input (RGB spectrogram)
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        # stack of residual blocks with increasing channel sizes
        self.layer1 = self._make_layer(64, 64, 2, stride=1)
        self.layer2 = self._make_layer(64, 128, 2, stride=2)
        self.layer3 = self._make_layer(128, 256, 2, stride=2)
        self.layer4 = self._make_layer(256, 512, 2, stride=2)
        
        # global average pooling and final classification layer
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)
        
    # helper function to create a layer of residual blocks
    def _make_layer(self, in_channels, out_channels, blocks, stride=1):
        downsample = None
        # need downsample when stride changes or channels don't match
        if stride != 1 or in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        
        # create list of blocks
        layers = []
        # first block may have different stride
        layers.append(BasicBlock(in_channels, out_channels, stride, downsample))
        # remaining blocks have stride 1
        for _ in range(1, blocks):
            layers.append(BasicBlock(out_channels, out_channels))
        
        return nn.Sequential(*layers)
    
    def forward(self, x):
        # initial conv block
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.maxpool(x)
        
        # pass through all residual layers
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        # global pooling and classification
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        
        return x


# Main classifier class that combines CNN with acoustic feature analysis
class AudioClassifier:  
    # dictionary of available pre-trained models
    AVAILABLE_MODELS = {
        '3s_window': 'spectrogram_cnn_3s_window.pth',
    }
    
    @classmethod
    def get_model_path(cls, model_name='3s_window'):
        # returns the full path to a model file
        import os
        if model_name not in cls.AVAILABLE_MODELS:
            print(f"Model not found: {model_name}")
            return None
        return os.path.join(os.path.dirname(__file__), cls.AVAILABLE_MODELS[model_name])
    
    def __init__(self, model_path=None, device=None):
        # set up device - use GPU if available
        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = torch.device(device)
        
        # initialize the CNN model
        self.model = SpeechStyleCNN().to(self.device)
        
        # use default model path if not specified
        if model_path is None:
            import os
            model_path = os.path.join(os.path.dirname(__file__), 'spectrogram_cnn_3s_window.pth')
        
        # load pre-trained weights
        try:
            print(f"Attempting to load model from: {model_path}")
            state_dict = torch.load(model_path, map_location=self.device, weights_only=False)
            self.model.load_state_dict(state_dict)
            print(f"✓ Successfully loaded trained model from: {model_path}")
        except FileNotFoundError:
            print(f"Could not find model file at {model_path}")
            print("Make sure the model file exists in the correct location")
        except Exception as e:
            print(f"Something went wrong loading the model: {e}")
        
        # set model to evaluation mode
        self.model.eval()
        
        # audio processing parameters
        self.sample_rate = 16000
        self.n_mels = 128
        self.n_fft = 2048
        self.hop_length = 512
        
    # extract mel spectrogram from audio file
    def extract_mel_spectrogram(self, audio_path, window_size=3.0):
        # load audio at target sample rate
        audio, sr = librosa.load(audio_path, sr=self.sample_rate)
        
        # calculate window size in samples
        window_samples = int(window_size * sr)
        
        # for longer audio, use multiple overlapping windows
        if len(audio) > window_samples * 1.5:
            hop_samples = window_samples // 2
            windows = []
            # extract overlapping windows
            for start in range(0, len(audio) - window_samples, hop_samples):
                window = audio[start:start + window_samples]
                windows.append(window)
            
            # add the last window
            if len(audio) > window_samples:
                windows.append(audio[-window_samples:])
            
            # compute mel spectrogram for each window
            mel_specs = []
            for window in windows[:5]:  # limit to 5 windows
                mel_spec = librosa.feature.melspectrogram(
                    y=window,
                    sr=sr,
                    n_mels=self.n_mels,
                    n_fft=self.n_fft,
                    hop_length=self.hop_length
                )
                mel_specs.append(mel_spec)
            
            # average the spectrograms
            mel_spec = np.mean(mel_specs, axis=0)
        else:
            # for short audio, pad or truncate
            if len(audio) < window_samples:
                audio = np.pad(audio, (0, window_samples - len(audio)), mode='constant')
            else:
                audio = audio[:window_samples]
            
            mel_spec = librosa.feature.melspectrogram(
                y=audio,
                sr=sr,
                n_mels=self.n_mels,
                n_fft=self.n_fft,
                hop_length=self.hop_length
            )
        
        # convert to decibels
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        # normalize to 0-1 range
        mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
        # stack into 3 channels for CNN input
        mel_spec_3ch = np.stack([mel_spec_norm, mel_spec_norm, mel_spec_norm], axis=0)
        
        return mel_spec_3ch
    
    # extract acoustic features from audio
    def extract_acoustic_features(self, audio_path):
        audio, sr = librosa.load(audio_path, sr=self.sample_rate)
        
        features = {}
        
        # tempo/rhythm estimation
        onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
        tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
        features['tempo'] = float(tempo)
        
        # pitch tracking
        pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
        pitch_values = []
        for t in range(pitches.shape[1]):
            index = magnitudes[:, t].argmax()
            pitch = pitches[index, t]
            if pitch > 0:
                pitch_values.append(pitch)
        
        # calculate pitch statistics
        if pitch_values:
            features['pitch_mean'] = float(np.mean(pitch_values))
            features['pitch_std'] = float(np.std(pitch_values))
            features['pitch_range'] = float(np.max(pitch_values) - np.min(pitch_values))
        else:
            features['pitch_mean'] = 0.0
            features['pitch_std'] = 0.0
            features['pitch_range'] = 0.0
        
        # energy/loudness features
        rms = librosa.feature.rms(y=audio)[0]
        features['energy_mean'] = float(np.mean(rms))
        features['energy_std'] = float(np.std(rms))
        
        # zero crossing rate - indicates voice quality
        zcr = librosa.feature.zero_crossing_rate(audio)[0]
        features['zcr_mean'] = float(np.mean(zcr))
        features['zcr_std'] = float(np.std(zcr))
        
        # spectral centroid - brightness of sound
        spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
        features['spectral_centroid_mean'] = float(np.mean(spectral_centroids))
        features['spectral_centroid_std'] = float(np.std(spectral_centroids))
        
        return features
    
    # compute prosody scores from acoustic features
    # uses thresholds calibrated from training data
    def _compute_prosody_scores(self, features):
        individual_scores = {}

        # spectral centroid variability - best discriminating feature
        sc_std = features['spectral_centroid_std']
        if sc_std >= 1080:
            spectral_score = 0.9  # strongly indicates read
        elif sc_std >= 1040:
            spectral_score = 0.7
        elif sc_std >= 1000:
            spectral_score = 0.5
        elif sc_std >= 970:
            spectral_score = 0.3
        else:
            spectral_score = 0.1  # strongly spontaneous
        
        individual_scores['spectral_variability'] = {
            'score': spectral_score,
            'value': sc_std,
            'interpretation': 'high variability (read)' if spectral_score > 0.6 else 'low variability (spontaneous)' if spectral_score < 0.4 else 'moderate'
        }
        
        # zero crossing rate - second best feature
        zcr = features['zcr_mean']
        if zcr >= 0.125:
            zcr_score = 0.9
        elif zcr >= 0.110:
            zcr_score = 0.7
        elif zcr >= 0.100:
            zcr_score = 0.5
        elif zcr >= 0.092:
            zcr_score = 0.3
        else:
            zcr_score = 0.1
        
        individual_scores['zcr_mean'] = {
            'score': zcr_score,
            'value': zcr,
            'interpretation': 'high ZCR (read)' if zcr_score > 0.6 else 'low ZCR (spontaneous)' if zcr_score < 0.4 else 'moderate'
        }
        
        # energy level - read speech tends to be lower energy
        energy = features['energy_mean']
        if energy < 0.055:
            energy_score = 0.85
        elif energy < 0.062:
            energy_score = 0.65
        elif energy < 0.070:
            energy_score = 0.4
        else:
            energy_score = 0.15
        
        individual_scores['energy_level'] = {
            'score': energy_score,
            'value': energy,
            'interpretation': 'low energy (read)' if energy_score > 0.6 else 'high energy (spontaneous)' if energy_score < 0.4 else 'moderate'
        }
        
        # pitch range feature
        pitch_range = features.get('pitch_range', 3828)
        if pitch_range < 3815:
            pitch_range_score = 0.7
        elif pitch_range < 3828:
            pitch_range_score = 0.5
        else:
            pitch_range_score = 0.3
        
        individual_scores['pitch_range'] = {
            'score': pitch_range_score,
            'value': pitch_range,
            'interpretation': 'narrow (read)' if pitch_range_score > 0.6 else 'wide (spontaneous)' if pitch_range_score < 0.4 else 'moderate'
        }
        
        # energy variability
        energy_std = features.get('energy_std', 0.047)
        if energy_std < 0.042:
            energy_std_score = 0.7
        elif energy_std < 0.048:
            energy_std_score = 0.5
        else:
            energy_std_score = 0.3
        
        individual_scores['energy_std'] = {
            'score': energy_std_score,
            'value': energy_std,
            'interpretation': 'steady (read)' if energy_std_score > 0.6 else 'variable (spontaneous)' if energy_std_score < 0.4 else 'moderate'
        }
        
        # zcr variability
        zcr_std = features.get('zcr_std', 0.111)
        if zcr_std >= 0.115:
            zcr_std_score = 0.7
        elif zcr_std >= 0.105:
            zcr_std_score = 0.5
        else:
            zcr_std_score = 0.3
        
        individual_scores['zcr_std'] = {
            'score': zcr_std_score,
            'value': zcr_std,
            'interpretation': 'variable ZCR (read)' if zcr_std_score > 0.6 else 'steady ZCR (spontaneous)' if zcr_std_score < 0.4 else 'moderate'
        }
        
        # weights based on feature importance from analysis
        weights = {
            'spectral_variability': 0.30,
            'zcr_mean': 0.25,
            'energy_level': 0.20,
            'pitch_range': 0.10,
            'energy_std': 0.08,
            'zcr_std': 0.07,
        }
        
        # calculate weighted overall score
        overall_score = (
            spectral_score * weights['spectral_variability'] +
            zcr_score * weights['zcr_mean'] +
            energy_score * weights['energy_level'] +
            pitch_range_score * weights['pitch_range'] +
            energy_std_score * weights['energy_std'] +
            zcr_std_score * weights['zcr_std']
        )
        
        # determine classification based on thresholds
        if overall_score > 0.58:
            classification = 'read'
            confidence = 0.5 + (overall_score - 0.5) * 0.9
        elif overall_score < 0.42:
            classification = 'spontaneous'
            confidence = 0.5 + (0.5 - overall_score) * 0.9
        else:
            classification = 'read' if overall_score >= 0.50 else 'spontaneous'
            confidence = 0.5 + abs(overall_score - 0.5) * 0.6
        
        return {
            'classification': classification,
            'confidence': min(0.95, confidence),
            'overall_score': overall_score,
            'individual_scores': individual_scores
        }
    
    # main classification method - combines CNN and prosody analysis
    def classify(self, audio_path):
        # extract mel spectrogram for CNN
        mel_spec = self.extract_mel_spectrogram(audio_path)
        
        # convert to tensor and add batch dimension
        mel_tensor = torch.FloatTensor(mel_spec).unsqueeze(0).to(self.device)
        
        # get CNN predictions
        with torch.no_grad():
            logits = self.model(mel_tensor)
            probabilities = F.softmax(logits, dim=1)
            predicted_class = torch.argmax(probabilities, dim=1).item()
            cnn_confidence = probabilities[0, predicted_class].item()
            
            print(f"CNN Logits: {logits[0].cpu().numpy()}")
            print(f"CNN Probabilities: Class 0 (read)={probabilities[0, 0].item():.3f}, Class 1 (spontaneous)={probabilities[0, 1].item():.3f}")
            print(f"CNN Prediction: Class {predicted_class} ({['read', 'spontaneous'][predicted_class]}) with confidence {cnn_confidence:.3f}")
        
        # extract acoustic features for prosody analysis
        acoustic_features = self.extract_acoustic_features(audio_path)
        
        # compute prosody-based scores
        prosody_scores = self._compute_prosody_scores(acoustic_features)
        prosody_classification = prosody_scores['classification']
        prosody_confidence = prosody_scores['confidence']
        
        # map CNN class to label
        cnn_class_name = 'read' if predicted_class == 0 else 'spontaneous'
        read_prob = probabilities[0, 0].item()
        
        print(f"CNN classification: {cnn_class_name}")
        print(f"Prosody classification: {prosody_classification} (conf={prosody_confidence:.2f})")
        
        # combine CNN and prosody - prosody is more reliable
        final_classification = prosody_classification
        final_confidence = prosody_confidence
        
        # boost confidence when both methods agree
        if cnn_class_name == prosody_classification:
            final_confidence = min(0.95, prosody_confidence * 1.15)
        elif read_prob > 0.85 and cnn_class_name == 'read':
            if prosody_confidence < 0.65:
                final_classification = 'read'
                final_confidence = 0.55
        elif read_prob < 0.10 and cnn_class_name == 'spontaneous':
            if prosody_confidence < 0.65:
                final_classification = 'spontaneous'
                final_confidence = 0.55
        
        return {
            'classification': final_classification,
            'confidence': float(final_confidence),
            'cnn_classification': cnn_class_name,
            'cnn_confidence': float(cnn_confidence),
            'prosody_classification': prosody_classification,
            'prosody_confidence': float(prosody_confidence),
            'prosody_scores': prosody_scores['individual_scores'],
            'acoustic_features': acoustic_features,
            'interpretation': self._interpret_classification(
                final_classification, final_confidence, 
                cnn_class_name, cnn_confidence,
                prosody_classification, prosody_confidence,
                prosody_scores, acoustic_features
            )
        }
    
    # generate human-readable interpretation of classification
    def _interpret_classification(
        self, 
        final_class, 
        final_confidence,
        cnn_class,
        cnn_confidence,
        prosody_class,
        prosody_confidence,
        prosody_scores,
        features
    ):   
        interpretation = f"## Classification: **{final_class.upper()}** SPEECH\n\n"
        interpretation += f"**Confidence:** {final_confidence*100:.1f}%\n\n"
        
        if final_class == 'read':
            interpretation += "**Description:** The speech exhibits characteristics of read or scripted content. "
            interpretation += "The audio shows consistent prosodic patterns typical of someone reading from prepared text, "
            interpretation += "with steady pacing, uniform intonation, and regular energy levels.\n\n"
        else:
            interpretation += "**Description:** The speech exhibits characteristics of spontaneous speaking. "
            interpretation += "The audio shows natural prosodic variation typical of extemporaneous speech, "
            interpretation += "with variable pacing, dynamic intonation, and natural energy fluctuations.\n\n"
        
        return interpretation


# test code - runs when script is executed directly
if __name__ == "__main__":
    classifier = AudioClassifier()
    print("\nAvailable pre-trained models:")
    for name, filename in AudioClassifier.AVAILABLE_MODELS.items():
        print(f"  - {name}: {filename}")
    
    print("\nModel architecture:")
    print(classifier.model)