AIZEN-007 commited on
Commit
f491cd5
Β·
verified Β·
1 Parent(s): 783be15

Create main2.py

Browse files
Files changed (1) hide show
  1. main2.py +739 -0
main2.py ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import numpy as np
4
+ import pandas as pd
5
+ import torch
6
+ import torchaudio
7
+ import librosa
8
+ import matplotlib.pyplot as plt
9
+ import csv
10
+ from typing import List, Dict, Tuple, Optional
11
+ from scipy.stats import kurtosis, skew
12
+ import concurrent.futures
13
+ import multiprocessing
14
+ from functools import partial
15
+ import time
16
+ import threading
17
+ from queue import Queue
18
+ from dotenv import load_dotenv
19
+ from groq import Groq
20
+
21
+ # Import required models
22
+ from pyannote.audio import Pipeline
23
+ import whisper
24
+ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
25
+ from torch_vggish_yamnet import yamnet
26
+ from torch_vggish_yamnet.input_proc import WaveformToInput
27
+ import warnings
28
+ warnings.filterwarnings("ignore")
29
+
30
+ class UnifiedAudioAnalyzer:
31
+ """
32
+ Unified Audio Analysis System combining:
33
+ 1. Speaker Diarization + Transcription
34
+ 2. Audio Event Detection (YAMNet)
35
+ 3. Emotion Recognition + Paralinguistic Features
36
+
37
+ Enhanced with parallel processing for faster execution
38
+ """
39
+
40
+ def __init__(self, enable_parallel_processing=True, max_workers=None):
41
+ """Initialize all models and components"""
42
+ print("πŸ”„ Initializing Unified Audio Analyzer...")
43
+
44
+ # Configure device
45
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
+ print(f"Using device: {self.device}")
47
+
48
+ # Parallel processing settings
49
+ self.enable_parallel_processing = enable_parallel_processing
50
+ self.max_workers = max_workers or max(1, multiprocessing.cpu_count() - 1)
51
+ print(f"Parallel processing: {'Enabled' if enable_parallel_processing else 'Disabled'}")
52
+ if enable_parallel_processing:
53
+ print(f"Max workers: {self.max_workers}")
54
+
55
+ # Initialize models
56
+ self._load_diarization_models()
57
+ self._load_emotion_models()
58
+ self._load_event_detection_models()
59
+ self._load_class_names()
60
+
61
+ print("βœ… All models loaded successfully!")
62
+
63
+ def _load_diarization_models(self):
64
+ """Load speaker diarization and transcription models"""
65
+ print("Loading speaker diarization and transcription models...")
66
+
67
+ # Load pyannote diarization pipeline
68
+ try:
69
+ self.diarization_pipeline = Pipeline.from_pretrained(
70
+ "pyannote/speaker-diarization-3.1"
71
+ # Uncomment and add your token: use_auth_token="YOUR_HUGGINGFACE_TOKEN"
72
+ )
73
+ if torch.cuda.is_available():
74
+ self.diarization_pipeline = self.diarization_pipeline.to(self.device)
75
+ except Exception as e:
76
+ print(f"Warning: Could not load diarization model: {e}")
77
+ self.diarization_pipeline = None
78
+
79
+ # Load Whisper transcription model
80
+ try:
81
+ self.whisper_model = whisper.load_model("base")
82
+ except Exception as e:
83
+ print(f"Warning: Could not load Whisper model: {e}")
84
+ self.whisper_model = None
85
+
86
+ def _load_emotion_models(self):
87
+ """Load emotion recognition models"""
88
+ print("Loading emotion recognition models...")
89
+
90
+ try:
91
+ self.emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained(
92
+ "Dpngtm/wav2vec2-emotion-recognition"
93
+ )
94
+ self.emotion_processor = Wav2Vec2Processor.from_pretrained(
95
+ "Dpngtm/wav2vec2-emotion-recognition"
96
+ )
97
+ self.emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]
98
+ except Exception as e:
99
+ print(f"Warning: Could not load emotion model: {e}")
100
+ self.emotion_model = None
101
+
102
+ def _load_event_detection_models(self):
103
+ """Load YAMNet for audio event detection"""
104
+ print("Loading audio event detection models...")
105
+
106
+ try:
107
+ self.yamnet_model = yamnet.yamnet(pretrained=True)
108
+ self.yamnet_model.eval()
109
+ self.yamnet_converter = WaveformToInput()
110
+ except Exception as e:
111
+ print(f"Warning: Could not load YAMNet model: {e}")
112
+ self.yamnet_model = None
113
+
114
+ def _load_class_names(self):
115
+ """Load AudioSet class names for YAMNet from CSV"""
116
+ csv_path = "yamnet_class_map.csv"
117
+ self.audioset_classes = []
118
+ try:
119
+ with open(csv_path, "r") as f:
120
+ reader = csv.reader(f)
121
+ next(reader) # skip header
122
+ for row in reader:
123
+ self.audioset_classes.append(row[2]) # display_name
124
+ except Exception as e:
125
+ print(f"Warning: Could not load class names from {csv_path}: {e}")
126
+ # Fallback to common AudioSet classes
127
+ self.audioset_classes = [
128
+ "Speech", "Male speech, man speaking", "Female speech, woman speaking",
129
+ "Child speech, kid speaking", "Conversation", "Narration, monologue",
130
+ "Babbling", "Speech synthesizer", "Shout", "Bellow", "Whoop", "Yell",
131
+ "Children shouting", "Screaming", "Whispering", "Laughter", "Baby laughter",
132
+ "Giggle", "Snicker", "Belly laugh", "Chuckle, chortle", "Crying, sobbing",
133
+ "Baby cry, infant cry", "Whimper", "Wail, moan", "Sigh", "Singing",
134
+ "Choir", "Yodeling", "Chant", "Mantra", "Male singing", "Female singing",
135
+ "Child singing", "Synthetic singing", "Rapping", "Humming", "Music",
136
+ "Musical instrument", "Piano", "Guitar", "Drum", "Orchestra", "Pop music",
137
+ "Rock music", "Jazz", "Classical music", "Electronic music", "Animal",
138
+ "Dog", "Cat", "Bird", "Insect", "Vehicle", "Car", "Motorcycle", "Train",
139
+ "Aircraft", "Helicopter", "Wind", "Rain", "Thunder", "Water", "Fire",
140
+ "Applause", "Crowd", "Footsteps", "Door", "Bell", "Alarm", "Clock"
141
+ ]
142
+
143
+ def _transcribe_segment_parallel(self, segment_data):
144
+ """Helper function for parallel transcription of segments"""
145
+ segment, sample_rate, speaker, start_time, end_time, whisper_model = segment_data
146
+
147
+ try:
148
+ # Create temporary file for this segment
149
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
150
+ temp_filename = temp_file.name
151
+ torchaudio.save(temp_filename, segment, sample_rate)
152
+
153
+ # Transcribe segment
154
+ try:
155
+ transcription_result = whisper_model.transcribe(
156
+ temp_filename,
157
+ language="en",
158
+ temperature=0,
159
+ no_speech_threshold=0.6
160
+ )
161
+ segment_text = transcription_result["text"].strip()
162
+
163
+ if segment_text:
164
+ result = {
165
+ "speaker": speaker,
166
+ "start": round(start_time, 2),
167
+ "end": round(end_time, 2),
168
+ "duration": round(end_time - start_time, 2),
169
+ "text": segment_text,
170
+ "confidence": transcription_result.get("language_probability", 0.0)
171
+ }
172
+ else:
173
+ result = None
174
+
175
+ except Exception as e:
176
+ print(f"⚠️ Error transcribing segment: {e}")
177
+ result = None
178
+
179
+ finally:
180
+ # Clean up temp file
181
+ try:
182
+ os.unlink(temp_filename)
183
+ except OSError:
184
+ pass
185
+
186
+ return result
187
+
188
+ except Exception as e:
189
+ print(f"⚠️ Error in parallel transcription: {e}")
190
+ return None
191
+
192
+ def transcribe_with_diarization(self, audio_file: str, min_segment_duration: float = 1.0) -> List[Dict]:
193
+ """Perform speaker diarization and transcription (aligned with main.py logic)"""
194
+ if self.diarization_pipeline is None or self.whisper_model is None:
195
+ print("❌ Diarization or transcription models not available")
196
+ return []
197
+
198
+ print("🎯 Performing speaker diarization and transcription...")
199
+
200
+ # Perform diarization
201
+ diarization_result = self.diarization_pipeline(audio_file,num_speakers=2)
202
+
203
+ # Load audio
204
+ waveform, sample_rate = torchaudio.load(audio_file)
205
+ if sample_rate != 16000:
206
+ waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
207
+ sample_rate = 16000
208
+
209
+ results = []
210
+ temp_files = []
211
+
212
+ try:
213
+ for turn, _, speaker in diarization_result.itertracks(yield_label=True):
214
+ if turn.end - turn.start < min_segment_duration:
215
+ continue
216
+
217
+ # Extract segment
218
+ start_sample = int(turn.start * sample_rate)
219
+ end_sample = int(turn.end * sample_rate)
220
+ segment = waveform[:, start_sample:end_sample]
221
+
222
+ # Create temporary file for transcription
223
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
224
+ temp_filename = temp_file.name
225
+ temp_files.append(temp_filename)
226
+ torchaudio.save(temp_filename, segment, sample_rate)
227
+
228
+ # Transcribe
229
+ try:
230
+ transcription_result = self.whisper_model.transcribe(
231
+ temp_filename,
232
+ language="en",
233
+ temperature=0,
234
+ no_speech_threshold=0.6
235
+ )
236
+ segment_text = transcription_result["text"].strip()
237
+
238
+ if segment_text:
239
+ results.append({
240
+ "speaker": speaker,
241
+ "start": round(turn.start, 2),
242
+ "end": round(turn.end, 2),
243
+ "duration": round(turn.end - turn.start, 2),
244
+ "text": segment_text,
245
+ "confidence": transcription_result.get("language_probability", 0.0)
246
+ })
247
+ except Exception as e:
248
+ print(f"⚠️ Error transcribing segment: {e}")
249
+ continue
250
+
251
+ finally:
252
+ # Cleanup temp files
253
+ for temp_file in temp_files:
254
+ try:
255
+ os.unlink(temp_file)
256
+ except OSError:
257
+ pass
258
+
259
+ return results
260
+
261
+ def detect_audio_events(self, audio_file: str, top_k: int = 10) -> Dict:
262
+ """Detect audio events using YAMNet"""
263
+ if self.yamnet_model is None:
264
+ print("❌ YAMNet model not available")
265
+ return {}
266
+
267
+ print("πŸ”Š Detecting audio events...")
268
+
269
+ try:
270
+ # Load and preprocess audio
271
+ waveform, sr = torchaudio.load(audio_file)
272
+ if sr != 16000:
273
+ waveform = torchaudio.functional.resample(waveform, sr, 16000)
274
+
275
+ # Process through YAMNet
276
+ inputs = self.yamnet_converter(waveform, 16000)
277
+
278
+ with torch.no_grad():
279
+ embeddings, logits = self.yamnet_model(inputs)
280
+ mean_logits = logits.mean(dim=0)
281
+ probs = torch.softmax(mean_logits, dim=-1)
282
+ top_probs, top_idx = torch.topk(probs, top_k)
283
+
284
+ # Format results
285
+ events = []
286
+ for i in range(top_k):
287
+ idx = top_idx[i].item()
288
+ prob = top_probs[i].item()
289
+ if idx < len(self.audioset_classes):
290
+ label = self.audioset_classes[idx]
291
+ else:
292
+ label = f"Unknown_Class_{idx}"
293
+
294
+ events.append({
295
+ "event": label,
296
+ "class_id": idx,
297
+ "probability": prob
298
+ })
299
+
300
+ return {
301
+ "top_events": events,
302
+ "total_classes": len(self.audioset_classes)
303
+ }
304
+
305
+ except Exception as e:
306
+ print(f"⚠️ Error in event detection: {e}")
307
+ return {}
308
+
309
+ def _extract_feature_chunk(self, audio_chunk, sr, feature_type):
310
+ """Helper function for parallel feature extraction"""
311
+ try:
312
+ if feature_type == "mfcc":
313
+ mfcc = librosa.feature.mfcc(y=audio_chunk, sr=sr, n_mfcc=13)
314
+ features = {}
315
+ for i in range(13):
316
+ features[f'mfcc_{i+1}_mean'] = float(np.mean(mfcc[i]))
317
+ features[f'mfcc_{i+1}_std'] = float(np.std(mfcc[i]))
318
+ return features
319
+
320
+ elif feature_type == "chroma":
321
+ chroma = librosa.feature.chroma_stft(y=audio_chunk, sr=sr)
322
+ features = {}
323
+ for i in range(12):
324
+ features[f'chroma_{i+1}_mean'] = float(np.mean(chroma[i]))
325
+ return features
326
+
327
+ elif feature_type == "spectral":
328
+ features = {}
329
+ features['spectral_centroid_mean'] = float(np.mean(librosa.feature.spectral_centroid(y=audio_chunk, sr=sr)[0]))
330
+ features['spectral_rolloff_mean'] = float(np.mean(librosa.feature.spectral_rolloff(y=audio_chunk, sr=sr)[0]))
331
+ return features
332
+
333
+ elif feature_type == "basic":
334
+ features = {}
335
+ features['rms_energy'] = float(np.mean(librosa.feature.rms(y=audio_chunk)[0]))
336
+ features['zero_crossing_rate'] = float(np.mean(librosa.feature.zero_crossing_rate(audio_chunk)[0]))
337
+ return features
338
+
339
+ except Exception as e:
340
+ print(f"⚠️ Error extracting {feature_type} features: {e}")
341
+ return {}
342
+
343
+ def extract_paralinguistic_features(self, audio_data, sr):
344
+ """Extract comprehensive paralinguistic features"""
345
+ print("🎡 Extracting paralinguistic features...")
346
+
347
+ features = {}
348
+
349
+ # Basic properties
350
+ features['duration'] = len(audio_data) / sr
351
+ features['sample_rate'] = sr
352
+
353
+ if self.enable_parallel_processing:
354
+ print("πŸš€ Using parallel feature extraction...")
355
+
356
+ # Prepare feature extraction tasks
357
+ feature_tasks = [
358
+ ("mfcc", audio_data, sr),
359
+ ("chroma", audio_data, sr),
360
+ ("spectral", audio_data, sr),
361
+ ("basic", audio_data, sr)
362
+ ]
363
+
364
+ # Execute feature extraction in parallel
365
+ with concurrent.futures.ThreadPoolExecutor(max_workers=min(4, self.max_workers)) as executor:
366
+ future_to_feature = {
367
+ executor.submit(self._extract_feature_chunk, audio_chunk, sr, feature_type): feature_type
368
+ for feature_type, audio_chunk, sr in feature_tasks
369
+ }
370
+
371
+ for future in concurrent.futures.as_completed(future_to_feature):
372
+ feature_result = future.result()
373
+ features.update(feature_result)
374
+ else:
375
+ # Sequential feature extraction (original logic)
376
+ # Energy features
377
+ features['rms_energy'] = float(np.mean(librosa.feature.rms(y=audio_data)[0]))
378
+ features['zero_crossing_rate'] = float(np.mean(librosa.feature.zero_crossing_rate(audio_data)[0]))
379
+
380
+ # MFCC features
381
+ mfcc = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
382
+ for i in range(13):
383
+ features[f'mfcc_{i+1}_mean'] = float(np.mean(mfcc[i]))
384
+ features[f'mfcc_{i+1}_std'] = float(np.std(mfcc[i]))
385
+
386
+ # Spectral features
387
+ features['spectral_centroid_mean'] = float(np.mean(librosa.feature.spectral_centroid(y=audio_data, sr=sr)[0]))
388
+ features['spectral_rolloff_mean'] = float(np.mean(librosa.feature.spectral_rolloff(y=audio_data, sr=sr)[0]))
389
+
390
+ # Chroma features
391
+ chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)
392
+ for i in range(12):
393
+ features[f'chroma_{i+1}_mean'] = float(np.mean(chroma[i]))
394
+
395
+ # Pitch features (kept sequential due to complexity)
396
+ try:
397
+ pitches, magnitudes = librosa.piptrack(y=audio_data, sr=sr, threshold=0.1)
398
+ pitch_values = []
399
+ for t in range(pitches.shape[1]):
400
+ index = magnitudes[:, t].argmax()
401
+ pitch = pitches[index, t]
402
+ if pitch > 0:
403
+ pitch_values.append(pitch)
404
+
405
+ if pitch_values:
406
+ features['pitch_mean'] = float(np.mean(pitch_values))
407
+ features['pitch_std'] = float(np.std(pitch_values))
408
+ features['pitch_min'] = float(np.min(pitch_values))
409
+ features['pitch_max'] = float(np.max(pitch_values))
410
+ else:
411
+ features.update({'pitch_mean': 0.0, 'pitch_std': 0.0, 'pitch_min': 0.0, 'pitch_max': 0.0})
412
+ except:
413
+ features.update({'pitch_mean': 0.0, 'pitch_std': 0.0, 'pitch_min': 0.0, 'pitch_max': 0.0})
414
+
415
+ # Tempo
416
+ try:
417
+ tempo, _ = librosa.beat.beat_track(y=audio_data, sr=sr)
418
+ if isinstance(tempo, np.ndarray):
419
+ features['tempo'] = float(tempo.item() if tempo.size == 1 else tempo[0])
420
+ else:
421
+ features['tempo'] = float(tempo)
422
+ except:
423
+ features['tempo'] = 0.0
424
+
425
+ return features
426
+
427
+ def predict_emotion(self, audio_data, sr):
428
+ """Predict emotion using transformer model"""
429
+ if self.emotion_model is None:
430
+ return None
431
+
432
+ print("😊 Predicting emotions...")
433
+
434
+ try:
435
+ # Resample to 16kHz if needed
436
+ if sr != 16000:
437
+ audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
438
+
439
+ # Process through model
440
+ inputs = self.emotion_processor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True)
441
+
442
+ with torch.no_grad():
443
+ outputs = self.emotion_model(**inputs)
444
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
445
+
446
+ # Get emotion probabilities
447
+ emotion_probs = {}
448
+ for i, emotion in enumerate(self.emotion_labels):
449
+ emotion_probs[emotion] = predictions[0][i].item()
450
+
451
+ predicted_emotion = self.emotion_labels[predictions.argmax().item()]
452
+ confidence = predictions.max().item()
453
+
454
+ return {
455
+ 'predicted_emotion': predicted_emotion,
456
+ 'confidence': confidence,
457
+ 'all_emotions': emotion_probs
458
+ }
459
+
460
+ except Exception as e:
461
+ print(f"⚠️ Error in emotion prediction: {e}")
462
+ return None
463
+
464
+ def analyze_complete_audio(self, audio_file: str) -> Dict:
465
+ """Perform complete unified audio analysis with parallel processing"""
466
+ if not os.path.exists(audio_file):
467
+ print(f"❌ Audio file not found: {audio_file}")
468
+ return {}
469
+
470
+ print(f"\nπŸš€ Starting complete analysis of: {audio_file}")
471
+ print("="*60)
472
+
473
+ start_time = time.time()
474
+
475
+ # Load audio for paralinguistic analysis
476
+ try:
477
+ audio_data, sr = librosa.load(audio_file, sr=22050)
478
+ audio_data, _ = librosa.effects.trim(audio_data, top_db=20)
479
+ audio_data = librosa.util.normalize(audio_data)
480
+ except Exception as e:
481
+ print(f"❌ Error loading audio: {e}")
482
+ return {}
483
+
484
+ if self.enable_parallel_processing:
485
+ print("πŸš€ Running analysis components in parallel...")
486
+
487
+ # Create a queue for results
488
+ results_queue = Queue()
489
+
490
+ # Define analysis functions
491
+ def run_diarization():
492
+ result = self.transcribe_with_diarization(audio_file)
493
+ results_queue.put(('diarization', result))
494
+
495
+ def run_event_detection():
496
+ result = self.detect_audio_events(audio_file)
497
+ results_queue.put(('events', result))
498
+
499
+ def run_feature_extraction():
500
+ result = self.extract_paralinguistic_features(audio_data, sr)
501
+ results_queue.put(('features', result))
502
+
503
+ def run_emotion_prediction():
504
+ result = self.predict_emotion(audio_data, sr)
505
+ results_queue.put(('emotion', result))
506
+
507
+ # Start threads for parallel execution
508
+ threads = [
509
+ threading.Thread(target=run_diarization),
510
+ threading.Thread(target=run_event_detection),
511
+ threading.Thread(target=run_feature_extraction),
512
+ threading.Thread(target=run_emotion_prediction)
513
+ ]
514
+
515
+ # Start all threads
516
+ for thread in threads:
517
+ thread.start()
518
+
519
+ # Wait for all threads to complete
520
+ for thread in threads:
521
+ thread.join()
522
+
523
+ # Collect results
524
+ analysis_components = {}
525
+ while not results_queue.empty():
526
+ component, result = results_queue.get()
527
+ analysis_components[component] = result
528
+
529
+ # Assign results
530
+ diarization_results = analysis_components.get('diarization', [])
531
+ event_results = analysis_components.get('events', {})
532
+ paralinguistic_features = analysis_components.get('features', {})
533
+ emotion_results = analysis_components.get('emotion', None)
534
+
535
+ else:
536
+ # Sequential processing (original logic)
537
+ # 1. Speaker Diarization + Transcription
538
+ diarization_results = self.transcribe_with_diarization(audio_file)
539
+
540
+ # 2. Audio Event Detection
541
+ event_results = self.detect_audio_events(audio_file)
542
+
543
+ # 3. Paralinguistic Features
544
+ paralinguistic_features = self.extract_paralinguistic_features(audio_data, sr)
545
+
546
+ # 4. Emotion Recognition
547
+ emotion_results = self.predict_emotion(audio_data, sr)
548
+
549
+ processing_time = time.time() - start_time
550
+ print(f"⏱️ Total processing time: {processing_time:.2f} seconds")
551
+
552
+ # Combine all results
553
+ complete_analysis = {
554
+ 'file_info': {
555
+ 'filename': os.path.basename(audio_file),
556
+ 'filepath': audio_file,
557
+ 'duration': paralinguistic_features.get('duration', 0),
558
+ 'sample_rate': paralinguistic_features.get('sample_rate', 0),
559
+ 'processing_time': processing_time
560
+ },
561
+ 'diarization_transcription': diarization_results,
562
+ 'audio_events': event_results,
563
+ 'paralinguistic_features': paralinguistic_features,
564
+ 'emotion_analysis': emotion_results
565
+ }
566
+
567
+ return complete_analysis
568
+
569
+ def print_analysis_summary(self, analysis_results: Dict):
570
+ """Print formatted analysis summary"""
571
+ if not analysis_results:
572
+ print("❌ No analysis results to display")
573
+ return
574
+
575
+ file_info = analysis_results.get('file_info', {})
576
+ diarization = analysis_results.get('diarization_transcription', [])
577
+ events = analysis_results.get('audio_events', {})
578
+ emotion = analysis_results.get('emotion_analysis', {})
579
+
580
+ print(f"\n{'='*80}")
581
+ print("🎯 UNIFIED AUDIO ANALYSIS RESULTS")
582
+ print(f"{'='*80}")
583
+
584
+ # File Information
585
+ print(f"πŸ“ File: {file_info.get('filename', 'Unknown')}")
586
+ print(f"⏱️ Duration: {file_info.get('duration', 0):.2f} seconds")
587
+ print(f"πŸ”Š Sample Rate: {file_info.get('sample_rate', 0)} Hz")
588
+ print(f"⚑ Processing Time: {file_info.get('processing_time', 0):.2f} seconds")
589
+
590
+ # 1. Speaker Diarization Results
591
+ print(f"\n{'🎀 SPEAKER DIARIZATION & TRANSCRIPTION'}")
592
+ print("-" * 50)
593
+ if diarization:
594
+ speakers = set(seg['speaker'] for seg in diarization)
595
+ print(f"Speakers detected: {len(speakers)}")
596
+ print(f"Total segments: {len(diarization)}")
597
+
598
+ for i, segment in enumerate(diarization, 1):
599
+ print(f"{i}. {segment['speaker']} [{segment['start']:.1f}s-{segment['end']:.1f}s]: {segment['text'][:80]}{'...' if len(segment['text']) > 80 else ''}")
600
+ else:
601
+ print("No diarization results available")
602
+
603
+ # 2. Audio Event Detection
604
+ print(f"\n{'πŸ”Š AUDIO EVENT DETECTION (Top 10)'}")
605
+ print("-" * 50)
606
+ top_events = events.get('top_events', [])
607
+ if top_events:
608
+ for i, event in enumerate(top_events[:10], 1):
609
+ print(f"{i:2d}. {event['event']:<30} | Probability: {event['probability']:.4f}")
610
+ else:
611
+ print("No audio events detected")
612
+
613
+ # 3. Emotion Analysis
614
+ print(f"\n{'😊 EMOTION ANALYSIS'}")
615
+ print("-" * 30)
616
+ if emotion:
617
+ print(f"Predicted Emotion: {emotion['predicted_emotion']} (Confidence: {emotion['confidence']:.3f})")
618
+ print("\nAll Emotion Probabilities:")
619
+ for emo, prob in emotion['all_emotions'].items():
620
+ print(f" {emo.capitalize():<12}: {prob:.3f}")
621
+ else:
622
+ print("No emotion analysis available")
623
+
624
+ # 4. Key Paralinguistic Features
625
+ features = analysis_results.get('paralinguistic_features', {})
626
+ if features:
627
+ print(f"\n{'🎡 KEY PARALINGUISTIC FEATURES'}")
628
+ print("-" * 40)
629
+ print(f"RMS Energy: {features.get('rms_energy', 0):.4f}")
630
+ print(f"Pitch Mean: {features.get('pitch_mean', 0):.2f} Hz")
631
+ print(f"Spectral Centroid: {features.get('spectral_centroid_mean', 0):.2f} Hz")
632
+ print(f"Tempo: {features.get('tempo', 0):.2f} BPM")
633
+ print(f"Zero Crossing Rate: {features.get('zero_crossing_rate', 0):.4f}")
634
+
635
+ def save_results_to_csv(self, analysis_results: Dict, output_prefix: str = "unified_analysis"):
636
+ """Save analysis results to CSV files"""
637
+ if not analysis_results:
638
+ print("❌ No results to save")
639
+ return
640
+
641
+ # Save diarization results
642
+ diarization = analysis_results.get('diarization_transcription', [])
643
+ if diarization:
644
+ df_diarization = pd.DataFrame(diarization)
645
+ diarization_file = f"{output_prefix}_diarization.csv"
646
+ df_diarization.to_csv(diarization_file, index=False)
647
+ print(f"πŸ’Ύ Diarization results saved to: {diarization_file}")
648
+
649
+ # Save audio events
650
+ events = analysis_results.get('audio_events', {}).get('top_events', [])
651
+ if events:
652
+ df_events = pd.DataFrame(events)
653
+ events_file = f"{output_prefix}_audio_events.csv"
654
+ df_events.to_csv(events_file, index=False)
655
+ print(f"πŸ’Ύ Audio events saved to: {events_file}")
656
+
657
+ # Save paralinguistic features
658
+ features = analysis_results.get('paralinguistic_features', {})
659
+ if features:
660
+ df_features = pd.DataFrame([features])
661
+ features_file = f"{output_prefix}_features.csv"
662
+ df_features.to_csv(features_file, index=False)
663
+ print(f"πŸ’Ύ Features saved to: {features_file}")
664
+
665
+ # Save emotion analysis
666
+ emotion = analysis_results.get('emotion_analysis', {})
667
+ if emotion:
668
+ df_emotion = pd.DataFrame([emotion])
669
+ emotion_file = f"{output_prefix}_emotion.csv"
670
+ df_emotion.to_csv(emotion_file, index=False)
671
+ print(f"πŸ’Ύ Emotion analysis saved to: {emotion_file}")
672
+
673
+ def summarize_audio_analysis_with_llm(analysis_results: dict) -> str:
674
+ """
675
+ Send all analysis results to a Groq LLM (gpt-oss-20b) and get a summary
676
+ describing relationships between diarization, events, emotion, and features.
677
+ Requires GROQ_API_KEY in environment.
678
+ """
679
+ # Prepare the prompt
680
+ prompt = (
681
+ "You are an expert audio scene interpreter. Given the structured audio analysis results, "
682
+ "summarize what is happening in plain, natural language, as if explaining the situation to someone. "
683
+ "Avoid technical terms, metrics, or probabilities. Instead, combine the speaker's words, background "
684
+ "sounds, emotions and other paralingusistic features to infer the most likely real-world context. Keep it short and clear.\n\n"
685
+ "Sample input : Recording of a person call reaching an airport (with background noise of airplanes, announcements, and crowd chatter). Sample output : The subway sound and other vehicle sound suggest that person is in Highway, and the aero plane sound indicate nearby Airport, while announcement provide information about the Airplane Schedule, that means person reached in boarding area or into the waiting hall.\n\n"
686
+ f"Audio Analysis Results:\n{analysis_results}\n\n"
687
+ "Plain Summary:"
688
+ )
689
+
690
+ # Load environment variables
691
+ load_dotenv()
692
+ api_key = os.getenv("GROQ_API_KEY")
693
+ if not api_key:
694
+ raise ValueError("GROQ_API_KEY environment variable not set.")
695
+
696
+ # Initialize Groq client
697
+ client = Groq(api_key=api_key)
698
+
699
+ # Make the API call
700
+ response = client.chat.completions.create(
701
+ model="openai/gpt-oss-20b",
702
+ messages=[
703
+ {"role": "system", "content": "You are an expert audio analyst."},
704
+ {"role": "user", "content": prompt},
705
+ ],
706
+ )
707
+
708
+ # Extract summary
709
+ summary = response.choices[0].message.content.strip()
710
+ return summary
711
+
712
+ def main():
713
+ """Main function demonstrating usage"""
714
+ # Initialize analyzer with parallel processing enabled
715
+ analyzer = UnifiedAudioAnalyzer(enable_parallel_processing=True, max_workers=None)
716
+
717
+ # Specify input audio file
718
+ audio_file = "dataset/flight/15.wav" # Update with your audio file path
719
+
720
+ if os.path.exists(audio_file):
721
+ # Perform complete analysis
722
+ results = analyzer.analyze_complete_audio(audio_file)
723
+
724
+ # Print summary
725
+ analyzer.print_analysis_summary(results)
726
+
727
+ # Save results to CSV files
728
+ # analyzer.save_results_to_csv(results, "my_audio_analysis")
729
+
730
+ print(f"\nβœ… Analysis complete! Check CSV files for detailed results.")
731
+ summary=summarize_audio_analysis_with_llm(results)
732
+ print("\n=== LLM Summary of Audio Analysis ===")
733
+ print(summary)
734
+ else:
735
+ print(f"❌ Audio file not found: {audio_file}")
736
+ print("Please update the audio_file path to point to your audio file.")
737
+
738
+ if __name__ == "__main__":
739
+ main()