File size: 26,873 Bytes
fec9168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
#!/usr/bin/env python3
"""
ESC-50 Preprocessing Script for Duration Task

This script processes all ESC-50 audio clips to:
1. Apply amplitude-based filtering to detect actual sound regions
2. Calculate effective duration (portion containing actual sound)
3. Save trimmed audio files (with silence removed)
4. Generate a CSV with all metadata including effective durations

Usage:
    python preprocess_esc50.py --config config.yaml
    python preprocess_esc50.py --config config.yaml --threshold-db -40 --min-sound-ms 50
"""

import argparse
import os
import sys
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np
import pandas as pd
from pydub import AudioSegment
from tqdm import tqdm

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

from utils.logger import setup_logger

logger = setup_logger(__name__)


def get_amplitude_array(audio: AudioSegment) -> np.ndarray:
    """
    Convert AudioSegment to numpy array of amplitudes.
    
    Args:
        audio: Input audio segment
        
    Returns:
        Numpy array of amplitude values (normalized to -1 to 1)
    """
    samples = np.array(audio.get_array_of_samples())
    
    # Handle stereo by averaging channels
    if audio.channels == 2:
        samples = samples.reshape((-1, 2)).mean(axis=1)
    
    # Normalize to -1 to 1 range
    max_val = float(2 ** (audio.sample_width * 8 - 1))
    samples = samples / max_val
    
    return samples


def compute_rms_envelope(samples: np.ndarray, frame_size_ms: int, hop_size_ms: int, 
                         sample_rate: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Compute RMS envelope of audio signal.
    
    Args:
        samples: Audio samples as numpy array
        frame_size_ms: Frame size in milliseconds
        hop_size_ms: Hop size in milliseconds
        sample_rate: Audio sample rate
        
    Returns:
        Tuple of (rms_values, time_stamps_ms)
    """
    frame_size = int(sample_rate * frame_size_ms / 1000)
    hop_size = int(sample_rate * hop_size_ms / 1000)
    
    rms_values = []
    time_stamps = []
    
    for i in range(0, len(samples) - frame_size + 1, hop_size):
        frame = samples[i:i + frame_size]
        rms = np.sqrt(np.mean(frame ** 2))
        rms_values.append(rms)
        time_stamps.append(i / sample_rate * 1000)  # Convert to ms
    
    return np.array(rms_values), np.array(time_stamps)


def rms_to_db(rms: np.ndarray, reference: float = 1.0) -> np.ndarray:
    """
    Convert RMS values to decibels.
    
    Args:
        rms: RMS values
        reference: Reference value (default 1.0 for normalized audio)
        
    Returns:
        dB values
    """
    # Avoid log(0) by using a small epsilon
    epsilon = 1e-10
    return 20 * np.log10(np.maximum(rms, epsilon) / reference)


def detect_sound_regions(
    audio: AudioSegment,
    threshold_db: float = -40.0,
    min_sound_duration_ms: int = 50,
    frame_size_ms: int = 20,
    hop_size_ms: int = 10,
    merge_gap_ms: int = 100,
    threshold_strategy: str = 'noise_floor',
    noise_floor_percentile: float = 10.0,
    noise_floor_delta_db: float = 15.0
) -> List[Tuple[int, int]]:
    """
    Detect regions in audio that contain actual sound (above threshold).
    
    Supports two threshold strategies:
    - 'peak_relative': threshold = peak_db + threshold_db (old behavior)
    - 'noise_floor': threshold = percentile(db_values, p) + delta_db (adaptive per-clip)
    
    The 'noise_floor' strategy is recommended as it adapts to each clip's
    actual background noise level rather than using a fixed offset from peak.
    
    Args:
        audio: Input audio segment
        threshold_db: dB threshold below peak (used if strategy='peak_relative')
        min_sound_duration_ms: Minimum duration of sound region to keep
        frame_size_ms: Frame size for RMS computation
        hop_size_ms: Hop size for RMS computation
        merge_gap_ms: Merge regions separated by less than this gap
        threshold_strategy: 'peak_relative' or 'noise_floor'
        noise_floor_percentile: Percentile for noise floor estimation (default 10)
        noise_floor_delta_db: dB above noise floor to set threshold (default 15)
        
    Returns:
        List of (start_ms, end_ms) tuples for sound regions
    """
    samples = get_amplitude_array(audio)
    sample_rate = audio.frame_rate
    
    # Compute RMS envelope
    rms_values, time_stamps = compute_rms_envelope(
        samples, frame_size_ms, hop_size_ms, sample_rate
    )
    
    if len(rms_values) == 0:
        return []
    
    # Convert to dB
    db_values = rms_to_db(rms_values)
    
    # Compute threshold based on strategy
    peak_db = np.max(db_values)
    
    if threshold_strategy == 'noise_floor':
        # ADAPTIVE: Use noise floor (low percentile) + delta
        # This adapts to each clip's actual background noise level
        noise_floor_db = np.percentile(db_values, noise_floor_percentile)
        absolute_threshold = noise_floor_db + noise_floor_delta_db
        
        # Safeguard: don't exceed peak (would detect nothing)
        # Leave at least 1 dB below peak
        absolute_threshold = min(absolute_threshold, peak_db - 1.0)
        
        logger.debug(
            f"Noise-floor threshold: floor={noise_floor_db:.1f}dB (p{noise_floor_percentile}), "
            f"delta={noise_floor_delta_db}dB, threshold={absolute_threshold:.1f}dB, peak={peak_db:.1f}dB"
        )
    else:
        # OLD: peak-relative threshold
        absolute_threshold = peak_db + threshold_db  # threshold_db is negative
        logger.debug(
            f"Peak-relative threshold: peak={peak_db:.1f}dB, offset={threshold_db}dB, "
            f"threshold={absolute_threshold:.1f}dB"
        )
    
    # Find frames above threshold
    above_threshold = db_values > absolute_threshold
    
    # Find contiguous regions
    regions = []
    in_region = False
    region_start = 0
    
    for i, (is_above, time_ms) in enumerate(zip(above_threshold, time_stamps)):
        if is_above and not in_region:
            # Start of new region
            in_region = True
            region_start = time_ms
        elif not is_above and in_region:
            # End of region
            in_region = False
            region_end = time_ms
            if region_end - region_start >= min_sound_duration_ms:
                regions.append((int(region_start), int(region_end)))
    
    # Handle case where audio ends while still in a region
    if in_region:
        region_end = time_stamps[-1] + hop_size_ms
        if region_end - region_start >= min_sound_duration_ms:
            regions.append((int(region_start), int(region_end)))
    
    # Merge regions that are close together
    if len(regions) > 1:
        merged_regions = [regions[0]]
        for start, end in regions[1:]:
            prev_start, prev_end = merged_regions[-1]
            if start - prev_end <= merge_gap_ms:
                # Merge with previous region
                merged_regions[-1] = (prev_start, end)
            else:
                merged_regions.append((start, end))
        regions = merged_regions
    
    return regions


def get_sound_regions(
    audio: AudioSegment,
    threshold_db: float = -40.0,
    min_sound_duration_ms: int = 50,
    threshold_strategy: str = 'noise_floor',
    noise_floor_percentile: float = 10.0,
    noise_floor_delta_db: float = 15.0
) -> List[Tuple[int, int]]:
    """
    Detect sound regions in audio using adaptive threshold.
    
    Args:
        audio: Input audio segment
        threshold_db: dB threshold below peak (used if strategy='peak_relative')
        min_sound_duration_ms: Minimum duration of sound region to keep
        threshold_strategy: 'peak_relative' or 'noise_floor'
        noise_floor_percentile: Percentile for noise floor estimation
        noise_floor_delta_db: dB above noise floor to set threshold
        
    Returns:
        List of (start_ms, end_ms) tuples for sound regions
    """
    return detect_sound_regions(
        audio, 
        threshold_db=threshold_db, 
        min_sound_duration_ms=min_sound_duration_ms,
        threshold_strategy=threshold_strategy,
        noise_floor_percentile=noise_floor_percentile,
        noise_floor_delta_db=noise_floor_delta_db
    )


def extract_sound_with_edges_trimmed(
    audio: AudioSegment,
    regions: List[Tuple[int, int]],
    min_silence_to_trim_ms: int = 100,
    buffer_ratio: float = 0.1
) -> AudioSegment:
    """
    Extract audio with ONLY leftmost and rightmost silence removed IF present.
    
    Trimming is ADAPTIVE:
    - Only trims if edge silence >= min_silence_to_trim_ms
    - Keeps a small percentage (buffer_ratio) of the silence to preserve transients
    - Buffer size adapts to actual silence duration (not fixed)
    
    Preserves all internal structure and silence between sounds.
    Perfect for periodic sounds (clock ticks, footsteps, typing).
    
    Args:
        audio: Input audio segment
        regions: List of (start_ms, end_ms) tuples for sound regions
        min_silence_to_trim_ms: Only trim if edge silence is at least this long (default 100ms)
        buffer_ratio: Keep this fraction of the silence as buffer (default 0.1 = 10%)
                     Example: 500ms silence -> keep 50ms buffer
        
    Returns:
        Audio segment with edges trimmed (or original if no significant silence)
    """
    if not regions:
        # No sound detected - return original
        return audio
    
    # Find the overall sound boundaries (first sound start, last sound end)
    first_sound_start_ms = regions[0][0]
    last_sound_end_ms = regions[-1][1]
    audio_duration_ms = len(audio)
    
    # Calculate actual silence durations at edges
    leading_silence_ms = first_sound_start_ms
    trailing_silence_ms = audio_duration_ms - last_sound_end_ms
    
    # Adaptive trimming: only trim if there's significant silence
    # Keep a small percentage as buffer to avoid cutting transients
    if leading_silence_ms >= min_silence_to_trim_ms:
        buffer_ms = max(200, int(leading_silence_ms * buffer_ratio))  # At least 200ms buffer
        trim_start_ms = max(0, first_sound_start_ms - buffer_ms)
    else:
        # Not enough silence to trim - keep from start
        trim_start_ms = 0
    
    if trailing_silence_ms >= min_silence_to_trim_ms:
        buffer_ms = max(200, int(trailing_silence_ms * buffer_ratio))
        trim_end_ms = min(audio_duration_ms, last_sound_end_ms + buffer_ms)
    else:
        # Not enough silence to trim - keep to end
        trim_end_ms = audio_duration_ms
    
    # Extract the edge-trimmed portion (internal structure preserved)
    trimmed_audio = audio[trim_start_ms:trim_end_ms]
    
    logger.debug(
        f"Edge trim: {audio_duration_ms}ms -> {len(trimmed_audio)}ms "
        f"(leading_silence={leading_silence_ms}ms, trailing_silence={trailing_silence_ms}ms, "
        f"trim_start={trim_start_ms}ms, trim_end={trim_end_ms}ms)"
    )
    
    return trimmed_audio


def extract_all_sound_regions(
    audio: AudioSegment,
    regions: List[Tuple[int, int]],
    crossfade_ms: int = 10,
    padding_ms: int = 20
) -> AudioSegment:
    """
    Extract ALL sound portions and join them, removing ALL silence.
    
    WARNING: This destroys natural periodicity! Use trim_edges_only() instead
    for most use cases. This function is kept for backward compatibility.
    
    Args:
        audio: Input audio segment
        regions: List of (start_ms, end_ms) tuples for sound regions
        crossfade_ms: Crossfade duration when joining regions
        padding_ms: Padding around each region to avoid cutting transients
        
    Returns:
        Audio segment containing only sound portions (internal silence removed)
    """
    if not regions:
        return audio
    
    # Extract each region
    extracted_parts = []
    for start_ms, end_ms in regions:
        # Add padding to avoid cutting off transients
        padded_start = max(0, start_ms - padding_ms)
        padded_end = min(len(audio), end_ms + padding_ms)
        part = audio[padded_start:padded_end]
        extracted_parts.append(part)
    
    # Concatenate with crossfade
    if len(extracted_parts) == 1:
        return extracted_parts[0]
    
    result = extracted_parts[0]
    for part in extracted_parts[1:]:
        if len(result) > crossfade_ms and len(part) > crossfade_ms:
            result = result.append(part, crossfade=crossfade_ms)
        else:
            result = result + part
    
    return result


def process_esc50_dataset(
    audio_dir: str,
    metadata_path: str,
    output_dir: str,
    threshold_db: float = -40.0,
    min_sound_duration_ms: int = 50,
    save_trimmed_audio: bool = True,
    threshold_strategy: str = 'noise_floor',
    noise_floor_percentile: float = 10.0,
    noise_floor_delta_db: float = 15.0
) -> pd.DataFrame:
    """
    Process entire ESC-50 dataset and compute effective durations.
    
    Uses ADAPTIVE EDGE-ONLY trimming to preserve natural periodicity of sounds.
    Only leading and trailing silence is removed IF significant (>=100ms).
    Trimming is adaptive: keeps a small percentage of silence as buffer for transients.
    All internal structure is preserved.
    
    Supports two threshold strategies for adaptive per-clip thresholding:
    - 'peak_relative': threshold = peak_db + threshold_db (fixed offset from peak)
    - 'noise_floor': threshold = percentile(db, p) + delta (adapts to noise floor)
    
    Args:
        audio_dir: Path to ESC-50 audio directory
        metadata_path: Path to ESC-50 metadata CSV
        output_dir: Output directory for processed files
        threshold_db: dB threshold for silence detection (peak_relative mode)
        min_sound_duration_ms: Minimum sound duration to keep
        save_trimmed_audio: Whether to save trimmed audio files
        threshold_strategy: 'peak_relative' or 'noise_floor' (recommended)
        noise_floor_percentile: Percentile for noise floor estimation (default 5)
        noise_floor_delta_db: dB above noise floor to set threshold (default 8)
        
    Returns:
        DataFrame with processed metadata
    """
    # Load original metadata
    original_metadata = pd.read_csv(metadata_path)
    logger.info(f"Loaded metadata for {len(original_metadata)} clips")
    
    # Create output directories
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    if save_trimmed_audio:
        trimmed_audio_dir = output_path / "trimmed_audio"
        trimmed_audio_dir.mkdir(exist_ok=True)
    
    # Process each audio file
    results = []
    
    for _, row in tqdm(original_metadata.iterrows(), total=len(original_metadata), 
                       desc="Processing ESC-50 clips"):
        filename = row['filename']
        category = row['category']
        audio_path = Path(audio_dir) / filename
        
        try:
            # Load audio
            audio = AudioSegment.from_file(str(audio_path), format="wav")
            raw_duration_s = len(audio) / 1000.0
            
            # Detect sound regions (using adaptive threshold)
            regions = get_sound_regions(
                audio, 
                threshold_db=threshold_db, 
                min_sound_duration_ms=min_sound_duration_ms,
                threshold_strategy=threshold_strategy,
                noise_floor_percentile=noise_floor_percentile,
                noise_floor_delta_db=noise_floor_delta_db
            )
            
            # Trim edges only (leftmost and rightmost silence)
            # Adaptive trimming: only trims if silence >= 100ms, keeps 10% as buffer
            trimmed_audio = extract_sound_with_edges_trimmed(audio, regions)
            final_duration_s = len(trimmed_audio) / 1000.0
            
            # Calculate peak amplitude and RMS from trimmed audio
            samples = get_amplitude_array(trimmed_audio)
            peak_amplitude = np.max(np.abs(samples))
            peak_amplitude_db = 20 * np.log10(peak_amplitude + 1e-10)
            rms = np.sqrt(np.mean(samples ** 2))
            avg_rms_db = 20 * np.log10(rms + 1e-10)
            
            # Calculate effective duration (sum of sound regions)
            effective_duration_s = sum(end - start for start, end in regions) / 1000.0 if regions else final_duration_s
            
            # Save trimmed audio
            trimmed_filename = None
            if save_trimmed_audio:
                trimmed_filename = filename
                trimmed_path = trimmed_audio_dir / trimmed_filename
                trimmed_audio.export(str(trimmed_path), format="wav")
            
            # Store results
            results.append({
                'filename': filename,
                'category': category,
                'fold': row['fold'],
                'target': row['target'],
                'esc10': row['esc10'],
                'raw_duration_s': round(raw_duration_s, 4),
                'final_duration_s': round(final_duration_s, 4),
                'effective_duration_s': round(effective_duration_s, 4),
                'num_sound_regions': len(regions),
                'peak_amplitude_db': round(peak_amplitude_db, 2),
                'avg_rms_db': round(avg_rms_db, 2),
                'trimmed_filename': trimmed_filename if save_trimmed_audio else None,
                'threshold_strategy': threshold_strategy,
                'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None,
                'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None,
                'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None,
                'min_sound_duration_ms_used': min_sound_duration_ms
            })
            
        except Exception as e:
            logger.error(f"Error processing {filename}: {e}")
            results.append({
                'filename': filename,
                'category': category,
                'fold': row['fold'],
                'target': row['target'],
                'esc10': row['esc10'],
                'raw_duration_s': None,
                'final_duration_s': None,
                'effective_duration_s': None,
                'num_sound_regions': 0,
                'peak_amplitude_db': None,
                'avg_rms_db': None,
                'trimmed_filename': None,
                'threshold_strategy': threshold_strategy,
                'threshold_db_used': threshold_db if threshold_strategy == 'peak_relative' else None,
                'noise_floor_percentile': noise_floor_percentile if threshold_strategy == 'noise_floor' else None,
                'noise_floor_delta_db': noise_floor_delta_db if threshold_strategy == 'noise_floor' else None,
                'min_sound_duration_ms_used': min_sound_duration_ms,
                'error': str(e)
            })
    
    # Create DataFrame
    results_df = pd.DataFrame(results)
    
    # Save CSV
    csv_path = output_path / "effective_durations.csv"
    results_df.to_csv(csv_path, index=False)
    logger.info(f"Saved effective durations to {csv_path}")
    
    # Print summary statistics
    print_summary_statistics(results_df)
    
    return results_df


def print_summary_statistics(df: pd.DataFrame):
    """Print summary statistics of the processed dataset."""
    print("\n" + "=" * 60)
    print("ESC-50 Preprocessing Summary")
    print("=" * 60)
    
    # Filter out errors
    valid_df = df[df['effective_duration_s'].notna()]
    
    print(f"\nTotal clips processed: {len(df)}")
    print(f"Successfully processed: {len(valid_df)}")
    print(f"Errors: {len(df) - len(valid_df)}")
    
    print(f"\nRaw duration statistics:")
    print(f"  Mean: {valid_df['raw_duration_s'].mean():.3f}s")
    print(f"  Std:  {valid_df['raw_duration_s'].std():.3f}s")
    print(f"  Min:  {valid_df['raw_duration_s'].min():.3f}s")
    print(f"  Max:  {valid_df['raw_duration_s'].max():.3f}s")
    
    print(f"\nFinal duration statistics (edges trimmed, internal structure preserved):")
    print(f"  Mean: {valid_df['final_duration_s'].mean():.3f}s")
    print(f"  Std:  {valid_df['final_duration_s'].std():.3f}s")
    print(f"  Min:  {valid_df['final_duration_s'].min():.3f}s")
    print(f"  Max:  {valid_df['final_duration_s'].max():.3f}s")
    
    print(f"\nEffective duration statistics (sum of sound regions only):")
    print(f"  Mean: {valid_df['effective_duration_s'].mean():.3f}s")
    print(f"  Std:  {valid_df['effective_duration_s'].std():.3f}s")
    print(f"  Min:  {valid_df['effective_duration_s'].min():.3f}s")
    print(f"  Max:  {valid_df['effective_duration_s'].max():.3f}s")
    
    # Compare effective vs final
    print(f"\nComparison (final includes internal silences):")
    print(f"  Avg effective: {valid_df['effective_duration_s'].mean():.3f}s")
    print(f"  Avg final:     {valid_df['final_duration_s'].mean():.3f}s")
    print(f"  Difference:    {valid_df['final_duration_s'].mean() - valid_df['effective_duration_s'].mean():.3f}s (internal silences)")
    
    # Duration reduction
    reduction = (1 - valid_df['final_duration_s'].mean() / valid_df['raw_duration_s'].mean()) * 100
    print(f"\nAverage edge trimming reduction: {reduction:.1f}%")
    
    # Per-category statistics
    print("\nEffective duration by category (top 10 longest):")
    category_stats = valid_df.groupby('category')['effective_duration_s'].agg(['mean', 'std', 'min', 'max'])
    category_stats = category_stats.sort_values('mean', ascending=False)
    print(category_stats.head(10).to_string())
    
    print("\nEffective duration by category (top 10 shortest):")
    print(category_stats.tail(10).to_string())
    
    print("\n" + "=" * 60)


def load_config(config_path: str) -> dict:
    """Load configuration from YAML file."""
    import yaml
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)


def main():
    parser = argparse.ArgumentParser(
        description="Preprocess ESC-50 dataset for duration task"
    )
    parser.add_argument(
        '--config', '-c',
        type=str,
        default='config.yaml',
        help='Path to configuration file'
    )
    parser.add_argument(
        '--threshold-db',
        type=float,
        default=None,
        help='dB threshold below peak for silence detection (default: -40)'
    )
    parser.add_argument(
        '--min-sound-ms',
        type=int,
        default=None,
        help='Minimum sound duration in ms to keep (default: 50)'
    )
    parser.add_argument(
        '--output-dir',
        type=str,
        default=None,
        help='Output directory (default: from config or ESC-50_preprocessed)'
    )
    parser.add_argument(
        '--no-trimmed-audio',
        action='store_true',
        help='Do not save trimmed audio files (only save CSV)'
    )
    parser.add_argument(
        '--threshold-strategy',
        type=str,
        choices=['peak_relative', 'noise_floor'],
        default=None,
        help='Threshold strategy: peak_relative (old) or noise_floor (adaptive, recommended)'
    )
    parser.add_argument(
        '--noise-floor-percentile',
        type=float,
        default=None,
        help='Percentile for noise floor estimation (default: 10)'
    )
    parser.add_argument(
        '--noise-floor-delta-db',
        type=float,
        default=None,
        help='dB above noise floor to set threshold (default: 15)'
    )
    
    args = parser.parse_args()
    
    # Load config
    config = load_config(args.config)
    
    # Get ESC-50 paths from config
    esc50_config = config.get('esc50', {})
    audio_dir = esc50_config.get('audio_path', '/home/debarpanb1/ESC-50_github/audio')
    metadata_path = esc50_config.get('metadata_path', '/home/debarpanb1/ESC-50_github/meta/esc50.csv')
    
    # Get duration task config for preprocessing parameters
    duration_config = config.get('tasks', {}).get('duration', {})
    
    # Determine threshold and min sound duration
    threshold_db = args.threshold_db
    if threshold_db is None:
        threshold_db = duration_config.get('amplitude_threshold_db', -40.0)
    
    min_sound_ms = args.min_sound_ms
    if min_sound_ms is None:
        min_sound_ms = duration_config.get('min_sound_duration_ms', 50)
    
    # Determine output directory
    output_dir = args.output_dir
    if output_dir is None:
        output_dir = duration_config.get(
            'preprocessed_data_path', 
            '/home/debarpanb1/TREA_2.0/ESC-50_preprocessed'
        )
    
    # Determine threshold strategy (noise_floor is recommended/default)
    threshold_strategy = args.threshold_strategy
    if threshold_strategy is None:
        threshold_strategy = duration_config.get('threshold_strategy', 'noise_floor')
    
    # Determine noise floor percentile
    noise_floor_percentile = args.noise_floor_percentile
    if noise_floor_percentile is None:
        noise_floor_percentile = duration_config.get('noise_floor_percentile', 10.0)
    
    # Determine noise floor delta dB
    noise_floor_delta_db = args.noise_floor_delta_db
    if noise_floor_delta_db is None:
        noise_floor_delta_db = duration_config.get('noise_floor_delta_db', 15.0)
    
    # Log configuration
    logger.info("=" * 60)
    logger.info("ESC-50 Preprocessing Configuration")
    logger.info("=" * 60)
    logger.info(f"Audio directory: {audio_dir}")
    logger.info(f"Metadata path: {metadata_path}")
    logger.info(f"Output directory: {output_dir}")
    logger.info(f"Threshold strategy: {threshold_strategy}")
    if threshold_strategy == 'peak_relative':
        logger.info(f"  Peak-relative threshold dB: {threshold_db}")
    else:
        logger.info(f"  Noise floor percentile: {noise_floor_percentile}")
        logger.info(f"  Noise floor delta dB: {noise_floor_delta_db}")
    logger.info(f"Min sound duration (ms): {min_sound_ms}")
    logger.info(f"Adaptive edge trimming: only if silence >= 100ms, keep 10% buffer")
    logger.info(f"Save trimmed audio: {not args.no_trimmed_audio}")
    logger.info("=" * 60)
    
    # Process dataset
    results_df = process_esc50_dataset(
        audio_dir=audio_dir,
        metadata_path=metadata_path,
        output_dir=output_dir,
        threshold_db=threshold_db,
        min_sound_duration_ms=min_sound_ms,
        save_trimmed_audio=not args.no_trimmed_audio,
        threshold_strategy=threshold_strategy,
        noise_floor_percentile=noise_floor_percentile,
        noise_floor_delta_db=noise_floor_delta_db
    )
    
    logger.info(f"\nPreprocessing complete!")
    logger.info(f"Results saved to: {output_dir}")
    
    return results_df


if __name__ == "__main__":
    main()