File size: 29,303 Bytes
0cf1a58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
import os
import random
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import time
from scipy.signal import find_peaks
import wandb
from tqdm import tqdm
import concurrent.futures
from typing import List, Sequence, Union
import time
from memory_profiler import profile

# Optionally redirect the dp_matrix plot to a specific directory (used by demo).
# Set via set_dp_matrix_out_dir() before calling inference; reset to None after.
_dp_matrix_out_dir = None

def set_dp_matrix_out_dir(path):
    global _dp_matrix_out_dir
    _dp_matrix_out_dir = path

timit_leehon_39_phonemes = [
    'ao', 'ae', 'ah','aw', 'er', 'ay', 
    'b', 'sil', 'ch', 'd', 'dh', 'dx', 'eh', 'el', 'm', 'en', 'ng', 'ey',
    'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'v', 'w', 'y', 'z', 'sh', 't', 'r', 's', 'th','uh', 'uw', 'oy', 'ow','p'
]

timit_61_phonemes = [
        'aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay',
        'b', 'bcl', 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi', 'er', 'ey',
        'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 'm', 'n', 'ng', 'l',
        'nx', 'ow', 'oy', 'p', 'pau', 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw','ux',
        'v', 'w', 'y', 'z', 'zh'
    ]

# Create mappings
# phoneme_to_idx = {phoneme: idx for idx, phoneme in enumerate(timit_61_phonemes)}
phoneme_to_idx_MACRO = {phoneme: idx for idx, phoneme in enumerate(timit_leehon_39_phonemes)}

idx_to_phoneme_MACRO = {idx: phoneme for phoneme, idx in phoneme_to_idx_MACRO.items()}
timit_to_leehon_map_MACRO = {
        'aa': 'ao', 'ae': 'ae', 'ah': 'ah', 'ao': 'ao', 'aw': 'aw', 'ax': 'ah', 'ax-h': 'ah', 'axr': 'er', 'ay': 'ay',
        'b': 'b', 'bcl': 'sil', 'ch': 'ch', 'd': 'd', 'dcl': 'sil', 'dh': 'dh', 'dx': 'dx', 'eh': 'eh', 'el': 'el',
        'em': 'm', 'en': 'en', 'eng': 'ng', 'epi': 'sil', 'er': 'er', 'ey': 'ey', 'f': 'f', 'g': 'g', 'gcl': 'sil',
        'h#': 'sil', 'hh': 'hh', 'hv': 'hh', 'ih': 'ih', 'ix': 'ih', 'iy': 'iy', 'jh': 'jh', 'k': 'k', 'kcl': 'sil',
        'l': 'el', 'm': 'm', 'n': 'en', 'ng': 'ng', 'nx': 'en', 'ow': 'ow', 'oy': 'oy', 'p': 'p', 'pau': 'sil', 'pcl': 'sil',
        'q': 't', 'qcl': 'sil', 'r': 'r', 's': 's', 'sh': 'sh', 't': 't', 'tcl': 'sil', 'th': 'th', 'uh': 'uh', 'uw': 'uw',
        'ux': 'uw', 'v': 'v', 'w': 'w', 'y': 'y', 'z': 'z', 'zh': 'sh',
}

def create_truth_probs_real(segments, phonemes, phoneme_to_index, num_frames):
    segments = [0] + list(segments)
    num_phonemes = len(phoneme_to_index)
    probs_real = torch.zeros((num_frames, num_phonemes), dtype=torch.float32)
    for seg_idx in range(len(phonemes)):
        start = int(segments[seg_idx])
        end = int(segments[seg_idx + 1]) if seg_idx + 1 < len(segments) else num_frames
        if end > start:
            ph_label = phonemes[seg_idx].lower()
            ph_index = phoneme_to_index.get(ph_label, phoneme_to_index.get('sil', 0))
            probs_real[start:end, ph_index] = 1.0
    return probs_real


# ------------------------------- ablations -------------------------------

# phoneme alignment with classic (hard) DP
def phoneme_alignment_Hard_DP(p_seq, w_phi, original_lengths, len_ratio, derivative_preds_np, probs_real):
    # Gamma is kept for signature consistency but not used in hard DP
    gamma = 1e-20 
    T = int(original_lengths[0])
    n = len(p_seq)
    device = derivative_preds_np.device

    if isinstance(probs_real, np.ndarray):
        probs_real = torch.tensor(probs_real, device=device)
    cumsum_probs = torch.cumsum(probs_real, dim=0)
    
    phoneme_mappings = {p.lower(): timit_to_leehon_map_MACRO.get(p.lower(), 'sil') if p.lower() not in timit_leehon_39_phonemes else p.lower() for p in p_seq}
    derivatives = torch.cat([torch.tensor([0], device=device), torch.diff(derivative_preds_np, dim=0)])

    # Initialize DP matrix with very low value
    dp_mat = torch.full((n, T, T), float(-1e9), device=device)
    p_idx0 = phoneme_to_idx_MACRO[phoneme_mappings[p_seq[0].lower()]]

    # Initial state for first phoneme
    t_e = torch.arange(T, device=device)
    dp_mat[0, 0, :] = (
        w_phi[0] * compute_phi_1(derivatives, 0, t_e)
    )

    # Forward Pass
    for i in tqdm(range(1, n)):
        p_idx = phoneme_to_idx_MACRO[phoneme_mappings[p_seq[i].lower()]]
        t_start = torch.arange(T, device=device)
        t_end = torch.arange(T, device=device)
        t_start_grid, t_end_grid = torch.meshgrid(t_start, t_end, indexing='ij')
        valid_mask = t_start_grid < t_end_grid
        
        phi1_dev = compute_phi_1(derivatives, t_start_grid, t_end_grid)
        phi2 = compute_phi_2(cumsum_probs, p_idx, t_start_grid, t_end_grid)
        total_phi = w_phi[0] * phi1_dev
        
        prev_scores = torch.full((T, T), float(-1e9), device=device)
        
        for t_end_val in range(T):
            valid_starts = t_start[t_start < t_end_val]
            if valid_starts.numel() == 0:
                continue
            
            # --- CLASSIC DP CHANGE ---
            # Instead of LogSumExp (Soft-Max), use Hard Max
            prev = dp_mat[i-1, :valid_starts[-1]+1, valid_starts]
            max_prev, _ = torch.max(prev, dim=0) 
            prev_scores[valid_starts, t_end_val] = max_prev

        dp_mat[i] = torch.where(valid_mask, total_phi + prev_scores, torch.full_like(total_phi, float(-1e9)))

    # Backtracking (Classic Argmax)
    best_start_times = torch.zeros((n), dtype=derivative_preds_np.dtype, device=device)
    best_prev_t_end = T - 1
    
    for i in range(n):
        cur_ph = n - 1 - i
        # Find the exact index that gave the maximum score
        scores = dp_mat[cur_ph, :, best_prev_t_end]
        
        # --- CLASSIC DP CHANGE ---
        # Instead of expected_idx (Soft-Argmax), use Hard Argmax
        best_t_start = torch.argmax(scores)
        
        best_start_times[cur_ph] = best_t_start.to(derivative_preds_np.dtype)
        best_prev_t_end = int(best_t_start.item())
        
    # Visualization Code (unchanged logic, updated labels)
    dp_mat_cpu = dp_mat.detach().cpu()
    best_start_times_cpu = best_start_times.detach().cpu().numpy()
    dp_to_plot = dp_mat_cpu.max(dim=1)[0].numpy()
    masked_dp = np.ma.masked_where(dp_to_plot <= -1e8, dp_to_plot)

    plt.figure(figsize=(12, 6))
    cmap = plt.cm.viridis
    cmap.set_bad(color='white')
    plt.imshow(masked_dp, aspect='auto', origin='lower', cmap=cmap)
    plt.colorbar(label='Hard DP Score')
    plt.xlabel('End time (frame)')
    plt.ylabel('Phoneme index')
    plt.title('Classic (Hard) DP Matrix with Best Path')
    plt.plot(best_start_times_cpu, range(len(best_start_times_cpu)), 'r.-', label='Argmax path')
    plt.legend()
    plt.tight_layout()
    plt.savefig('dp_matrix_hard_classic.png')
    plt.close()
        
    return best_start_times

# ------------------second ablations - naive peak detection ------------------

from scipy.signal import find_peaks

def phoneme_alignment_naive_peak_detection(p_seq, w_phi, original_lengths, len_ratio, derivative_preds_np, probs_real):
    """
    Ablation version: Replaces DP with Naive Scipy Peak Detection.
    """
    gamma = 1e-20 
    T = int(original_lengths[0])
    n = len(p_seq)
    device = derivative_preds_np.device

    # --- Keep identical preprocessing to ensure 'plug & play' ---
    if isinstance(probs_real, np.ndarray):
        probs_real = torch.tensor(probs_real, device=device)
    
    # We don't actually need cumsum_probs or phoneme_mappings for naive peak detection,
    # but we keep them defined to avoid any potential scope issues if you add code back.
    cumsum_probs = torch.cumsum(probs_real, dim=0)
    signal = derivative_preds_np.detach().cpu().numpy().flatten()

    # --- Naive Peak Detection ---
    # To get exactly 'n' boundaries for 'n' phonemes, we pick the top n most prominent peaks.
    peaks, properties = find_peaks(signal, prominence=0.05) 
    
    peak_heights = signal[peaks]
    
    # Sort peaks by height and take the top 'n'
    top_indices = np.argsort(peak_heights)[-n:]
    best_peaks = np.sort(peaks[top_indices])
    if len(best_peaks) < n:
        filler = np.linspace(0, T-1, n)
        best_peaks = filler # Fallback
    best_start_times = torch.tensor(best_peaks, dtype=derivative_preds_np.dtype, device=device)

    # --- Mock DP Matrix for Plotting ---
    dp_mat = torch.full((n, T, T), float(-1e9), device=device)
    for i, peak_time in enumerate(best_peaks):
        dp_mat[i, :, int(peak_time)] = 1.0 

    # --- Identical Plotting Logic ---
    dp_mat_cpu = dp_mat.detach().cpu()
    best_start_times_cpu = best_start_times.detach().cpu().numpy()
    dp_to_plot = dp_mat_cpu.max(dim=1)[0].numpy()
    
    masked_dp = np.ma.masked_where(dp_to_plot <= -1e8, dp_to_plot)

    plt.figure(figsize=(12, 6))
    cmap = plt.cm.viridis
    cmap.set_bad(color='white')
    
    plt.imshow(masked_dp, aspect='auto', origin='lower', cmap=cmap)
    plt.colorbar(label='Peak Detection (Naive)')
    plt.xlabel('End time (frame)')
    plt.ylabel('Phoneme index')
    plt.title('Naive Peak Detection (Ablation)')
    plt.plot(best_start_times_cpu, range(len(best_start_times_cpu)), 'r.-', label='Detected Peaks')
    plt.legend()
    plt.tight_layout()
    save_path = 'peak_detection_ablation.png'
    plt.savefig(save_path)
    plt.close()
    
    print(f"Ablation plot saved as {save_path}")

    return best_start_times

# ------------------------- phoneme alignment main ------------------------
def phoneme_alignment(p_seq, w_phi, original_lengths, len_ratio, derivative_preds_np, probs_real):
    gamma = 1e-20
    T = int(original_lengths[0])
    n = len(p_seq)
    device = derivative_preds_np.device

    if isinstance(probs_real, np.ndarray):
        probs_real = torch.tensor(probs_real, device=device)
    cumsum_probs = torch.cumsum(probs_real, dim=0)
    
    
    
    phoneme_mappings = {p.lower(): timit_to_leehon_map_MACRO.get(p.lower(), 'sil') if p.lower() not in timit_leehon_39_phonemes else p.lower() for p in p_seq}
    derivatives = torch.cat([torch.tensor([0], device=derivative_preds_np.device), torch.diff(derivative_preds_np, dim=0)])

    dp_mat = torch.full((n, T, T), float(-1e9), device=device)

    p_idx0 = phoneme_to_idx_MACRO[phoneme_mappings[p_seq[0].lower()]]

    # Vectorized init for first phoneme
    t_e = torch.arange(T, device=device)
    dp_mat[0, 0, :] = (
        w_phi[0] * compute_phi_1(derivatives, 0, t_e)
        + w_phi[1] * compute_phi_1(derivatives, 0, t_e)

    )

    for i in tqdm(range(1, n)):
        p_idx = phoneme_to_idx_MACRO[phoneme_mappings[p_seq[i].lower()]]
        # Vectorized over t_start and t_end
        t_start = torch.arange(T, device=device)
        t_end = torch.arange(T, device=device)
        t_start_grid, t_end_grid = torch.meshgrid(t_start, t_end, indexing='ij')
        valid_mask = t_start_grid < t_end_grid

        phi1_dev = compute_phi_1(derivatives, t_start_grid, t_end_grid)
        phi1 = compute_phi_1(derivative_preds_np, t_start_grid, t_end_grid)
        phi2 = compute_phi_2(cumsum_probs, p_idx, t_start_grid, t_end_grid)
        total_phi = w_phi[0] * phi1_dev + w_phi[1] * phi2

        # Max over all possible previous end times.
        # Vectorized equivalent of the original per-t_end loop: for each previous
        # end time s, logsumexp over dp_mat[i-1]'s start-rows is the SAME regardless
        # of t_end (invalid spans start>=end are -1e9 and never contribute), so the
        # O(T) inner loop over t_end collapses to one column-wise logsumexp + mask.
        # Bit-identical output; ~T x faster on long utterances.
        col_lse = torch.logsumexp(dp_mat[i-1] / gamma, dim=0) * gamma  # (T,) over start rows
        prev_scores = torch.where(
            valid_mask,
            col_lse.unsqueeze(1).expand(T, T),
            torch.full((T, T), float(-1e9), device=device),
        )
        dp_mat[i] = torch.where(valid_mask, total_phi + prev_scores, torch.full_like(total_phi, float(-1e9)))

    # Backtracking
    best_start_times = torch.zeros((n), dtype=derivative_preds_np.dtype, device=device)
    best_prev_t_end = T-1
    for i in range(n):
        cur_ph = n-1-i
        scores = dp_mat[cur_ph, :, best_prev_t_end]
        soft_weights = torch.softmax(scores / gamma, dim=0)
        expected_idx = (soft_weights * torch.arange(T, device=device, dtype=derivative_preds_np.dtype)).sum()
        best_start_times[cur_ph] = expected_idx      
        best_prev_t_end = int(expected_idx.round().item())
        
    # DP-matrix figure is a debug artifact only; best_start_times (the return
    # value) is already computed above. Skip entirely unless an output dir is set
    # (the web demo sets it). Saves ~570ms/utterance in training/eval.
    if _dp_matrix_out_dir is not None:
        dp_mat_cpu = dp_mat.detach().cpu()
        best_start_times_cpu = best_start_times.detach().cpu().numpy()
        dp_to_plot = dp_mat_cpu.max(dim=1)[0].numpy()
        masked_dp = np.ma.masked_where(dp_to_plot <= -1e8, dp_to_plot)  # mask all values <= -1e8

        plt.figure(figsize=(12, 6))
        cmap = plt.cm.viridis
        cmap.set_bad(color='white')
        real_min = masked_dp.min()
        real_max = masked_dp.max()
        # Plot DP matrix (max over start times)
        plt.imshow(masked_dp, aspect='auto', origin='lower', cmap=cmap, vmin=real_min, vmax=real_max)
        plt.colorbar(label='DP Score (max over start)')
        plt.xlabel('End time (frame)')
        plt.ylabel('Phoneme index')
        plt.title('DP Matrix with Best Path')
        # Overlay best_start_times as a red line
        plt.plot(best_start_times_cpu, range(len(best_start_times_cpu)), 'r.-', label='Best start times')
        plt.legend()
        plt.tight_layout()
        _save_path = os.path.join(_dp_matrix_out_dir or '.', 'dp_matrix_with_path.png')
        plt.savefig(_save_path)
        print(f"DP matrix with path plot saved as {_save_path}")
        
    return best_start_times

def compute_phi_1(derivative_preds_np: torch.Tensor, t_start: Union[torch.Tensor, int], t_end: Union[torch.Tensor, int]) -> torch.Tensor:
    """
    Computes phi_1 for dynamic programming.
    t_start and t_end can be scalars or tensors of the same shape.
    Returns a tensor of scores.
    """
    # Ensure t_start and t_end are tensors
    t_start = torch.as_tensor(t_start, device=derivative_preds_np.device)
    t_end = torch.as_tensor(t_end, device=derivative_preds_np.device)
    # Broadcast to same shape
    t_start, t_end = torch.broadcast_tensors(t_start, t_end)
    # Valid indices
    valid = (t_end < derivative_preds_np.shape[0]-1) & (t_start < derivative_preds_np.shape[0]-1) & (t_end > 0) & (t_start > 0)
    score = torch.zeros_like(t_start, dtype=derivative_preds_np.dtype, device=derivative_preds_np.device)

    eps = 1e-6
    tanh_scale = 1e-3 #1e-2 #0.5
    if valid.any():
        
        # start_pos - 
        idx_s = t_start[valid].long()
        s_center = torch.tanh(tanh_scale * derivative_preds_np[idx_s])
        s_prev = torch.tanh(tanh_scale * derivative_preds_np[idx_s -1])
        s_next = torch.tanh(tanh_scale * derivative_preds_np[idx_s +1])
        delta_prev_s = s_center - s_prev
        delta_next_s = s_center - s_next
        scores_zerocross_s = (1-torch.sqrt(s_center**2)) +  torch.sqrt(delta_prev_s **2 + eps) + torch.sqrt(delta_next_s**2 + eps)
        # orig - 
        score[valid] += scores_zerocross_s

        # end_pos - 
        idx_e = t_end[valid].long()
        e_center = torch.tanh(tanh_scale * derivative_preds_np[idx_e]) #do i need this? not sure
        e_prev = torch.tanh(tanh_scale * derivative_preds_np[idx_e -1])
        e_next = torch.tanh(tanh_scale * derivative_preds_np[idx_e +1])
        delta_prev_e = e_center - e_prev
        delta_next_e = e_center - e_next
        scores_zerocross_e = (1-torch.sqrt(e_center**2)) + torch.sqrt(delta_prev_e **2 + eps) + torch.sqrt(delta_next_e**2 + eps)
        # orig - 
        score[valid] += scores_zerocross_e
    return score

def compute_phi_2(cumsum_probs: torch.Tensor, p: int, t_start: Union[torch.Tensor, int], t_end: Union[torch.Tensor, int]) -> torch.Tensor:
    """
    Computes phi_2 for dynamic programming.
    t_start and t_end can be scalars or tensors of the same shape.
    Returns a tensor of scores.
    """
    t_start = torch.as_tensor(t_start, device=cumsum_probs.device)
    t_end = torch.as_tensor(t_end, device=cumsum_probs.device)
    t_start, t_end = torch.broadcast_tensors(t_start, t_end)
    # Valid indices
    valid = (t_end < cumsum_probs.shape[0]) & (t_start < cumsum_probs.shape[0]) & (t_end > 0) & (t_start >= 0)
    probs_score = torch.zeros_like(t_start, dtype=cumsum_probs.dtype, device=cumsum_probs.device)
    # Only assign where valid
    probs_score[valid] = cumsum_probs[t_end[valid], p] - torch.where(
        t_start[valid] > 0,
        cumsum_probs[t_start[valid], p],
        torch.zeros_like(t_start[valid], dtype=cumsum_probs.dtype, device=cumsum_probs.device)
    )
    lengths = (t_end - t_start).clamp(min=1)
    probs_score[valid] = probs_score[valid] / lengths[valid]
    return (probs_score)

def best_phoneme_for_segments(cumsum_probs: torch.Tensor, t_start: torch.Tensor, t_end: torch.Tensor):
    """
    For each (t_start, t_end) pair (tensors broadcasted to same shape),
    compute the average probability per phoneme over the segment and return:
      - max_vals: tensor of shape (pairs,) with the max average prob per pair
      - max_idx:  LongTensor of shape (pairs,) with the argmax phoneme index per pair
    """
    device = cumsum_probs.device
    t_start = torch.as_tensor(t_start, device=device)
    t_end = torch.as_tensor(t_end, device=device)
    t_start, t_end = torch.broadcast_tensors(t_start, t_end)

    valid = (t_end < cumsum_probs.shape[0]) & (t_start < cumsum_probs.shape[0]) & (t_end > 0) & (t_start >= 0)
    max_vals = torch.zeros_like(t_start, dtype=cumsum_probs.dtype, device=device)
    max_idx = torch.full_like(t_start, -1, dtype=torch.long, device=device)

    if not valid.any():
        return max_vals, max_idx

    idx_end = t_end[valid].long()
    idx_start = t_start[valid].long()
    probs_end = cumsum_probs[idx_end]  # (k, P)
    probs_start = torch.zeros_like(probs_end)
    nonzero_mask = idx_start > 0
    if nonzero_mask.any():
        probs_start[nonzero_mask] = cumsum_probs[idx_start[nonzero_mask]]

    segment_sum = probs_end - probs_start  # (k, P)
    lengths = (t_end[valid] - t_start[valid]).clamp(min=1).unsqueeze(1).to(segment_sum.dtype)
    segment_mean = segment_sum / lengths  # (k, P)

    vals, idxs = segment_mean.max(dim=1)  # per-row max and argmax
    max_vals[valid] = vals
    max_idx[valid] = idxs.long()
    return max_vals, max_idx

def get_timit_61_phoneme_mappings():
    """
    Returns the TIMIT 61 phoneme-to-index mapping and the reverse index-to-phoneme mapping.

    Returns:
        phoneme_to_idx (dict): Dictionary mapping phonemes to unique indices.
        idx_to_phoneme (dict): Dictionary mapping indices to their corresponding phonemes.
    """
    # this is actually including the leehon 39 phonemes!!!!!
    timit_61_phonemes = [
        'aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay',
        'b', 'bcl', 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi', 'er', 'ey',
        'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng',
        'nx', 'ow', 'oy', 'p', 'pau', 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux',
        'v', 'w', 'y', 'z', 'zh'
    ]
    timit_leehon_39_phonemes = [
        'ao', 'ae', 'ah','aw', 'er', 'ay', 
        'b', 'sil', 'ch', 'd', 'dh', 'dx', 'eh', 'el', 'm', 'en', 'ng', 'ey',
        'f', 'g', 'hh', 'ih', 'iy', 'jh', 'k', 'v', 'w', 'y', 'z', 'sh', 't', 'r', 's', 'th','uh', 'uw', 'oy', 'ow','p'
    ]
    # Create mappings
    phoneme_to_idx = {phoneme: idx for idx, phoneme in enumerate(timit_leehon_39_phonemes)}
    idx_to_phoneme = {idx: phoneme for phoneme, idx in phoneme_to_idx.items()}

    return phoneme_to_idx, idx_to_phoneme

# --------------------------------


def timit_to_leehon(timit_label):
    # Mapping of TIMIT 61 phonemes to Leehon 39 phonemes
    timit_to_leehon_map = {
        'aa': 'ao', 'ae': 'ae', 'ah': 'ah', 'ao': 'ao', 'aw': 'aw', 'ax': 'ah', 'ax-h': 'ah', 'axr': 'er', 'ay': 'ay',
        'b': 'b', 'bcl': 'sil', 'ch': 'ch', 'd': 'd', 'dcl': 'sil', 'dh': 'dh', 'dx': 'dx', 'eh': 'eh', 'el': 'el',
        'em': 'm', 'en': 'en', 'eng': 'ng', 'epi': 'sil', 'er': 'er', 'ey': 'ey', 'f': 'f', 'g': 'g', 'gcl': 'sil',
        'h#': 'sil', 'hh': 'hh', 'hv': 'hh', 'ih': 'ih', 'ix': 'ih', 'iy': 'iy', 'jh': 'jh', 'k': 'k', 'kcl': 'sil',
        'l': 'el', 'm': 'm', 'n': 'en', 'ng': 'ng', 'nx': 'en', 'ow': 'ow', 'oy': 'oy', 'p': 'p', 'pau': 'sil', 'pcl': 'sil',
        'q': 't', 'qcl': 'sil', 'r': 'r', 's': 's', 'sh': 'sh', 't': 't', 'tcl': 'sil', 'th': 'th', 'uh': 'uh', 'uw': 'uw',
        'ux': 'uw', 'v': 'v', 'w': 'w', 'y': 'y', 'z': 'z', 'zh': 'sh', '':'sil'
    }
    
    # Return the corresponding Leehon 39 label, or None if the label is not found
    return timit_to_leehon_map.get(timit_label.lower(), None)

def load_phoneme_stats():
    phonemes_path = "phonemes_39"
    stats_path = "phoneme_stats_39.out"
    
    # Load phoneme names
    with open(phonemes_path, "r") as f:
        phonemes = [line.strip() for line in f]

    # Load mu values (second row of stats file)
    with open(stats_path, "r") as f:
        lines = f.readlines()
        mu_values = list(map(float, lines[1].strip().split()))  # Convert to float
        sigma_values = list(map(float, lines[2].strip().split()))  # Convert to float
    
    # Create phoneme-to-mu dictionary
    phoneme_mu_dict = dict(zip(phonemes, mu_values))
    phoneme_sigma_dict = dict(zip(phonemes, sigma_values))
    
    return phoneme_mu_dict, phoneme_sigma_dict

# Load phoneme stats once


def get_mu_stats(p):
    phoneme_mu_dict, _ = load_phoneme_stats()
    """Return the mu value for the given phoneme p."""
    return phoneme_mu_dict.get(p, None)  # Return None if phoneme is not found

def get_sigma_stats(p):
    _, phoneme_sigma_dict = load_phoneme_stats()
    """Return the mu value for the given phoneme p."""
    return phoneme_sigma_dict.get(p, None)  # Return None if phoneme is not found

def replicate_first_k_frames(x, k, dim):
    return torch.cat([x.index_select(dim=dim, index=torch.LongTensor([0] * k).to(x.device)), x], dim=dim)

class LambdaLayer(nn.Module):
    def __init__(self, lambd):
        super(LambdaLayer, self).__init__()
        self.lambd = lambd
    def forward(self, x):
        return self.lambd(x)

class PrintShapeLayer(nn.Module):
    def __init__(self):
        super(PrintShapeLayer, self).__init__()
    def forward(self, x):
        print(x.shape)
        return x

def length_to_mask(length, max_len=None, dtype=None):
    """length: B.
    return B x max_len.
    If max_len is None, then max of length will be used.
    """
    assert len(length.shape) == 1, 'Length shape should be 1 dimensional.'
    max_len = max_len or length.max().item()
    mask = torch.arange(max_len, device=length.device,
                        dtype=length.dtype).expand(len(length), max_len) < length.unsqueeze(1)
    if dtype is not None:
        mask = torch.as_tensor(mask, dtype=dtype, device=length.device)
    return mask

def detect_peaks_worker(xi,w_phi, p_seq, original_lengths, probs_real, len_ratio, width, distance):
    print(f"num peaks = {len(p_seq)}")
    print(f"xi type: {type(xi)}")
    preds_np = xi.requires_grad_(True)
    median_h = preds_np.median()
    preds_np = preds_np - median_h
    derivative_preds_np = preds_np
    xmin, xmax = xi.min(), xi.max()
    xi = (xi - xmin) / (xmax - xmin)
    xi = xi.flatten()
    
    peaks = phoneme_alignment(p_seq,w_phi, original_lengths, len_ratio, derivative_preds_np, probs_real)
    
    if len(peaks) == 0:
        peaks = torch.tensor([xi.shape[0] - 1], device=xi.device)
    
    return peaks

def detect_peaks(x,w_phi, original_lengths_all, phonemes, len_ratio, probs_real_all):
    """Detect peaks of next_frame_classifier using multithreading."""
    
    out = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
        xi=x
        p_seq = phonemes
        original_lengths = original_lengths_all
        probs_real = probs_real_all
        if len(xi)!=0:
            result = detect_peaks_worker(xi, w_phi, p_seq, [original_lengths], probs_real, len_ratio, width=None, distance=None)
        out.append(result)
    
    return out

class PrecisionRecallMetric:
    def __init__(self):
        self.precision_counter = 0
        self.recall_counter = 0
        self.pred_counter = 0
        self.gt_counter = 0
        self.eps = 1e-5
        self.data = []
        self.tolerance = 2
        self.width_range = [1]
        self.distance_range = [1]

    def get_metrics(self, precision_counter, recall_counter, pred_counter, gt_counter):
        EPS = 1e-7
        
        precision = precision_counter / (pred_counter + self.eps)
        recall = recall_counter / (gt_counter + self.eps)
        f1 = 2 * (precision * recall) / (precision + recall + self.eps)
        
        os = recall / (precision + EPS) - 1
        r1 = np.sqrt((1 - recall) ** 2 + os ** 2)
        r2 = (-os + recall - 1) / (np.sqrt(2))
        rval = 1 - (np.abs(r1) + np.abs(r2)) / 2

        return precision, recall, f1, rval

    def zero(self):
        self.data = []
        
    def update(self, seg, pos_pred, length,original_lengths_all, probs_all,phonemes_all):
        for seg_i, pos_pred_i, length_i , original_length, probs,phonemes in zip(seg, pos_pred, length,original_lengths_all,probs_all,phonemes_all):
            self.data.append((seg_i, pos_pred_i, length_i.item(),[original_length.item()], probs, phonemes))


    def get_stats(self, width=None, distance=None):
        print(f"calculating metrics using {len(self.data)} entries")
        max_rval = -float("inf")
        min_l1_dist = float("inf")
        best_params = None
        segs = list(map(lambda x: x[0], self.data))
        length = list(map(lambda x: x[2], self.data))
        yhats = list(map(lambda x: x[1], self.data))
        original_lengths_all = list(map(lambda x: x[3], self.data))
        probs = list(map(lambda x: x[4], self.data))
        phonemes = list(map(lambda x: x[5], self.data))

        width_range = self.width_range
        distance_range = self.distance_range

        if width is not None:
            width_range = [width]
            distance_range = [distance]
        sr = 16000
        len_ratio = 161.34011627906978

        for width in width_range:
            for distance in distance_range:
                    for (y, yhat,original_len, phoneme, prob) in zip(segs, yhats, original_lengths_all, phonemes, probs):
                        if isinstance(y,list):
                            y = torch.tensor(y, device=yhat.device, dtype=yhat.dtype)
                        peaks = detect_peaks(x=yhat,w_phi= [0.5,0.5],
                                             original_lengths_all = original_len[0],
                                             phonemes = phoneme,
                                             len_ratio = 161.34011627906978 , 
                                             probs_real_all = prob)
                        peaks = peaks[0]* len_ratio/sr
                        yhat = peaks
                        yhat = yhat[1:]

                        if isinstance(y,list):
                            y = torch.tensor(y, device=yhat.device, dtype=yhat.dtype)
                            y = y*len_ratio/sr
                        l1_dist = torch.mean(torch.abs(y - yhat)).item()
                        l2_dist = torch.mean((y - yhat)**2).item()
                        if l1_dist<min_l1_dist:
                            min_l1_dist = l1_dist
                            out = (l1_dist,l2_dist)
                            best_params = width, distance
        self.zero()
        print(f"best peak detection params: {best_params} (width, distance)")
        print(f"best peak detection L1_DIST: {l1_dist}")
        print(f"best peak detection L2_DIST: {l2_dist}")
        return out, best_params


class StatsMeter:
    def __init__(self):
        self.data = []

    def update(self, item):
        if type(item) == list:
            self.data.extend(item)
        else:
            self.data.append(item)

    def get_stats(self):
        data = np.array(self.data)
        if len(data)==0:
            return float('nan')
        mean = data.mean()
        return mean

    def zero(self):
        self.data.clear()
        assert len(self.data) == 0, "StatsMeter didn't clear"


class Timer:
    def __init__(self, msg):
        self.msg = msg
        self.start_time = None

    def __enter__(self):
        self.start_time = time.time()
        print(f"{self.msg} -- started")

    def __exit__(self, exc_type, exc_value, exc_tb):
        print(f"{self.msg} -- done in {(time.time() - self.start_time)} secs")


def max_min_norm(x):
    x -= x.min(-1, keepdim=True)[0]
    x /= x.max(-1, keepdim=True)[0]
    return x


def line():
    print(90 * "-")