hetchyy commited on
Commit
2aebf7d
·
verified ·
1 Parent(s): 0b34cc0

Upload folder using huggingface_hub

Browse files
src/alignment/_dp_core.c CHANGED
The diff for this file is too large to render. See raw diff
 
src/alignment/_dp_core.pyx CHANGED
@@ -411,24 +411,28 @@ cdef tuple _align_full_3d(
411
  cdef double *cost_3d = NULL
412
  cdef int *start_3d = NULL
413
  cdef int *max_j_3d = NULL
 
414
  cdef int *par_i = NULL
415
  cdef int *par_k = NULL
416
  cdef int *par_j = NULL
417
  cdef char *par_t = NULL # 0=sub, 1=del, 2=ins, 3=wrap
 
418
 
419
  cost_3d = <double *>malloc(total_3d * sizeof(double))
420
  start_3d = <int *>malloc(total_3d * sizeof(int))
421
  max_j_3d = <int *>malloc(total_3d * sizeof(int))
 
422
  par_i = <int *>malloc(total_3d * sizeof(int))
423
  par_k = <int *>malloc(total_3d * sizeof(int))
424
  par_j = <int *>malloc(total_3d * sizeof(int))
425
  par_t = <char *>malloc(total_3d * sizeof(char))
426
 
427
- if (cost_3d == NULL or start_3d == NULL or max_j_3d == NULL or
428
  par_i == NULL or par_k == NULL or par_j == NULL or par_t == NULL):
429
  if cost_3d != NULL: free(cost_3d)
430
  if start_3d != NULL: free(start_3d)
431
  if max_j_3d != NULL: free(max_j_3d)
 
432
  if par_i != NULL: free(par_i)
433
  if par_k != NULL: free(par_k)
434
  if par_j != NULL: free(par_j)
@@ -439,12 +443,14 @@ cdef tuple _align_full_3d(
439
  cdef int i, j, k
440
  cdef int koff, koff_src, koff_dst
441
  cdef long base_i, base_prev
 
442
 
443
  # Initialize all to INF / -1
444
  for idx in range(total_3d):
445
  cost_3d[idx] = INF_VAL
446
  start_3d[idx] = -1
447
  max_j_3d[idx] = -1
 
448
  par_i[idx] = -1
449
  par_k[idx] = -1
450
  par_j[idx] = -1
@@ -456,6 +462,7 @@ cdef tuple _align_full_3d(
456
  cost_3d[j] = 0.0 # i=0, k=0, j
457
  start_3d[j] = j
458
  max_j_3d[j] = j
 
459
 
460
  # Variables for DP transitions
461
  cdef double del_opt, ins_opt, sub_opt, sc, new_cost, cost_at_end, best_val
@@ -477,6 +484,7 @@ cdef tuple _align_full_3d(
477
  cost_3d[idx] = i * cost_del
478
  start_3d[idx] = 0
479
  max_j_3d[idx] = 0
 
480
  par_i[idx] = i - 1
481
  par_k[idx] = 0
482
  par_j[idx] = 0
@@ -506,17 +514,24 @@ cdef tuple _align_full_3d(
506
  start_3d[idx] = start_3d[base_prev + koff + j - 1]
507
  mj_val = max_j_3d[base_prev + koff + j - 1]
508
  max_j_3d[idx] = j if j > mj_val else mj_val
 
 
 
509
  par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 0
510
  elif del_opt <= ins_opt:
511
  cost_3d[idx] = del_opt
512
  start_3d[idx] = start_3d[base_prev + koff + j]
513
  max_j_3d[idx] = max_j_3d[base_prev + koff + j]
 
514
  par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j; par_t[idx] = 1
515
  elif ins_opt < INF_VAL:
516
  cost_3d[idx] = ins_opt
517
  start_3d[idx] = start_3d[base_i + koff + j - 1]
518
  mj_val = max_j_3d[base_i + koff + j - 1]
519
  max_j_3d[idx] = j if j > mj_val else mj_val
 
 
 
520
  par_i[idx] = i; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 2
521
 
522
  # Wrap transitions (within same row i)
@@ -544,6 +559,9 @@ cdef tuple _align_full_3d(
544
  start_3d[idx] = start_3d[base_i + koff_src + j_end]
545
  mj_val = max_j_3d[base_i + koff_src + j_end]
546
  max_j_3d[idx] = j_end if j_end > mj_val else mj_val
 
 
 
547
  par_i[idx] = i; par_k[idx] = k; par_j[idx] = j_end; par_t[idx] = 3
548
 
549
  # Insertion re-sweep from wrap positions
@@ -556,6 +574,9 @@ cdef tuple _align_full_3d(
556
  start_3d[idx] = start_3d[base_i + koff_dst + j - 1]
557
  mj_val = max_j_3d[base_i + koff_dst + j - 1]
558
  max_j_3d[idx] = j if j > mj_val else mj_val
 
 
 
559
  par_i[idx] = i; par_k[idx] = k + 1; par_j[idx] = j - 1; par_t[idx] = 2
560
 
561
  # ------------------------------------------------------------------
@@ -567,7 +588,7 @@ cdef tuple _align_full_3d(
567
  cdef int best_k_val = 0, best_max_j = -1
568
 
569
  cdef double dist, norm_dist, prior, score, phoneme_cost
570
- cdef int j_start_val, ref_len, denom, start_word, max_j_val
571
 
572
  base_i = <long>m * layer_stride
573
  for k in range(K + 1):
@@ -603,7 +624,10 @@ cdef tuple _align_full_3d(
603
  else:
604
  start_word = enc.R_w[j - 1]
605
 
606
- prior = prior_weight * fabs(<double>(start_word - expected_word))
 
 
 
607
  score = norm_dist + prior
608
  if sc_mode == 2: # additive
609
  score = score + k * wrap_score_cost
@@ -618,7 +642,7 @@ cdef tuple _align_full_3d(
618
  best_max_j = max_j_val
619
 
620
  if best_j < 0:
621
- free(cost_3d); free(start_3d); free(max_j_3d)
622
  free(par_i); free(par_k); free(par_j); free(par_t)
623
  return (None, None, float('inf'), float('inf'), 0, 0, [])
624
 
 
411
  cdef double *cost_3d = NULL
412
  cdef int *start_3d = NULL
413
  cdef int *max_j_3d = NULL
414
+ cdef int *min_w_3d = NULL # minimum word index reached along path
415
  cdef int *par_i = NULL
416
  cdef int *par_k = NULL
417
  cdef int *par_j = NULL
418
  cdef char *par_t = NULL # 0=sub, 1=del, 2=ins, 3=wrap
419
+ cdef int BIG_W = 999999
420
 
421
  cost_3d = <double *>malloc(total_3d * sizeof(double))
422
  start_3d = <int *>malloc(total_3d * sizeof(int))
423
  max_j_3d = <int *>malloc(total_3d * sizeof(int))
424
+ min_w_3d = <int *>malloc(total_3d * sizeof(int))
425
  par_i = <int *>malloc(total_3d * sizeof(int))
426
  par_k = <int *>malloc(total_3d * sizeof(int))
427
  par_j = <int *>malloc(total_3d * sizeof(int))
428
  par_t = <char *>malloc(total_3d * sizeof(char))
429
 
430
+ if (cost_3d == NULL or start_3d == NULL or max_j_3d == NULL or min_w_3d == NULL or
431
  par_i == NULL or par_k == NULL or par_j == NULL or par_t == NULL):
432
  if cost_3d != NULL: free(cost_3d)
433
  if start_3d != NULL: free(start_3d)
434
  if max_j_3d != NULL: free(max_j_3d)
435
+ if min_w_3d != NULL: free(min_w_3d)
436
  if par_i != NULL: free(par_i)
437
  if par_k != NULL: free(par_k)
438
  if par_j != NULL: free(par_j)
 
443
  cdef int i, j, k
444
  cdef int koff, koff_src, koff_dst
445
  cdef long base_i, base_prev
446
+ cdef int w_j, mw_val
447
 
448
  # Initialize all to INF / -1
449
  for idx in range(total_3d):
450
  cost_3d[idx] = INF_VAL
451
  start_3d[idx] = -1
452
  max_j_3d[idx] = -1
453
+ min_w_3d[idx] = BIG_W
454
  par_i[idx] = -1
455
  par_k[idx] = -1
456
  par_j[idx] = -1
 
462
  cost_3d[j] = 0.0 # i=0, k=0, j
463
  start_3d[j] = j
464
  max_j_3d[j] = j
465
+ min_w_3d[j] = enc.R_w[j] if j < n else BIG_W
466
 
467
  # Variables for DP transitions
468
  cdef double del_opt, ins_opt, sub_opt, sc, new_cost, cost_at_end, best_val
 
484
  cost_3d[idx] = i * cost_del
485
  start_3d[idx] = 0
486
  max_j_3d[idx] = 0
487
+ min_w_3d[idx] = min_w_3d[base_prev + koff]
488
  par_i[idx] = i - 1
489
  par_k[idx] = 0
490
  par_j[idx] = 0
 
514
  start_3d[idx] = start_3d[base_prev + koff + j - 1]
515
  mj_val = max_j_3d[base_prev + koff + j - 1]
516
  max_j_3d[idx] = j if j > mj_val else mj_val
517
+ w_j = enc.R_w[j - 1]
518
+ mw_val = min_w_3d[base_prev + koff + j - 1]
519
+ min_w_3d[idx] = w_j if w_j < mw_val else mw_val
520
  par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 0
521
  elif del_opt <= ins_opt:
522
  cost_3d[idx] = del_opt
523
  start_3d[idx] = start_3d[base_prev + koff + j]
524
  max_j_3d[idx] = max_j_3d[base_prev + koff + j]
525
+ min_w_3d[idx] = min_w_3d[base_prev + koff + j]
526
  par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j; par_t[idx] = 1
527
  elif ins_opt < INF_VAL:
528
  cost_3d[idx] = ins_opt
529
  start_3d[idx] = start_3d[base_i + koff + j - 1]
530
  mj_val = max_j_3d[base_i + koff + j - 1]
531
  max_j_3d[idx] = j if j > mj_val else mj_val
532
+ w_j = enc.R_w[j - 1]
533
+ mw_val = min_w_3d[base_i + koff + j - 1]
534
+ min_w_3d[idx] = w_j if w_j < mw_val else mw_val
535
  par_i[idx] = i; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 2
536
 
537
  # Wrap transitions (within same row i)
 
559
  start_3d[idx] = start_3d[base_i + koff_src + j_end]
560
  mj_val = max_j_3d[base_i + koff_src + j_end]
561
  max_j_3d[idx] = j_end if j_end > mj_val else mj_val
562
+ mw_val = min_w_3d[base_i + koff_src + j_end]
563
+ w_j = enc.R_w[j_sw]
564
+ min_w_3d[idx] = w_j if w_j < mw_val else mw_val
565
  par_i[idx] = i; par_k[idx] = k; par_j[idx] = j_end; par_t[idx] = 3
566
 
567
  # Insertion re-sweep from wrap positions
 
574
  start_3d[idx] = start_3d[base_i + koff_dst + j - 1]
575
  mj_val = max_j_3d[base_i + koff_dst + j - 1]
576
  max_j_3d[idx] = j if j > mj_val else mj_val
577
+ w_j = enc.R_w[j - 1]
578
+ mw_val = min_w_3d[base_i + koff_dst + j - 1]
579
+ min_w_3d[idx] = w_j if w_j < mw_val else mw_val
580
  par_i[idx] = i; par_k[idx] = k + 1; par_j[idx] = j - 1; par_t[idx] = 2
581
 
582
  # ------------------------------------------------------------------
 
588
  cdef int best_k_val = 0, best_max_j = -1
589
 
590
  cdef double dist, norm_dist, prior, score, phoneme_cost
591
+ cdef int j_start_val, ref_len, denom, start_word, max_j_val, min_word_val, eff_start
592
 
593
  base_i = <long>m * layer_stride
594
  for k in range(K + 1):
 
624
  else:
625
  start_word = enc.R_w[j - 1]
626
 
627
+ # Use earliest word the path actually touches for fair prior
628
+ min_word_val = min_w_3d[idx]
629
+ eff_start = min_word_val if min_word_val < start_word and min_word_val < BIG_W else start_word
630
+ prior = prior_weight * fabs(<double>(eff_start - expected_word))
631
  score = norm_dist + prior
632
  if sc_mode == 2: # additive
633
  score = score + k * wrap_score_cost
 
642
  best_max_j = max_j_val
643
 
644
  if best_j < 0:
645
+ free(cost_3d); free(start_3d); free(max_j_3d); free(min_w_3d)
646
  free(par_i); free(par_k); free(par_j); free(par_t)
647
  return (None, None, float('inf'), float('inf'), 0, 0, [])
648
 
src/alignment/phoneme_matcher.py CHANGED
@@ -408,12 +408,18 @@ def align_wraparound(
408
  parent = [[[None] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
409
  start_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
410
  max_j_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
 
 
 
 
 
411
 
412
  # Initialize: k=0, free starts at word boundaries
413
  for j in word_starts:
414
  dp[0][0][j] = 0.0
415
  start_arr[0][0][j] = j
416
  max_j_arr[0][0][j] = j
 
417
 
418
  # Fill DP
419
  for i in range(1, m + 1):
@@ -423,6 +429,7 @@ def align_wraparound(
423
  parent[i][k][0] = (i - 1, k, 0, 'D')
424
  start_arr[i][k][0] = 0
425
  max_j_arr[i][k][0] = 0
 
426
 
427
  for j in range(1, n + 1):
428
  del_opt = dp[i-1][k][j] + cost_del if dp[i-1][k][j] < INF else INF
@@ -433,18 +440,22 @@ def align_wraparound(
433
  best = min(del_opt, ins_opt, sub_opt)
434
  if best < INF:
435
  dp[i][k][j] = best
 
436
  if best == sub_opt:
437
  parent[i][k][j] = (i - 1, k, j - 1, 'S')
438
  start_arr[i][k][j] = start_arr[i-1][k][j-1]
439
  max_j_arr[i][k][j] = max(max_j_arr[i-1][k][j-1], j)
 
440
  elif best == del_opt:
441
  parent[i][k][j] = (i - 1, k, j, 'D')
442
  start_arr[i][k][j] = start_arr[i-1][k][j]
443
  max_j_arr[i][k][j] = max_j_arr[i-1][k][j]
 
444
  else:
445
  parent[i][k][j] = (i, k, j - 1, 'I')
446
  start_arr[i][k][j] = start_arr[i][k][j-1]
447
  max_j_arr[i][k][j] = max(max_j_arr[i][k][j-1], j)
 
448
 
449
  # Wrap transitions
450
  for k in range(K):
@@ -462,6 +473,7 @@ def align_wraparound(
462
  parent[i][k+1][j_s] = (i, k, j_end, 'W')
463
  start_arr[i][k+1][j_s] = start_arr[i][k][j_end]
464
  max_j_arr[i][k+1][j_s] = max(max_j_arr[i][k][j_end], j_end)
 
465
 
466
  # Re-propagate insertions from wrap positions
467
  for j in range(1, n + 1):
@@ -471,6 +483,8 @@ def align_wraparound(
471
  parent[i][k+1][j] = (i, k+1, j-1, 'I')
472
  start_arr[i][k+1][j] = start_arr[i][k+1][j-1]
473
  max_j_arr[i][k+1][j] = max(max_j_arr[i][k+1][j-1], j)
 
 
474
 
475
  # Best-match selection
476
  best_score = INF
@@ -504,7 +518,10 @@ def align_wraparound(
504
  nd = pc / denom
505
 
506
  sw = R_phone_to_word[j_s] if j_s < n else R_phone_to_word[j - 1]
507
- prior = prior_weight * abs(sw - expected_word)
 
 
 
508
  score = nd + prior
509
  if scoring_mode == "additive":
510
  score += k * wrap_score_cost
 
408
  parent = [[[None] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
409
  start_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
410
  max_j_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
411
+ # Track minimum word index reached along each path so wrap paths
412
+ # can't game the position prior by starting near the expected word
413
+ # then jumping backward.
414
+ BIG_W = 999999
415
+ min_w_arr = [[[BIG_W] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
416
 
417
  # Initialize: k=0, free starts at word boundaries
418
  for j in word_starts:
419
  dp[0][0][j] = 0.0
420
  start_arr[0][0][j] = j
421
  max_j_arr[0][0][j] = j
422
+ min_w_arr[0][0][j] = R_phone_to_word[j] if j < n else BIG_W
423
 
424
  # Fill DP
425
  for i in range(1, m + 1):
 
429
  parent[i][k][0] = (i - 1, k, 0, 'D')
430
  start_arr[i][k][0] = 0
431
  max_j_arr[i][k][0] = 0
432
+ min_w_arr[i][k][0] = min_w_arr[i-1][k][0]
433
 
434
  for j in range(1, n + 1):
435
  del_opt = dp[i-1][k][j] + cost_del if dp[i-1][k][j] < INF else INF
 
440
  best = min(del_opt, ins_opt, sub_opt)
441
  if best < INF:
442
  dp[i][k][j] = best
443
+ w_j = R_phone_to_word[j - 1] if j > 0 else BIG_W
444
  if best == sub_opt:
445
  parent[i][k][j] = (i - 1, k, j - 1, 'S')
446
  start_arr[i][k][j] = start_arr[i-1][k][j-1]
447
  max_j_arr[i][k][j] = max(max_j_arr[i-1][k][j-1], j)
448
+ min_w_arr[i][k][j] = min(min_w_arr[i-1][k][j-1], w_j)
449
  elif best == del_opt:
450
  parent[i][k][j] = (i - 1, k, j, 'D')
451
  start_arr[i][k][j] = start_arr[i-1][k][j]
452
  max_j_arr[i][k][j] = max_j_arr[i-1][k][j]
453
+ min_w_arr[i][k][j] = min_w_arr[i-1][k][j]
454
  else:
455
  parent[i][k][j] = (i, k, j - 1, 'I')
456
  start_arr[i][k][j] = start_arr[i][k][j-1]
457
  max_j_arr[i][k][j] = max(max_j_arr[i][k][j-1], j)
458
+ min_w_arr[i][k][j] = min(min_w_arr[i][k][j-1], w_j)
459
 
460
  # Wrap transitions
461
  for k in range(K):
 
473
  parent[i][k+1][j_s] = (i, k, j_end, 'W')
474
  start_arr[i][k+1][j_s] = start_arr[i][k][j_end]
475
  max_j_arr[i][k+1][j_s] = max(max_j_arr[i][k][j_end], j_end)
476
+ min_w_arr[i][k+1][j_s] = min(min_w_arr[i][k][j_end], R_phone_to_word[j_s])
477
 
478
  # Re-propagate insertions from wrap positions
479
  for j in range(1, n + 1):
 
483
  parent[i][k+1][j] = (i, k+1, j-1, 'I')
484
  start_arr[i][k+1][j] = start_arr[i][k+1][j-1]
485
  max_j_arr[i][k+1][j] = max(max_j_arr[i][k+1][j-1], j)
486
+ w_j = R_phone_to_word[j - 1] if j > 0 else BIG_W
487
+ min_w_arr[i][k+1][j] = min(min_w_arr[i][k+1][j-1], w_j)
488
 
489
  # Best-match selection
490
  best_score = INF
 
518
  nd = pc / denom
519
 
520
  sw = R_phone_to_word[j_s] if j_s < n else R_phone_to_word[j - 1]
521
+ # Use the earliest word the path actually touches for a fair prior
522
+ mw = min_w_arr[m][k][j]
523
+ eff_sw = min(sw, mw) if mw < BIG_W else sw
524
+ prior = prior_weight * abs(eff_sw - expected_word)
525
  score = nd + prior
526
  if scoring_mode == "additive":
527
  score += k * wrap_score_cost
src/mfa.py CHANGED
@@ -694,7 +694,8 @@ def compute_mfa_timestamps(current_html, json_output, segment_dir, cached_log_ro
694
  # (patch-based edits update state but skip output_html)
695
  if segment_dir:
696
  from src.ui.segments import render_segments
697
- current_html = render_segments(segments_state, segment_dir=str(segment_dir))
 
698
 
699
  if not current_html or '<span class="word"' not in current_html:
700
  yield current_html, gr.update(), gr.update(), gr.update(), gr.update()
 
694
  # (patch-based edits update state but skip output_html)
695
  if segment_dir:
696
  from src.ui.segments import render_segments
697
+ full_audio_url = f"/gradio_api/file={segment_dir}/full.wav"
698
+ current_html = render_segments(segments_state, full_audio_url=full_audio_url, segment_dir=str(segment_dir))
699
 
700
  if not current_html or '<span class="word"' not in current_html:
701
  yield current_html, gr.update(), gr.update(), gr.update(), gr.update()