Spaces:
Running on Zero
Running on Zero
Upload folder using huggingface_hub
Browse files- src/alignment/_dp_core.c +0 -0
- src/alignment/_dp_core.pyx +28 -4
- src/alignment/phoneme_matcher.py +18 -1
- src/mfa.py +2 -1
src/alignment/_dp_core.c
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/alignment/_dp_core.pyx
CHANGED
|
@@ -411,24 +411,28 @@ cdef tuple _align_full_3d(
|
|
| 411 |
cdef double *cost_3d = NULL
|
| 412 |
cdef int *start_3d = NULL
|
| 413 |
cdef int *max_j_3d = NULL
|
|
|
|
| 414 |
cdef int *par_i = NULL
|
| 415 |
cdef int *par_k = NULL
|
| 416 |
cdef int *par_j = NULL
|
| 417 |
cdef char *par_t = NULL # 0=sub, 1=del, 2=ins, 3=wrap
|
|
|
|
| 418 |
|
| 419 |
cost_3d = <double *>malloc(total_3d * sizeof(double))
|
| 420 |
start_3d = <int *>malloc(total_3d * sizeof(int))
|
| 421 |
max_j_3d = <int *>malloc(total_3d * sizeof(int))
|
|
|
|
| 422 |
par_i = <int *>malloc(total_3d * sizeof(int))
|
| 423 |
par_k = <int *>malloc(total_3d * sizeof(int))
|
| 424 |
par_j = <int *>malloc(total_3d * sizeof(int))
|
| 425 |
par_t = <char *>malloc(total_3d * sizeof(char))
|
| 426 |
|
| 427 |
-
if (cost_3d == NULL or start_3d == NULL or max_j_3d == NULL or
|
| 428 |
par_i == NULL or par_k == NULL or par_j == NULL or par_t == NULL):
|
| 429 |
if cost_3d != NULL: free(cost_3d)
|
| 430 |
if start_3d != NULL: free(start_3d)
|
| 431 |
if max_j_3d != NULL: free(max_j_3d)
|
|
|
|
| 432 |
if par_i != NULL: free(par_i)
|
| 433 |
if par_k != NULL: free(par_k)
|
| 434 |
if par_j != NULL: free(par_j)
|
|
@@ -439,12 +443,14 @@ cdef tuple _align_full_3d(
|
|
| 439 |
cdef int i, j, k
|
| 440 |
cdef int koff, koff_src, koff_dst
|
| 441 |
cdef long base_i, base_prev
|
|
|
|
| 442 |
|
| 443 |
# Initialize all to INF / -1
|
| 444 |
for idx in range(total_3d):
|
| 445 |
cost_3d[idx] = INF_VAL
|
| 446 |
start_3d[idx] = -1
|
| 447 |
max_j_3d[idx] = -1
|
|
|
|
| 448 |
par_i[idx] = -1
|
| 449 |
par_k[idx] = -1
|
| 450 |
par_j[idx] = -1
|
|
@@ -456,6 +462,7 @@ cdef tuple _align_full_3d(
|
|
| 456 |
cost_3d[j] = 0.0 # i=0, k=0, j
|
| 457 |
start_3d[j] = j
|
| 458 |
max_j_3d[j] = j
|
|
|
|
| 459 |
|
| 460 |
# Variables for DP transitions
|
| 461 |
cdef double del_opt, ins_opt, sub_opt, sc, new_cost, cost_at_end, best_val
|
|
@@ -477,6 +484,7 @@ cdef tuple _align_full_3d(
|
|
| 477 |
cost_3d[idx] = i * cost_del
|
| 478 |
start_3d[idx] = 0
|
| 479 |
max_j_3d[idx] = 0
|
|
|
|
| 480 |
par_i[idx] = i - 1
|
| 481 |
par_k[idx] = 0
|
| 482 |
par_j[idx] = 0
|
|
@@ -506,17 +514,24 @@ cdef tuple _align_full_3d(
|
|
| 506 |
start_3d[idx] = start_3d[base_prev + koff + j - 1]
|
| 507 |
mj_val = max_j_3d[base_prev + koff + j - 1]
|
| 508 |
max_j_3d[idx] = j if j > mj_val else mj_val
|
|
|
|
|
|
|
|
|
|
| 509 |
par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 0
|
| 510 |
elif del_opt <= ins_opt:
|
| 511 |
cost_3d[idx] = del_opt
|
| 512 |
start_3d[idx] = start_3d[base_prev + koff + j]
|
| 513 |
max_j_3d[idx] = max_j_3d[base_prev + koff + j]
|
|
|
|
| 514 |
par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j; par_t[idx] = 1
|
| 515 |
elif ins_opt < INF_VAL:
|
| 516 |
cost_3d[idx] = ins_opt
|
| 517 |
start_3d[idx] = start_3d[base_i + koff + j - 1]
|
| 518 |
mj_val = max_j_3d[base_i + koff + j - 1]
|
| 519 |
max_j_3d[idx] = j if j > mj_val else mj_val
|
|
|
|
|
|
|
|
|
|
| 520 |
par_i[idx] = i; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 2
|
| 521 |
|
| 522 |
# Wrap transitions (within same row i)
|
|
@@ -544,6 +559,9 @@ cdef tuple _align_full_3d(
|
|
| 544 |
start_3d[idx] = start_3d[base_i + koff_src + j_end]
|
| 545 |
mj_val = max_j_3d[base_i + koff_src + j_end]
|
| 546 |
max_j_3d[idx] = j_end if j_end > mj_val else mj_val
|
|
|
|
|
|
|
|
|
|
| 547 |
par_i[idx] = i; par_k[idx] = k; par_j[idx] = j_end; par_t[idx] = 3
|
| 548 |
|
| 549 |
# Insertion re-sweep from wrap positions
|
|
@@ -556,6 +574,9 @@ cdef tuple _align_full_3d(
|
|
| 556 |
start_3d[idx] = start_3d[base_i + koff_dst + j - 1]
|
| 557 |
mj_val = max_j_3d[base_i + koff_dst + j - 1]
|
| 558 |
max_j_3d[idx] = j if j > mj_val else mj_val
|
|
|
|
|
|
|
|
|
|
| 559 |
par_i[idx] = i; par_k[idx] = k + 1; par_j[idx] = j - 1; par_t[idx] = 2
|
| 560 |
|
| 561 |
# ------------------------------------------------------------------
|
|
@@ -567,7 +588,7 @@ cdef tuple _align_full_3d(
|
|
| 567 |
cdef int best_k_val = 0, best_max_j = -1
|
| 568 |
|
| 569 |
cdef double dist, norm_dist, prior, score, phoneme_cost
|
| 570 |
-
cdef int j_start_val, ref_len, denom, start_word, max_j_val
|
| 571 |
|
| 572 |
base_i = <long>m * layer_stride
|
| 573 |
for k in range(K + 1):
|
|
@@ -603,7 +624,10 @@ cdef tuple _align_full_3d(
|
|
| 603 |
else:
|
| 604 |
start_word = enc.R_w[j - 1]
|
| 605 |
|
| 606 |
-
|
|
|
|
|
|
|
|
|
|
| 607 |
score = norm_dist + prior
|
| 608 |
if sc_mode == 2: # additive
|
| 609 |
score = score + k * wrap_score_cost
|
|
@@ -618,7 +642,7 @@ cdef tuple _align_full_3d(
|
|
| 618 |
best_max_j = max_j_val
|
| 619 |
|
| 620 |
if best_j < 0:
|
| 621 |
-
free(cost_3d); free(start_3d); free(max_j_3d)
|
| 622 |
free(par_i); free(par_k); free(par_j); free(par_t)
|
| 623 |
return (None, None, float('inf'), float('inf'), 0, 0, [])
|
| 624 |
|
|
|
|
| 411 |
cdef double *cost_3d = NULL
|
| 412 |
cdef int *start_3d = NULL
|
| 413 |
cdef int *max_j_3d = NULL
|
| 414 |
+
cdef int *min_w_3d = NULL # minimum word index reached along path
|
| 415 |
cdef int *par_i = NULL
|
| 416 |
cdef int *par_k = NULL
|
| 417 |
cdef int *par_j = NULL
|
| 418 |
cdef char *par_t = NULL # 0=sub, 1=del, 2=ins, 3=wrap
|
| 419 |
+
cdef int BIG_W = 999999
|
| 420 |
|
| 421 |
cost_3d = <double *>malloc(total_3d * sizeof(double))
|
| 422 |
start_3d = <int *>malloc(total_3d * sizeof(int))
|
| 423 |
max_j_3d = <int *>malloc(total_3d * sizeof(int))
|
| 424 |
+
min_w_3d = <int *>malloc(total_3d * sizeof(int))
|
| 425 |
par_i = <int *>malloc(total_3d * sizeof(int))
|
| 426 |
par_k = <int *>malloc(total_3d * sizeof(int))
|
| 427 |
par_j = <int *>malloc(total_3d * sizeof(int))
|
| 428 |
par_t = <char *>malloc(total_3d * sizeof(char))
|
| 429 |
|
| 430 |
+
if (cost_3d == NULL or start_3d == NULL or max_j_3d == NULL or min_w_3d == NULL or
|
| 431 |
par_i == NULL or par_k == NULL or par_j == NULL or par_t == NULL):
|
| 432 |
if cost_3d != NULL: free(cost_3d)
|
| 433 |
if start_3d != NULL: free(start_3d)
|
| 434 |
if max_j_3d != NULL: free(max_j_3d)
|
| 435 |
+
if min_w_3d != NULL: free(min_w_3d)
|
| 436 |
if par_i != NULL: free(par_i)
|
| 437 |
if par_k != NULL: free(par_k)
|
| 438 |
if par_j != NULL: free(par_j)
|
|
|
|
| 443 |
cdef int i, j, k
|
| 444 |
cdef int koff, koff_src, koff_dst
|
| 445 |
cdef long base_i, base_prev
|
| 446 |
+
cdef int w_j, mw_val
|
| 447 |
|
| 448 |
# Initialize all to INF / -1
|
| 449 |
for idx in range(total_3d):
|
| 450 |
cost_3d[idx] = INF_VAL
|
| 451 |
start_3d[idx] = -1
|
| 452 |
max_j_3d[idx] = -1
|
| 453 |
+
min_w_3d[idx] = BIG_W
|
| 454 |
par_i[idx] = -1
|
| 455 |
par_k[idx] = -1
|
| 456 |
par_j[idx] = -1
|
|
|
|
| 462 |
cost_3d[j] = 0.0 # i=0, k=0, j
|
| 463 |
start_3d[j] = j
|
| 464 |
max_j_3d[j] = j
|
| 465 |
+
min_w_3d[j] = enc.R_w[j] if j < n else BIG_W
|
| 466 |
|
| 467 |
# Variables for DP transitions
|
| 468 |
cdef double del_opt, ins_opt, sub_opt, sc, new_cost, cost_at_end, best_val
|
|
|
|
| 484 |
cost_3d[idx] = i * cost_del
|
| 485 |
start_3d[idx] = 0
|
| 486 |
max_j_3d[idx] = 0
|
| 487 |
+
min_w_3d[idx] = min_w_3d[base_prev + koff]
|
| 488 |
par_i[idx] = i - 1
|
| 489 |
par_k[idx] = 0
|
| 490 |
par_j[idx] = 0
|
|
|
|
| 514 |
start_3d[idx] = start_3d[base_prev + koff + j - 1]
|
| 515 |
mj_val = max_j_3d[base_prev + koff + j - 1]
|
| 516 |
max_j_3d[idx] = j if j > mj_val else mj_val
|
| 517 |
+
w_j = enc.R_w[j - 1]
|
| 518 |
+
mw_val = min_w_3d[base_prev + koff + j - 1]
|
| 519 |
+
min_w_3d[idx] = w_j if w_j < mw_val else mw_val
|
| 520 |
par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 0
|
| 521 |
elif del_opt <= ins_opt:
|
| 522 |
cost_3d[idx] = del_opt
|
| 523 |
start_3d[idx] = start_3d[base_prev + koff + j]
|
| 524 |
max_j_3d[idx] = max_j_3d[base_prev + koff + j]
|
| 525 |
+
min_w_3d[idx] = min_w_3d[base_prev + koff + j]
|
| 526 |
par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j; par_t[idx] = 1
|
| 527 |
elif ins_opt < INF_VAL:
|
| 528 |
cost_3d[idx] = ins_opt
|
| 529 |
start_3d[idx] = start_3d[base_i + koff + j - 1]
|
| 530 |
mj_val = max_j_3d[base_i + koff + j - 1]
|
| 531 |
max_j_3d[idx] = j if j > mj_val else mj_val
|
| 532 |
+
w_j = enc.R_w[j - 1]
|
| 533 |
+
mw_val = min_w_3d[base_i + koff + j - 1]
|
| 534 |
+
min_w_3d[idx] = w_j if w_j < mw_val else mw_val
|
| 535 |
par_i[idx] = i; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 2
|
| 536 |
|
| 537 |
# Wrap transitions (within same row i)
|
|
|
|
| 559 |
start_3d[idx] = start_3d[base_i + koff_src + j_end]
|
| 560 |
mj_val = max_j_3d[base_i + koff_src + j_end]
|
| 561 |
max_j_3d[idx] = j_end if j_end > mj_val else mj_val
|
| 562 |
+
mw_val = min_w_3d[base_i + koff_src + j_end]
|
| 563 |
+
w_j = enc.R_w[j_sw]
|
| 564 |
+
min_w_3d[idx] = w_j if w_j < mw_val else mw_val
|
| 565 |
par_i[idx] = i; par_k[idx] = k; par_j[idx] = j_end; par_t[idx] = 3
|
| 566 |
|
| 567 |
# Insertion re-sweep from wrap positions
|
|
|
|
| 574 |
start_3d[idx] = start_3d[base_i + koff_dst + j - 1]
|
| 575 |
mj_val = max_j_3d[base_i + koff_dst + j - 1]
|
| 576 |
max_j_3d[idx] = j if j > mj_val else mj_val
|
| 577 |
+
w_j = enc.R_w[j - 1]
|
| 578 |
+
mw_val = min_w_3d[base_i + koff_dst + j - 1]
|
| 579 |
+
min_w_3d[idx] = w_j if w_j < mw_val else mw_val
|
| 580 |
par_i[idx] = i; par_k[idx] = k + 1; par_j[idx] = j - 1; par_t[idx] = 2
|
| 581 |
|
| 582 |
# ------------------------------------------------------------------
|
|
|
|
| 588 |
cdef int best_k_val = 0, best_max_j = -1
|
| 589 |
|
| 590 |
cdef double dist, norm_dist, prior, score, phoneme_cost
|
| 591 |
+
cdef int j_start_val, ref_len, denom, start_word, max_j_val, min_word_val, eff_start
|
| 592 |
|
| 593 |
base_i = <long>m * layer_stride
|
| 594 |
for k in range(K + 1):
|
|
|
|
| 624 |
else:
|
| 625 |
start_word = enc.R_w[j - 1]
|
| 626 |
|
| 627 |
+
# Use earliest word the path actually touches for fair prior
|
| 628 |
+
min_word_val = min_w_3d[idx]
|
| 629 |
+
eff_start = min_word_val if min_word_val < start_word and min_word_val < BIG_W else start_word
|
| 630 |
+
prior = prior_weight * fabs(<double>(eff_start - expected_word))
|
| 631 |
score = norm_dist + prior
|
| 632 |
if sc_mode == 2: # additive
|
| 633 |
score = score + k * wrap_score_cost
|
|
|
|
| 642 |
best_max_j = max_j_val
|
| 643 |
|
| 644 |
if best_j < 0:
|
| 645 |
+
free(cost_3d); free(start_3d); free(max_j_3d); free(min_w_3d)
|
| 646 |
free(par_i); free(par_k); free(par_j); free(par_t)
|
| 647 |
return (None, None, float('inf'), float('inf'), 0, 0, [])
|
| 648 |
|
src/alignment/phoneme_matcher.py
CHANGED
|
@@ -408,12 +408,18 @@ def align_wraparound(
|
|
| 408 |
parent = [[[None] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
|
| 409 |
start_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
|
| 410 |
max_j_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
# Initialize: k=0, free starts at word boundaries
|
| 413 |
for j in word_starts:
|
| 414 |
dp[0][0][j] = 0.0
|
| 415 |
start_arr[0][0][j] = j
|
| 416 |
max_j_arr[0][0][j] = j
|
|
|
|
| 417 |
|
| 418 |
# Fill DP
|
| 419 |
for i in range(1, m + 1):
|
|
@@ -423,6 +429,7 @@ def align_wraparound(
|
|
| 423 |
parent[i][k][0] = (i - 1, k, 0, 'D')
|
| 424 |
start_arr[i][k][0] = 0
|
| 425 |
max_j_arr[i][k][0] = 0
|
|
|
|
| 426 |
|
| 427 |
for j in range(1, n + 1):
|
| 428 |
del_opt = dp[i-1][k][j] + cost_del if dp[i-1][k][j] < INF else INF
|
|
@@ -433,18 +440,22 @@ def align_wraparound(
|
|
| 433 |
best = min(del_opt, ins_opt, sub_opt)
|
| 434 |
if best < INF:
|
| 435 |
dp[i][k][j] = best
|
|
|
|
| 436 |
if best == sub_opt:
|
| 437 |
parent[i][k][j] = (i - 1, k, j - 1, 'S')
|
| 438 |
start_arr[i][k][j] = start_arr[i-1][k][j-1]
|
| 439 |
max_j_arr[i][k][j] = max(max_j_arr[i-1][k][j-1], j)
|
|
|
|
| 440 |
elif best == del_opt:
|
| 441 |
parent[i][k][j] = (i - 1, k, j, 'D')
|
| 442 |
start_arr[i][k][j] = start_arr[i-1][k][j]
|
| 443 |
max_j_arr[i][k][j] = max_j_arr[i-1][k][j]
|
|
|
|
| 444 |
else:
|
| 445 |
parent[i][k][j] = (i, k, j - 1, 'I')
|
| 446 |
start_arr[i][k][j] = start_arr[i][k][j-1]
|
| 447 |
max_j_arr[i][k][j] = max(max_j_arr[i][k][j-1], j)
|
|
|
|
| 448 |
|
| 449 |
# Wrap transitions
|
| 450 |
for k in range(K):
|
|
@@ -462,6 +473,7 @@ def align_wraparound(
|
|
| 462 |
parent[i][k+1][j_s] = (i, k, j_end, 'W')
|
| 463 |
start_arr[i][k+1][j_s] = start_arr[i][k][j_end]
|
| 464 |
max_j_arr[i][k+1][j_s] = max(max_j_arr[i][k][j_end], j_end)
|
|
|
|
| 465 |
|
| 466 |
# Re-propagate insertions from wrap positions
|
| 467 |
for j in range(1, n + 1):
|
|
@@ -471,6 +483,8 @@ def align_wraparound(
|
|
| 471 |
parent[i][k+1][j] = (i, k+1, j-1, 'I')
|
| 472 |
start_arr[i][k+1][j] = start_arr[i][k+1][j-1]
|
| 473 |
max_j_arr[i][k+1][j] = max(max_j_arr[i][k+1][j-1], j)
|
|
|
|
|
|
|
| 474 |
|
| 475 |
# Best-match selection
|
| 476 |
best_score = INF
|
|
@@ -504,7 +518,10 @@ def align_wraparound(
|
|
| 504 |
nd = pc / denom
|
| 505 |
|
| 506 |
sw = R_phone_to_word[j_s] if j_s < n else R_phone_to_word[j - 1]
|
| 507 |
-
|
|
|
|
|
|
|
|
|
|
| 508 |
score = nd + prior
|
| 509 |
if scoring_mode == "additive":
|
| 510 |
score += k * wrap_score_cost
|
|
|
|
| 408 |
parent = [[[None] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
|
| 409 |
start_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
|
| 410 |
max_j_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
|
| 411 |
+
# Track minimum word index reached along each path so wrap paths
|
| 412 |
+
# can't game the position prior by starting near the expected word
|
| 413 |
+
# then jumping backward.
|
| 414 |
+
BIG_W = 999999
|
| 415 |
+
min_w_arr = [[[BIG_W] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
|
| 416 |
|
| 417 |
# Initialize: k=0, free starts at word boundaries
|
| 418 |
for j in word_starts:
|
| 419 |
dp[0][0][j] = 0.0
|
| 420 |
start_arr[0][0][j] = j
|
| 421 |
max_j_arr[0][0][j] = j
|
| 422 |
+
min_w_arr[0][0][j] = R_phone_to_word[j] if j < n else BIG_W
|
| 423 |
|
| 424 |
# Fill DP
|
| 425 |
for i in range(1, m + 1):
|
|
|
|
| 429 |
parent[i][k][0] = (i - 1, k, 0, 'D')
|
| 430 |
start_arr[i][k][0] = 0
|
| 431 |
max_j_arr[i][k][0] = 0
|
| 432 |
+
min_w_arr[i][k][0] = min_w_arr[i-1][k][0]
|
| 433 |
|
| 434 |
for j in range(1, n + 1):
|
| 435 |
del_opt = dp[i-1][k][j] + cost_del if dp[i-1][k][j] < INF else INF
|
|
|
|
| 440 |
best = min(del_opt, ins_opt, sub_opt)
|
| 441 |
if best < INF:
|
| 442 |
dp[i][k][j] = best
|
| 443 |
+
w_j = R_phone_to_word[j - 1] if j > 0 else BIG_W
|
| 444 |
if best == sub_opt:
|
| 445 |
parent[i][k][j] = (i - 1, k, j - 1, 'S')
|
| 446 |
start_arr[i][k][j] = start_arr[i-1][k][j-1]
|
| 447 |
max_j_arr[i][k][j] = max(max_j_arr[i-1][k][j-1], j)
|
| 448 |
+
min_w_arr[i][k][j] = min(min_w_arr[i-1][k][j-1], w_j)
|
| 449 |
elif best == del_opt:
|
| 450 |
parent[i][k][j] = (i - 1, k, j, 'D')
|
| 451 |
start_arr[i][k][j] = start_arr[i-1][k][j]
|
| 452 |
max_j_arr[i][k][j] = max_j_arr[i-1][k][j]
|
| 453 |
+
min_w_arr[i][k][j] = min_w_arr[i-1][k][j]
|
| 454 |
else:
|
| 455 |
parent[i][k][j] = (i, k, j - 1, 'I')
|
| 456 |
start_arr[i][k][j] = start_arr[i][k][j-1]
|
| 457 |
max_j_arr[i][k][j] = max(max_j_arr[i][k][j-1], j)
|
| 458 |
+
min_w_arr[i][k][j] = min(min_w_arr[i][k][j-1], w_j)
|
| 459 |
|
| 460 |
# Wrap transitions
|
| 461 |
for k in range(K):
|
|
|
|
| 473 |
parent[i][k+1][j_s] = (i, k, j_end, 'W')
|
| 474 |
start_arr[i][k+1][j_s] = start_arr[i][k][j_end]
|
| 475 |
max_j_arr[i][k+1][j_s] = max(max_j_arr[i][k][j_end], j_end)
|
| 476 |
+
min_w_arr[i][k+1][j_s] = min(min_w_arr[i][k][j_end], R_phone_to_word[j_s])
|
| 477 |
|
| 478 |
# Re-propagate insertions from wrap positions
|
| 479 |
for j in range(1, n + 1):
|
|
|
|
| 483 |
parent[i][k+1][j] = (i, k+1, j-1, 'I')
|
| 484 |
start_arr[i][k+1][j] = start_arr[i][k+1][j-1]
|
| 485 |
max_j_arr[i][k+1][j] = max(max_j_arr[i][k+1][j-1], j)
|
| 486 |
+
w_j = R_phone_to_word[j - 1] if j > 0 else BIG_W
|
| 487 |
+
min_w_arr[i][k+1][j] = min(min_w_arr[i][k+1][j-1], w_j)
|
| 488 |
|
| 489 |
# Best-match selection
|
| 490 |
best_score = INF
|
|
|
|
| 518 |
nd = pc / denom
|
| 519 |
|
| 520 |
sw = R_phone_to_word[j_s] if j_s < n else R_phone_to_word[j - 1]
|
| 521 |
+
# Use the earliest word the path actually touches for a fair prior
|
| 522 |
+
mw = min_w_arr[m][k][j]
|
| 523 |
+
eff_sw = min(sw, mw) if mw < BIG_W else sw
|
| 524 |
+
prior = prior_weight * abs(eff_sw - expected_word)
|
| 525 |
score = nd + prior
|
| 526 |
if scoring_mode == "additive":
|
| 527 |
score += k * wrap_score_cost
|
src/mfa.py
CHANGED
|
@@ -694,7 +694,8 @@ def compute_mfa_timestamps(current_html, json_output, segment_dir, cached_log_ro
|
|
| 694 |
# (patch-based edits update state but skip output_html)
|
| 695 |
if segment_dir:
|
| 696 |
from src.ui.segments import render_segments
|
| 697 |
-
|
|
|
|
| 698 |
|
| 699 |
if not current_html or '<span class="word"' not in current_html:
|
| 700 |
yield current_html, gr.update(), gr.update(), gr.update(), gr.update()
|
|
|
|
| 694 |
# (patch-based edits update state but skip output_html)
|
| 695 |
if segment_dir:
|
| 696 |
from src.ui.segments import render_segments
|
| 697 |
+
full_audio_url = f"/gradio_api/file={segment_dir}/full.wav"
|
| 698 |
+
current_html = render_segments(segments_state, full_audio_url=full_audio_url, segment_dir=str(segment_dir))
|
| 699 |
|
| 700 |
if not current_html or '<span class="word"' not in current_html:
|
| 701 |
yield current_html, gr.update(), gr.update(), gr.update(), gr.update()
|