Quran-multi-aligner

Running on Zero

App Files Files Community

hetchyy commited on 13 days ago

Commit

2aebf7d

verified ·

1 Parent(s): 0b34cc0

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

src/alignment/_dp_core.c +0 -0
src/alignment/_dp_core.pyx +28 -4
src/alignment/phoneme_matcher.py +18 -1
src/mfa.py +2 -1

src/alignment/_dp_core.c CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/alignment/_dp_core.pyx CHANGED Viewed

@@ -411,24 +411,28 @@ cdef tuple _align_full_3d(
     cdef double *cost_3d = NULL
     cdef int *start_3d = NULL
     cdef int *max_j_3d = NULL
     cdef int *par_i = NULL
     cdef int *par_k = NULL
     cdef int *par_j = NULL
     cdef char *par_t = NULL  # 0=sub, 1=del, 2=ins, 3=wrap
     cost_3d = <double *>malloc(total_3d * sizeof(double))
     start_3d = <int *>malloc(total_3d * sizeof(int))
     max_j_3d = <int *>malloc(total_3d * sizeof(int))
     par_i = <int *>malloc(total_3d * sizeof(int))
     par_k = <int *>malloc(total_3d * sizeof(int))
     par_j = <int *>malloc(total_3d * sizeof(int))
     par_t = <char *>malloc(total_3d * sizeof(char))
-    if (cost_3d == NULL or start_3d == NULL or max_j_3d == NULL or
             par_i == NULL or par_k == NULL or par_j == NULL or par_t == NULL):
         if cost_3d != NULL: free(cost_3d)
         if start_3d != NULL: free(start_3d)
         if max_j_3d != NULL: free(max_j_3d)
         if par_i != NULL: free(par_i)
         if par_k != NULL: free(par_k)
         if par_j != NULL: free(par_j)
@@ -439,12 +443,14 @@ cdef tuple _align_full_3d(
     cdef int i, j, k
     cdef int koff, koff_src, koff_dst
     cdef long base_i, base_prev
     # Initialize all to INF / -1
     for idx in range(total_3d):
         cost_3d[idx] = INF_VAL
         start_3d[idx] = -1
         max_j_3d[idx] = -1
         par_i[idx] = -1
         par_k[idx] = -1
         par_j[idx] = -1
@@ -456,6 +462,7 @@ cdef tuple _align_full_3d(
             cost_3d[j] = 0.0    # i=0, k=0, j
             start_3d[j] = j
             max_j_3d[j] = j
     # Variables for DP transitions
     cdef double del_opt, ins_opt, sub_opt, sc, new_cost, cost_at_end, best_val
@@ -477,6 +484,7 @@ cdef tuple _align_full_3d(
                 cost_3d[idx] = i * cost_del
                 start_3d[idx] = 0
                 max_j_3d[idx] = 0
                 par_i[idx] = i - 1
                 par_k[idx] = 0
                 par_j[idx] = 0
@@ -506,17 +514,24 @@ cdef tuple _align_full_3d(
                     start_3d[idx] = start_3d[base_prev + koff + j - 1]
                     mj_val = max_j_3d[base_prev + koff + j - 1]
                     max_j_3d[idx] = j if j > mj_val else mj_val
                     par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 0
                 elif del_opt <= ins_opt:
                     cost_3d[idx] = del_opt
                     start_3d[idx] = start_3d[base_prev + koff + j]
                     max_j_3d[idx] = max_j_3d[base_prev + koff + j]
                     par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j; par_t[idx] = 1
                 elif ins_opt < INF_VAL:
                     cost_3d[idx] = ins_opt
                     start_3d[idx] = start_3d[base_i + koff + j - 1]
                     mj_val = max_j_3d[base_i + koff + j - 1]
                     max_j_3d[idx] = j if j > mj_val else mj_val
                     par_i[idx] = i; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 2
         # Wrap transitions (within same row i)
@@ -544,6 +559,9 @@ cdef tuple _align_full_3d(
                         start_3d[idx] = start_3d[base_i + koff_src + j_end]
                         mj_val = max_j_3d[base_i + koff_src + j_end]
                         max_j_3d[idx] = j_end if j_end > mj_val else mj_val
                         par_i[idx] = i; par_k[idx] = k; par_j[idx] = j_end; par_t[idx] = 3
             # Insertion re-sweep from wrap positions
@@ -556,6 +574,9 @@ cdef tuple _align_full_3d(
                     start_3d[idx] = start_3d[base_i + koff_dst + j - 1]
                     mj_val = max_j_3d[base_i + koff_dst + j - 1]
                     max_j_3d[idx] = j if j > mj_val else mj_val
                     par_i[idx] = i; par_k[idx] = k + 1; par_j[idx] = j - 1; par_t[idx] = 2
     # ------------------------------------------------------------------
@@ -567,7 +588,7 @@ cdef tuple _align_full_3d(
     cdef int best_k_val = 0, best_max_j = -1
     cdef double dist, norm_dist, prior, score, phoneme_cost
-    cdef int j_start_val, ref_len, denom, start_word, max_j_val
     base_i = <long>m * layer_stride
     for k in range(K + 1):
@@ -603,7 +624,10 @@ cdef tuple _align_full_3d(
             else:
                 start_word = enc.R_w[j - 1]
-            prior = prior_weight * fabs(<double>(start_word - expected_word))
             score = norm_dist + prior
             if sc_mode == 2:   # additive
                 score = score + k * wrap_score_cost
@@ -618,7 +642,7 @@ cdef tuple _align_full_3d(
                 best_max_j = max_j_val
     if best_j < 0:
-        free(cost_3d); free(start_3d); free(max_j_3d)
         free(par_i); free(par_k); free(par_j); free(par_t)
         return (None, None, float('inf'), float('inf'), 0, 0, [])

     cdef double *cost_3d = NULL
     cdef int *start_3d = NULL
     cdef int *max_j_3d = NULL
+    cdef int *min_w_3d = NULL  # minimum word index reached along path
     cdef int *par_i = NULL
     cdef int *par_k = NULL
     cdef int *par_j = NULL
     cdef char *par_t = NULL  # 0=sub, 1=del, 2=ins, 3=wrap
+    cdef int BIG_W = 999999
     cost_3d = <double *>malloc(total_3d * sizeof(double))
     start_3d = <int *>malloc(total_3d * sizeof(int))
     max_j_3d = <int *>malloc(total_3d * sizeof(int))
+    min_w_3d = <int *>malloc(total_3d * sizeof(int))
     par_i = <int *>malloc(total_3d * sizeof(int))
     par_k = <int *>malloc(total_3d * sizeof(int))
     par_j = <int *>malloc(total_3d * sizeof(int))
     par_t = <char *>malloc(total_3d * sizeof(char))
+    if (cost_3d == NULL or start_3d == NULL or max_j_3d == NULL or min_w_3d == NULL or
             par_i == NULL or par_k == NULL or par_j == NULL or par_t == NULL):
         if cost_3d != NULL: free(cost_3d)
         if start_3d != NULL: free(start_3d)
         if max_j_3d != NULL: free(max_j_3d)
+        if min_w_3d != NULL: free(min_w_3d)
         if par_i != NULL: free(par_i)
         if par_k != NULL: free(par_k)
         if par_j != NULL: free(par_j)
     cdef int i, j, k
     cdef int koff, koff_src, koff_dst
     cdef long base_i, base_prev
+    cdef int w_j, mw_val
     # Initialize all to INF / -1
     for idx in range(total_3d):
         cost_3d[idx] = INF_VAL
         start_3d[idx] = -1
         max_j_3d[idx] = -1
+        min_w_3d[idx] = BIG_W
         par_i[idx] = -1
         par_k[idx] = -1
         par_j[idx] = -1
             cost_3d[j] = 0.0    # i=0, k=0, j
             start_3d[j] = j
             max_j_3d[j] = j
+            min_w_3d[j] = enc.R_w[j] if j < n else BIG_W
     # Variables for DP transitions
     cdef double del_opt, ins_opt, sub_opt, sc, new_cost, cost_at_end, best_val
                 cost_3d[idx] = i * cost_del
                 start_3d[idx] = 0
                 max_j_3d[idx] = 0
+                min_w_3d[idx] = min_w_3d[base_prev + koff]
                 par_i[idx] = i - 1
                 par_k[idx] = 0
                 par_j[idx] = 0
                     start_3d[idx] = start_3d[base_prev + koff + j - 1]
                     mj_val = max_j_3d[base_prev + koff + j - 1]
                     max_j_3d[idx] = j if j > mj_val else mj_val
+                    w_j = enc.R_w[j - 1]
+                    mw_val = min_w_3d[base_prev + koff + j - 1]
+                    min_w_3d[idx] = w_j if w_j < mw_val else mw_val
                     par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 0
                 elif del_opt <= ins_opt:
                     cost_3d[idx] = del_opt
                     start_3d[idx] = start_3d[base_prev + koff + j]
                     max_j_3d[idx] = max_j_3d[base_prev + koff + j]
+                    min_w_3d[idx] = min_w_3d[base_prev + koff + j]
                     par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j; par_t[idx] = 1
                 elif ins_opt < INF_VAL:
                     cost_3d[idx] = ins_opt
                     start_3d[idx] = start_3d[base_i + koff + j - 1]
                     mj_val = max_j_3d[base_i + koff + j - 1]
                     max_j_3d[idx] = j if j > mj_val else mj_val
+                    w_j = enc.R_w[j - 1]
+                    mw_val = min_w_3d[base_i + koff + j - 1]
+                    min_w_3d[idx] = w_j if w_j < mw_val else mw_val
                     par_i[idx] = i; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 2
         # Wrap transitions (within same row i)
                         start_3d[idx] = start_3d[base_i + koff_src + j_end]
                         mj_val = max_j_3d[base_i + koff_src + j_end]
                         max_j_3d[idx] = j_end if j_end > mj_val else mj_val
+                        mw_val = min_w_3d[base_i + koff_src + j_end]
+                        w_j = enc.R_w[j_sw]
+                        min_w_3d[idx] = w_j if w_j < mw_val else mw_val
                         par_i[idx] = i; par_k[idx] = k; par_j[idx] = j_end; par_t[idx] = 3
             # Insertion re-sweep from wrap positions
                     start_3d[idx] = start_3d[base_i + koff_dst + j - 1]
                     mj_val = max_j_3d[base_i + koff_dst + j - 1]
                     max_j_3d[idx] = j if j > mj_val else mj_val
+                    w_j = enc.R_w[j - 1]
+                    mw_val = min_w_3d[base_i + koff_dst + j - 1]
+                    min_w_3d[idx] = w_j if w_j < mw_val else mw_val
                     par_i[idx] = i; par_k[idx] = k + 1; par_j[idx] = j - 1; par_t[idx] = 2
     # ------------------------------------------------------------------
     cdef int best_k_val = 0, best_max_j = -1
     cdef double dist, norm_dist, prior, score, phoneme_cost
+    cdef int j_start_val, ref_len, denom, start_word, max_j_val, min_word_val, eff_start
     base_i = <long>m * layer_stride
     for k in range(K + 1):
             else:
                 start_word = enc.R_w[j - 1]
+            # Use earliest word the path actually touches for fair prior
+            min_word_val = min_w_3d[idx]
+            eff_start = min_word_val if min_word_val < start_word and min_word_val < BIG_W else start_word
+            prior = prior_weight * fabs(<double>(eff_start - expected_word))
             score = norm_dist + prior
             if sc_mode == 2:   # additive
                 score = score + k * wrap_score_cost
                 best_max_j = max_j_val
     if best_j < 0:
+        free(cost_3d); free(start_3d); free(max_j_3d); free(min_w_3d)
         free(par_i); free(par_k); free(par_j); free(par_t)
         return (None, None, float('inf'), float('inf'), 0, 0, [])

src/alignment/phoneme_matcher.py CHANGED Viewed

@@ -408,12 +408,18 @@ def align_wraparound(
     parent = [[[None] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
     start_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
     max_j_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
     # Initialize: k=0, free starts at word boundaries
     for j in word_starts:
         dp[0][0][j] = 0.0
         start_arr[0][0][j] = j
         max_j_arr[0][0][j] = j
     # Fill DP
     for i in range(1, m + 1):
@@ -423,6 +429,7 @@ def align_wraparound(
                 parent[i][k][0] = (i - 1, k, 0, 'D')
                 start_arr[i][k][0] = 0
                 max_j_arr[i][k][0] = 0
             for j in range(1, n + 1):
                 del_opt = dp[i-1][k][j] + cost_del if dp[i-1][k][j] < INF else INF
@@ -433,18 +440,22 @@ def align_wraparound(
                 best = min(del_opt, ins_opt, sub_opt)
                 if best < INF:
                     dp[i][k][j] = best
                     if best == sub_opt:
                         parent[i][k][j] = (i - 1, k, j - 1, 'S')
                         start_arr[i][k][j] = start_arr[i-1][k][j-1]
                         max_j_arr[i][k][j] = max(max_j_arr[i-1][k][j-1], j)
                     elif best == del_opt:
                         parent[i][k][j] = (i - 1, k, j, 'D')
                         start_arr[i][k][j] = start_arr[i-1][k][j]
                         max_j_arr[i][k][j] = max_j_arr[i-1][k][j]
                     else:
                         parent[i][k][j] = (i, k, j - 1, 'I')
                         start_arr[i][k][j] = start_arr[i][k][j-1]
                         max_j_arr[i][k][j] = max(max_j_arr[i][k][j-1], j)
         # Wrap transitions
         for k in range(K):
@@ -462,6 +473,7 @@ def align_wraparound(
                         parent[i][k+1][j_s] = (i, k, j_end, 'W')
                         start_arr[i][k+1][j_s] = start_arr[i][k][j_end]
                         max_j_arr[i][k+1][j_s] = max(max_j_arr[i][k][j_end], j_end)
             # Re-propagate insertions from wrap positions
             for j in range(1, n + 1):
@@ -471,6 +483,8 @@ def align_wraparound(
                     parent[i][k+1][j] = (i, k+1, j-1, 'I')
                     start_arr[i][k+1][j] = start_arr[i][k+1][j-1]
                     max_j_arr[i][k+1][j] = max(max_j_arr[i][k+1][j-1], j)
     # Best-match selection
     best_score = INF
@@ -504,7 +518,10 @@ def align_wraparound(
             nd = pc / denom
             sw = R_phone_to_word[j_s] if j_s < n else R_phone_to_word[j - 1]
-            prior = prior_weight * abs(sw - expected_word)
             score = nd + prior
             if scoring_mode == "additive":
                 score += k * wrap_score_cost

     parent = [[[None] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
     start_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
     max_j_arr = [[[-1] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
+    # Track minimum word index reached along each path so wrap paths
+    # can't game the position prior by starting near the expected word
+    # then jumping backward.
+    BIG_W = 999999
+    min_w_arr = [[[BIG_W] * (n + 1) for _ in range(K + 1)] for _ in range(m + 1)]
     # Initialize: k=0, free starts at word boundaries
     for j in word_starts:
         dp[0][0][j] = 0.0
         start_arr[0][0][j] = j
         max_j_arr[0][0][j] = j
+        min_w_arr[0][0][j] = R_phone_to_word[j] if j < n else BIG_W
     # Fill DP
     for i in range(1, m + 1):
                 parent[i][k][0] = (i - 1, k, 0, 'D')
                 start_arr[i][k][0] = 0
                 max_j_arr[i][k][0] = 0
+                min_w_arr[i][k][0] = min_w_arr[i-1][k][0]
             for j in range(1, n + 1):
                 del_opt = dp[i-1][k][j] + cost_del if dp[i-1][k][j] < INF else INF
                 best = min(del_opt, ins_opt, sub_opt)
                 if best < INF:
                     dp[i][k][j] = best
+                    w_j = R_phone_to_word[j - 1] if j > 0 else BIG_W
                     if best == sub_opt:
                         parent[i][k][j] = (i - 1, k, j - 1, 'S')
                         start_arr[i][k][j] = start_arr[i-1][k][j-1]
                         max_j_arr[i][k][j] = max(max_j_arr[i-1][k][j-1], j)
+                        min_w_arr[i][k][j] = min(min_w_arr[i-1][k][j-1], w_j)
                     elif best == del_opt:
                         parent[i][k][j] = (i - 1, k, j, 'D')
                         start_arr[i][k][j] = start_arr[i-1][k][j]
                         max_j_arr[i][k][j] = max_j_arr[i-1][k][j]
+                        min_w_arr[i][k][j] = min_w_arr[i-1][k][j]
                     else:
                         parent[i][k][j] = (i, k, j - 1, 'I')
                         start_arr[i][k][j] = start_arr[i][k][j-1]
                         max_j_arr[i][k][j] = max(max_j_arr[i][k][j-1], j)
+                        min_w_arr[i][k][j] = min(min_w_arr[i][k][j-1], w_j)
         # Wrap transitions
         for k in range(K):
                         parent[i][k+1][j_s] = (i, k, j_end, 'W')
                         start_arr[i][k+1][j_s] = start_arr[i][k][j_end]
                         max_j_arr[i][k+1][j_s] = max(max_j_arr[i][k][j_end], j_end)
+                        min_w_arr[i][k+1][j_s] = min(min_w_arr[i][k][j_end], R_phone_to_word[j_s])
             # Re-propagate insertions from wrap positions
             for j in range(1, n + 1):
                     parent[i][k+1][j] = (i, k+1, j-1, 'I')
                     start_arr[i][k+1][j] = start_arr[i][k+1][j-1]
                     max_j_arr[i][k+1][j] = max(max_j_arr[i][k+1][j-1], j)
+                    w_j = R_phone_to_word[j - 1] if j > 0 else BIG_W
+                    min_w_arr[i][k+1][j] = min(min_w_arr[i][k+1][j-1], w_j)
     # Best-match selection
     best_score = INF
             nd = pc / denom
             sw = R_phone_to_word[j_s] if j_s < n else R_phone_to_word[j - 1]
+            # Use the earliest word the path actually touches for a fair prior
+            mw = min_w_arr[m][k][j]
+            eff_sw = min(sw, mw) if mw < BIG_W else sw
+            prior = prior_weight * abs(eff_sw - expected_word)
             score = nd + prior
             if scoring_mode == "additive":
                 score += k * wrap_score_cost

src/mfa.py CHANGED Viewed

@@ -694,7 +694,8 @@ def compute_mfa_timestamps(current_html, json_output, segment_dir, cached_log_ro
     # (patch-based edits update state but skip output_html)
     if segment_dir:
         from src.ui.segments import render_segments
-        current_html = render_segments(segments_state, segment_dir=str(segment_dir))
     if not current_html or '<span class="word"' not in current_html:
         yield current_html, gr.update(), gr.update(), gr.update(), gr.update()

     # (patch-based edits update state but skip output_html)
     if segment_dir:
         from src.ui.segments import render_segments
+        full_audio_url = f"/gradio_api/file={segment_dir}/full.wav"
+        current_html = render_segments(segments_state, full_audio_url=full_audio_url, segment_dir=str(segment_dir))
     if not current_html or '<span class="word"' not in current_html:
         yield current_html, gr.update(), gr.update(), gr.update(), gr.update()