Spaces:
Running on Zero
Running on Zero
| # cython: boundscheck=False, wraparound=False, cdivision=True | |
| """ | |
| Cython-accelerated word-boundary-constrained substring Levenshtein DP | |
| with wraparound support for repetition detection. | |
| When max_wraps=0, uses rolling rows (identical to the old standard DP). | |
| When max_wraps>0, uses full 3D matrix with parent pointers for traceback. | |
| """ | |
| from libc.stdlib cimport malloc, free | |
| from libc.math cimport INFINITY, fabs | |
| # --------------------------------------------------------------------------- | |
| # Phoneme → integer encoding (built lazily on first call) | |
| # --------------------------------------------------------------------------- | |
| cdef dict _phoneme_to_id = {} | |
| cdef int _num_phonemes = 0 | |
| cdef double *_sub_matrix = NULL # flat _num_phonemes × _num_phonemes | |
| cdef double _default_sub = 1.0 | |
| cdef int _encode_phoneme(str p): | |
| """Return integer id for *p*, assigning a new one if unseen.""" | |
| global _num_phonemes | |
| cdef int pid | |
| try: | |
| pid = _phoneme_to_id[p] | |
| except KeyError: | |
| pid = _num_phonemes | |
| _phoneme_to_id[p] = pid | |
| _num_phonemes += 1 | |
| return pid | |
| def init_substitution_matrix(dict sub_costs, double default_sub): | |
| """Build the dense substitution-cost matrix from the Python dict. | |
| Must be called once before the first DP call (phoneme_matcher.py does | |
| this at import time). | |
| Parameters | |
| ---------- | |
| sub_costs : dict[(str, str), float] | |
| Phoneme-pair substitution costs (both orderings already present). | |
| default_sub : float | |
| Cost used for pairs not in *sub_costs*. | |
| """ | |
| global _sub_matrix, _default_sub, _num_phonemes | |
| _default_sub = default_sub | |
| # First pass: make sure every phoneme in sub_costs has an id | |
| for (a, b) in sub_costs: | |
| _encode_phoneme(a) | |
| _encode_phoneme(b) | |
| # Allocate matrix (will be re-allocated if new phonemes appear later) | |
| _rebuild_matrix(sub_costs) | |
| cdef void _rebuild_matrix(dict sub_costs): | |
| """(Re)allocate and fill the dense cost matrix.""" | |
| global _sub_matrix, _num_phonemes, _default_sub | |
| cdef int size = _num_phonemes | |
| cdef int i, j | |
| if _sub_matrix != NULL: | |
| free(_sub_matrix) | |
| _sub_matrix = <double *>malloc(size * size * sizeof(double)) | |
| if _sub_matrix == NULL: | |
| raise MemoryError("Failed to allocate substitution matrix") | |
| # Fill with default | |
| for i in range(size * size): | |
| _sub_matrix[i] = _default_sub | |
| # Diagonal = 0 (match) | |
| for i in range(size): | |
| _sub_matrix[i * size + i] = 0.0 | |
| # Overrides from dict | |
| cdef int aid, bid | |
| cdef double cost | |
| for (a, b), cost in sub_costs.items(): | |
| aid = _phoneme_to_id.get(a, -1) | |
| bid = _phoneme_to_id.get(b, -1) | |
| if aid >= 0 and bid >= 0: | |
| _sub_matrix[aid * size + bid] = cost | |
| cdef inline double _get_sub_cost(int pid, int rid, int size) nogil: | |
| """Look up substitution cost from the dense matrix.""" | |
| if pid == rid: | |
| return 0.0 | |
| if pid < size and rid < size: | |
| return _sub_matrix[pid * size + rid] | |
| return _default_sub | |
| # --------------------------------------------------------------------------- | |
| # Helper: grow matrix when new phonemes are encountered at runtime | |
| # --------------------------------------------------------------------------- | |
| cdef void _grow_matrix(): | |
| """Expand the substitution matrix to cover newly added phonemes. | |
| New rows/columns are filled with the default substitution cost, | |
| diagonal with 0.0. Existing entries are preserved. | |
| """ | |
| global _sub_matrix, _num_phonemes | |
| cdef int new_size = _num_phonemes | |
| cdef double *new_mat | |
| cdef int i | |
| if _sub_matrix == NULL: | |
| _sub_matrix = <double *>malloc(new_size * new_size * sizeof(double)) | |
| if _sub_matrix == NULL: | |
| return | |
| for i in range(new_size * new_size): | |
| _sub_matrix[i] = _default_sub | |
| for i in range(new_size): | |
| _sub_matrix[i * new_size + i] = 0.0 | |
| return | |
| new_mat = <double *>malloc(new_size * new_size * sizeof(double)) | |
| if new_mat == NULL: | |
| return | |
| for i in range(new_size * new_size): | |
| new_mat[i] = _default_sub | |
| for i in range(new_size): | |
| new_mat[i * new_size + i] = 0.0 | |
| free(_sub_matrix) | |
| _sub_matrix = new_mat | |
| # --------------------------------------------------------------------------- | |
| # Shared: encode inputs and precompute boundaries | |
| # --------------------------------------------------------------------------- | |
| cdef struct EncodedInput: | |
| int *P_ids | |
| int *R_ids | |
| int *R_w | |
| char *start_bd | |
| char *end_bd | |
| int *ws_pos # word-start positions (sorted) | |
| int *we_pos # word-end positions (sorted) | |
| int n_ws # count of word-start positions | |
| int n_we # count of word-end positions | |
| int mat_size # phoneme matrix size at time of encoding | |
| int m # len(P) | |
| int n # len(R) | |
| cdef EncodedInput _encode_inputs(list P_list, list R_list, list R_phone_to_word_list) except *: | |
| """Encode string lists to C arrays, precompute word boundaries.""" | |
| cdef EncodedInput enc | |
| cdef int m = len(P_list) | |
| cdef int n = len(R_list) | |
| cdef int i, j | |
| cdef bint need_rebuild = False | |
| enc.m = m | |
| enc.n = n | |
| enc.P_ids = NULL | |
| enc.R_ids = NULL | |
| enc.R_w = NULL | |
| enc.start_bd = NULL | |
| enc.end_bd = NULL | |
| enc.ws_pos = NULL | |
| enc.we_pos = NULL | |
| enc.P_ids = <int *>malloc(m * sizeof(int)) | |
| enc.R_ids = <int *>malloc(n * sizeof(int)) | |
| enc.R_w = <int *>malloc(n * sizeof(int)) | |
| if enc.P_ids == NULL or enc.R_ids == NULL or enc.R_w == NULL: | |
| _free_encoded(&enc) | |
| raise MemoryError() | |
| for i in range(m): | |
| p = P_list[i] | |
| if p not in _phoneme_to_id: | |
| _encode_phoneme(p) | |
| need_rebuild = True | |
| enc.P_ids[i] = _phoneme_to_id[p] | |
| for j in range(n): | |
| r = R_list[j] | |
| if r not in _phoneme_to_id: | |
| _encode_phoneme(r) | |
| need_rebuild = True | |
| enc.R_ids[j] = _phoneme_to_id[r] | |
| enc.R_w[j] = <int>R_phone_to_word_list[j] | |
| if need_rebuild and _sub_matrix != NULL: | |
| _grow_matrix() | |
| enc.mat_size = _num_phonemes | |
| # Precompute boundary flags | |
| enc.start_bd = <char *>malloc((n + 1) * sizeof(char)) | |
| enc.end_bd = <char *>malloc((n + 1) * sizeof(char)) | |
| if enc.start_bd == NULL or enc.end_bd == NULL: | |
| _free_encoded(&enc) | |
| raise MemoryError() | |
| enc.start_bd[0] = 1 | |
| for j in range(1, n): | |
| enc.start_bd[j] = 1 if enc.R_w[j] != enc.R_w[j - 1] else 0 | |
| enc.start_bd[n] = 0 | |
| enc.end_bd[0] = 0 | |
| for j in range(1, n): | |
| enc.end_bd[j] = 1 if enc.R_w[j] != enc.R_w[j - 1] else 0 | |
| enc.end_bd[n] = 1 | |
| # Build sorted arrays of boundary positions | |
| enc.n_ws = 0 | |
| enc.n_we = 0 | |
| for j in range(n + 1): | |
| if enc.start_bd[j]: enc.n_ws += 1 | |
| if enc.end_bd[j]: enc.n_we += 1 | |
| enc.ws_pos = <int *>malloc(enc.n_ws * sizeof(int)) | |
| enc.we_pos = <int *>malloc(enc.n_we * sizeof(int)) | |
| if enc.ws_pos == NULL or enc.we_pos == NULL: | |
| _free_encoded(&enc) | |
| raise MemoryError() | |
| cdef int ws_i = 0, we_i = 0 | |
| for j in range(n + 1): | |
| if enc.start_bd[j]: | |
| enc.ws_pos[ws_i] = j; ws_i += 1 | |
| if enc.end_bd[j]: | |
| enc.we_pos[we_i] = j; we_i += 1 | |
| return enc | |
| cdef void _free_encoded(EncodedInput *enc): | |
| """Free all arrays in an EncodedInput.""" | |
| if enc.P_ids != NULL: free(enc.P_ids) | |
| if enc.R_ids != NULL: free(enc.R_ids) | |
| if enc.R_w != NULL: free(enc.R_w) | |
| if enc.start_bd != NULL: free(enc.start_bd) | |
| if enc.end_bd != NULL: free(enc.end_bd) | |
| if enc.ws_pos != NULL: free(enc.ws_pos) | |
| if enc.we_pos != NULL: free(enc.we_pos) | |
| enc.P_ids = NULL | |
| enc.R_ids = NULL | |
| enc.R_w = NULL | |
| enc.start_bd = NULL | |
| enc.end_bd = NULL | |
| enc.ws_pos = NULL | |
| enc.we_pos = NULL | |
| # --------------------------------------------------------------------------- | |
| # Rolling-row DP for max_wraps=0 (fast path, 89% of segments) | |
| # --------------------------------------------------------------------------- | |
| cdef tuple _align_rolling( | |
| EncodedInput *enc, | |
| int expected_word, | |
| double prior_weight, | |
| double cost_sub, | |
| double cost_del, | |
| double cost_ins, | |
| ): | |
| """Standard word-boundary DP using rolling rows. No wraparound.""" | |
| cdef int m = enc.m, n = enc.n | |
| cdef int mat_size = enc.mat_size | |
| cdef double INF_VAL = INFINITY | |
| # Allocate rolling rows | |
| cdef double *prev_cost = <double *>malloc((n + 1) * sizeof(double)) | |
| cdef double *curr_cost = <double *>malloc((n + 1) * sizeof(double)) | |
| cdef int *prev_start = <int *>malloc((n + 1) * sizeof(int)) | |
| cdef int *curr_start = <int *>malloc((n + 1) * sizeof(int)) | |
| if prev_cost == NULL or curr_cost == NULL or prev_start == NULL or curr_start == NULL: | |
| if prev_cost != NULL: free(prev_cost) | |
| if curr_cost != NULL: free(curr_cost) | |
| if prev_start != NULL: free(prev_start) | |
| if curr_start != NULL: free(curr_start) | |
| raise MemoryError() | |
| cdef int i, j | |
| cdef double del_option, ins_option, sub_option, sc | |
| cdef double *tmp_d | |
| cdef int *tmp_i | |
| cdef bint col0_start = enc.start_bd[0] | |
| # Initialize row 0 | |
| for j in range(n + 1): | |
| if enc.start_bd[j]: | |
| prev_cost[j] = 0.0 | |
| prev_start[j] = j | |
| else: | |
| prev_cost[j] = INF_VAL | |
| prev_start[j] = -1 | |
| # Core DP loop | |
| for i in range(1, m + 1): | |
| if col0_start: | |
| curr_cost[0] = i * cost_del | |
| curr_start[0] = 0 | |
| else: | |
| curr_cost[0] = INF_VAL | |
| curr_start[0] = -1 | |
| for j in range(1, n + 1): | |
| del_option = prev_cost[j] + cost_del | |
| ins_option = curr_cost[j - 1] + cost_ins | |
| sc = _get_sub_cost(enc.P_ids[i - 1], enc.R_ids[j - 1], mat_size) | |
| sub_option = prev_cost[j - 1] + sc | |
| if sub_option <= del_option and sub_option <= ins_option: | |
| curr_cost[j] = sub_option | |
| curr_start[j] = prev_start[j - 1] | |
| elif del_option <= ins_option: | |
| curr_cost[j] = del_option | |
| curr_start[j] = prev_start[j] | |
| else: | |
| curr_cost[j] = ins_option | |
| curr_start[j] = curr_start[j - 1] | |
| tmp_d = prev_cost; prev_cost = curr_cost; curr_cost = tmp_d | |
| tmp_i = prev_start; prev_start = curr_start; curr_start = tmp_i | |
| # Best-match selection | |
| cdef double best_score = INF_VAL | |
| cdef int best_j = -1, best_j_start = -1 | |
| cdef double best_cost_val = INF_VAL, best_norm = INF_VAL | |
| cdef double dist, norm_dist, prior, score | |
| cdef int j_start_val, ref_len, denom, start_word | |
| for j in range(1, n + 1): | |
| if not enc.end_bd[j]: | |
| continue | |
| if prev_cost[j] >= INF_VAL: | |
| continue | |
| dist = prev_cost[j] | |
| j_start_val = prev_start[j] | |
| ref_len = j - j_start_val | |
| denom = m if m > ref_len else ref_len | |
| if denom < 1: | |
| denom = 1 | |
| norm_dist = dist / denom | |
| if j_start_val < n: | |
| start_word = enc.R_w[j_start_val] | |
| else: | |
| start_word = enc.R_w[j - 1] | |
| prior = prior_weight * fabs(<double>(start_word - expected_word)) | |
| score = norm_dist + prior | |
| if score < best_score: | |
| best_score = score | |
| best_j = j | |
| best_j_start = j_start_val | |
| best_cost_val = dist | |
| best_norm = norm_dist | |
| free(prev_cost); free(curr_cost) | |
| free(prev_start); free(curr_start) | |
| if best_j < 0: | |
| return (None, None, float('inf'), float('inf'), 0, 0, []) | |
| return (best_j, best_j_start, best_cost_val, best_norm, 0, best_j, []) | |
| # --------------------------------------------------------------------------- | |
| # Full 3D DP for max_wraps>0 (with traceback) | |
| # --------------------------------------------------------------------------- | |
| cdef tuple _align_full_3d( | |
| EncodedInput *enc, | |
| int expected_word, | |
| double prior_weight, | |
| double cost_sub, | |
| double cost_del, | |
| double cost_ins, | |
| double wrap_penalty, | |
| int max_wraps, | |
| int sc_mode, # 0=subtract, 1=no_subtract, 2=additive | |
| double wrap_score_cost, | |
| double wrap_span_weight, | |
| ): | |
| """Wraparound DP with full 3D matrix and parent pointers for traceback.""" | |
| cdef int m = enc.m, n = enc.n | |
| cdef int K = max_wraps | |
| cdef int mat_size = enc.mat_size | |
| cdef double INF_VAL = INFINITY | |
| # 3D indexing: [i * layer_stride + k * col_stride + j] | |
| cdef int col_stride = n + 1 | |
| cdef int layer_stride = (K + 1) * col_stride | |
| cdef long total_3d = <long>(m + 1) * layer_stride | |
| # Allocate 3D arrays | |
| cdef double *cost_3d = NULL | |
| cdef int *start_3d = NULL | |
| cdef int *max_j_3d = NULL | |
| cdef int *min_w_3d = NULL # minimum word index reached along path | |
| cdef int *par_i = NULL | |
| cdef int *par_k = NULL | |
| cdef int *par_j = NULL | |
| cdef char *par_t = NULL # 0=sub, 1=del, 2=ins, 3=wrap | |
| cdef int BIG_W = 999999 | |
| cost_3d = <double *>malloc(total_3d * sizeof(double)) | |
| start_3d = <int *>malloc(total_3d * sizeof(int)) | |
| max_j_3d = <int *>malloc(total_3d * sizeof(int)) | |
| min_w_3d = <int *>malloc(total_3d * sizeof(int)) | |
| par_i = <int *>malloc(total_3d * sizeof(int)) | |
| par_k = <int *>malloc(total_3d * sizeof(int)) | |
| par_j = <int *>malloc(total_3d * sizeof(int)) | |
| par_t = <char *>malloc(total_3d * sizeof(char)) | |
| if (cost_3d == NULL or start_3d == NULL or max_j_3d == NULL or min_w_3d == NULL or | |
| par_i == NULL or par_k == NULL or par_j == NULL or par_t == NULL): | |
| if cost_3d != NULL: free(cost_3d) | |
| if start_3d != NULL: free(start_3d) | |
| if max_j_3d != NULL: free(max_j_3d) | |
| if min_w_3d != NULL: free(min_w_3d) | |
| if par_i != NULL: free(par_i) | |
| if par_k != NULL: free(par_k) | |
| if par_j != NULL: free(par_j) | |
| if par_t != NULL: free(par_t) | |
| raise MemoryError() | |
| cdef long idx | |
| cdef int i, j, k | |
| cdef int koff, koff_src, koff_dst | |
| cdef long base_i, base_prev | |
| cdef int w_j, mw_val | |
| # Initialize all to INF / -1 | |
| for idx in range(total_3d): | |
| cost_3d[idx] = INF_VAL | |
| start_3d[idx] = -1 | |
| max_j_3d[idx] = -1 | |
| min_w_3d[idx] = BIG_W | |
| par_i[idx] = -1 | |
| par_k[idx] = -1 | |
| par_j[idx] = -1 | |
| par_t[idx] = -1 | |
| # Row 0, k=0: free starts at word boundaries | |
| for j in range(n + 1): | |
| if enc.start_bd[j]: | |
| cost_3d[j] = 0.0 # i=0, k=0, j | |
| start_3d[j] = j | |
| max_j_3d[j] = j | |
| min_w_3d[j] = enc.R_w[j] if j < n else BIG_W | |
| # Variables for DP transitions | |
| cdef double del_opt, ins_opt, sub_opt, sc, new_cost, cost_at_end, best_val | |
| cdef int j_end, j_sw, mj_val, word_span | |
| cdef int we_i, ws_i | |
| # Fill DP | |
| for i in range(1, m + 1): | |
| base_i = <long>i * layer_stride | |
| base_prev = <long>(i - 1) * layer_stride | |
| # Standard transitions for each k | |
| for k in range(K + 1): | |
| koff = k * col_stride | |
| # Column 0: deletion base case, k=0 only | |
| if k == 0 and enc.start_bd[0]: | |
| idx = base_i + koff | |
| cost_3d[idx] = i * cost_del | |
| start_3d[idx] = 0 | |
| max_j_3d[idx] = 0 | |
| min_w_3d[idx] = min_w_3d[base_prev + koff] | |
| par_i[idx] = i - 1 | |
| par_k[idx] = 0 | |
| par_j[idx] = 0 | |
| par_t[idx] = 1 # del | |
| for j in range(1, n + 1): | |
| idx = base_i + koff + j | |
| # Deletion: prev row, same j, same k | |
| del_opt = INF_VAL | |
| if cost_3d[base_prev + koff + j] < INF_VAL: | |
| del_opt = cost_3d[base_prev + koff + j] + cost_del | |
| # Insertion: same row, j-1, same k | |
| ins_opt = INF_VAL | |
| if cost_3d[base_i + koff + j - 1] < INF_VAL: | |
| ins_opt = cost_3d[base_i + koff + j - 1] + cost_ins | |
| # Substitution: prev row, j-1, same k | |
| sub_opt = INF_VAL | |
| if cost_3d[base_prev + koff + j - 1] < INF_VAL: | |
| sc = _get_sub_cost(enc.P_ids[i - 1], enc.R_ids[j - 1], mat_size) | |
| sub_opt = cost_3d[base_prev + koff + j - 1] + sc | |
| if sub_opt <= del_opt and sub_opt <= ins_opt: | |
| cost_3d[idx] = sub_opt | |
| start_3d[idx] = start_3d[base_prev + koff + j - 1] | |
| mj_val = max_j_3d[base_prev + koff + j - 1] | |
| max_j_3d[idx] = j if j > mj_val else mj_val | |
| w_j = enc.R_w[j - 1] | |
| mw_val = min_w_3d[base_prev + koff + j - 1] | |
| min_w_3d[idx] = w_j if w_j < mw_val else mw_val | |
| par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 0 | |
| elif del_opt <= ins_opt: | |
| cost_3d[idx] = del_opt | |
| start_3d[idx] = start_3d[base_prev + koff + j] | |
| max_j_3d[idx] = max_j_3d[base_prev + koff + j] | |
| min_w_3d[idx] = min_w_3d[base_prev + koff + j] | |
| par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j; par_t[idx] = 1 | |
| elif ins_opt < INF_VAL: | |
| cost_3d[idx] = ins_opt | |
| start_3d[idx] = start_3d[base_i + koff + j - 1] | |
| mj_val = max_j_3d[base_i + koff + j - 1] | |
| max_j_3d[idx] = j if j > mj_val else mj_val | |
| w_j = enc.R_w[j - 1] | |
| mw_val = min_w_3d[base_i + koff + j - 1] | |
| min_w_3d[idx] = w_j if w_j < mw_val else mw_val | |
| par_i[idx] = i; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 2 | |
| # Wrap transitions (within same row i) | |
| for k in range(K): | |
| koff_src = k * col_stride | |
| koff_dst = (k + 1) * col_stride | |
| for we_i in range(enc.n_we): | |
| j_end = enc.we_pos[we_i] | |
| if cost_3d[base_i + koff_src + j_end] >= INF_VAL: | |
| continue | |
| cost_at_end = cost_3d[base_i + koff_src + j_end] | |
| for ws_i in range(enc.n_ws): | |
| j_sw = enc.ws_pos[ws_i] | |
| if j_sw >= j_end: | |
| continue | |
| word_span = enc.R_w[j_end - 1] - enc.R_w[j_sw] | |
| if word_span < 0: | |
| word_span = -word_span | |
| new_cost = cost_at_end + wrap_penalty + wrap_span_weight * word_span | |
| idx = base_i + koff_dst + j_sw | |
| if new_cost < cost_3d[idx]: | |
| cost_3d[idx] = new_cost | |
| start_3d[idx] = start_3d[base_i + koff_src + j_end] | |
| mj_val = max_j_3d[base_i + koff_src + j_end] | |
| max_j_3d[idx] = j_end if j_end > mj_val else mj_val | |
| mw_val = min_w_3d[base_i + koff_src + j_end] | |
| w_j = enc.R_w[j_sw] | |
| min_w_3d[idx] = w_j if w_j < mw_val else mw_val | |
| par_i[idx] = i; par_k[idx] = k; par_j[idx] = j_end; par_t[idx] = 3 | |
| # Insertion re-sweep from wrap positions | |
| for j in range(1, n + 1): | |
| idx = base_i + koff_dst + j | |
| ins_opt = cost_3d[base_i + koff_dst + j - 1] + cost_ins \ | |
| if cost_3d[base_i + koff_dst + j - 1] < INF_VAL else INF_VAL | |
| if ins_opt < cost_3d[idx]: | |
| cost_3d[idx] = ins_opt | |
| start_3d[idx] = start_3d[base_i + koff_dst + j - 1] | |
| mj_val = max_j_3d[base_i + koff_dst + j - 1] | |
| max_j_3d[idx] = j if j > mj_val else mj_val | |
| w_j = enc.R_w[j - 1] | |
| mw_val = min_w_3d[base_i + koff_dst + j - 1] | |
| min_w_3d[idx] = w_j if w_j < mw_val else mw_val | |
| par_i[idx] = i; par_k[idx] = k + 1; par_j[idx] = j - 1; par_t[idx] = 2 | |
| # ------------------------------------------------------------------ | |
| # Best-match selection (end boundaries only, across all k) | |
| # ------------------------------------------------------------------ | |
| cdef double best_score = INF_VAL | |
| cdef int best_j = -1, best_j_start = -1 | |
| cdef double best_cost_val = INF_VAL, best_norm = INF_VAL | |
| cdef int best_k_val = 0, best_max_j = -1 | |
| cdef double dist, norm_dist, prior, score, phoneme_cost | |
| cdef int j_start_val, ref_len, denom, start_word, max_j_val, min_word_val, eff_start | |
| base_i = <long>m * layer_stride | |
| for k in range(K + 1): | |
| koff = k * col_stride | |
| for j in range(1, n + 1): | |
| if not enc.end_bd[j]: | |
| continue | |
| idx = base_i + koff + j | |
| if cost_3d[idx] >= INF_VAL: | |
| continue | |
| dist = cost_3d[idx] | |
| j_start_val = start_3d[idx] | |
| if j_start_val < 0: | |
| continue | |
| max_j_val = max_j_3d[idx] | |
| ref_len = (max_j_val if max_j_val > j else j) - j_start_val | |
| if ref_len <= 0: | |
| continue | |
| denom = m if m > ref_len else ref_len | |
| if denom < 1: | |
| denom = 1 | |
| if sc_mode == 1: # no_subtract | |
| phoneme_cost = dist | |
| else: # subtract or additive | |
| phoneme_cost = dist - k * wrap_penalty | |
| norm_dist = phoneme_cost / denom | |
| if j_start_val < n: | |
| start_word = enc.R_w[j_start_val] | |
| else: | |
| start_word = enc.R_w[j - 1] | |
| # Use earliest word the path actually touches for fair prior | |
| min_word_val = min_w_3d[idx] | |
| eff_start = min_word_val if min_word_val < start_word and min_word_val < BIG_W else start_word | |
| prior = prior_weight * fabs(<double>(eff_start - expected_word)) | |
| score = norm_dist + prior | |
| if sc_mode == 2: # additive | |
| score = score + k * wrap_score_cost | |
| if score < best_score: | |
| best_score = score | |
| best_j = j | |
| best_j_start = j_start_val | |
| best_cost_val = dist | |
| best_norm = norm_dist | |
| best_k_val = k | |
| best_max_j = max_j_val | |
| if best_j < 0: | |
| free(cost_3d); free(start_3d); free(max_j_3d); free(min_w_3d) | |
| free(par_i); free(par_k); free(par_j); free(par_t) | |
| return (None, None, float('inf'), float('inf'), 0, 0, []) | |
| # ------------------------------------------------------------------ | |
| # Traceback: walk parent pointers, collect wrap points | |
| # ------------------------------------------------------------------ | |
| wrap_points = [] | |
| cdef int ci = m, ck = best_k_val, cj = best_j | |
| cdef int pi, pk, pj | |
| cdef char pt | |
| while True: | |
| idx = <long>ci * layer_stride + ck * col_stride + cj | |
| if par_i[idx] < 0: | |
| break | |
| pi = par_i[idx] | |
| pk = par_k[idx] | |
| pj = par_j[idx] | |
| pt = par_t[idx] | |
| if pt == 3: # wrap | |
| # Wrap: at P position ci, R jumped from pj (j_end) back to cj (j_start) | |
| wrap_points.append((ci, pj, cj)) | |
| ci = pi; ck = pk; cj = pj | |
| wrap_points.reverse() # chronological order | |
| free(cost_3d); free(start_3d); free(max_j_3d) | |
| free(par_i); free(par_k); free(par_j); free(par_t) | |
| return (best_j, best_j_start, best_cost_val, best_norm, best_k_val, best_max_j, wrap_points) | |
| # --------------------------------------------------------------------------- | |
| # Public API: unified wraparound DP | |
| # --------------------------------------------------------------------------- | |
| def cy_align_wraparound( | |
| list P_list, | |
| list R_list, | |
| list R_phone_to_word_list, | |
| int expected_word, | |
| double prior_weight, | |
| double cost_sub, | |
| double cost_del, | |
| double cost_ins, | |
| double wrap_penalty = 2.0, | |
| int max_wraps = 0, | |
| str scoring_mode = "subtract", | |
| double wrap_score_cost = 0.01, | |
| double wrap_span_weight = 0.1, | |
| ): | |
| """Wraparound word-boundary-constrained substring alignment (Cython). | |
| When max_wraps=0, uses rolling rows (fast path, no traceback needed). | |
| When max_wraps>0, uses full 3D matrix with parent pointers for traceback. | |
| Returns (best_j, best_j_start, best_cost, best_norm_dist, best_k, best_max_j, wrap_points). | |
| wrap_points: list of (i, j_end, j_start) — P position and R positions of each wrap. | |
| Empty list when no wraps detected. | |
| scoring_mode: | |
| "subtract" — phoneme_cost = dist - k*wrap_penalty (wrap is free in score) | |
| "no_subtract" — phoneme_cost = dist (wrap penalty stays in score) | |
| "additive" — phoneme_cost = dist - k*wrap_penalty, score += k*wrap_score_cost | |
| """ | |
| cdef int m = len(P_list) | |
| cdef int n = len(R_list) | |
| if m == 0 or n == 0: | |
| return (None, None, float('inf'), float('inf'), 0, 0, []) | |
| # Encode inputs | |
| cdef EncodedInput enc = _encode_inputs(P_list, R_list, R_phone_to_word_list) | |
| # Encode scoring mode outside of branches (cdef not allowed inside if blocks) | |
| cdef int sc_mode | |
| if scoring_mode == "no_subtract": | |
| sc_mode = 1 | |
| elif scoring_mode == "additive": | |
| sc_mode = 2 | |
| else: | |
| sc_mode = 0 | |
| cdef tuple result | |
| try: | |
| if max_wraps == 0: | |
| # Fast path: rolling rows, no wraparound | |
| result = _align_rolling( | |
| &enc, expected_word, prior_weight, | |
| cost_sub, cost_del, cost_ins, | |
| ) | |
| else: | |
| result = _align_full_3d( | |
| &enc, expected_word, prior_weight, | |
| cost_sub, cost_del, cost_ins, | |
| wrap_penalty, max_wraps, | |
| sc_mode, wrap_score_cost, | |
| wrap_span_weight, | |
| ) | |
| finally: | |
| _free_encoded(&enc) | |
| return result | |