# cython: boundscheck=False, wraparound=False, cdivision=True """ Cython-accelerated word-boundary-constrained substring Levenshtein DP with wraparound support for repetition detection. When max_wraps=0, uses rolling rows (identical to the old standard DP). When max_wraps>0, uses full 3D matrix with parent pointers for traceback. """ from libc.stdlib cimport malloc, free from libc.math cimport INFINITY, fabs # --------------------------------------------------------------------------- # Phoneme → integer encoding (built lazily on first call) # --------------------------------------------------------------------------- cdef dict _phoneme_to_id = {} cdef int _num_phonemes = 0 cdef double *_sub_matrix = NULL # flat _num_phonemes × _num_phonemes cdef double _default_sub = 1.0 cdef int _encode_phoneme(str p): """Return integer id for *p*, assigning a new one if unseen.""" global _num_phonemes cdef int pid try: pid = _phoneme_to_id[p] except KeyError: pid = _num_phonemes _phoneme_to_id[p] = pid _num_phonemes += 1 return pid def init_substitution_matrix(dict sub_costs, double default_sub): """Build the dense substitution-cost matrix from the Python dict. Must be called once before the first DP call (phoneme_matcher.py does this at import time). Parameters ---------- sub_costs : dict[(str, str), float] Phoneme-pair substitution costs (both orderings already present). default_sub : float Cost used for pairs not in *sub_costs*. """ global _sub_matrix, _default_sub, _num_phonemes _default_sub = default_sub # First pass: make sure every phoneme in sub_costs has an id for (a, b) in sub_costs: _encode_phoneme(a) _encode_phoneme(b) # Allocate matrix (will be re-allocated if new phonemes appear later) _rebuild_matrix(sub_costs) cdef void _rebuild_matrix(dict sub_costs): """(Re)allocate and fill the dense cost matrix.""" global _sub_matrix, _num_phonemes, _default_sub cdef int size = _num_phonemes cdef int i, j if _sub_matrix != NULL: free(_sub_matrix) _sub_matrix = malloc(size * size * sizeof(double)) if _sub_matrix == NULL: raise MemoryError("Failed to allocate substitution matrix") # Fill with default for i in range(size * size): _sub_matrix[i] = _default_sub # Diagonal = 0 (match) for i in range(size): _sub_matrix[i * size + i] = 0.0 # Overrides from dict cdef int aid, bid cdef double cost for (a, b), cost in sub_costs.items(): aid = _phoneme_to_id.get(a, -1) bid = _phoneme_to_id.get(b, -1) if aid >= 0 and bid >= 0: _sub_matrix[aid * size + bid] = cost cdef inline double _get_sub_cost(int pid, int rid, int size) nogil: """Look up substitution cost from the dense matrix.""" if pid == rid: return 0.0 if pid < size and rid < size: return _sub_matrix[pid * size + rid] return _default_sub # --------------------------------------------------------------------------- # Helper: grow matrix when new phonemes are encountered at runtime # --------------------------------------------------------------------------- cdef void _grow_matrix(): """Expand the substitution matrix to cover newly added phonemes. New rows/columns are filled with the default substitution cost, diagonal with 0.0. Existing entries are preserved. """ global _sub_matrix, _num_phonemes cdef int new_size = _num_phonemes cdef double *new_mat cdef int i if _sub_matrix == NULL: _sub_matrix = malloc(new_size * new_size * sizeof(double)) if _sub_matrix == NULL: return for i in range(new_size * new_size): _sub_matrix[i] = _default_sub for i in range(new_size): _sub_matrix[i * new_size + i] = 0.0 return new_mat = malloc(new_size * new_size * sizeof(double)) if new_mat == NULL: return for i in range(new_size * new_size): new_mat[i] = _default_sub for i in range(new_size): new_mat[i * new_size + i] = 0.0 free(_sub_matrix) _sub_matrix = new_mat # --------------------------------------------------------------------------- # Shared: encode inputs and precompute boundaries # --------------------------------------------------------------------------- cdef struct EncodedInput: int *P_ids int *R_ids int *R_w char *start_bd char *end_bd int *ws_pos # word-start positions (sorted) int *we_pos # word-end positions (sorted) int n_ws # count of word-start positions int n_we # count of word-end positions int mat_size # phoneme matrix size at time of encoding int m # len(P) int n # len(R) cdef EncodedInput _encode_inputs(list P_list, list R_list, list R_phone_to_word_list) except *: """Encode string lists to C arrays, precompute word boundaries.""" cdef EncodedInput enc cdef int m = len(P_list) cdef int n = len(R_list) cdef int i, j cdef bint need_rebuild = False enc.m = m enc.n = n enc.P_ids = NULL enc.R_ids = NULL enc.R_w = NULL enc.start_bd = NULL enc.end_bd = NULL enc.ws_pos = NULL enc.we_pos = NULL enc.P_ids = malloc(m * sizeof(int)) enc.R_ids = malloc(n * sizeof(int)) enc.R_w = malloc(n * sizeof(int)) if enc.P_ids == NULL or enc.R_ids == NULL or enc.R_w == NULL: _free_encoded(&enc) raise MemoryError() for i in range(m): p = P_list[i] if p not in _phoneme_to_id: _encode_phoneme(p) need_rebuild = True enc.P_ids[i] = _phoneme_to_id[p] for j in range(n): r = R_list[j] if r not in _phoneme_to_id: _encode_phoneme(r) need_rebuild = True enc.R_ids[j] = _phoneme_to_id[r] enc.R_w[j] = R_phone_to_word_list[j] if need_rebuild and _sub_matrix != NULL: _grow_matrix() enc.mat_size = _num_phonemes # Precompute boundary flags enc.start_bd = malloc((n + 1) * sizeof(char)) enc.end_bd = malloc((n + 1) * sizeof(char)) if enc.start_bd == NULL or enc.end_bd == NULL: _free_encoded(&enc) raise MemoryError() enc.start_bd[0] = 1 for j in range(1, n): enc.start_bd[j] = 1 if enc.R_w[j] != enc.R_w[j - 1] else 0 enc.start_bd[n] = 0 enc.end_bd[0] = 0 for j in range(1, n): enc.end_bd[j] = 1 if enc.R_w[j] != enc.R_w[j - 1] else 0 enc.end_bd[n] = 1 # Build sorted arrays of boundary positions enc.n_ws = 0 enc.n_we = 0 for j in range(n + 1): if enc.start_bd[j]: enc.n_ws += 1 if enc.end_bd[j]: enc.n_we += 1 enc.ws_pos = malloc(enc.n_ws * sizeof(int)) enc.we_pos = malloc(enc.n_we * sizeof(int)) if enc.ws_pos == NULL or enc.we_pos == NULL: _free_encoded(&enc) raise MemoryError() cdef int ws_i = 0, we_i = 0 for j in range(n + 1): if enc.start_bd[j]: enc.ws_pos[ws_i] = j; ws_i += 1 if enc.end_bd[j]: enc.we_pos[we_i] = j; we_i += 1 return enc cdef void _free_encoded(EncodedInput *enc): """Free all arrays in an EncodedInput.""" if enc.P_ids != NULL: free(enc.P_ids) if enc.R_ids != NULL: free(enc.R_ids) if enc.R_w != NULL: free(enc.R_w) if enc.start_bd != NULL: free(enc.start_bd) if enc.end_bd != NULL: free(enc.end_bd) if enc.ws_pos != NULL: free(enc.ws_pos) if enc.we_pos != NULL: free(enc.we_pos) enc.P_ids = NULL enc.R_ids = NULL enc.R_w = NULL enc.start_bd = NULL enc.end_bd = NULL enc.ws_pos = NULL enc.we_pos = NULL # --------------------------------------------------------------------------- # Rolling-row DP for max_wraps=0 (fast path, 89% of segments) # --------------------------------------------------------------------------- cdef tuple _align_rolling( EncodedInput *enc, int expected_word, double prior_weight, double cost_sub, double cost_del, double cost_ins, ): """Standard word-boundary DP using rolling rows. No wraparound.""" cdef int m = enc.m, n = enc.n cdef int mat_size = enc.mat_size cdef double INF_VAL = INFINITY # Allocate rolling rows cdef double *prev_cost = malloc((n + 1) * sizeof(double)) cdef double *curr_cost = malloc((n + 1) * sizeof(double)) cdef int *prev_start = malloc((n + 1) * sizeof(int)) cdef int *curr_start = malloc((n + 1) * sizeof(int)) if prev_cost == NULL or curr_cost == NULL or prev_start == NULL or curr_start == NULL: if prev_cost != NULL: free(prev_cost) if curr_cost != NULL: free(curr_cost) if prev_start != NULL: free(prev_start) if curr_start != NULL: free(curr_start) raise MemoryError() cdef int i, j cdef double del_option, ins_option, sub_option, sc cdef double *tmp_d cdef int *tmp_i cdef bint col0_start = enc.start_bd[0] # Initialize row 0 for j in range(n + 1): if enc.start_bd[j]: prev_cost[j] = 0.0 prev_start[j] = j else: prev_cost[j] = INF_VAL prev_start[j] = -1 # Core DP loop for i in range(1, m + 1): if col0_start: curr_cost[0] = i * cost_del curr_start[0] = 0 else: curr_cost[0] = INF_VAL curr_start[0] = -1 for j in range(1, n + 1): del_option = prev_cost[j] + cost_del ins_option = curr_cost[j - 1] + cost_ins sc = _get_sub_cost(enc.P_ids[i - 1], enc.R_ids[j - 1], mat_size) sub_option = prev_cost[j - 1] + sc if sub_option <= del_option and sub_option <= ins_option: curr_cost[j] = sub_option curr_start[j] = prev_start[j - 1] elif del_option <= ins_option: curr_cost[j] = del_option curr_start[j] = prev_start[j] else: curr_cost[j] = ins_option curr_start[j] = curr_start[j - 1] tmp_d = prev_cost; prev_cost = curr_cost; curr_cost = tmp_d tmp_i = prev_start; prev_start = curr_start; curr_start = tmp_i # Best-match selection cdef double best_score = INF_VAL cdef int best_j = -1, best_j_start = -1 cdef double best_cost_val = INF_VAL, best_norm = INF_VAL cdef double dist, norm_dist, prior, score cdef int j_start_val, ref_len, denom, start_word for j in range(1, n + 1): if not enc.end_bd[j]: continue if prev_cost[j] >= INF_VAL: continue dist = prev_cost[j] j_start_val = prev_start[j] ref_len = j - j_start_val denom = m if m > ref_len else ref_len if denom < 1: denom = 1 norm_dist = dist / denom if j_start_val < n: start_word = enc.R_w[j_start_val] else: start_word = enc.R_w[j - 1] prior = prior_weight * fabs((start_word - expected_word)) score = norm_dist + prior if score < best_score: best_score = score best_j = j best_j_start = j_start_val best_cost_val = dist best_norm = norm_dist free(prev_cost); free(curr_cost) free(prev_start); free(curr_start) if best_j < 0: return (None, None, float('inf'), float('inf'), 0, 0, []) return (best_j, best_j_start, best_cost_val, best_norm, 0, best_j, []) # --------------------------------------------------------------------------- # Full 3D DP for max_wraps>0 (with traceback) # --------------------------------------------------------------------------- cdef tuple _align_full_3d( EncodedInput *enc, int expected_word, double prior_weight, double cost_sub, double cost_del, double cost_ins, double wrap_penalty, int max_wraps, int sc_mode, # 0=subtract, 1=no_subtract, 2=additive double wrap_score_cost, double wrap_span_weight, ): """Wraparound DP with full 3D matrix and parent pointers for traceback.""" cdef int m = enc.m, n = enc.n cdef int K = max_wraps cdef int mat_size = enc.mat_size cdef double INF_VAL = INFINITY # 3D indexing: [i * layer_stride + k * col_stride + j] cdef int col_stride = n + 1 cdef int layer_stride = (K + 1) * col_stride cdef long total_3d = (m + 1) * layer_stride # Allocate 3D arrays cdef double *cost_3d = NULL cdef int *start_3d = NULL cdef int *max_j_3d = NULL cdef int *min_w_3d = NULL # minimum word index reached along path cdef int *par_i = NULL cdef int *par_k = NULL cdef int *par_j = NULL cdef char *par_t = NULL # 0=sub, 1=del, 2=ins, 3=wrap cdef int BIG_W = 999999 cost_3d = malloc(total_3d * sizeof(double)) start_3d = malloc(total_3d * sizeof(int)) max_j_3d = malloc(total_3d * sizeof(int)) min_w_3d = malloc(total_3d * sizeof(int)) par_i = malloc(total_3d * sizeof(int)) par_k = malloc(total_3d * sizeof(int)) par_j = malloc(total_3d * sizeof(int)) par_t = malloc(total_3d * sizeof(char)) if (cost_3d == NULL or start_3d == NULL or max_j_3d == NULL or min_w_3d == NULL or par_i == NULL or par_k == NULL or par_j == NULL or par_t == NULL): if cost_3d != NULL: free(cost_3d) if start_3d != NULL: free(start_3d) if max_j_3d != NULL: free(max_j_3d) if min_w_3d != NULL: free(min_w_3d) if par_i != NULL: free(par_i) if par_k != NULL: free(par_k) if par_j != NULL: free(par_j) if par_t != NULL: free(par_t) raise MemoryError() cdef long idx cdef int i, j, k cdef int koff, koff_src, koff_dst cdef long base_i, base_prev cdef int w_j, mw_val # Initialize all to INF / -1 for idx in range(total_3d): cost_3d[idx] = INF_VAL start_3d[idx] = -1 max_j_3d[idx] = -1 min_w_3d[idx] = BIG_W par_i[idx] = -1 par_k[idx] = -1 par_j[idx] = -1 par_t[idx] = -1 # Row 0, k=0: free starts at word boundaries for j in range(n + 1): if enc.start_bd[j]: cost_3d[j] = 0.0 # i=0, k=0, j start_3d[j] = j max_j_3d[j] = j min_w_3d[j] = enc.R_w[j] if j < n else BIG_W # Variables for DP transitions cdef double del_opt, ins_opt, sub_opt, sc, new_cost, cost_at_end, best_val cdef int j_end, j_sw, mj_val, word_span cdef int we_i, ws_i # Fill DP for i in range(1, m + 1): base_i = i * layer_stride base_prev = (i - 1) * layer_stride # Standard transitions for each k for k in range(K + 1): koff = k * col_stride # Column 0: deletion base case, k=0 only if k == 0 and enc.start_bd[0]: idx = base_i + koff cost_3d[idx] = i * cost_del start_3d[idx] = 0 max_j_3d[idx] = 0 min_w_3d[idx] = min_w_3d[base_prev + koff] par_i[idx] = i - 1 par_k[idx] = 0 par_j[idx] = 0 par_t[idx] = 1 # del for j in range(1, n + 1): idx = base_i + koff + j # Deletion: prev row, same j, same k del_opt = INF_VAL if cost_3d[base_prev + koff + j] < INF_VAL: del_opt = cost_3d[base_prev + koff + j] + cost_del # Insertion: same row, j-1, same k ins_opt = INF_VAL if cost_3d[base_i + koff + j - 1] < INF_VAL: ins_opt = cost_3d[base_i + koff + j - 1] + cost_ins # Substitution: prev row, j-1, same k sub_opt = INF_VAL if cost_3d[base_prev + koff + j - 1] < INF_VAL: sc = _get_sub_cost(enc.P_ids[i - 1], enc.R_ids[j - 1], mat_size) sub_opt = cost_3d[base_prev + koff + j - 1] + sc if sub_opt <= del_opt and sub_opt <= ins_opt: cost_3d[idx] = sub_opt start_3d[idx] = start_3d[base_prev + koff + j - 1] mj_val = max_j_3d[base_prev + koff + j - 1] max_j_3d[idx] = j if j > mj_val else mj_val w_j = enc.R_w[j - 1] mw_val = min_w_3d[base_prev + koff + j - 1] min_w_3d[idx] = w_j if w_j < mw_val else mw_val par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 0 elif del_opt <= ins_opt: cost_3d[idx] = del_opt start_3d[idx] = start_3d[base_prev + koff + j] max_j_3d[idx] = max_j_3d[base_prev + koff + j] min_w_3d[idx] = min_w_3d[base_prev + koff + j] par_i[idx] = i - 1; par_k[idx] = k; par_j[idx] = j; par_t[idx] = 1 elif ins_opt < INF_VAL: cost_3d[idx] = ins_opt start_3d[idx] = start_3d[base_i + koff + j - 1] mj_val = max_j_3d[base_i + koff + j - 1] max_j_3d[idx] = j if j > mj_val else mj_val w_j = enc.R_w[j - 1] mw_val = min_w_3d[base_i + koff + j - 1] min_w_3d[idx] = w_j if w_j < mw_val else mw_val par_i[idx] = i; par_k[idx] = k; par_j[idx] = j - 1; par_t[idx] = 2 # Wrap transitions (within same row i) for k in range(K): koff_src = k * col_stride koff_dst = (k + 1) * col_stride for we_i in range(enc.n_we): j_end = enc.we_pos[we_i] if cost_3d[base_i + koff_src + j_end] >= INF_VAL: continue cost_at_end = cost_3d[base_i + koff_src + j_end] for ws_i in range(enc.n_ws): j_sw = enc.ws_pos[ws_i] if j_sw >= j_end: continue word_span = enc.R_w[j_end - 1] - enc.R_w[j_sw] if word_span < 0: word_span = -word_span new_cost = cost_at_end + wrap_penalty + wrap_span_weight * word_span idx = base_i + koff_dst + j_sw if new_cost < cost_3d[idx]: cost_3d[idx] = new_cost start_3d[idx] = start_3d[base_i + koff_src + j_end] mj_val = max_j_3d[base_i + koff_src + j_end] max_j_3d[idx] = j_end if j_end > mj_val else mj_val mw_val = min_w_3d[base_i + koff_src + j_end] w_j = enc.R_w[j_sw] min_w_3d[idx] = w_j if w_j < mw_val else mw_val par_i[idx] = i; par_k[idx] = k; par_j[idx] = j_end; par_t[idx] = 3 # Insertion re-sweep from wrap positions for j in range(1, n + 1): idx = base_i + koff_dst + j ins_opt = cost_3d[base_i + koff_dst + j - 1] + cost_ins \ if cost_3d[base_i + koff_dst + j - 1] < INF_VAL else INF_VAL if ins_opt < cost_3d[idx]: cost_3d[idx] = ins_opt start_3d[idx] = start_3d[base_i + koff_dst + j - 1] mj_val = max_j_3d[base_i + koff_dst + j - 1] max_j_3d[idx] = j if j > mj_val else mj_val w_j = enc.R_w[j - 1] mw_val = min_w_3d[base_i + koff_dst + j - 1] min_w_3d[idx] = w_j if w_j < mw_val else mw_val par_i[idx] = i; par_k[idx] = k + 1; par_j[idx] = j - 1; par_t[idx] = 2 # ------------------------------------------------------------------ # Best-match selection (end boundaries only, across all k) # ------------------------------------------------------------------ cdef double best_score = INF_VAL cdef int best_j = -1, best_j_start = -1 cdef double best_cost_val = INF_VAL, best_norm = INF_VAL cdef int best_k_val = 0, best_max_j = -1 cdef double dist, norm_dist, prior, score, phoneme_cost cdef int j_start_val, ref_len, denom, start_word, max_j_val, min_word_val, eff_start base_i = m * layer_stride for k in range(K + 1): koff = k * col_stride for j in range(1, n + 1): if not enc.end_bd[j]: continue idx = base_i + koff + j if cost_3d[idx] >= INF_VAL: continue dist = cost_3d[idx] j_start_val = start_3d[idx] if j_start_val < 0: continue max_j_val = max_j_3d[idx] ref_len = (max_j_val if max_j_val > j else j) - j_start_val if ref_len <= 0: continue denom = m if m > ref_len else ref_len if denom < 1: denom = 1 if sc_mode == 1: # no_subtract phoneme_cost = dist else: # subtract or additive phoneme_cost = dist - k * wrap_penalty norm_dist = phoneme_cost / denom if j_start_val < n: start_word = enc.R_w[j_start_val] else: start_word = enc.R_w[j - 1] # Use earliest word the path actually touches for fair prior min_word_val = min_w_3d[idx] eff_start = min_word_val if min_word_val < start_word and min_word_val < BIG_W else start_word prior = prior_weight * fabs((eff_start - expected_word)) score = norm_dist + prior if sc_mode == 2: # additive score = score + k * wrap_score_cost if score < best_score: best_score = score best_j = j best_j_start = j_start_val best_cost_val = dist best_norm = norm_dist best_k_val = k best_max_j = max_j_val if best_j < 0: free(cost_3d); free(start_3d); free(max_j_3d); free(min_w_3d) free(par_i); free(par_k); free(par_j); free(par_t) return (None, None, float('inf'), float('inf'), 0, 0, []) # ------------------------------------------------------------------ # Traceback: walk parent pointers, collect wrap points # ------------------------------------------------------------------ wrap_points = [] cdef int ci = m, ck = best_k_val, cj = best_j cdef int pi, pk, pj cdef char pt while True: idx = ci * layer_stride + ck * col_stride + cj if par_i[idx] < 0: break pi = par_i[idx] pk = par_k[idx] pj = par_j[idx] pt = par_t[idx] if pt == 3: # wrap # Wrap: at P position ci, R jumped from pj (j_end) back to cj (j_start) wrap_points.append((ci, pj, cj)) ci = pi; ck = pk; cj = pj wrap_points.reverse() # chronological order free(cost_3d); free(start_3d); free(max_j_3d) free(par_i); free(par_k); free(par_j); free(par_t) return (best_j, best_j_start, best_cost_val, best_norm, best_k_val, best_max_j, wrap_points) # --------------------------------------------------------------------------- # Public API: unified wraparound DP # --------------------------------------------------------------------------- def cy_align_wraparound( list P_list, list R_list, list R_phone_to_word_list, int expected_word, double prior_weight, double cost_sub, double cost_del, double cost_ins, double wrap_penalty = 2.0, int max_wraps = 0, str scoring_mode = "subtract", double wrap_score_cost = 0.01, double wrap_span_weight = 0.1, ): """Wraparound word-boundary-constrained substring alignment (Cython). When max_wraps=0, uses rolling rows (fast path, no traceback needed). When max_wraps>0, uses full 3D matrix with parent pointers for traceback. Returns (best_j, best_j_start, best_cost, best_norm_dist, best_k, best_max_j, wrap_points). wrap_points: list of (i, j_end, j_start) — P position and R positions of each wrap. Empty list when no wraps detected. scoring_mode: "subtract" — phoneme_cost = dist - k*wrap_penalty (wrap is free in score) "no_subtract" — phoneme_cost = dist (wrap penalty stays in score) "additive" — phoneme_cost = dist - k*wrap_penalty, score += k*wrap_score_cost """ cdef int m = len(P_list) cdef int n = len(R_list) if m == 0 or n == 0: return (None, None, float('inf'), float('inf'), 0, 0, []) # Encode inputs cdef EncodedInput enc = _encode_inputs(P_list, R_list, R_phone_to_word_list) # Encode scoring mode outside of branches (cdef not allowed inside if blocks) cdef int sc_mode if scoring_mode == "no_subtract": sc_mode = 1 elif scoring_mode == "additive": sc_mode = 2 else: sc_mode = 0 cdef tuple result try: if max_wraps == 0: # Fast path: rolling rows, no wraparound result = _align_rolling( &enc, expected_word, prior_weight, cost_sub, cost_del, cost_ins, ) else: result = _align_full_3d( &enc, expected_word, prior_weight, cost_sub, cost_del, cost_ins, wrap_penalty, max_wraps, sc_mode, wrap_score_cost, wrap_span_weight, ) finally: _free_encoded(&enc) return result