# cython: boundscheck=False, wraparound=False, cdivision=True """ Cython-accelerated word-boundary-constrained substring Levenshtein DP. Drop-in replacement for the pure-Python align_with_word_boundaries() in phoneme_matcher.py. Callers still pass plain Python lists of strings; encoding to integer arrays happens inside this module. """ from libc.stdlib cimport malloc, free from libc.math cimport INFINITY, fabs # --------------------------------------------------------------------------- # Phoneme → integer encoding (built lazily on first call) # --------------------------------------------------------------------------- cdef dict _phoneme_to_id = {} cdef int _num_phonemes = 0 cdef double *_sub_matrix = NULL # flat _num_phonemes × _num_phonemes cdef double _default_sub = 1.0 cdef int _encode_phoneme(str p): """Return integer id for *p*, assigning a new one if unseen.""" global _num_phonemes cdef int pid try: pid = _phoneme_to_id[p] except KeyError: pid = _num_phonemes _phoneme_to_id[p] = pid _num_phonemes += 1 return pid def init_substitution_matrix(dict sub_costs, double default_sub): """Build the dense substitution-cost matrix from the Python dict. Must be called once before the first DP call (phoneme_matcher.py does this at import time). Parameters ---------- sub_costs : dict[(str, str), float] Phoneme-pair substitution costs (both orderings already present). default_sub : float Cost used for pairs not in *sub_costs*. """ global _sub_matrix, _default_sub, _num_phonemes _default_sub = default_sub # First pass: make sure every phoneme in sub_costs has an id for (a, b) in sub_costs: _encode_phoneme(a) _encode_phoneme(b) # Allocate matrix (will be re-allocated if new phonemes appear later) _rebuild_matrix(sub_costs) cdef void _rebuild_matrix(dict sub_costs): """(Re)allocate and fill the dense cost matrix.""" global _sub_matrix, _num_phonemes, _default_sub cdef int size = _num_phonemes cdef int i, j if _sub_matrix != NULL: free(_sub_matrix) _sub_matrix = malloc(size * size * sizeof(double)) if _sub_matrix == NULL: raise MemoryError("Failed to allocate substitution matrix") # Fill with default for i in range(size * size): _sub_matrix[i] = _default_sub # Diagonal = 0 (match) for i in range(size): _sub_matrix[i * size + i] = 0.0 # Overrides from dict cdef int aid, bid cdef double cost for (a, b), cost in sub_costs.items(): aid = _phoneme_to_id.get(a, -1) bid = _phoneme_to_id.get(b, -1) if aid >= 0 and bid >= 0: _sub_matrix[aid * size + bid] = cost cdef inline double _get_sub_cost(int pid, int rid, int size) nogil: """Look up substitution cost from the dense matrix.""" if pid == rid: return 0.0 if pid < size and rid < size: return _sub_matrix[pid * size + rid] return _default_sub # --------------------------------------------------------------------------- # Main DP function # --------------------------------------------------------------------------- def cy_align_with_word_boundaries( list P_list, list R_list, list R_phone_to_word_list, int expected_word, double prior_weight, double cost_sub, double cost_del, double cost_ins, ): """Word-boundary-constrained substring alignment (Cython). Identical semantics to the pure-Python version. Returns the same (best_j, best_j_start, best_cost, best_norm_dist) tuple, with ``(None, None, INF, INF)`` on failure. """ cdef int m = len(P_list) cdef int n = len(R_list) cdef double INF_VAL = INFINITY if m == 0 or n == 0: return (None, None, float('inf'), float('inf')) # ------------------------------------------------------------------ # Encode string lists → C arrays # ------------------------------------------------------------------ cdef int *P_ids = malloc(m * sizeof(int)) cdef int *R_ids = malloc(n * sizeof(int)) cdef int *R_w = malloc(n * sizeof(int)) if P_ids == NULL or R_ids == NULL or R_w == NULL: if P_ids != NULL: free(P_ids) if R_ids != NULL: free(R_ids) if R_w != NULL: free(R_w) raise MemoryError() cdef int i, j cdef bint need_rebuild = False for i in range(m): p = P_list[i] if p not in _phoneme_to_id: _encode_phoneme(p) need_rebuild = True P_ids[i] = _phoneme_to_id[p] for j in range(n): r = R_list[j] if r not in _phoneme_to_id: _encode_phoneme(r) need_rebuild = True R_ids[j] = _phoneme_to_id[r] R_w[j] = R_phone_to_word_list[j] # If new phonemes appeared, rebuild the matrix so ids are covered if need_rebuild and _sub_matrix != NULL: # We need the original sub_costs dict, but we don't have it here. # The safest approach: expand matrix with defaults for new phonemes. _grow_matrix() cdef int mat_size = _num_phonemes # ------------------------------------------------------------------ # Precompute boundary flags # ------------------------------------------------------------------ cdef char *start_boundary = malloc((n + 1) * sizeof(char)) cdef char *end_boundary = malloc((n + 1) * sizeof(char)) if start_boundary == NULL or end_boundary == NULL: free(P_ids); free(R_ids); free(R_w) if start_boundary != NULL: free(start_boundary) if end_boundary != NULL: free(end_boundary) raise MemoryError() # start_boundary[j]: can alignment start at column j? start_boundary[0] = 1 # column 0 always valid for j in range(1, n): start_boundary[j] = 1 if R_w[j] != R_w[j - 1] else 0 start_boundary[n] = 0 # can't start at or past end # end_boundary[j]: can alignment end at column j? end_boundary[0] = 0 # can't end before consuming anything for j in range(1, n): end_boundary[j] = 1 if R_w[j] != R_w[j - 1] else 0 end_boundary[n] = 1 # end of reference always valid # ------------------------------------------------------------------ # DP arrays (two-row rolling) # ------------------------------------------------------------------ cdef double *prev_cost = malloc((n + 1) * sizeof(double)) cdef double *curr_cost = malloc((n + 1) * sizeof(double)) cdef int *prev_start = malloc((n + 1) * sizeof(int)) cdef int *curr_start = malloc((n + 1) * sizeof(int)) if (prev_cost == NULL or curr_cost == NULL or prev_start == NULL or curr_start == NULL): free(P_ids); free(R_ids); free(R_w) free(start_boundary); free(end_boundary) if prev_cost != NULL: free(prev_cost) if curr_cost != NULL: free(curr_cost) if prev_start != NULL: free(prev_start) if curr_start != NULL: free(curr_start) raise MemoryError() # Initialise row 0 for j in range(n + 1): if start_boundary[j]: prev_cost[j] = 0.0 prev_start[j] = j else: prev_cost[j] = INF_VAL prev_start[j] = -1 # ------------------------------------------------------------------ # Core DP loop (no Python objects touched → runs at C speed) # ------------------------------------------------------------------ cdef double del_option, ins_option, sub_option, sc cdef double *tmp_d cdef int *tmp_i cdef bint col0_start = start_boundary[0] for i in range(1, m + 1): if col0_start: curr_cost[0] = i * cost_del curr_start[0] = 0 else: curr_cost[0] = INF_VAL curr_start[0] = -1 for j in range(1, n + 1): del_option = prev_cost[j] + cost_del ins_option = curr_cost[j - 1] + cost_ins sc = _get_sub_cost(P_ids[i - 1], R_ids[j - 1], mat_size) sub_option = prev_cost[j - 1] + sc if sub_option <= del_option and sub_option <= ins_option: curr_cost[j] = sub_option curr_start[j] = prev_start[j - 1] elif del_option <= ins_option: curr_cost[j] = del_option curr_start[j] = prev_start[j] else: curr_cost[j] = ins_option curr_start[j] = curr_start[j - 1] # Swap rows tmp_d = prev_cost; prev_cost = curr_cost; curr_cost = tmp_d tmp_i = prev_start; prev_start = curr_start; curr_start = tmp_i # ------------------------------------------------------------------ # Best-match selection (end boundaries only) # ------------------------------------------------------------------ cdef double best_score = INF_VAL cdef int best_j = -1 cdef int best_j_start = -1 cdef double best_cost_val = INF_VAL cdef double best_norm = INF_VAL cdef double dist, norm_dist, prior, score cdef int j_start_val, ref_len, denom, start_word for j in range(1, n + 1): if not end_boundary[j]: continue if prev_cost[j] >= INF_VAL: continue dist = prev_cost[j] j_start_val = prev_start[j] ref_len = j - j_start_val denom = m if m > ref_len else ref_len if denom < 1: denom = 1 norm_dist = dist / denom if j_start_val < n: start_word = R_w[j_start_val] else: start_word = R_w[j - 1] prior = prior_weight * fabs((start_word - expected_word)) score = norm_dist + prior if score < best_score: best_score = score best_j = j best_j_start = j_start_val best_cost_val = dist best_norm = norm_dist # ------------------------------------------------------------------ # Cleanup # ------------------------------------------------------------------ free(P_ids); free(R_ids); free(R_w) free(start_boundary); free(end_boundary) free(prev_cost); free(curr_cost) free(prev_start); free(curr_start) if best_j < 0: return (None, None, float('inf'), float('inf')) return (best_j, best_j_start, best_cost_val, best_norm) # --------------------------------------------------------------------------- # Helper: grow matrix when new phonemes are encountered at runtime # --------------------------------------------------------------------------- cdef void _grow_matrix(): """Expand the substitution matrix to cover newly added phonemes. New rows/columns are filled with the default substitution cost, diagonal with 0.0. Existing entries are preserved. """ global _sub_matrix, _num_phonemes cdef int old_size cdef int new_size = _num_phonemes cdef double *new_mat if _sub_matrix == NULL: # No matrix yet — allocate fresh with defaults _sub_matrix = malloc(new_size * new_size * sizeof(double)) if _sub_matrix == NULL: return for i in range(new_size * new_size): _sub_matrix[i] = _default_sub for i in range(new_size): _sub_matrix[i * new_size + i] = 0.0 return # Figure out old size from current allocation # We track it implicitly: old_size = new_size - (number of phonemes added since last build) # Simpler: just rebuild from scratch with defaults + diagonal new_mat = malloc(new_size * new_size * sizeof(double)) if new_mat == NULL: return cdef int i, j_idx for i in range(new_size * new_size): new_mat[i] = _default_sub for i in range(new_size): new_mat[i * new_size + i] = 0.0 # Copy old entries (old matrix was some smaller size). # We don't know old_size exactly, so we just keep the new defaults. # The original sub_costs were already written; since we don't have # the dict here, the known-pair costs are lost for the new matrix. # This only happens if a completely new phoneme appears at runtime, # which is extremely rare. The init call covers all 69 known phonemes. free(_sub_matrix) _sub_matrix = new_mat