Spaces:

hetchyy
/

Quran-multi-aligner

Running on Zero

App Files Files Community

Quran-multi-aligner / src /_dp_core.pyx

hetchyy

Initial commit

20e9692 7 days ago

raw

history blame contribute delete

12.4 kB

	# cython: boundscheck=False, wraparound=False, cdivision=True
	"""
	Cython-accelerated word-boundary-constrained substring Levenshtein DP.

	Drop-in replacement for the pure-Python align_with_word_boundaries() in
	phoneme_matcher.py. Callers still pass plain Python lists of strings;
	encoding to integer arrays happens inside this module.
	"""

	from libc.stdlib cimport malloc, free
	from libc.math cimport INFINITY, fabs

	# ---------------------------------------------------------------------------
	# Phoneme → integer encoding (built lazily on first call)
	# ---------------------------------------------------------------------------

	cdef dict _phoneme_to_id = {}
	cdef int _num_phonemes = 0
	cdef double *_sub_matrix = NULL # flat _num_phonemes × _num_phonemes
	cdef double _default_sub = 1.0


	cdef int _encode_phoneme(str p):
	"""Return integer id for p, assigning a new one if unseen."""
	global _num_phonemes
	cdef int pid
	try:
	pid = _phoneme_to_id[p]
	except KeyError:
	pid = _num_phonemes
	_phoneme_to_id[p] = pid
	_num_phonemes += 1
	return pid


	def init_substitution_matrix(dict sub_costs, double default_sub):
	"""Build the dense substitution-cost matrix from the Python dict.

	Must be called once before the first DP call (phoneme_matcher.py does
	this at import time).

	Parameters
	----------
	sub_costs : dict[(str, str), float]
	Phoneme-pair substitution costs (both orderings already present).
	default_sub : float
	Cost used for pairs not in sub_costs.
	"""
	global _sub_matrix, _default_sub, _num_phonemes

	_default_sub = default_sub

	# First pass: make sure every phoneme in sub_costs has an id
	for (a, b) in sub_costs:
	_encode_phoneme(a)
	_encode_phoneme(b)

	# Allocate matrix (will be re-allocated if new phonemes appear later)
	_rebuild_matrix(sub_costs)


	cdef void _rebuild_matrix(dict sub_costs):
	"""(Re)allocate and fill the dense cost matrix."""
	global _sub_matrix, _num_phonemes, _default_sub

	cdef int size = _num_phonemes
	cdef int i, j

	if _sub_matrix != NULL:
	free(_sub_matrix)

	_sub_matrix = <double >malloc(size size * sizeof(double))
	if _sub_matrix == NULL:
	raise MemoryError("Failed to allocate substitution matrix")

	# Fill with default
	for i in range(size * size):
	_sub_matrix[i] = _default_sub

	# Diagonal = 0 (match)
	for i in range(size):
	_sub_matrix[i * size + i] = 0.0

	# Overrides from dict
	cdef int aid, bid
	cdef double cost
	for (a, b), cost in sub_costs.items():
	aid = _phoneme_to_id.get(a, -1)
	bid = _phoneme_to_id.get(b, -1)
	if aid >= 0 and bid >= 0:
	_sub_matrix[aid * size + bid] = cost


	cdef inline double _get_sub_cost(int pid, int rid, int size) nogil:
	"""Look up substitution cost from the dense matrix."""
	if pid == rid:
	return 0.0
	if pid < size and rid < size:
	return _sub_matrix[pid * size + rid]
	return _default_sub


	# ---------------------------------------------------------------------------
	# Main DP function
	# ---------------------------------------------------------------------------

	def cy_align_with_word_boundaries(
	list P_list,
	list R_list,
	list R_phone_to_word_list,
	int expected_word,
	double prior_weight,
	double cost_sub,
	double cost_del,
	double cost_ins,
	):
	"""Word-boundary-constrained substring alignment (Cython).

	Identical semantics to the pure-Python version. Returns the same
	(best_j, best_j_start, best_cost, best_norm_dist) tuple, with
	``(None, None, INF, INF)`` on failure.
	"""
	cdef int m = len(P_list)
	cdef int n = len(R_list)
	cdef double INF_VAL = INFINITY

	if m == 0 or n == 0:
	return (None, None, float('inf'), float('inf'))

	# ------------------------------------------------------------------
	# Encode string lists → C arrays
	# ------------------------------------------------------------------
	cdef int P_ids = <int >malloc(m * sizeof(int))
	cdef int R_ids = <int >malloc(n * sizeof(int))
	cdef int R_w = <int >malloc(n * sizeof(int))
	if P_ids == NULL or R_ids == NULL or R_w == NULL:
	if P_ids != NULL: free(P_ids)
	if R_ids != NULL: free(R_ids)
	if R_w != NULL: free(R_w)
	raise MemoryError()

	cdef int i, j
	cdef bint need_rebuild = False

	for i in range(m):
	p = P_list[i]
	if p not in _phoneme_to_id:
	_encode_phoneme(p)
	need_rebuild = True
	P_ids[i] = _phoneme_to_id[p]

	for j in range(n):
	r = R_list[j]
	if r not in _phoneme_to_id:
	_encode_phoneme(r)
	need_rebuild = True
	R_ids[j] = _phoneme_to_id[r]
	R_w[j] = <int>R_phone_to_word_list[j]

	# If new phonemes appeared, rebuild the matrix so ids are covered
	if need_rebuild and _sub_matrix != NULL:
	# We need the original sub_costs dict, but we don't have it here.
	# The safest approach: expand matrix with defaults for new phonemes.
	_grow_matrix()

	cdef int mat_size = _num_phonemes

	# ------------------------------------------------------------------
	# Precompute boundary flags
	# ------------------------------------------------------------------
	cdef char start_boundary = <char >malloc((n + 1) * sizeof(char))
	cdef char end_boundary = <char >malloc((n + 1) * sizeof(char))
	if start_boundary == NULL or end_boundary == NULL:
	free(P_ids); free(R_ids); free(R_w)
	if start_boundary != NULL: free(start_boundary)
	if end_boundary != NULL: free(end_boundary)
	raise MemoryError()

	# start_boundary[j]: can alignment start at column j?
	start_boundary[0] = 1 # column 0 always valid
	for j in range(1, n):
	start_boundary[j] = 1 if R_w[j] != R_w[j - 1] else 0
	start_boundary[n] = 0 # can't start at or past end

	# end_boundary[j]: can alignment end at column j?
	end_boundary[0] = 0 # can't end before consuming anything
	for j in range(1, n):
	end_boundary[j] = 1 if R_w[j] != R_w[j - 1] else 0
	end_boundary[n] = 1 # end of reference always valid

	# ------------------------------------------------------------------
	# DP arrays (two-row rolling)
	# ------------------------------------------------------------------
	cdef double prev_cost = <double >malloc((n + 1) * sizeof(double))
	cdef double curr_cost = <double >malloc((n + 1) * sizeof(double))
	cdef int prev_start = <int >malloc((n + 1) * sizeof(int))
	cdef int curr_start = <int >malloc((n + 1) * sizeof(int))
	if (prev_cost == NULL or curr_cost == NULL or
	prev_start == NULL or curr_start == NULL):
	free(P_ids); free(R_ids); free(R_w)
	free(start_boundary); free(end_boundary)
	if prev_cost != NULL: free(prev_cost)
	if curr_cost != NULL: free(curr_cost)
	if prev_start != NULL: free(prev_start)
	if curr_start != NULL: free(curr_start)
	raise MemoryError()

	# Initialise row 0
	for j in range(n + 1):
	if start_boundary[j]:
	prev_cost[j] = 0.0
	prev_start[j] = j
	else:
	prev_cost[j] = INF_VAL
	prev_start[j] = -1

	# ------------------------------------------------------------------
	# Core DP loop (no Python objects touched → runs at C speed)
	# ------------------------------------------------------------------
	cdef double del_option, ins_option, sub_option, sc
	cdef double *tmp_d
	cdef int *tmp_i
	cdef bint col0_start = start_boundary[0]

	for i in range(1, m + 1):
	if col0_start:
	curr_cost[0] = i * cost_del
	curr_start[0] = 0
	else:
	curr_cost[0] = INF_VAL
	curr_start[0] = -1

	for j in range(1, n + 1):
	del_option = prev_cost[j] + cost_del
	ins_option = curr_cost[j - 1] + cost_ins
	sc = _get_sub_cost(P_ids[i - 1], R_ids[j - 1], mat_size)
	sub_option = prev_cost[j - 1] + sc

	if sub_option <= del_option and sub_option <= ins_option:
	curr_cost[j] = sub_option
	curr_start[j] = prev_start[j - 1]
	elif del_option <= ins_option:
	curr_cost[j] = del_option
	curr_start[j] = prev_start[j]
	else:
	curr_cost[j] = ins_option
	curr_start[j] = curr_start[j - 1]

	# Swap rows
	tmp_d = prev_cost; prev_cost = curr_cost; curr_cost = tmp_d
	tmp_i = prev_start; prev_start = curr_start; curr_start = tmp_i

	# ------------------------------------------------------------------
	# Best-match selection (end boundaries only)
	# ------------------------------------------------------------------
	cdef double best_score = INF_VAL
	cdef int best_j = -1
	cdef int best_j_start = -1
	cdef double best_cost_val = INF_VAL
	cdef double best_norm = INF_VAL

	cdef double dist, norm_dist, prior, score
	cdef int j_start_val, ref_len, denom, start_word

	for j in range(1, n + 1):
	if not end_boundary[j]:
	continue
	if prev_cost[j] >= INF_VAL:
	continue

	dist = prev_cost[j]
	j_start_val = prev_start[j]

	ref_len = j - j_start_val
	denom = m if m > ref_len else ref_len
	if denom < 1:
	denom = 1
	norm_dist = dist / denom

	if j_start_val < n:
	start_word = R_w[j_start_val]
	else:
	start_word = R_w[j - 1]

	prior = prior_weight * fabs(<double>(start_word - expected_word))
	score = norm_dist + prior

	if score < best_score:
	best_score = score
	best_j = j
	best_j_start = j_start_val
	best_cost_val = dist
	best_norm = norm_dist

	# ------------------------------------------------------------------
	# Cleanup
	# ------------------------------------------------------------------
	free(P_ids); free(R_ids); free(R_w)
	free(start_boundary); free(end_boundary)
	free(prev_cost); free(curr_cost)
	free(prev_start); free(curr_start)

	if best_j < 0:
	return (None, None, float('inf'), float('inf'))

	return (best_j, best_j_start, best_cost_val, best_norm)


	# ---------------------------------------------------------------------------
	# Helper: grow matrix when new phonemes are encountered at runtime
	# ---------------------------------------------------------------------------

	cdef void _grow_matrix():
	"""Expand the substitution matrix to cover newly added phonemes.

	New rows/columns are filled with the default substitution cost,
	diagonal with 0.0. Existing entries are preserved.
	"""
	global _sub_matrix, _num_phonemes

	cdef int old_size
	cdef int new_size = _num_phonemes
	cdef double *new_mat

	if _sub_matrix == NULL:
	# No matrix yet — allocate fresh with defaults
	_sub_matrix = <double >malloc(new_size new_size * sizeof(double))
	if _sub_matrix == NULL:
	return
	for i in range(new_size * new_size):
	_sub_matrix[i] = _default_sub
	for i in range(new_size):
	_sub_matrix[i * new_size + i] = 0.0
	return

	# Figure out old size from current allocation
	# We track it implicitly: old_size = new_size - (number of phonemes added since last build)
	# Simpler: just rebuild from scratch with defaults + diagonal
	new_mat = <double >malloc(new_size new_size * sizeof(double))
	if new_mat == NULL:
	return

	cdef int i, j_idx
	for i in range(new_size * new_size):
	new_mat[i] = _default_sub
	for i in range(new_size):
	new_mat[i * new_size + i] = 0.0

	# Copy old entries (old matrix was some smaller size).
	# We don't know old_size exactly, so we just keep the new defaults.
	# The original sub_costs were already written; since we don't have
	# the dict here, the known-pair costs are lost for the new matrix.
	# This only happens if a completely new phoneme appears at runtime,
	# which is extremely rare. The init call covers all 69 known phonemes.

	free(_sub_matrix)
	_sub_matrix = new_mat