Advanced-MIDI-Renderer

Sleeping

App Files Files Community

Advanced-MIDI-Renderer / TCUPY.py

asigalov61

Upload 2 files

530d442 verified 6 months ago

raw

history blame contribute delete

49.5 kB

	#! /usr/bin/python3

	r'''############################################################################
	################################################################################
	#
	#
	# Tegridy Cupy Python Module (TCUPY)
	# Version 1.0
	#
	# Project Los Angeles
	#
	# Tegridy Code 2025
	#
	# https://github.com/asigalov61/tegridy-tools
	#
	#
	################################################################################
	#
	# Copyright 2024 Project Los Angeles / Tegridy Code
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	################################################################################
	################################################################################
	#
	# Critical dependencies
	#
	# !pip install cupy-cuda12x
	# !pip install numpy==1.24.4
	#
	################################################################################
	'''

	################################################################################

	print('=' * 70)
	print('Loading module...')
	print('Please wait...')
	print('=' * 70)

	################################################################################

	import sys
	import os

	################################################################################

	try:
	import cupy as cp
	import cupy as np
	print('=' * 70)
	print('CuPy is found!')
	print('Will use CuPy and GPU for processing!')
	print('=' * 70)

	except ImportError as e:
	print(f"Error: Could not import CuPy. Details: {e}")
	# Handle the error, such as providing a fallback or exiting the program
	# For example:
	print("Please make sure CuPy is installed.")
	print('=' * 70)

	raise RuntimeError("CuPy could not be loaded!") from e

	################################################################################

	from collections import defaultdict, deque
	from typing import Optional, Tuple, Dict, Any, List

	################################################################################

	# Constants
	MEMORY_LEN = 12 # Autoregressive context length
	SEQUENCE_LENGTH = 32 # Each sequence has 24 triplets

	# Baseline penalty values:
	REPETITION_PENALTY = (1.0, 1.0, 1.0) # base repetition penalty per element
	SPIKE_PENALTY_STRENGTH = (1.0, 1.0, 1.0) # base spike penalty strength per element
	SPIKE_SIGMA = (1.0, 1.0, 1.0) # baseline sigma value per element (minimum allowed)

	###################################################################################

	def find_numpy_array(src_array, trg_array):

	"""
	Finds 1D numpy array in 2D numpy array
	"""

	match_mask = np.all(src_array == trg_array, axis=1)

	return np.where(match_mask)[0]

	###################################################################################

	def vertical_list_search(src_list, trg_list):

	"""
	For each vertical window of consecutive rows of height len(trg_list) in src_list,
	this function checks whether for every offset j (0 <= j < len(trg_list)) the row
	at index (window_start + j) contains trg_list[j].

	It returns a list of windows (each a list of consecutive row indices) that meet this condition.
	"""

	if not src_list or not trg_list:
	return []

	n = len(src_list)
	k = len(trg_list)

	num_windows = n - k + 1

	if num_windows <= 0:
	return []

	# Determine the maximum row length.
	max_len = max(len(row) for row in src_list)

	# Determine a fill value guaranteed to be less than any valid value.
	global_min = min(min(row) for row in src_list if row)
	fill_value = global_min - 1

	# Build a padded 2D array A (shape n x max_len) from src_list.
	A = np.full((n, max_len), fill_value, dtype=np.int64)
	for i, row in enumerate(src_list):
	L = len(row)
	A[i, :L] = row

	# For each unique target in trg_list, compute a Boolean vector of length n.
	# present[t][i] will be True if A[i, :] contains t, else False.
	unique_targets = set(trg_list)

	present_dict = {}

	for t in unique_targets:
	# Compute along axis=1 so that for each row we see if any element equals t.
	present_dict[t] = np.any(A == t, axis=1)

	# Build a Boolean array B of shape (k, num_windows) where for each offset j,
	# B[j, s] = present_dict[ trg_list[j] ][s + j] for each window starting index s.
	B = np.empty((k, num_windows), dtype=bool)

	for j in range(k):
	t = trg_list[j]
	# For a vertical window starting at s, row s+j should contain t.
	B[j, :] = present_dict[t][j: j + num_windows]

	# A window is valid if all k rows in that window contain the required target.
	valid_windows_mask = np.all(B, axis=0)
	valid_starts = np.nonzero(valid_windows_mask)[0]

	# Create output windows (each as a list of consecutive row indices).
	result = [list(range(s, s + k)) for s in valid_starts]

	return result


	###################################################################################

	def pack_sequences(train_data, pad_val=-1):
	"""
	Packs a list of variable-length token sequences into a 2D CuPy array.

	This version computes lengths and builds the padded array and mask entirely on GPU.
	It converts each sequence into a CuPy array, concatenates them, and assigns tokens in one shot.

	Returns:
	batch: a CuPy array of shape (n, max_len)
	lengths: a CuPy array of shape (n,) containing each sequence's length.
	"""
	n = len(train_data)
	# Compute lengths of each sequence and convert to a CuPy array.
	lengths = cp.array([len(seq) for seq in train_data], dtype=cp.int64)
	max_len_val = int(cp.max(lengths).get())
	# Allocate the padded 2D array filled with pad_val.
	batch = cp.full((n, max_len_val), pad_val, dtype=cp.int64)
	# Create a boolean mask: for each row, positions less than the sequence length are valid.
	mask = cp.arange(max_len_val).reshape(1, max_len_val) < lengths.reshape(n, 1)
	# Convert each sequence to a CuPy array and concatenate them.
	sequences = [cp.array(seq, dtype=cp.int64) for seq in train_data]
	flat = cp.concatenate(sequences)
	# Fill in the valid positions.
	batch[mask] = flat
	return batch, lengths

	###################################################################################

	def count_best_pair_gpu(batch, lengths, factor, pad_val=-1):
	"""
	Given the entire GPU-resident packed data, compute the most frequent
	adjacent pair (encoded as: pair_val = first * factor + second) on GPU.
	"""
	n, L = batch.shape
	cols = cp.arange(L - 1, dtype=cp.int64)
	cols_expanded = cp.broadcast_to(cols, (n, L - 1))
	valid_mask = cols_expanded < cp.reshape(lengths, (n, 1)) - 1

	first_tokens = batch[:, :L - 1]
	second_tokens = batch[:, 1:L]
	valid_first = first_tokens[valid_mask]
	valid_second = second_tokens[valid_mask]

	pairs = valid_first * factor + valid_second
	if pairs.size == 0:
	return None

	sorted_pairs = cp.sort(pairs)
	diff = cp.diff(sorted_pairs)
	boundaries = cp.nonzero(diff)[0] + 1
	group_starts = cp.concatenate([cp.array([0], dtype=cp.int64), boundaries])
	group_ends = cp.concatenate([boundaries, cp.array([sorted_pairs.size], dtype=cp.int64)])
	group_counts = group_ends - group_starts

	max_idx = int(cp.argmax(group_counts))
	best_pair_enc = int(sorted_pairs[group_starts[max_idx]])
	best_freq = int(group_counts[max_idx])
	first = best_pair_enc // factor
	second = best_pair_enc % factor
	return (first, second, best_freq)

	###################################################################################

	merge_kernel_code = r'''
	extern "C" __global__
	void merge_pair_kernel(const long* input, long* output,
	const long* input_lengths, long* output_lengths,
	const long num_rows, const long num_cols,
	const long a, const long b, const long new_token,
	const long pad_val) {
	int row = blockIdx.x * blockDim.x + threadIdx.x;
	if (row >= num_rows) return;
	long in_length = input_lengths[row];
	long out_idx = 0;
	bool skip_next = false;
	for (long i = 0; i < in_length; i++) {
	if (skip_next) {
	skip_next = false;
	continue;
	}
	long token = input[row * num_cols + i];
	if (i < in_length - 1 && token == a && input[row * num_cols + i + 1] == b) {
	output[row * num_cols + out_idx] = new_token;
	out_idx++;
	skip_next = true;
	} else {
	output[row * num_cols + out_idx] = token;
	out_idx++;
	}
	}
	output_lengths[row] = out_idx;
	for (long j = out_idx; j < num_cols; j++) {
	output[row * num_cols + j] = pad_val;
	}
	}
	'''
	merge_kernel = cp.RawKernel(merge_kernel_code, 'merge_pair_kernel')

	###################################################################################

	def learn_bpe_codes_gpu(train_data, vocab_size=4096, max_merges=None, pad_val=-1):
	"""
	Learn BPE merge rules completely on GPU.

	The training data is packed once (using the vectorized pack_sequences).
	On each merge iteration, the best adjacent pair is computed on GPU and then merged
	into a new token via a custom merge kernel (with double-buffering).

	Returns:
	codes: a list of merge rules as ((first, second), new_token)
	final_data: the merged training data (list of sequences)
	"""
	# Pack the entire dataset onto GPU.
	batch, lengths = pack_sequences(train_data, pad_val)
	n, L = batch.shape

	# Initialize vocabulary and the next available token.
	initial_vocab = {token for seq in train_data for token in seq}
	next_token = max(initial_vocab) + 1
	codes = []
	merge_count = 0
	pbar = tqdm.tqdm(total=max_merges if max_merges is not None else None,
	desc="Learning BPE Codes (GPU)", leave=True)

	# Preallocate buffers for double-buffering.
	work_batch = cp.empty_like(batch)
	work_lengths = cp.empty_like(lengths)
	input_batch = batch
	input_lengths = lengths

	threads_per_block = 128
	blocks = (n + threads_per_block - 1) // threads_per_block

	while next_token < vocab_size and (max_merges is None or merge_count < max_merges):
	# Early stop if all sequences have collapsed (checked on GPU).
	if bool(cp.all(input_lengths == 1)):
	pbar.write("All sequences have collapsed; stopping early.")
	break

	factor = next_token # by construction, every token is < next_token
	best = count_best_pair_gpu(input_batch, input_lengths, factor, pad_val)
	if best is None:
	pbar.write("No mergeable pairs found; stopping early.")
	break

	best_pair = (best[0], best[1])
	best_freq = best[2]
	if best_freq < 2:
	pbar.write("Best pair frequency is less than 2; stopping early.")
	break

	codes.append((best_pair, next_token))

	# Launch the merge kernel.
	merge_kernel((blocks,), (threads_per_block,),
	(input_batch,
	work_batch,
	input_lengths,
	work_lengths,
	cp.int64(n),
	cp.int64(L),
	cp.int64(best_pair[0]),
	cp.int64(best_pair[1]),
	cp.int64(next_token),
	cp.int64(pad_val)))
	# Swap buffers for double-buffering.
	input_batch, work_batch = work_batch, input_batch
	input_lengths, work_lengths = work_lengths, input_lengths

	next_token += 1
	merge_count += 1
	pbar.update(1)
	pbar.close()

	final_batch = cp.asnumpy(input_batch)
	final_lengths = cp.asnumpy(input_lengths)
	final_data = [final_batch[i, :final_lengths[i]].tolist() for i in range(n)]
	return codes, final_data

	###################################################################################

	fused_merge_kernel_code = r'''
	extern "C" __global__
	void fused_merge_kernel(long* data_in, long* data_out, long* lengths, const long pad_val,
	const long num_rows, const long max_len, const long num_merges, const long* merge_rules) {
	int row = blockIdx.x * blockDim.x + threadIdx.x;
	if (row >= num_rows) return;
	long base = row * max_len;
	long cur_len = lengths[row];
	long* cur = data_in + base;
	long* other = data_out + base;
	// Process each merge rule sequentially.
	for (int m = 0; m < num_merges; m++) {
	long a = merge_rules[3 * m];
	long b = merge_rules[3 * m + 1];
	long new_token = merge_rules[3 * m + 2];
	long out_idx = 0;
	for (int i = 0; i < cur_len; i++) {
	if (i < cur_len - 1 && cur[i] == a && cur[i+1] == b) {
	other[out_idx] = new_token;
	out_idx++;
	i++; // Skip the next token.
	} else {
	other[out_idx] = cur[i];
	out_idx++;
	}
	}
	cur_len = out_idx;
	// Swap pointers for the next merge.
	long* temp = cur;
	cur = other;
	other = temp;
	}
	lengths[row] = cur_len;
	// Pad the remaining positions with pad_val.
	for (int i = cur_len; i < max_len; i++) {
	cur[i] = pad_val;
	}
	// If the final result is not in data_in, copy back.
	if (cur != data_in + base) {
	for (int i = 0; i < cur_len; i++) {
	data_in[base + i] = cur[i];
	}
	}
	}
	'''
	fused_kernel = cp.RawKernel(fused_merge_kernel_code, 'fused_merge_kernel')

	###################################################################################

	def retokenize_train_data_fused_gpu(train_data, codes, pad_val=-1):
	"""
	Retokenize training data using the fully fused GPU kernel.

	The entire training dataset is first packed into GPU memory (using pack_sequences).
	All learned merge rules (provided in 'codes') are applied via a single kernel launch.
	Each GPU thread processes one sequence by applying all merge rules sequentially.

	Returns:
	tokenized_data: list of retokenized sequences.
	"""
	# Pack the data.
	batch, lengths = pack_sequences(train_data, pad_val)
	n, max_len = batch.shape
	# Build a flattened merge_rules array using CuPy.
	if len(codes) > 0:
	merge_rules_list = [[rule[0][0], rule[0][1], rule[1]] for rule in codes]
	merge_rules_gpu = cp.array(merge_rules_list, dtype=cp.int64)
	merge_rules_gpu = merge_rules_gpu.reshape(-1)
	else:
	merge_rules_gpu = cp.empty((0,), dtype=cp.int64)
	num_merges = merge_rules_gpu.shape[0] // 3
	# Preallocate a scratch buffer.
	scratch = cp.empty_like(batch)
	threads_per_block = 128
	blocks = (n + threads_per_block - 1) // threads_per_block
	# Launch the fused kernel.
	fused_kernel((blocks,), (threads_per_block,),
	(batch, scratch, lengths, cp.int64(pad_val),
	cp.int64(n), cp.int64(max_len), cp.int64(num_merges), merge_rules_gpu))
	final_batch = cp.asnumpy(batch)
	final_lengths = cp.asnumpy(lengths)
	tokenized_data = [final_batch[i, :final_lengths[i]].tolist() for i in range(n)]
	return tokenized_data

	###################################################################################

	def bpe_encode(seq, codes):
	"""
	Iteratively encodes a sequence using BPE merge rules provided in a dictionary.

	Args:
	seq (list): A list of tokens (e.g. integers) representing the input sequence.
	codes (dict): A dictionary mapping token pairs (a tuple of two tokens)
	to a merged token. For example:
	{ (1, 2): 100, (100, 3): 101 }

	Returns:
	list: The encoded sequence after applying all possible merges.

	The function repeatedly scans the entire sequence from left to right;
	whenever it finds a contiguous token pair that exists as a key in the
	codes dict, it replaces that pair with the merged token. This pass is
	repeated until no more merges are possible.
	"""

	if type(codes) == list:
	codes = dict(codes)

	encoded_seq = seq.copy() # work on a copy so as not to modify the original
	done = False
	while not done:
	new_seq = []
	i = 0
	changed = False
	while i < len(encoded_seq):
	# If a merge is possible, merge the two tokens.
	if i < len(encoded_seq) - 1 and (encoded_seq[i], encoded_seq[i + 1]) in codes:
	new_seq.append(codes[(encoded_seq[i], encoded_seq[i + 1])])
	i += 2 # Skip the next token as it was merged.
	changed = True
	else:
	new_seq.append(encoded_seq[i])
	i += 1
	# If no merges occurred in this pass, exit the loop.
	if not changed:
	done = True
	encoded_seq = new_seq
	return encoded_seq

	###################################################################################

	def bpe_decode(seq, codes):
	"""
	Decodes a sequence encoded with BPE merge rules defined in a codes dictionary.

	Args:
	seq (list): The encoded sequence (a list of tokens).
	codes (dict): A dictionary mapping token pairs to the merged token, used during encoding.

	Returns:
	list: The fully decoded sequence, with all merged tokens recursively expanded.

	The function constructs a reverse mapping that converts a merged token back into
	its constituent pair. Each token in the sequence is then recursively expanded.
	"""

	if type(codes) == list:
	codes = dict(codes)

	# Build the reverse mapping: key = merged token, value = tuple (original token pair)
	reverse_mapping = {merged: pair for pair, merged in codes.items()}

	def recursive_expand(token):
	# If the token is a merged token, expand it recursively.
	if token in reverse_mapping:
	a, b = reverse_mapping[token]
	return recursive_expand(a) + recursive_expand(b)
	else:
	return [token]

	decoded_seq = []
	for token in seq:
	decoded_seq.extend(recursive_expand(token))
	return decoded_seq

	###################################################################################

	def ensure_triplet(val: Any, name: str = "") -> Tuple[float, float, float]:
	"""
	Ensure the given parameter is returned as a triplet.
	If provided as a scalar, promote it to a triplet.
	"""
	if np.isscalar(val):
	return (float(val), float(val), float(val))
	elif isinstance(val, (list, tuple)) and len(val) == 3:
	return tuple(float(x) for x in val)
	else:
	raise ValueError(f"{name} must be a scalar or a sequence of 3 numbers.")

	###################################################################################

	REP_PENALTY = ensure_triplet(REPETITION_PENALTY, "REPETITION_PENALTY")
	SPIKE_STRENGTH = ensure_triplet(SPIKE_PENALTY_STRENGTH, "SPIKE_PENALTY_STRENGTH")
	SPIKE_SIG = ensure_triplet(SPIKE_SIGMA, "SPIKE_SIGMA")

	###################################################################################

	def sliding_window_view_alternative(a: np.ndarray, window_length: int) -> np.ndarray:
	"""
	Create a sliding-window view (without copying) of an array.
	Expected input shape: (n, L, d) and returns: (n, L - window_length + 1, window_length, d)
	"""
	n, L, d = a.shape
	new_shape = (n, L - window_length + 1, window_length, d)
	new_strides = (a.strides[0], a.strides[1], a.strides[1], a.strides[2])
	return np.lib.stride_tricks.as_strided(a, shape=new_shape, strides=new_strides)

	###################################################################################

	def build_ngram_mapping(data: np.ndarray, memory_len: int) -> Dict[Any, Dict[Any, int]]:
	"""
	Build an n-gram mapping from a context (a sequence of triplets) to candidate triplets with frequencies.
	"""
	n, L, d = data.shape
	window_length = memory_len + 1 # context (memory) + candidate
	windows = sliding_window_view_alternative(data, window_length)
	# windows shape: (n, L - window_length + 1, window_length, d)

	# Split windows into context (first memory_len triplets) and candidates (last triplet)
	contexts = windows[:, :, :memory_len, :] # shape: (n, num_windows, memory_len, d)
	candidates = windows[:, :, memory_len, :] # shape: (n, num_windows, d)

	# Flatten the batch and window dimensions.
	contexts_flat = contexts.reshape(-1, memory_len, d)
	candidates_flat = candidates.reshape(-1, d)

	mapping = defaultdict(lambda: defaultdict(int))
	total_windows = contexts_flat.shape[0]
	for context_arr, candidate_arr in tqdm.tqdm(
	zip(contexts_flat, candidates_flat),
	total=total_windows,
	desc="Building n-gram mapping"):
	context_key = tuple(map(tuple, context_arr)) # use a tuple of triplets as the key
	candidate_val = tuple(candidate_arr)
	mapping[context_key][candidate_val] += 1

	return {context: dict(candidates) for context, candidates in mapping.items()}

	###################################################################################

	def precompute_mapping_lookup(mapping: Dict[Any, Dict[Any, int]]) -> Dict[Any, Tuple[Tuple[Any, ...], np.ndarray]]:
	"""
	Converts the mapping into a lookup table: context -> (tuple(candidates), frequencies_array).
	"""
	mapping_lookup = {}
	for context, candidate_dict in tqdm.tqdm(mapping.items(), desc="Precomputing lookup"):
	candidates = tuple(candidate_dict.keys())
	frequencies = np.array(list(candidate_dict.values()), dtype=np.float64)
	mapping_lookup[context] = (candidates, frequencies)
	return mapping_lookup

	###################################################################################

	def build_training_sequences_set(data: np.ndarray) -> set:
	"""
	Build a set of training sequences (each as a tuple of triplets) for uniqueness checking.
	"""
	return {tuple(map(tuple, seq)) for seq in data}

	###################################################################################

	def generate_sequence_optimized(mapping_lookup: Dict[Any, Tuple[Tuple[Any, ...], np.ndarray]],
	training_set: set,
	memory_len: int,
	sequence_length: int = 24,
	max_attempts: int = 1000) -> Optional[Tuple[Tuple[float, float, float], ...]]:
	"""
	Autoregressively generate a new, unique sequence using the precomputed mapping lookup.
	The invariant maintained is: the second element of one triplet is never greater than the first element
	of the following triplet.

	Two dynamic adjustments are applied for candidate selection:

	1. Dynamic Repetition Penalty:
	For each candidate, count the occurrences of each element in the generated sequence.
	Rather than a fixed penalty, this repetition penalty scales with the ratio
	(current_length / sequence_length). In log-space, it subtracts:
	(current_length / sequence_length) * sum_k(count[k] * log(REP_PENALTY[k])
	2. Dynamic Spike (Variance) Penalty:
	For each candidate, compute the squared difference from the running average for each element.
	Use a dynamic sigma that is the maximum between the running standard deviation and the baseline.
	The penalty term for each element is:
	SPIKE_STRENGTH[k] * ((cand[k] - running_avg[k])^2) / (2 * dynamic_sigma[k]^2)
	The overall spike penalty is the sum of the three terms and is subtracted from the candidate’s log frequency.

	The resulting candidate log score is computed as:
	log(candidate_frequency) - rep_penalty_component - spike_penalty_component
	A numerical stable softmax is then applied over these scores to determine the probability for drawing a candidate.

	If no candidate passing the invariant is found, the attempt is aborted.

	Parameters:
	mapping_lookup: Precomputed lookup mapping (context → (candidates, frequencies)).
	training_set: Set of training sequences to ensure uniqueness.
	memory_len: Number of triplets used as context.
	sequence_length: Desired length of the generated sequence.
	max_attempts: Maximum number of generation attempts.

	Returns:
	A new unique sequence (tuple of triplets) that respects the invariant, or None if not found.
	"""
	mapping_keys = list(mapping_lookup.keys())
	num_keys = len(mapping_keys)

	for attempt in range(max_attempts):
	# Select a seed context randomly (from training data so that the invariant holds).
	seed = mapping_keys[np.random.randint(0, num_keys)]
	generated_sequence: List[Tuple[float, float, float]] = list(seed)
	valid_generation = True

	while len(generated_sequence) < sequence_length:
	last_triplet = generated_sequence[-1]
	current_context = tuple(generated_sequence[-memory_len:]) # context as tuple of triplets
	candidate_found = False

	if current_context in mapping_lookup:
	candidates, frequencies = mapping_lookup[current_context]
	# Filter candidates by invariant:
	# Candidate's first element must be >= last triplet's second element.
	valid_indices = [i for i, cand in enumerate(candidates) if cand[0] >= last_triplet[1]]
	if valid_indices:
	# Filter candidates and their associated frequencies.
	filtered_freqs = frequencies[valid_indices]
	filtered_candidates = [candidates[i] for i in valid_indices]

	# Convert candidates into a NumPy array for vectorized operations.
	candidate_array = np.array(filtered_candidates, dtype=np.float64) # shape: (n_candidates, 3)

	# Prepare generation history as array.
	generated_array = np.array(generated_sequence, dtype=np.float64) # shape: (T, 3)
	current_length = generated_array.shape[0]

	# Running average and standard deviation for dynamic spike adjustment.
	running_avg = np.mean(generated_array, axis=0) # shape: (3,)
	running_std = np.std(generated_array, axis=0) # shape: (3,)
	# Dynamic sigma: ensure a minimum sigma value.
	dynamic_sigma = np.maximum(running_std, np.array(SPIKE_SIG))

	# --- Compute Repetition Penalty ---
	# For each candidate, count the number of occurrences for each element along the corresponding column.
	rep_counts = np.array([
	[np.sum(generated_array[:, k] == candidate_array[i, k]) for k in range(3)]
	for i in range(candidate_array.shape[0])
	]) # shape: (n_candidates, 3)
	# The repetition penalty in log-space.
	rep_penalty_term = np.sum(rep_counts * np.log(np.array(REP_PENALTY)) *
	(current_length / sequence_length), axis=1) # shape: (n_candidates,)

	# --- Compute Spike (Variance) Penalty ---
	# Compute the difference per candidate from the running average.
	diff = candidate_array - running_avg # shape: (n_candidates, 3)
	spike_penalty_term = np.sum(np.array(SPIKE_STRENGTH) * (diff*2) / (2 (dynamic_sigma**2)),
	axis=1) # shape: (n_candidates,)

	# --- Compute Candidate Log-Scores ---
	# Use np.log on frequencies (they are positive by construction).
	log_freq = np.log(filtered_freqs)
	log_scores = log_freq - rep_penalty_term - spike_penalty_term

	# --- Softmax in Log-space (stable computation) ---
	max_log = np.max(log_scores)
	exp_scores = np.exp(log_scores - max_log)
	probabilities = exp_scores / np.sum(exp_scores)

	# Choose the next candidate using advanced probabilities.
	chosen_idx = np.random.choice(len(filtered_candidates), p=probabilities)
	next_triplet = filtered_candidates[chosen_idx]
	candidate_found = True

	if not candidate_found:
	# Abort this generation attempt if no valid candidate is available.
	valid_generation = False
	break

	generated_sequence.append(next_triplet)

	# Ensure the final sequence meets the invariant and is unique.
	if valid_generation and len(generated_sequence) == sequence_length:
	new_sequence = tuple(generated_sequence)
	invariant_ok = all(a[1] <= b[0] for a, b in zip(new_sequence, new_sequence[1:]))
	if invariant_ok and new_sequence not in training_set:
	return new_sequence

	return None

	###################################################################################

	def analyze_generated_sequence(sequence: tuple, mapping_lookup: dict, memory_len: int) -> tuple:
	"""
	Analyze the generated sequence and return several useful statistics
	as both a dictionary and as a nicely formatted string report.

	Statistics Computed:
	- unigram_diversity: Ratio of unique triplets to total triplets.
	- repetition_rate: Fraction of repeated triplets.
	- bigram_diversity: Ratio of unique consecutive pairs to total pairs.
	- max_consecutive_repetitions: Maximum number of identical consecutive triplets.
	- avg_candidate_probability (overfit rate): For the transitions (using a sliding window of size
	MEMORY_LEN as context followed by candidate), the average probability of the chosen candidate
	as per the training mapping.

	Additional Analytics:
	- element_stats: For each element (index 0, 1, 2) in a triplet, includes:
	* mean, standard deviation, minimum, maximum, and average consecutive absolute difference.
	- avg_transition_entropy: The average entropy of the candidate distributions (from mapping_lookup)
	for each transition context.
	- context_coverage: The fraction of transitions (based on context of length MEMORY_LEN) that are found
	in the mapping_lookup.

	Parameters:
	sequence: Generated sequence (tuple of triplets).
	mapping_lookup: Precomputed mapping lookup.
	memory_len: The context length used.

	Returns:
	A tuple containing:
	(stats_dict, stats_report_string)
	"""
	stats = {}
	seq_len = len(sequence)

	# --- Basic Statistics ---

	# Unigram.
	unique_triplets = len(set(sequence))
	stats["unigram_diversity"] = unique_triplets / seq_len
	stats["repetition_rate"] = 1 - (unique_triplets / seq_len)

	# Bigram.
	bigrams = [(sequence[i], sequence[i+1]) for i in range(seq_len - 1)]
	unique_bigrams = len(set(bigrams))
	stats["bigram_diversity"] = unique_bigrams / (seq_len - 1)

	# Maximum consecutive repetitions.
	max_consecutive = 1
	current_consecutive = 1
	for i in range(1, seq_len):
	if sequence[i] == sequence[i-1]:
	current_consecutive += 1
	if current_consecutive > max_consecutive:
	max_consecutive = current_consecutive
	else:
	current_consecutive = 1
	stats["max_consecutive_repetitions"] = max_consecutive

	# Avg Candidate Probability (Overfit Rate)
	overfit_probs = []
	for i in range(memory_len, seq_len):
	context = tuple(sequence[i - memory_len: i])
	candidate = sequence[i]
	if context in mapping_lookup:
	candidates, frequencies = mapping_lookup[context]
	total_freq = np.sum(frequencies)
	try:
	idx = candidates.index(candidate)
	cand_prob = frequencies[idx] / total_freq
	overfit_probs.append(cand_prob)
	except ValueError:
	pass
	stats["avg_candidate_probability"] = np.mean(overfit_probs) if overfit_probs else None

	# --- Additional Analytics ---

	# 1. Element-Level Statistics.
	seq_arr = np.array(sequence) # shape: (seq_len, 3)
	element_stats = {}
	for dim in range(seq_arr.shape[1]):
	values = seq_arr[:, dim]
	mean_val = np.mean(values)
	std_val = np.std(values)
	min_val = np.min(values)
	max_val = np.max(values)
	# Calculate average absolute difference between consecutive values:
	diffs = np.abs(np.diff(values))
	avg_diff = np.mean(diffs) if diffs.size > 0 else 0
	element_stats[f"element_{dim}"] = {
	"mean": mean_val,
	"std": std_val,
	"min": min_val,
	"max": max_val,
	"avg_consecutive_diff": avg_diff,
	}
	stats["element_stats"] = element_stats

	# 2. Transition Entropy:
	entropies = []
	valid_transitions = 0
	for i in range(memory_len, seq_len):
	context = tuple(sequence[i - memory_len: i])
	if context in mapping_lookup:
	candidates, freqs = mapping_lookup[context]
	total_freq = np.sum(freqs)
	if total_freq > 0:
	probs = freqs / total_freq
	# Add a very small constant to avoid log(0)
	epsilon = 1e-10
	entropy = -np.sum(probs * np.log(probs + epsilon))
	entropies.append(entropy)
	valid_transitions += 1
	stats["avg_transition_entropy"] = np.mean(entropies) if entropies else None

	# 3. Context Coverage:
	total_transitions = seq_len - memory_len
	stats["context_coverage"] = (valid_transitions / total_transitions) if total_transitions > 0 else None

	# --- Build a Pretty Report String ---
	sep_line = "-" * 60
	lines = []
	lines.append(sep_line)
	lines.append("Sequence Analytics Report:")
	lines.append(sep_line)
	lines.append("Overall Statistics:")
	lines.append(f" Unigram Diversity : {stats['unigram_diversity']:.3f}")
	lines.append(f" Repetition Rate : {stats['repetition_rate']:.3f}")
	lines.append(f" Bigram Diversity : {stats['bigram_diversity']:.3f}")
	lines.append(f" Max Consecutive Repetitions: {stats['max_consecutive_repetitions']}")
	cand_prob = stats["avg_candidate_probability"]
	cand_prob_str = f"{cand_prob:.3f}" if cand_prob is not None else "N/A"
	lines.append(f" Avg Candidate Probability : {cand_prob_str}")
	lines.append("")

	lines.append("Element-Level Statistics:")
	for dim in sorted(element_stats.keys()):
	ed = element_stats[dim]
	lines.append(f" {dim.capitalize()}:")
	lines.append(f" Mean : {ed['mean']:.3f}")
	lines.append(f" Std Dev : {ed['std']:.3f}")
	lines.append(f" Min : {ed['min']:.3f}")
	lines.append(f" Max : {ed['max']:.3f}")
	lines.append(f" Avg Consecutive Diff : {ed['avg_consecutive_diff']:.3f}")
	lines.append("")

	lines.append("Transition Statistics:")
	avg_entropy = stats["avg_transition_entropy"]
	entropy_str = f"{avg_entropy:.3f}" if avg_entropy is not None else "N/A"
	lines.append(f" Average Transition Entropy: {entropy_str}")
	cc = stats["context_coverage"]
	cc_str = f"{cc:.3f}" if cc is not None else "N/A"
	lines.append(f" Context Coverage : {cc_str}")
	lines.append(sep_line)

	stats_report = "\n".join(lines)

	# Return both the dictionary and the formatted report string.
	return stats, stats_report

	###################################################################################

	def autoregressive_generate(start_seq, mel_tones, trg_array, trg_matches_array, num_new_tokens, chunk_len=5):

	# Convert sequences to NumPy arrays.
	current_seq = np.array(start_seq, dtype=int) # Shape: (num_tokens, token_dim)
	trg_array = np.array(trg_array, dtype=int) # Shape: (num_candidates, 2, token_dim)
	start_len = len(start_seq)

	midx = start_len-1

	# Deque for sliding memory of candidate pairs (immutable tuples).
	recent_candidates = deque(maxlen=5)

	while (len(current_seq) - start_len) < num_new_tokens:

	midx += 1

	# Get the last two tokens as context.
	context = current_seq[-(chunk_len-1):] # Shape: (2, token_dim)

	sli = 0
	msize = 0

	ctx = context[:, :-1].reshape(1, -1)
	trg_mat_arr = trg_matches_array

	while msize < 8:

	print('=== Slice', sli)

	# Compare context with candidates in trg_array.
	match_mask = np.all(ctx == trg_mat_arr, axis=1)
	match_indices = np.where(match_mask)[0]

	msize = match_indices.size

	if msize < 8:
	sli += 1
	ctx = context[:, :-1].reshape(1, -1)[:, sli:]
	trg_mat_arr = trg_matches_array[:, :-sli]

	if match_indices.size == 0:
	if len(current_seq) > start_len:

	#tones_chord = sorted([mel_tones[midx], (mel_tones[midx]+7) % 12])
	tones_chord = sorted([mel_tones[midx]])
	new_tuple = [[mel_tones[midx], TMIDIX.ALL_CHORDS_SORTED.index(tones_chord)]]
	current_seq = np.concatenate((current_seq, new_tuple), axis=0)
	print('Subbed', midx)
	continue

	# From the matching candidates, filter out those whose candidate pair is in recent memory.
	available_candidates = []
	cseen = []
	for idx in match_indices:

	if idx not in recent_candidates:
	# Convert candidate pair to an immutable tuple
	candidate_pair = tuple(trg_array[idx].tolist())
	if candidate_pair[-1][0] == mel_tones[midx] and candidate_pair[-1][1] not in cseen:
	available_candidates.append((idx, candidate_pair))
	cseen.append(candidate_pair[-1][1])

	# If all candidates have recently been used, backtrack.
	if len(available_candidates) < 3:
	if len(current_seq) >= start_len:
	#tones_chord = sorted([mel_tones[midx], (mel_tones[midx]+7) % 12])
	tones_chord = sorted([mel_tones[midx]])
	new_tuple = [[mel_tones[midx], TMIDIX.ALL_CHORDS_SORTED.index(tones_chord)]]
	current_seq = np.concatenate((current_seq, new_tuple), axis=0)
	#rev_val = random.choice([-1, -2])
	#current_seq = current_seq[:rev_val]
	#print(midx)
	#midx = len(current_seq)
	#print('Reverted', midx, len(current_seq))
	continue

	else:
	print(len(available_candidates))
	# Choose one available candidate at random.
	chosen_idx, chosen_pair = available_candidates[np.random.choice(len(available_candidates))]
	new_token = trg_array[chosen_idx][-1] # The second token of the candidate pair.


	# Append the new token to the sequence.
	current_seq = np.concatenate((current_seq, new_token[None, :]), axis=0)

	recent_candidates.append(chosen_idx)

	print('Gen seq len', len(current_seq))

	return current_seq

	###################################################################################

	def minkowski_distance_vector_to_matrix(x: cp.ndarray, X: cp.ndarray, p: float = 3) -> cp.ndarray:

	"""
	Computes the Minkowski distance between a 1D CuPy array 'x' and each row of a 2D CuPy array 'X'.

	Parameters:
	x (cp.ndarray): A 1D array with shape (n_features,) representing a single vector.
	X (cp.ndarray): A 2D array with shape (n_samples, n_features) where each row is a vector.
	p (float): The order of the Minkowski distance.
	For instance:
	- p=1 yields the Manhattan distance,
	- p=2 yields the Euclidean distance,
	- p=3 yields the Minkowski distance and will use the cube-root implementation,
	- p=∞ (or cp.inf) gives the Chebyshev distance.

	Returns:
	cp.ndarray: A 1D array of length n_samples containing the Minkowski distance between 'x'
	and the corresponding row in 'X'.
	"""

	# Compute the element-wise absolute differences between x and every row in X.
	# Broadcasting x over the rows of X results in an array of shape (n_samples, n_features).
	diff = cp.abs(X - x)

	if p == float('inf') or p == cp.inf:
	# For the Chebyshev distance, use the maximum absolute difference along the feature axis.
	distances = cp.max(diff, axis=1)
	elif p == 3:
	# Instead of using the generic power operation (sum(diff3) (1/3)),
	# we use cp.cbrt for cube-root calculation when p is exactly 3.
	distances = cp.cbrt(cp.sum(diff ** 3, axis=1))
	else:
	# For general Minkowski distance with finite p,
	# compute the p-th power of differences, sum them, then take the p-th root.
	distances = cp.sum(diff p, axis=1) (1.0 / p)

	return distances

	###################################################################################

	def pairwise_minkowski_distance(X: cp.ndarray, p: float = 2) -> cp.ndarray:

	"""
	Computes pairwise Minkowski distances for a 2D CuPy array.

	Parameters:
	X (cp.ndarray): A 2D array of shape (n_samples, n_features), where each row represents a vector.
	p (float): The order of the Minkowski distance.
	For example:
	- p=1 is the Manhattan distance,
	- p=2 is the Euclidean distance,
	- p=∞ (e.g., float('inf') or cp.inf) is the Chebyshev distance.

	Returns:
	cp.ndarray: A 2D array of shape (n_samples, n_samples) containing the pairwise Minkowski distances.
	"""

	# Use broadcasting to compute the absolute difference between every pair of vectors.
	# The result of X[:, None, :] - X[None, :, :] will have shape (n_samples, n_samples, n_features).
	if p == float('inf') or p == cp.inf:
	# For the Chebyshev distance, take the maximum absolute difference along the feature axis.
	return cp.max(cp.abs(X[:, None, :] - X[None, :, :]), axis=-1)
	else:
	# Raise the absolute differences to the power p.
	diff_powered = cp.abs(X[:, None, :] - X[None, :, :]) ** p
	# Sum over the features for each pair (i, j) and then take the p-th root.
	distances = cp.sum(diff_powered, axis=-1) ** (1.0 / p)

	return distances

	###################################################################################

	def pairwise_cosine_similarity(X: cp.ndarray, eps: float = 1e-10) -> cp.ndarray:

	"""
	Computes the pairwise cosine similarity for a 2D CuPy array.

	Parameters:
	X (cp.ndarray): A 2D array of shape (n_samples, n_features) where each row represents a vector.
	eps (float): A small constant added to the denominator to prevent division by zero.

	Returns:
	cp.ndarray: A 2D array of shape (n_samples, n_samples) containing the pairwise cosine similarities.
	"""

	# Compute the dot product between every pair of rows.
	# This results in a matrix where element (i, j) is the dot product of X[i] and X[j].
	dot_product = cp.dot(X, X.T)

	# Compute the L2 norm (Euclidean norm) for each row vector.
	norms = cp.linalg.norm(X, axis=1)

	# Compute the outer product of the norms to form the denominator.
	# The element (i, j) in this matrix is norms[i] * norms[j].
	norm_matrix = cp.outer(norms, norms)

	# Compute the cosine similarity matrix.
	# Adding a small epsilon (eps) to the denominator prevents division by zero.
	cosine_similarity = dot_product / (norm_matrix + eps)

	return cosine_similarity

	###################################################################################

	def cosine_similarities(src_array, trg_array):

	"""
	Computes cosine similarities between 1D src array and 2D trg array
	"""

	src_norm = cp.linalg.norm(src_array)

	trg_norms = cp.linalg.norm(trg_array, axis=1)

	dot_products = cp.dot(trg_array, src_array)

	cosine_sims = dot_products / (src_norm * trg_norms + 1e-10)

	return cosine_sims

	###################################################################################

	def embeddings_topk_cosine_neighbors(embeddings,
	k=1,
	row_batch=4096,
	col_batch=4096
	):

	"""
	For each embedding, find the indices and similarities of its top-k neighbors,
	excluding itself, sorted by descending similarity.

	Args:
	embeddings (cp.ndarray): shape (N, D), float32 on GPU.
	k (int): how many neighbors to return (must be < N).
	row_batch (int): number of rows to process at once.
	col_batch (int): number of columns to process at once.

	Returns:
	top_idx (cp.ndarray): shape (N, k), int32 indices of nearest neighbors.
	top_sim (cp.ndarray): shape (N, k), float32 cosine similarities.
	Each row is sorted descending.
	"""

	# normalize embeddings to unit length
	norms = cp.linalg.norm(embeddings, axis=1, keepdims=True)
	embeddings /= norms

	N, D = embeddings.shape
	if not (1 <= k < N):
	raise ValueError(f"k must satisfy 1 ≤ k < N; got N={N}, k={k}")

	# placeholders for top-k
	top_sim = cp.full((N, k), -cp.inf, dtype=cp.float32)
	top_idx = cp.full((N, k), -1, dtype=cp.int32)

	for i in tqdm.tqdm(range(0, N, row_batch)):
	i_end = min(i + row_batch, N)
	rows = embeddings[i:i_end] # (rb, D)
	rb = i_end - i

	# per-block buffers
	best_s = top_sim[i:i_end] # (rb, k)
	best_i = top_idx[i:i_end] # (rb, k)
	row_ids = cp.arange(i, i_end) # (rb,)

	for j in range(0, N, col_batch):
	j_end = min(j + col_batch, N)
	cols = embeddings[j:j_end] # (cb, D)
	sims = rows.dot(cols.T) # (rb, cb)

	# mask out self-similarities
	# find rows whose global index ∈ [j, j_end)
	mask = (row_ids >= j) & (row_ids < j_end)
	if mask.any():
	local_rows = cp.where(mask)[0]
	local_cols = row_ids[mask] - j
	sims[local_rows, local_cols] = -cp.inf

	# get top-k within this block
	# argpartition to grab k largest in each row
	part = cp.argpartition(sims, -k, axis=1)[:, -k:] # (rb, k)
	blk_s = sims[cp.arange(rb)[:, None], part] # (rb, k)
	blk_i = part + j # (rb, k)

	# merge with running best
	cat_s = cp.concatenate([best_s, blk_s], axis=1) # (rb, 2k)
	cat_i = cp.concatenate([best_i, blk_i], axis=1)

	# select new top-k from the 2k candidates
	part2 = cp.argpartition(cat_s, -k, axis=1)[:, -k:]
	best_s = cat_s[cp.arange(rb)[:, None], part2]
	best_i = cat_i[cp.arange(rb)[:, None], part2]

	# write back
	top_sim[i:i_end] = best_s
	top_idx[i:i_end] = best_i

	# final sort per row so sims descend
	if k > 1:
	order = cp.argsort(-top_sim, axis=1)
	top_sim = cp.take_along_axis(top_sim, order, axis=1)
	top_idx = cp.take_along_axis(top_idx, order, axis=1)

	# if k==1, squeeze dims
	if k == 1:
	return top_idx.ravel(), top_sim.ravel()

	return top_idx, top_sim

	###################################################################################

	print('Module is loaded!')
	print('Enjoy! :)')
	print('=' * 70)

	###################################################################################
	# This is the end of the TCUPY Python module
	###################################################################################