Koyd111
/

testy

Model card Files Files and versions

testy / preprocess_dataset.py

Koyd111's picture

Upload 8 files

32b6996 verified about 1 year ago

history blame contribute delete

1.93 kB

	from cipher_8bit import *
	from french_dataset import get_full_dataset

	def get_frequency_ranks(encodings, symbols, sequence_len):
	freq_ranks_dict = [0] * len(symbols)
	encodings = encodings[:sequence_len]
	for encoding in encodings:
	freq_ranks_dict[encoding] += 1
	freq_ranks = [0] * (sequence_len)

	for i in range(len(encodings)):
	freq_ranks[i] = freq_ranks_dict[encodings[i]]
	return freq_ranks

	def get_proximity_array(encodings, sequence_len):
	distances = [0] * (sequence_len)
	encodings = encodings[:sequence_len]
	for i, encoding in enumerate(encodings):
	try:
	last_idx = encodings.index(encoding, 0, i)
	distances[i] = (i - last_idx)
	except ValueError:
	# If the encoding is not found in the indices, set the distance to 0
	distances[i] = 0
	return distances

	def preprocess_text(sequence_len=256):
	full_text = get_full_dataset()
	symbols = get_symbols(full_text, 256)
	symbols = load_or_save_symbols(symbols)
	substitution_rule = substitution_cipher(symbols, 1337)

	i = 0
	raw_length = sequence_len * 2 # Overshoot so when it encodes it takes atleast sequence_len
	processed_data = []
	while i * raw_length < len(full_text) - raw_length:
	i += 1
	sample_text = full_text[(i - 1) * raw_length: i * raw_length - 1]
	encodings_array, indices = encode_text_with_indices(substitution_rule, symbols, sample_text)
	if len(encodings_array) > sequence_len: encodings_array = encodings_array[:sequence_len]
	if len(indices) > sequence_len: indices = indices[:sequence_len]

	ranks = get_frequency_ranks(encodings_array, symbols, sequence_len)
	distances = get_proximity_array(encodings_array, sequence_len)
	processed_data.append([encodings_array, distances, indices])
	return processed_data