File size: 1,932 Bytes
32b6996 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
from cipher_8bit import *
from french_dataset import get_full_dataset
def get_frequency_ranks(encodings, symbols, sequence_len):
freq_ranks_dict = [0] * len(symbols)
encodings = encodings[:sequence_len]
for encoding in encodings:
freq_ranks_dict[encoding] += 1
freq_ranks = [0] * (sequence_len)
for i in range(len(encodings)):
freq_ranks[i] = freq_ranks_dict[encodings[i]]
return freq_ranks
def get_proximity_array(encodings, sequence_len):
distances = [0] * (sequence_len)
encodings = encodings[:sequence_len]
for i, encoding in enumerate(encodings):
try:
last_idx = encodings.index(encoding, 0, i)
distances[i] = (i - last_idx)
except ValueError:
# If the encoding is not found in the indices, set the distance to 0
distances[i] = 0
return distances
def preprocess_text(sequence_len=256):
full_text = get_full_dataset()
symbols = get_symbols(full_text, 256)
symbols = load_or_save_symbols(symbols)
substitution_rule = substitution_cipher(symbols, 1337)
i = 0
raw_length = sequence_len * 2 # Overshoot so when it encodes it takes atleast sequence_len
processed_data = []
while i * raw_length < len(full_text) - raw_length:
i += 1
sample_text = full_text[(i - 1) * raw_length: i * raw_length - 1]
encodings_array, indices = encode_text_with_indices(substitution_rule, symbols, sample_text)
if len(encodings_array) > sequence_len: encodings_array = encodings_array[:sequence_len]
if len(indices) > sequence_len: indices = indices[:sequence_len]
ranks = get_frequency_ranks(encodings_array, symbols, sequence_len)
distances = get_proximity_array(encodings_array, sequence_len)
processed_data.append([encodings_array, distances, indices])
return processed_data
|