|
|
import os
|
|
|
import pickle
|
|
|
import random
|
|
|
from collections import Counter
|
|
|
|
|
|
def cut_string_into_pairs(text_corpus):
|
|
|
pairs = []
|
|
|
for i in range(0, len(text_corpus) - 1, 2):
|
|
|
pairs.append(text_corpus[i:i + 2])
|
|
|
if len(text_corpus) % 2 != 0:
|
|
|
pairs.append(text_corpus[-1] + '_')
|
|
|
return pairs
|
|
|
|
|
|
def get_symbols(text_corpus, max_characters=256):
|
|
|
|
|
|
single_characters = list(set(list(text_corpus)))
|
|
|
pairs = [item for item, _ in Counter(cut_string_into_pairs(text_corpus)).most_common(256 - len(single_characters))]
|
|
|
return single_characters + pairs
|
|
|
|
|
|
def substitution_cipher(symbols, random_seed):
|
|
|
random.seed(random_seed)
|
|
|
|
|
|
integer_encodings = random.sample(list(range(len(symbols))), len(symbols))
|
|
|
substitution_rule = dict({})
|
|
|
|
|
|
for idx, symbol in enumerate(symbols):
|
|
|
encoding = integer_encodings[idx]
|
|
|
substitution_rule[symbol] = encoding
|
|
|
return substitution_rule
|
|
|
|
|
|
def encode_text_with_indices(rule, symbols, text):
|
|
|
encoded_text = []
|
|
|
indices = []
|
|
|
i = 0
|
|
|
|
|
|
|
|
|
index_dict = dict(zip(symbols, range(len(symbols))))
|
|
|
|
|
|
while i < len(text):
|
|
|
|
|
|
if i + 1 < len(text):
|
|
|
pair = text[i] + text[i + 1]
|
|
|
|
|
|
if pair in rule:
|
|
|
encoding = rule[pair]
|
|
|
encoded_text.append(encoding)
|
|
|
indices.append(index_dict[pair])
|
|
|
i += 2
|
|
|
continue
|
|
|
|
|
|
|
|
|
if text[i] in rule:
|
|
|
encoding = rule[text[i]]
|
|
|
encoded_text.append(encoding)
|
|
|
indices.append(index_dict[text[i]])
|
|
|
else:
|
|
|
|
|
|
encoded_text.append(256)
|
|
|
indices.append(256)
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
return encoded_text, indices
|
|
|
|
|
|
|
|
|
def load_or_save_symbols(symbols, pickle_file_path="symbols.pkl"):
|
|
|
if os.path.exists(pickle_file_path):
|
|
|
with open(pickle_file_path, 'rb') as f:
|
|
|
print("Loading symbols from pickle file...")
|
|
|
return pickle.load(f)
|
|
|
else:
|
|
|
print("Pickle file not found. Saving symbols...")
|
|
|
with open(pickle_file_path, 'wb') as f:
|
|
|
pickle.dump(symbols, f)
|
|
|
return symbols
|
|
|
|