testy / invariants.py
Koyd111's picture
Upload 8 files
32b6996 verified
from cipher_8bit import *
from french_dataset import get_full_dataset
import json
import pickle
def get_pattern_ranks(pattern_frequency_dict):
sorted_items = sorted(pattern_frequency_dict.items(), key=lambda x: x[1], reverse=True)
# Initialize a new dictionary for ranks
ranked_dict = {}
# Assign ranks (starting from 1)
rank = 1
for key, value in sorted_items:
ranked_dict[key] = rank
rank += 1
return ranked_dict
def unique_pattern_identifiers(symbol_index_sequences,save=False, name="data"):
freq_dict = {}
i = 0
if os.path.exists(name+".json") and save:
# Load the existing JSON data into a dictionary
with open(name+".json", "r") as json_file:
loaded_dict = json.load(json_file)
return get_pattern_ranks(loaded_dict)
for sequence in symbol_index_sequences:
raw_pattern_data = find_patterns_and_indices(sequence, remove_subsets=False)
for pattern in raw_pattern_data:
key = "-".join(map(str, pattern[0]))
if key in freq_dict:
freq_dict[key] += len(pattern[1])
else:
freq_dict[key] = len(pattern[1])
i += 1
if save:
with open(name+".json", "w") as json_file:
json.dump(freq_dict, json_file, indent=4)
return get_pattern_ranks(freq_dict)
def get_data_pairs(full_text):
if os.path.exists("data_pairs.pkl"):
with open("data_pairs.pkl", 'rb') as f:
print("Loading training pairs from pickle file...")
return pickle.load(f)
text_chunks = []
chunk_len = 1500
i=0
print("starting chunking text")
while i * chunk_len < len(full_text) - chunk_len - 1:
i += 1
sample_text = full_text[(i - 1) * chunk_len: i * chunk_len - 1]
text_chunks.append(sample_text)
symbol_index_sequences = []
symbols = ['b', 'j', '\r', 'J', '”', ')', 'Â', 'É', 'ê', '5', 't', '9', 'Y', '%', 'N', 'B', 'V', '\ufeff', 'Ê', '?', '’', 'i', ':', 's', 'C', 'â', 'ï', 'W', 'y', 'p', 'D', '—', '«', 'º', 'A', '3', 'n', '0', 'q', '4', 'e', 'T', 'È', '$', 'U', 'v', '»', 'l', 'P', 'X', 'Z', 'À', 'ç', 'u', '…', 'î', 'L', 'k', 'E', 'R', '2', '_', '8', 'é', 'O', 'Î', '‘', 'a', 'F', 'H', 'c', '[', '(', "'", 'è', 'I', '/', '!', ' ', '°', 'S', '•', '#', 'x', 'à', 'g', '*', 'Q', 'w', '1', 'û', '7', 'G', 'm', '™', 'K', 'z', '\n', 'o', 'ù', ',', 'r', ']', '.', 'M', 'Ç', '“', 'h', '-', 'f', 'ë', '6', ';', 'd', 'ô', 'e ', 's ', 't ', 'es', ' d', '\r\n', 'en', 'qu', ' l', 're', ' p', 'de', 'le', 'nt', 'on', ' c', ', ', ' e', 'ou', ' q', ' s', 'n ', 'ue', 'an', 'te', ' a', 'ai', 'se', 'it', 'me', 'is', 'oi', 'r ', 'er', ' m', 'ce', 'ne', 'et', 'in', 'ns', ' n', 'ur', 'i ', 'a ', 'eu', 'co', 'tr', 'la', 'ar', 'ie', 'ui', 'us', 'ut', 'il', ' t', 'pa', 'au', 'el', 'ti', 'st', 'un', 'em', 'ra', 'e,', 'so', 'or', 'l ', ' f', 'll', 'nd', ' j', 'si', 'ir', 'e\r', 'ss', 'u ', 'po', 'ro', 'ri', 'pr', 's,', 'ma', ' v', ' i', 'di', ' r', 'vo', 'pe', 'to', 'ch', '. ', 've', 'nc', 'om', ' o', 'je', 'no', 'rt', 'à ', 'lu', "'e", 'mo', 'ta', 'as', 'at', 'io', 's\r', 'sa', "u'", 'av', 'os', ' à', ' u', "l'", "'a", 'rs', 'pl', 'é ', '; ', 'ho', 'té', 'ét', 'fa', 'da', 'li', 'su', 't\r', 'ée', 'ré', 'dé', 'ec', 'nn', 'mm', "'i", 'ca', 'uv', '\n\r', 'id', ' b', 'ni', 'bl']
symbols = load_or_save_symbols(symbols)
substitution_rule = substitution_cipher(symbols, 1337)
def invariate_sequence(sample, ids, vocab_size):
fill_in = []
p = find_patterns_and_indices(sample)
u = find_unique_singles(sample)
for pattern in p:
value = "-".join(map(str, pattern[0]))
for index in pattern[1]:
fill_in.append([index, ids[value], len(pattern[0])])
for unique in u:
fill_in.append([unique[1][0], vocab_size + 1, 0])
fill_in.sort(key=lambda x: x[0])
total_list = [0] * 1024
i=0
tally = 0
pattern_count = 0
while pattern_count < len(fill_in):
if tally != fill_in[pattern_count][0]:
total_list[i] = 1
i+=1
tally+=1
continue
if fill_in[pattern_count][2] == 0:
total_list[i] = 0
i+=1
tally+=1
pattern_count +=1
continue
if fill_in[pattern_count][2] == 1:
total_list[i+1] = fill_in[pattern_count][1] + 5
i+=1
else:
total_list[i] = fill_in[pattern_count][2]
total_list[i+1] = fill_in[pattern_count][1] + 5
i+=2
tally += fill_in[pattern_count][2]
pattern_count += 1
total_list = total_list[:i]
return total_list
dataset = []
print(len(text_chunks))
i5 = 0
for text_i in range(len(text_chunks)):
i5 += 1
if i5 % 100 == 0:
print(i5)
sample_encodings, sample_indices = encode_text_with_indices(substitution_rule, symbols, text_chunks[text_i])
sample_encodings = sample_encodings[:512]
sample_indices = sample_indices[:512]
if sample_indices.count(256) > 0:
continue
encodings_identifiers = unique_pattern_identifiers([sample_encodings], False)
encodings_vocab = len(encodings_identifiers.items())
encoding_list = invariate_sequence(sample_encodings, encodings_identifiers, encodings_vocab)
dataset.append([encoding_list, sample_indices])
with open("data_pairs.pkl", 'wb') as f:
pickle.dump(dataset, f)
return dataset
def filter_subset_pairs(pairs):
def is_subset_pair(pair1, pair2):
first1, second1 = set(pair1[0]), set(pair1[1])
first2, second2 = set(pair2[0]), set(pair2[1])
return (first1.issubset(first2) and second1.issubset(second2) and
(len(first1) < len(first2) or len(second1) < len(second2)))
result = pairs.copy()
i = len(result) - 1
while i >= 0:
should_remove = False
for j, pair2 in enumerate(result):
if i != j and is_subset_pair(result[i], pair2):
should_remove = True
break
if should_remove:
result.pop(i)
i -= 1
return result
def find_patterns_and_indices(sequence, remove_subsets=True):
"""
Find all repeating subsequences in a sequence and their indices.
Excludes indices of subsequences when they are part of a larger repeating subsequence.
Args:
sequence (list): Input sequence of numbers
Returns:
list: List of [subsequence, indices] pairs for repeating subsequences
"""
n = len(sequence)
result = []
# Helper function to convert list to tuple for hashability
def to_tuple(lst):
return tuple(lst)
# Find all possible subsequences and their indices
subsequence_indices = {}
for length in range(1, 5): # Start from length 2
for i in range(n - length + 1):
subseq = to_tuple(sequence[i:i + length])
if subseq not in subsequence_indices:
subsequence_indices[subseq] = []
subsequence_indices[subseq].append(i)
# Filter out non-repeating subsequences
repeating_subsequences = {
subseq: indices
for subseq, indices in subsequence_indices.items()
if len(indices) > 1
}
# Sort subsequences by length (longest first)
sorted_subsequences = sorted(
repeating_subsequences.items(),
key=lambda x: len(x[0]),
reverse=True
)
# Keep track of used indices
used_indices = set()
# Process subsequences from longest to shortest
for subseq, indices in sorted_subsequences:
# Filter out indices that are already part of longer subsequences
valid_indices = []
if remove_subsets:
for idx in indices:
# Check if any position in this occurrence overlaps with used indices
overlap = False
for pos in range(idx, idx + len(subseq)):
if pos in used_indices:
overlap = True
break
if not overlap:
valid_indices.append(idx)
# Mark all positions in this occurrence as used
for pos in range(idx, idx + len(subseq)):
used_indices.add(pos)
else:
valid_indices = indices
# Only add subsequence if it still has multiple valid occurrences
if len(valid_indices) > 1:
result.append([list(subseq), valid_indices])
return result
def find_unique_singles(sequence):
arr = []
for i, element in enumerate(sequence):
count = sequence.count(element)
if count == 1:
arr.append([[element], [i]])
return arr