testy / invariants.py

Upload 8 files

32b6996 verified about 1 year ago

9.34 kB

	from cipher_8bit import *
	from french_dataset import get_full_dataset
	import json
	import pickle

	def get_pattern_ranks(pattern_frequency_dict):
	sorted_items = sorted(pattern_frequency_dict.items(), key=lambda x: x[1], reverse=True)

	# Initialize a new dictionary for ranks
	ranked_dict = {}

	# Assign ranks (starting from 1)
	rank = 1
	for key, value in sorted_items:
	ranked_dict[key] = rank
	rank += 1
	return ranked_dict

	def unique_pattern_identifiers(symbol_index_sequences,save=False, name="data"):
	freq_dict = {}
	i = 0
	if os.path.exists(name+".json") and save:
	# Load the existing JSON data into a dictionary
	with open(name+".json", "r") as json_file:
	loaded_dict = json.load(json_file)
	return get_pattern_ranks(loaded_dict)

	for sequence in symbol_index_sequences:
	raw_pattern_data = find_patterns_and_indices(sequence, remove_subsets=False)
	for pattern in raw_pattern_data:
	key = "-".join(map(str, pattern[0]))
	if key in freq_dict:
	freq_dict[key] += len(pattern[1])
	else:
	freq_dict[key] = len(pattern[1])
	i += 1
	if save:
	with open(name+".json", "w") as json_file:
	json.dump(freq_dict, json_file, indent=4)

	return get_pattern_ranks(freq_dict)

	def get_data_pairs(full_text):
	if os.path.exists("data_pairs.pkl"):
	with open("data_pairs.pkl", 'rb') as f:
	print("Loading training pairs from pickle file...")
	return pickle.load(f)
	text_chunks = []
	chunk_len = 1500
	i=0
	print("starting chunking text")
	while i * chunk_len < len(full_text) - chunk_len - 1:
	i += 1
	sample_text = full_text[(i - 1) * chunk_len: i * chunk_len - 1]
	text_chunks.append(sample_text)
	symbol_index_sequences = []
	symbols = ['b', 'j', '\r', 'J', '”', ')', 'Â', 'É', 'ê', '5', 't', '9', 'Y', '%', 'N', 'B', 'V', '\ufeff', 'Ê', '?', '’', 'i', ':', 's', 'C', 'â', 'ï', 'W', 'y', 'p', 'D', '—', '«', 'º', 'A', '3', 'n', '0', 'q', '4', 'e', 'T', 'È', '$', 'U', 'v', '»', 'l', 'P', 'X', 'Z', 'À', 'ç', 'u', '…', 'î', 'L', 'k', 'E', 'R', '2', '_', '8', 'é', 'O', 'Î', '‘', 'a', 'F', 'H', 'c', '[', '(', "'", 'è', 'I', '/', '!', ' ', '°', 'S', '•', '#', 'x', 'à', 'g', '*', 'Q', 'w', '1', 'û', '7', 'G', 'm', '™', 'K', 'z', '\n', 'o', 'ù', ',', 'r', ']', '.', 'M', 'Ç', '“', 'h', '-', 'f', 'ë', '6', ';', 'd', 'ô', 'e ', 's ', 't ', 'es', ' d', '\r\n', 'en', 'qu', ' l', 're', ' p', 'de', 'le', 'nt', 'on', ' c', ', ', ' e', 'ou', ' q', ' s', 'n ', 'ue', 'an', 'te', ' a', 'ai', 'se', 'it', 'me', 'is', 'oi', 'r ', 'er', ' m', 'ce', 'ne', 'et', 'in', 'ns', ' n', 'ur', 'i ', 'a ', 'eu', 'co', 'tr', 'la', 'ar', 'ie', 'ui', 'us', 'ut', 'il', ' t', 'pa', 'au', 'el', 'ti', 'st', 'un', 'em', 'ra', 'e,', 'so', 'or', 'l ', ' f', 'll', 'nd', ' j', 'si', 'ir', 'e\r', 'ss', 'u ', 'po', 'ro', 'ri', 'pr', 's,', 'ma', ' v', ' i', 'di', ' r', 'vo', 'pe', 'to', 'ch', '. ', 've', 'nc', 'om', ' o', 'je', 'no', 'rt', 'à ', 'lu', "'e", 'mo', 'ta', 'as', 'at', 'io', 's\r', 'sa', "u'", 'av', 'os', ' à', ' u', "l'", "'a", 'rs', 'pl', 'é ', '; ', 'ho', 'té', 'ét', 'fa', 'da', 'li', 'su', 't\r', 'ée', 'ré', 'dé', 'ec', 'nn', 'mm', "'i", 'ca', 'uv', '\n\r', 'id', ' b', 'ni', 'bl']
	symbols = load_or_save_symbols(symbols)
	substitution_rule = substitution_cipher(symbols, 1337)

	def invariate_sequence(sample, ids, vocab_size):
	fill_in = []
	p = find_patterns_and_indices(sample)
	u = find_unique_singles(sample)
	for pattern in p:
	value = "-".join(map(str, pattern[0]))
	for index in pattern[1]:
	fill_in.append([index, ids[value], len(pattern[0])])
	for unique in u:
	fill_in.append([unique[1][0], vocab_size + 1, 0])
	fill_in.sort(key=lambda x: x[0])
	total_list = [0] * 1024
	i=0
	tally = 0
	pattern_count = 0
	while pattern_count < len(fill_in):
	if tally != fill_in[pattern_count][0]:
	total_list[i] = 1
	i+=1
	tally+=1
	continue
	if fill_in[pattern_count][2] == 0:
	total_list[i] = 0
	i+=1
	tally+=1
	pattern_count +=1
	continue

	if fill_in[pattern_count][2] == 1:
	total_list[i+1] = fill_in[pattern_count][1] + 5
	i+=1
	else:
	total_list[i] = fill_in[pattern_count][2]
	total_list[i+1] = fill_in[pattern_count][1] + 5
	i+=2

	tally += fill_in[pattern_count][2]
	pattern_count += 1

	total_list = total_list[:i]
	return total_list

	dataset = []
	print(len(text_chunks))
	i5 = 0
	for text_i in range(len(text_chunks)):
	i5 += 1
	if i5 % 100 == 0:
	print(i5)
	sample_encodings, sample_indices = encode_text_with_indices(substitution_rule, symbols, text_chunks[text_i])
	sample_encodings = sample_encodings[:512]
	sample_indices = sample_indices[:512]
	if sample_indices.count(256) > 0:
	continue

	encodings_identifiers = unique_pattern_identifiers([sample_encodings], False)
	encodings_vocab = len(encodings_identifiers.items())

	encoding_list = invariate_sequence(sample_encodings, encodings_identifiers, encodings_vocab)
	dataset.append([encoding_list, sample_indices])
	with open("data_pairs.pkl", 'wb') as f:
	pickle.dump(dataset, f)
	return dataset



	def filter_subset_pairs(pairs):

	def is_subset_pair(pair1, pair2):
	first1, second1 = set(pair1[0]), set(pair1[1])
	first2, second2 = set(pair2[0]), set(pair2[1])
	return (first1.issubset(first2) and second1.issubset(second2) and
	(len(first1) < len(first2) or len(second1) < len(second2)))

	result = pairs.copy()
	i = len(result) - 1

	while i >= 0:
	should_remove = False
	for j, pair2 in enumerate(result):
	if i != j and is_subset_pair(result[i], pair2):
	should_remove = True
	break
	if should_remove:
	result.pop(i)
	i -= 1

	return result

	def find_patterns_and_indices(sequence, remove_subsets=True):
	"""
	Find all repeating subsequences in a sequence and their indices.
	Excludes indices of subsequences when they are part of a larger repeating subsequence.

	Args:
	sequence (list): Input sequence of numbers

	Returns:
	list: List of [subsequence, indices] pairs for repeating subsequences
	"""
	n = len(sequence)
	result = []

	# Helper function to convert list to tuple for hashability
	def to_tuple(lst):
	return tuple(lst)

	# Find all possible subsequences and their indices
	subsequence_indices = {}
	for length in range(1, 5): # Start from length 2
	for i in range(n - length + 1):
	subseq = to_tuple(sequence[i:i + length])
	if subseq not in subsequence_indices:
	subsequence_indices[subseq] = []
	subsequence_indices[subseq].append(i)

	# Filter out non-repeating subsequences
	repeating_subsequences = {
	subseq: indices
	for subseq, indices in subsequence_indices.items()
	if len(indices) > 1
	}

	# Sort subsequences by length (longest first)
	sorted_subsequences = sorted(
	repeating_subsequences.items(),
	key=lambda x: len(x[0]),
	reverse=True
	)

	# Keep track of used indices
	used_indices = set()

	# Process subsequences from longest to shortest
	for subseq, indices in sorted_subsequences:
	# Filter out indices that are already part of longer subsequences
	valid_indices = []
	if remove_subsets:
	for idx in indices:
	# Check if any position in this occurrence overlaps with used indices
	overlap = False
	for pos in range(idx, idx + len(subseq)):
	if pos in used_indices:
	overlap = True
	break
	if not overlap:
	valid_indices.append(idx)
	# Mark all positions in this occurrence as used
	for pos in range(idx, idx + len(subseq)):
	used_indices.add(pos)
	else:
	valid_indices = indices

	# Only add subsequence if it still has multiple valid occurrences
	if len(valid_indices) > 1:
	result.append([list(subseq), valid_indices])

	return result

	def find_unique_singles(sequence):
	arr = []
	for i, element in enumerate(sequence):
	count = sequence.count(element)
	if count == 1:
	arr.append([[element], [i]])

	return arr