File size: 9,342 Bytes
32b6996 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
from cipher_8bit import *
from french_dataset import get_full_dataset
import json
import pickle
def get_pattern_ranks(pattern_frequency_dict):
sorted_items = sorted(pattern_frequency_dict.items(), key=lambda x: x[1], reverse=True)
# Initialize a new dictionary for ranks
ranked_dict = {}
# Assign ranks (starting from 1)
rank = 1
for key, value in sorted_items:
ranked_dict[key] = rank
rank += 1
return ranked_dict
def unique_pattern_identifiers(symbol_index_sequences,save=False, name="data"):
freq_dict = {}
i = 0
if os.path.exists(name+".json") and save:
# Load the existing JSON data into a dictionary
with open(name+".json", "r") as json_file:
loaded_dict = json.load(json_file)
return get_pattern_ranks(loaded_dict)
for sequence in symbol_index_sequences:
raw_pattern_data = find_patterns_and_indices(sequence, remove_subsets=False)
for pattern in raw_pattern_data:
key = "-".join(map(str, pattern[0]))
if key in freq_dict:
freq_dict[key] += len(pattern[1])
else:
freq_dict[key] = len(pattern[1])
i += 1
if save:
with open(name+".json", "w") as json_file:
json.dump(freq_dict, json_file, indent=4)
return get_pattern_ranks(freq_dict)
def get_data_pairs(full_text):
if os.path.exists("data_pairs.pkl"):
with open("data_pairs.pkl", 'rb') as f:
print("Loading training pairs from pickle file...")
return pickle.load(f)
text_chunks = []
chunk_len = 1500
i=0
print("starting chunking text")
while i * chunk_len < len(full_text) - chunk_len - 1:
i += 1
sample_text = full_text[(i - 1) * chunk_len: i * chunk_len - 1]
text_chunks.append(sample_text)
symbol_index_sequences = []
symbols = ['b', 'j', '\r', 'J', '”', ')', 'Â', 'É', 'ê', '5', 't', '9', 'Y', '%', 'N', 'B', 'V', '\ufeff', 'Ê', '?', '’', 'i', ':', 's', 'C', 'â', 'ï', 'W', 'y', 'p', 'D', '—', '«', 'º', 'A', '3', 'n', '0', 'q', '4', 'e', 'T', 'È', '$', 'U', 'v', '»', 'l', 'P', 'X', 'Z', 'À', 'ç', 'u', '…', 'î', 'L', 'k', 'E', 'R', '2', '_', '8', 'é', 'O', 'Î', '‘', 'a', 'F', 'H', 'c', '[', '(', "'", 'è', 'I', '/', '!', ' ', '°', 'S', '•', '#', 'x', 'à', 'g', '*', 'Q', 'w', '1', 'û', '7', 'G', 'm', '™', 'K', 'z', '\n', 'o', 'ù', ',', 'r', ']', '.', 'M', 'Ç', '“', 'h', '-', 'f', 'ë', '6', ';', 'd', 'ô', 'e ', 's ', 't ', 'es', ' d', '\r\n', 'en', 'qu', ' l', 're', ' p', 'de', 'le', 'nt', 'on', ' c', ', ', ' e', 'ou', ' q', ' s', 'n ', 'ue', 'an', 'te', ' a', 'ai', 'se', 'it', 'me', 'is', 'oi', 'r ', 'er', ' m', 'ce', 'ne', 'et', 'in', 'ns', ' n', 'ur', 'i ', 'a ', 'eu', 'co', 'tr', 'la', 'ar', 'ie', 'ui', 'us', 'ut', 'il', ' t', 'pa', 'au', 'el', 'ti', 'st', 'un', 'em', 'ra', 'e,', 'so', 'or', 'l ', ' f', 'll', 'nd', ' j', 'si', 'ir', 'e\r', 'ss', 'u ', 'po', 'ro', 'ri', 'pr', 's,', 'ma', ' v', ' i', 'di', ' r', 'vo', 'pe', 'to', 'ch', '. ', 've', 'nc', 'om', ' o', 'je', 'no', 'rt', 'à ', 'lu', "'e", 'mo', 'ta', 'as', 'at', 'io', 's\r', 'sa', "u'", 'av', 'os', ' à', ' u', "l'", "'a", 'rs', 'pl', 'é ', '; ', 'ho', 'té', 'ét', 'fa', 'da', 'li', 'su', 't\r', 'ée', 'ré', 'dé', 'ec', 'nn', 'mm', "'i", 'ca', 'uv', '\n\r', 'id', ' b', 'ni', 'bl']
symbols = load_or_save_symbols(symbols)
substitution_rule = substitution_cipher(symbols, 1337)
def invariate_sequence(sample, ids, vocab_size):
fill_in = []
p = find_patterns_and_indices(sample)
u = find_unique_singles(sample)
for pattern in p:
value = "-".join(map(str, pattern[0]))
for index in pattern[1]:
fill_in.append([index, ids[value], len(pattern[0])])
for unique in u:
fill_in.append([unique[1][0], vocab_size + 1, 0])
fill_in.sort(key=lambda x: x[0])
total_list = [0] * 1024
i=0
tally = 0
pattern_count = 0
while pattern_count < len(fill_in):
if tally != fill_in[pattern_count][0]:
total_list[i] = 1
i+=1
tally+=1
continue
if fill_in[pattern_count][2] == 0:
total_list[i] = 0
i+=1
tally+=1
pattern_count +=1
continue
if fill_in[pattern_count][2] == 1:
total_list[i+1] = fill_in[pattern_count][1] + 5
i+=1
else:
total_list[i] = fill_in[pattern_count][2]
total_list[i+1] = fill_in[pattern_count][1] + 5
i+=2
tally += fill_in[pattern_count][2]
pattern_count += 1
total_list = total_list[:i]
return total_list
dataset = []
print(len(text_chunks))
i5 = 0
for text_i in range(len(text_chunks)):
i5 += 1
if i5 % 100 == 0:
print(i5)
sample_encodings, sample_indices = encode_text_with_indices(substitution_rule, symbols, text_chunks[text_i])
sample_encodings = sample_encodings[:512]
sample_indices = sample_indices[:512]
if sample_indices.count(256) > 0:
continue
encodings_identifiers = unique_pattern_identifiers([sample_encodings], False)
encodings_vocab = len(encodings_identifiers.items())
encoding_list = invariate_sequence(sample_encodings, encodings_identifiers, encodings_vocab)
dataset.append([encoding_list, sample_indices])
with open("data_pairs.pkl", 'wb') as f:
pickle.dump(dataset, f)
return dataset
def filter_subset_pairs(pairs):
def is_subset_pair(pair1, pair2):
first1, second1 = set(pair1[0]), set(pair1[1])
first2, second2 = set(pair2[0]), set(pair2[1])
return (first1.issubset(first2) and second1.issubset(second2) and
(len(first1) < len(first2) or len(second1) < len(second2)))
result = pairs.copy()
i = len(result) - 1
while i >= 0:
should_remove = False
for j, pair2 in enumerate(result):
if i != j and is_subset_pair(result[i], pair2):
should_remove = True
break
if should_remove:
result.pop(i)
i -= 1
return result
def find_patterns_and_indices(sequence, remove_subsets=True):
"""
Find all repeating subsequences in a sequence and their indices.
Excludes indices of subsequences when they are part of a larger repeating subsequence.
Args:
sequence (list): Input sequence of numbers
Returns:
list: List of [subsequence, indices] pairs for repeating subsequences
"""
n = len(sequence)
result = []
# Helper function to convert list to tuple for hashability
def to_tuple(lst):
return tuple(lst)
# Find all possible subsequences and their indices
subsequence_indices = {}
for length in range(1, 5): # Start from length 2
for i in range(n - length + 1):
subseq = to_tuple(sequence[i:i + length])
if subseq not in subsequence_indices:
subsequence_indices[subseq] = []
subsequence_indices[subseq].append(i)
# Filter out non-repeating subsequences
repeating_subsequences = {
subseq: indices
for subseq, indices in subsequence_indices.items()
if len(indices) > 1
}
# Sort subsequences by length (longest first)
sorted_subsequences = sorted(
repeating_subsequences.items(),
key=lambda x: len(x[0]),
reverse=True
)
# Keep track of used indices
used_indices = set()
# Process subsequences from longest to shortest
for subseq, indices in sorted_subsequences:
# Filter out indices that are already part of longer subsequences
valid_indices = []
if remove_subsets:
for idx in indices:
# Check if any position in this occurrence overlaps with used indices
overlap = False
for pos in range(idx, idx + len(subseq)):
if pos in used_indices:
overlap = True
break
if not overlap:
valid_indices.append(idx)
# Mark all positions in this occurrence as used
for pos in range(idx, idx + len(subseq)):
used_indices.add(pos)
else:
valid_indices = indices
# Only add subsequence if it still has multiple valid occurrences
if len(valid_indices) > 1:
result.append([list(subseq), valid_indices])
return result
def find_unique_singles(sequence):
arr = []
for i, element in enumerate(sequence):
count = sequence.count(element)
if count == 1:
arr.append([[element], [i]])
return arr
|