File size: 9,342 Bytes
32b6996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from cipher_8bit import *
from french_dataset import get_full_dataset
import json
import pickle

def get_pattern_ranks(pattern_frequency_dict):
    sorted_items = sorted(pattern_frequency_dict.items(), key=lambda x: x[1], reverse=True)
    
    # Initialize a new dictionary for ranks
    ranked_dict = {}
    
    # Assign ranks (starting from 1)
    rank = 1
    for key, value in sorted_items:
        ranked_dict[key] = rank
        rank += 1
    return ranked_dict

def unique_pattern_identifiers(symbol_index_sequences,save=False, name="data"):
    freq_dict = {}
    i = 0
    if os.path.exists(name+".json") and save:
        # Load the existing JSON data into a dictionary
        with open(name+".json", "r") as json_file:
            loaded_dict = json.load(json_file)
        return get_pattern_ranks(loaded_dict)

    for sequence in symbol_index_sequences:
        raw_pattern_data = find_patterns_and_indices(sequence, remove_subsets=False)
        for pattern in raw_pattern_data:
            key = "-".join(map(str, pattern[0]))
            if key in freq_dict:
                freq_dict[key] += len(pattern[1])
            else:
                freq_dict[key] = len(pattern[1])
        i += 1
    if save:
        with open(name+".json", "w") as json_file:
            json.dump(freq_dict, json_file, indent=4)

    return get_pattern_ranks(freq_dict)

def get_data_pairs(full_text):
    if os.path.exists("data_pairs.pkl"):
        with open("data_pairs.pkl", 'rb') as f:
            print("Loading training pairs from pickle file...")
            return pickle.load(f)
    text_chunks = []
    chunk_len = 1500
    i=0
    print("starting chunking text")
    while i * chunk_len < len(full_text) - chunk_len - 1:
        i += 1
        sample_text = full_text[(i - 1) * chunk_len: i * chunk_len - 1]
        text_chunks.append(sample_text)
    symbol_index_sequences = []
    symbols = ['b', 'j', '\r', 'J', '”', ')', 'Â', 'É', 'ê', '5', 't', '9', 'Y', '%', 'N', 'B', 'V', '\ufeff', 'Ê', '?', '’', 'i', ':', 's', 'C', 'â', 'ï', 'W', 'y', 'p', 'D', '—', '«', 'º', 'A', '3', 'n', '0', 'q', '4', 'e', 'T', 'È', '$', 'U', 'v', '»', 'l', 'P', 'X', 'Z', 'À', 'ç', 'u', '…', 'î', 'L', 'k', 'E', 'R', '2', '_', '8', 'é', 'O', 'Î', '‘', 'a', 'F', 'H', 'c', '[', '(', "'", 'è', 'I', '/', '!', ' ', '°', 'S', '•', '#', 'x', 'à', 'g', '*', 'Q', 'w', '1', 'û', '7', 'G', 'm', '™', 'K', 'z', '\n', 'o', 'ù', ',', 'r', ']', '.', 'M', 'Ç', '“', 'h', '-', 'f', 'ë', '6', ';', 'd', 'ô', 'e ', 's ', 't ', 'es', ' d', '\r\n', 'en', 'qu', ' l', 're', ' p', 'de', 'le', 'nt', 'on', ' c', ', ', ' e', 'ou', ' q', ' s', 'n ', 'ue', 'an', 'te', ' a', 'ai', 'se', 'it', 'me', 'is', 'oi', 'r ', 'er', ' m', 'ce', 'ne', 'et', 'in', 'ns', ' n', 'ur', 'i ', 'a ', 'eu', 'co', 'tr', 'la', 'ar', 'ie', 'ui', 'us', 'ut', 'il', ' t', 'pa', 'au', 'el', 'ti', 'st', 'un', 'em', 'ra', 'e,', 'so', 'or', 'l ', ' f', 'll', 'nd', ' j', 'si', 'ir', 'e\r', 'ss', 'u ', 'po', 'ro', 'ri', 'pr', 's,', 'ma', ' v', ' i', 'di', ' r', 'vo', 'pe', 'to', 'ch', '. ', 've', 'nc', 'om', ' o', 'je', 'no', 'rt', 'à ', 'lu', "'e", 'mo', 'ta', 'as', 'at', 'io', 's\r', 'sa', "u'", 'av', 'os', ' à', ' u', "l'", "'a", 'rs', 'pl', 'é ', '; ', 'ho', 'té', 'ét', 'fa', 'da', 'li', 'su', 't\r', 'ée', 'ré', 'dé', 'ec', 'nn', 'mm', "'i", 'ca', 'uv', '\n\r', 'id', ' b', 'ni', 'bl']
    symbols = load_or_save_symbols(symbols)
    substitution_rule = substitution_cipher(symbols, 1337)

    def invariate_sequence(sample, ids, vocab_size):
        fill_in = []
        p = find_patterns_and_indices(sample)
        u = find_unique_singles(sample)
        for pattern in p:
            value = "-".join(map(str, pattern[0]))
            for index in pattern[1]:
                fill_in.append([index, ids[value], len(pattern[0])])
        for unique in u:
            fill_in.append([unique[1][0], vocab_size + 1, 0])
        fill_in.sort(key=lambda x: x[0])
        total_list = [0] * 1024
        i=0
        tally = 0
        pattern_count = 0
        while pattern_count < len(fill_in):
            if tally != fill_in[pattern_count][0]:
                total_list[i] = 1
                i+=1
                tally+=1
                continue
            if fill_in[pattern_count][2] == 0:
                total_list[i] = 0
                i+=1
                tally+=1
                pattern_count +=1
                continue

            if fill_in[pattern_count][2] == 1:
                    total_list[i+1] = fill_in[pattern_count][1] + 5
                    i+=1
            else:
                total_list[i] = fill_in[pattern_count][2]
                total_list[i+1] = fill_in[pattern_count][1] + 5
                i+=2
            
            tally += fill_in[pattern_count][2]
            pattern_count += 1

        total_list = total_list[:i]
        return total_list

    dataset = []
    print(len(text_chunks))
    i5 = 0
    for text_i in range(len(text_chunks)):
        i5 += 1
        if i5 % 100 == 0:
            print(i5)
        sample_encodings, sample_indices = encode_text_with_indices(substitution_rule, symbols, text_chunks[text_i])
        sample_encodings = sample_encodings[:512]
        sample_indices = sample_indices[:512]
        if sample_indices.count(256) > 0:
            continue
        
        encodings_identifiers = unique_pattern_identifiers([sample_encodings], False)
        encodings_vocab = len(encodings_identifiers.items())

        encoding_list = invariate_sequence(sample_encodings, encodings_identifiers, encodings_vocab)
        dataset.append([encoding_list, sample_indices])
    with open("data_pairs.pkl", 'wb') as f:
        pickle.dump(dataset, f)
    return dataset



def filter_subset_pairs(pairs):

    def is_subset_pair(pair1, pair2):
        first1, second1 = set(pair1[0]), set(pair1[1])
        first2, second2 = set(pair2[0]), set(pair2[1])
        return (first1.issubset(first2) and second1.issubset(second2) and 
                (len(first1) < len(first2) or len(second1) < len(second2)))
    
    result = pairs.copy()
    i = len(result) - 1
    
    while i >= 0:
        should_remove = False
        for j, pair2 in enumerate(result):
            if i != j and is_subset_pair(result[i], pair2):
                should_remove = True
                break
        if should_remove:
            result.pop(i)
        i -= 1
        
    return result

def find_patterns_and_indices(sequence, remove_subsets=True):
    """

    Find all repeating subsequences in a sequence and their indices.

    Excludes indices of subsequences when they are part of a larger repeating subsequence.

    

    Args:

        sequence (list): Input sequence of numbers

        

    Returns:

        list: List of [subsequence, indices] pairs for repeating subsequences

    """
    n = len(sequence)
    result = []
    
    # Helper function to convert list to tuple for hashability
    def to_tuple(lst):
        return tuple(lst)
    
    # Find all possible subsequences and their indices
    subsequence_indices = {}
    for length in range(1, 5):  # Start from length 2
        for i in range(n - length + 1):
            subseq = to_tuple(sequence[i:i + length])
            if subseq not in subsequence_indices:
                subsequence_indices[subseq] = []
            subsequence_indices[subseq].append(i)
    
    # Filter out non-repeating subsequences
    repeating_subsequences = {
        subseq: indices 
        for subseq, indices in subsequence_indices.items() 
        if len(indices) > 1
    }
    
    # Sort subsequences by length (longest first)
    sorted_subsequences = sorted(
        repeating_subsequences.items(),
        key=lambda x: len(x[0]),
        reverse=True
    )
    
    # Keep track of used indices
    used_indices = set()
    
    # Process subsequences from longest to shortest
    for subseq, indices in sorted_subsequences:
        # Filter out indices that are already part of longer subsequences
        valid_indices = []
        if remove_subsets:
            for idx in indices:
                # Check if any position in this occurrence overlaps with used indices
                overlap = False
                for pos in range(idx, idx + len(subseq)):
                    if pos in used_indices:
                        overlap = True
                        break
                if not overlap:
                    valid_indices.append(idx)
                    # Mark all positions in this occurrence as used
                    for pos in range(idx, idx + len(subseq)):
                        used_indices.add(pos)
        else:
            valid_indices = indices
            
        # Only add subsequence if it still has multiple valid occurrences
        if len(valid_indices) > 1:
            result.append([list(subseq), valid_indices])
    
    return result

def find_unique_singles(sequence):
    arr = []
    for i, element in enumerate(sequence):
        count = sequence.count(element)
        if count == 1:
            arr.append([[element], [i]])
    
    return arr