Spaces:

cryptocalypse
/

nos_tokenizer_compressor

Configuration error

App Files Files Community

cryptocalypse commited on May 15, 2024

Commit

40f0e47

verified ·

1 Parent(s): cf93e36

Create nos.py

Browse files

Files changed (1) hide show

nos.py +326 -0

nos.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import sys
+import math
+import re
+import heapq
+from collections import defaultdict, Counter
+from typing import List, Tuple, Dict
+class TextProcessor:
+    def __init__(self, texto):
+        self.texto = texto
+    def entropy(self):
+        simbolos = {}
+        total_caracteres = len(self.texto)
+        for caracter in self.texto:
+            simbolos[caracter] = simbolos.get(caracter, 0) + 1
+        entropia = 0
+        for count in simbolos.values():
+            probabilidad = count / total_caracteres
+            entropia -= probabilidad * math.log2(probabilidad)
+        return simbolos, entropia
+    def common_string(self, cadena1, cadena2):
+        longitud1 = len(cadena1)
+        longitud2 = len(cadena2)
+        comun = ''
+        subcadenas_comunes = []
+        for i in range(longitud1):
+            for j in range(longitud2):
+                k = 0
+                while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
+                    k += 1
+                if k > 0:
+                    subcadenas_comunes.append(cadena1[i:i+k])
+        if subcadenas_comunes:
+            comun = max(subcadenas_comunes, key=len)
+        return comun
+    def magic_split(self):
+        unique_symbols = set(self.texto)
+        symbol_distances = {}
+        for symbol in unique_symbols:
+            indices = [i for i, char in enumerate(self.texto) if char == symbol]
+            if len(indices) > 1:
+                distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
+                symbol_distances[symbol] = distances
+        variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}
+        mins = {}
+        for v in variation:
+            if variation[v]!=0 and variation[v]!=1:
+                mins[v] = variation[v]
+        best_symbol = min(mins, key=mins.get)
+        return best_symbol
+    def rotate_string(self, string, n):
+        indice = n % len(string)
+        string_rotado = string[indice:] + string[:indice]
+        return string_rotado
+    def rotate_compare(self, tokiA, tokiB):
+        if tokiA >= tokiB:
+            tokA = tokiA
+            tokB = tokiB
+            ltokA = len(tokA)
+        else:
+            tokA = tokiB
+            tokB = tokiA
+            ltokA = len(tokB)
+        i = 0
+        rotations = {}
+        while i < ltokA:
+            tokrotated = self.rotate_string(tokA, i)
+            rotations[str(i)] = self.common_string(tokrotated, tokB)
+            i += 1
+        best_r = ""
+        for x in rotations:
+            lb = len(best_r)
+            rot = rotations[x]
+            lrot = len(rot)
+            if lrot > 1 and lrot < ltokA and lrot > lb:
+                best_r = rot
+        return best_r
+    def get_subTokens(self, spl):
+        sub_tokens = self.texto.split(spl)
+        toks = []
+        for tok in sub_tokens:
+            for tok2 in sub_tokens:
+                if tok != tok2:
+                    toks.append(self.rotate_compare(tok, tok2))
+        return list(set(toks))
+    def tokenize(self, spliter_optimo):
+        tokens = self.get_subTokens(spliter_optimo)
+        tokenized_sentence = {}
+        chunk = self.texto.split(spliter_optimo)
+        for txt in chunk:
+            best_split = ""
+            if len(txt)<3:
+                tokenized_sentence[txt]= txt
+            else:
+                for tok in tokens:
+                    if tok != "":
+                        lt = len(tok)
+                        lb = len(best_split)
+                        spltxt = txt.split(tok)
+                        if len(spltxt) > 1:
+                            l0 = len(spltxt[0])
+                            l1 = len(spltxt[1])
+                            if lt < len(txt) and lt > lb:
+                                best_split = tok
+                                tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
+        return tokenized_sentence
+    def symbol_distances(self,texto, tokens):
+    # Ordena los tokens por longitud descendente para garantizar la división más larga posible.
+        txt = texto
+        for tok in tokens:
+            if tok !='':
+                txt = txt.replace(tok,"-"+tok+"-")
+        #print(txt)
+        arr = txt.split("-")
+        return [elem for elem in arr if elem != '']
+    def distances(self,tokens):
+        tokens_unicos = {}
+        for i, token in enumerate(tokens):
+            if token not in tokens_unicos:
+                tokens_unicos[token] = [i]
+            else:
+                tokens_unicos[token].append(i)
+        return tokens_unicos
+    def from_distances(self,tokens_distancias):
+        rebuild={}
+        recoded_dic={}
+        for tok in tokens_distancias:
+            for dis in tokens_distancias[tok]:
+                try:
+                    rebuild[dis]=tok
+                    recoded_dic[dis] = gindex(tokens_distancias,tok)
+                except:
+                    pass
+        enc = {k: recoded_dic[k] for k in sorted(recoded_dic)}
+        rebu = {k: rebuild[k] for k in sorted(rebuild)}
+        dic_str = ""
+        for d in tokens_distancias:
+            dic_str+=","+d
+        enc_str = ""
+        for e in enc:
+            enc_str += ","+str(enc[e])
+        return dic_str,enc_str
+def gindex(obj, key):
+    keys = list(obj.keys())
+    try:
+        index = keys.index(key)
+        return index
+    except ValueError:
+        return None  # Key not found in the dictionary
+# Ejemplo de uso:
+texto_ejemplo = "cuando te digo vete , te aburres , corres o andas  ? cuando me dices vete , me aburro, corro y ando"
+processor = TextProcessor(texto_ejemplo)
+spliter_optimo = processor.magic_split()
+tokenized_sentence = processor.tokenize(spliter_optimo)
+token_txt =""
+for token in tokenized_sentence:
+    token_txt += "-"+tokenized_sentence[token]
+tokens = set(token_txt.split("-"))
+symb = processor.symbol_distances(texto_ejemplo,tokens)
+print("Tokens")
+print(tokens)
+print("Number of symbols in tokens:")
+print(len(tokens))
+print("Number of symbols in chars:")
+print(len(set(texto_ejemplo)))
+print("Length of text",len(texto_ejemplo))
+print("Texto original:", texto_ejemplo)
+print("Spliter óptimo:", spliter_optimo)
+print("Frase tokenizada:", tokenized_sentence)
+print("Length tokenized",len(tokenized_sentence))
+print("Token Sentences", symb)
+print("Lenght Token Sentence", len(symb))
+print("Length Symbols Token Dictionary",len(set(symb)))
+distances = processor.distances(symb)
+print("Token Distances", distances)
+print("Token Distance Length", len(distances))
+print(gindex(distances,"cu"))
+dic_str,enc_str = processor.from_distances(distances)
+print(dic_str,enc_str)
+class HuffmanNode:
+    def __init__(self, char: str, freq: int):
+        self.char = char
+        self.freq = freq
+        self.left = None
+        self.right = None
+    def __lt__(self, other):
+        return self.freq < other.freq
+def build_huffman_tree(text: str) -> HuffmanNode:
+    frequency = Counter(text)
+    priority_queue = [HuffmanNode(char, freq) for char, freq in frequency.items()]
+    heapq.heapify(priority_queue)
+    while len(priority_queue) > 1:
+        left = heapq.heappop(priority_queue)
+        right = heapq.heappop(priority_queue)
+        merged_node = HuffmanNode(None, left.freq + right.freq)
+        merged_node.left = left
+        merged_node.right = right
+        heapq.heappush(priority_queue, merged_node)
+    return priority_queue[0]
+def encode_huffman_tree(node: HuffmanNode, prefix: str = "") -> Dict[str, str]:
+    if node is None:
+        return {}
+    if node.char is not None:
+        return {node.char: prefix}
+    encoding = {}
+    encoding.update(encode_huffman_tree(node.left, prefix + "0"))
+    encoding.update(encode_huffman_tree(node.right, prefix + "1"))
+    return encoding
+def huffman_encode(text: str) -> Tuple[Dict[str, str], bytes]:
+    root = build_huffman_tree(text)
+    encoding_map = encode_huffman_tree(root)
+    encoded_text = ''.join(encoding_map[char] for char in text)
+    # Asegurarse de que la longitud de la cadena codificada es múltiplo de 8 para la conversión a bytes
+    remainder = len(encoded_text) % 8
+    if remainder != 0:
+        encoded_text += '0' * (8 - remainder)
+    # Convertir la cadena binaria a bytes
+    encoded_bytes = bytes(int(encoded_text[i:i+8], 2) for i in range(0, len(encoded_text), 8))
+    return encoding_map, encoded_bytes
+def huffman_decode(encoding_map: Dict[str, str], encoded_bytes: bytes) -> str:
+    # Convertir bytes a una cadena binaria
+    encoded_text = ''.join(format(byte, '08b') for byte in encoded_bytes)
+    decoding_map = {code: char for char, code in encoding_map.items()}
+    decoded_text = ""
+    current_code = ""
+    for bit in encoded_text:
+        current_code += bit
+        if current_code in decoding_map:
+            decoded_text += decoding_map[current_code]
+            current_code = ""
+    return decoded_text
+def guardar_binarios_en_archivo(binarios: List[bytes], nombre_archivo: str):
+    with open(nombre_archivo, 'wb') as archivo:
+        for binario in binarios:
+            archivo.write(binario)
+            archivo.write(b'\n')  # Separador entre los binarios
+    print(f"Datos binarios guardados en el archivo '{nombre_archivo}'")
+# Ejemplo de uso
+cadena1 = dic_str
+cadena2 = enc_str
+# Codificar cadena1 y cadena2
+encoding_map1, encoded_bytes1 = huffman_encode(cadena1)
+encoding_map2, encoded_bytes2 = huffman_encode(cadena2)
+# Guardar binarios en un solo archivo
+guardar_binarios_en_archivo([encoded_bytes1, encoded_bytes2], "text.txt.nos")