Spaces:

torahCodes
/

Torah_Codes

Runtime error

App Files Files Community

cryptocalypse commited on May 5, 2024

Commit

103c053

1 Parent(s): e865108

libs entropy and read files

Browse files

Files changed (2) hide show

lib/entropy.py +131 -0
lib/files.py +31 -0

lib/entropy.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import sys
+import math
+class TextProcessor:
+    def __init__(self, texto):
+        self.texto = texto
+    def entropy(self):
+        simbolos = {}
+        total_caracteres = len(self.texto)
+        for caracter in self.texto:
+            simbolos[caracter] = simbolos.get(caracter, 0) + 1
+        entropia = 0
+        for count in simbolos.values():
+            probabilidad = count / total_caracteres
+            entropia -= probabilidad * math.log2(probabilidad)
+        return simbolos, entropia
+    def common_string(self, cadena1, cadena2):
+        longitud1 = len(cadena1)
+        longitud2 = len(cadena2)
+        comun = ''
+        subcadenas_comunes = []
+        for i in range(longitud1):
+            for j in range(longitud2):
+                k = 0
+                while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
+                    k += 1
+                if k > 0:
+                    subcadenas_comunes.append(cadena1[i:i+k])
+        if subcadenas_comunes:
+            comun = max(subcadenas_comunes, key=len)
+        return comun
+    def magic_split(self):
+        unique_symbols = set(self.texto)
+        symbol_distances = {}
+        for symbol in unique_symbols:
+            indices = [i for i, char in enumerate(self.texto) if char == symbol]
+            if len(indices) > 1:
+                distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
+                symbol_distances[symbol] = distances
+        variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}
+        mins = {}
+        for v in variation:
+            if variation[v]!=0 and variation[v]!=1:
+                mins[v] = variation[v]
+        best_symbol = min(mins, key=mins.get)
+        return best_symbol
+    def rotate_string(self, string, n):
+        indice = n % len(string)
+        string_rotado = string[indice:] + string[:indice]
+        return string_rotado
+    def rotate_compare(self, tokiA, tokiB):
+        if tokiA >= tokiB:
+            tokA = tokiA
+            tokB = tokiB
+            ltokA = len(tokA)
+        else:
+            tokA = tokiB
+            tokB = tokiA
+            ltokA = len(tokB)
+        i = 0
+        rotations = {}
+        while i < ltokA:
+            tokrotated = self.rotate_string(tokA, i)
+            rotations[str(i)] = self.common_string(tokrotated, tokB)
+            i += 1
+        best_r = ""
+        for x in rotations:
+            lb = len(best_r)
+            rot = rotations[x]
+            lrot = len(rot)
+            if lrot > 1 and lrot < ltokA and lrot > lb:
+                best_r = rot
+        return best_r
+    def get_subTokens(self, spl):
+        sub_tokens = self.texto.split(spl)
+        toks = []
+        for tok in sub_tokens:
+            for tok2 in sub_tokens:
+                if tok != tok2:
+                    toks.append(self.rotate_compare(tok, tok2))
+        return list(set(toks))
+    def tokenize(self, spliter_optimo):
+        tokens = self.get_subTokens(spliter_optimo)
+        tokenized_sentence = {}
+        chunk = self.texto.split(spliter_optimo)
+        for txt in chunk:
+            best_split = ""
+            for tok in tokens:
+                if tok != "":
+                    lt = len(tok)
+                    lb = len(best_split)
+                    spltxt = txt.split(tok)
+                    if len(spltxt) > 1:
+                        l0 = len(spltxt[0])
+                        l1 = len(spltxt[1])
+                        if lt < len(txt) and lt > lb:
+                            best_split = tok
+                            tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
+        return tokenized_sentence
+# Example usage:
+texto_ejemplo = sys.argv[1]
+text_processor = TextProcessor(texto_ejemplo)
+spliter_optimo = text_processor.magic_split()
+print("Spliter óptimo:", spliter_optimo)
+print(text_processor.entropy())
+print(text_processor.tokenize(spliter_optimo))

lib/files.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+class TextFinder:
+    def __init__(self, folder):
+        self.folder = folder
+    def find_matches(self, text):
+        matches = []
+        files = os.listdir(self.folder)
+        for file in files:
+            file_path = os.path.join(self.folder, file)
+            if os.path.isfile(file_path):
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                    index = content.find(text)
+                    while index != -1:
+                        start = max(content.rfind('\n', 0, index), content.rfind('.', 0, index))
+                        end = min(content.find('\n', index), content.find('.', index))
+                        if start != -1 and end != -1:
+                            matches.append(content[start+1:end].strip())
+                        index = content.find(text, index + 1)
+        return matches
+# Example usage:
+if __name__ == "__main__":
+    finder = TextFinder('example_folder')
+    matches = finder.find_matches('text_to_find')
+    print(matches)