Spaces:
Runtime error
Runtime error
Commit
·
103c053
1
Parent(s):
e865108
libs entropy and read files
Browse files- lib/entropy.py +131 -0
- lib/files.py +31 -0
lib/entropy.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import math
|
| 3 |
+
|
| 4 |
+
class TextProcessor:
|
| 5 |
+
def __init__(self, texto):
|
| 6 |
+
self.texto = texto
|
| 7 |
+
|
| 8 |
+
def entropy(self):
|
| 9 |
+
simbolos = {}
|
| 10 |
+
total_caracteres = len(self.texto)
|
| 11 |
+
|
| 12 |
+
for caracter in self.texto:
|
| 13 |
+
simbolos[caracter] = simbolos.get(caracter, 0) + 1
|
| 14 |
+
|
| 15 |
+
entropia = 0
|
| 16 |
+
for count in simbolos.values():
|
| 17 |
+
probabilidad = count / total_caracteres
|
| 18 |
+
entropia -= probabilidad * math.log2(probabilidad)
|
| 19 |
+
|
| 20 |
+
return simbolos, entropia
|
| 21 |
+
|
| 22 |
+
def common_string(self, cadena1, cadena2):
|
| 23 |
+
longitud1 = len(cadena1)
|
| 24 |
+
longitud2 = len(cadena2)
|
| 25 |
+
comun = ''
|
| 26 |
+
subcadenas_comunes = []
|
| 27 |
+
|
| 28 |
+
for i in range(longitud1):
|
| 29 |
+
for j in range(longitud2):
|
| 30 |
+
k = 0
|
| 31 |
+
while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
|
| 32 |
+
k += 1
|
| 33 |
+
if k > 0:
|
| 34 |
+
subcadenas_comunes.append(cadena1[i:i+k])
|
| 35 |
+
|
| 36 |
+
if subcadenas_comunes:
|
| 37 |
+
comun = max(subcadenas_comunes, key=len)
|
| 38 |
+
|
| 39 |
+
return comun
|
| 40 |
+
|
| 41 |
+
def magic_split(self):
|
| 42 |
+
unique_symbols = set(self.texto)
|
| 43 |
+
symbol_distances = {}
|
| 44 |
+
for symbol in unique_symbols:
|
| 45 |
+
indices = [i for i, char in enumerate(self.texto) if char == symbol]
|
| 46 |
+
if len(indices) > 1:
|
| 47 |
+
distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
|
| 48 |
+
symbol_distances[symbol] = distances
|
| 49 |
+
|
| 50 |
+
variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}
|
| 51 |
+
|
| 52 |
+
mins = {}
|
| 53 |
+
for v in variation:
|
| 54 |
+
if variation[v]!=0 and variation[v]!=1:
|
| 55 |
+
mins[v] = variation[v]
|
| 56 |
+
|
| 57 |
+
best_symbol = min(mins, key=mins.get)
|
| 58 |
+
|
| 59 |
+
return best_symbol
|
| 60 |
+
|
| 61 |
+
def rotate_string(self, string, n):
|
| 62 |
+
indice = n % len(string)
|
| 63 |
+
string_rotado = string[indice:] + string[:indice]
|
| 64 |
+
return string_rotado
|
| 65 |
+
|
| 66 |
+
def rotate_compare(self, tokiA, tokiB):
|
| 67 |
+
if tokiA >= tokiB:
|
| 68 |
+
tokA = tokiA
|
| 69 |
+
tokB = tokiB
|
| 70 |
+
ltokA = len(tokA)
|
| 71 |
+
else:
|
| 72 |
+
tokA = tokiB
|
| 73 |
+
tokB = tokiA
|
| 74 |
+
ltokA = len(tokB)
|
| 75 |
+
|
| 76 |
+
i = 0
|
| 77 |
+
rotations = {}
|
| 78 |
+
while i < ltokA:
|
| 79 |
+
tokrotated = self.rotate_string(tokA, i)
|
| 80 |
+
rotations[str(i)] = self.common_string(tokrotated, tokB)
|
| 81 |
+
i += 1
|
| 82 |
+
|
| 83 |
+
best_r = ""
|
| 84 |
+
for x in rotations:
|
| 85 |
+
lb = len(best_r)
|
| 86 |
+
rot = rotations[x]
|
| 87 |
+
lrot = len(rot)
|
| 88 |
+
if lrot > 1 and lrot < ltokA and lrot > lb:
|
| 89 |
+
best_r = rot
|
| 90 |
+
|
| 91 |
+
return best_r
|
| 92 |
+
|
| 93 |
+
def get_subTokens(self, spl):
|
| 94 |
+
sub_tokens = self.texto.split(spl)
|
| 95 |
+
toks = []
|
| 96 |
+
for tok in sub_tokens:
|
| 97 |
+
for tok2 in sub_tokens:
|
| 98 |
+
if tok != tok2:
|
| 99 |
+
toks.append(self.rotate_compare(tok, tok2))
|
| 100 |
+
|
| 101 |
+
return list(set(toks))
|
| 102 |
+
|
| 103 |
+
def tokenize(self, spliter_optimo):
|
| 104 |
+
tokens = self.get_subTokens(spliter_optimo)
|
| 105 |
+
tokenized_sentence = {}
|
| 106 |
+
chunk = self.texto.split(spliter_optimo)
|
| 107 |
+
for txt in chunk:
|
| 108 |
+
best_split = ""
|
| 109 |
+
for tok in tokens:
|
| 110 |
+
if tok != "":
|
| 111 |
+
lt = len(tok)
|
| 112 |
+
lb = len(best_split)
|
| 113 |
+
spltxt = txt.split(tok)
|
| 114 |
+
if len(spltxt) > 1:
|
| 115 |
+
l0 = len(spltxt[0])
|
| 116 |
+
l1 = len(spltxt[1])
|
| 117 |
+
if lt < len(txt) and lt > lb:
|
| 118 |
+
best_split = tok
|
| 119 |
+
tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
|
| 120 |
+
return tokenized_sentence
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# Example usage:
|
| 124 |
+
texto_ejemplo = sys.argv[1]
|
| 125 |
+
|
| 126 |
+
text_processor = TextProcessor(texto_ejemplo)
|
| 127 |
+
spliter_optimo = text_processor.magic_split()
|
| 128 |
+
print("Spliter óptimo:", spliter_optimo)
|
| 129 |
+
print(text_processor.entropy())
|
| 130 |
+
print(text_processor.tokenize(spliter_optimo))
|
| 131 |
+
|
lib/files.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
class TextFinder:
|
| 4 |
+
def __init__(self, folder):
|
| 5 |
+
self.folder = folder
|
| 6 |
+
|
| 7 |
+
def find_matches(self, text):
|
| 8 |
+
matches = []
|
| 9 |
+
files = os.listdir(self.folder)
|
| 10 |
+
|
| 11 |
+
for file in files:
|
| 12 |
+
file_path = os.path.join(self.folder, file)
|
| 13 |
+
if os.path.isfile(file_path):
|
| 14 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 15 |
+
content = f.read()
|
| 16 |
+
index = content.find(text)
|
| 17 |
+
while index != -1:
|
| 18 |
+
start = max(content.rfind('\n', 0, index), content.rfind('.', 0, index))
|
| 19 |
+
end = min(content.find('\n', index), content.find('.', index))
|
| 20 |
+
if start != -1 and end != -1:
|
| 21 |
+
matches.append(content[start+1:end].strip())
|
| 22 |
+
index = content.find(text, index + 1)
|
| 23 |
+
|
| 24 |
+
return matches
|
| 25 |
+
|
| 26 |
+
# Example usage:
|
| 27 |
+
if __name__ == "__main__":
|
| 28 |
+
finder = TextFinder('example_folder')
|
| 29 |
+
matches = finder.find_matches('text_to_find')
|
| 30 |
+
print(matches)
|
| 31 |
+
|