| import sys |
| import math |
|
|
| class TextProcessor: |
| def __init__(self, texto): |
| self.texto = texto |
|
|
| def entropy(self): |
| simbolos = {} |
| total_caracteres = len(self.texto) |
|
|
| for caracter in self.texto: |
| simbolos[caracter] = simbolos.get(caracter, 0) + 1 |
|
|
| entropia = 0 |
| for count in simbolos.values(): |
| probabilidad = count / total_caracteres |
| entropia -= probabilidad * math.log2(probabilidad) |
|
|
| return simbolos, entropia |
|
|
| def common_string(self, cadena1, cadena2): |
| longitud1 = len(cadena1) |
| longitud2 = len(cadena2) |
| comun = '' |
| subcadenas_comunes = [] |
|
|
| for i in range(longitud1): |
| for j in range(longitud2): |
| k = 0 |
| while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]): |
| k += 1 |
| if k > 0: |
| subcadenas_comunes.append(cadena1[i:i+k]) |
|
|
| if subcadenas_comunes: |
| comun = max(subcadenas_comunes, key=len) |
|
|
| return comun |
|
|
| def magic_split(self): |
| unique_symbols = set(self.texto) |
| symbol_distances = {} |
| for symbol in unique_symbols: |
| indices = [i for i, char in enumerate(self.texto) if char == symbol] |
| if len(indices) > 1: |
| distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)] |
| symbol_distances[symbol] = distances |
|
|
| variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances} |
|
|
| mins = {} |
| for v in variation: |
| if variation[v]!=0 and variation[v]!=1: |
| mins[v] = variation[v] |
|
|
| best_symbol = min(mins, key=mins.get) |
|
|
| return best_symbol |
|
|
| def rotate_string(self, string, n): |
| indice = n % len(string) |
| string_rotado = string[indice:] + string[:indice] |
| return string_rotado |
|
|
| def rotate_compare(self, tokiA, tokiB): |
| if tokiA >= tokiB: |
| tokA = tokiA |
| tokB = tokiB |
| ltokA = len(tokA) |
| else: |
| tokA = tokiB |
| tokB = tokiA |
| ltokA = len(tokB) |
|
|
| i = 0 |
| rotations = {} |
| while i < ltokA: |
| tokrotated = self.rotate_string(tokA, i) |
| rotations[str(i)] = self.common_string(tokrotated, tokB) |
| i += 1 |
|
|
| best_r = "" |
| for x in rotations: |
| lb = len(best_r) |
| rot = rotations[x] |
| lrot = len(rot) |
| if lrot > 1 and lrot < ltokA and lrot > lb: |
| best_r = rot |
|
|
| return best_r |
|
|
| def get_subTokens(self, spl): |
| sub_tokens = self.texto.split(spl) |
| toks = [] |
| for tok in sub_tokens: |
| for tok2 in sub_tokens: |
| if tok != tok2: |
| toks.append(self.rotate_compare(tok, tok2)) |
|
|
| return list(set(toks)) |
|
|
| def tokenize(self, spliter_optimo): |
| tokens = self.get_subTokens(spliter_optimo) |
| tokenized_sentence = {} |
| chunk = self.texto.split(spliter_optimo) |
| for txt in chunk: |
| best_split = "" |
| for tok in tokens: |
| if tok != "": |
| lt = len(tok) |
| lb = len(best_split) |
| spltxt = txt.split(tok) |
| if len(spltxt) > 1: |
| l0 = len(spltxt[0]) |
| l1 = len(spltxt[1]) |
| if lt < len(txt) and lt > lb: |
| best_split = tok |
| tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1] |
| return tokenized_sentence |
|
|
|
|
|
|