Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| import nltk | |
| from nltk import word_tokenize | |
| from nltk.util import ngrams | |
| from unidecode import unidecode | |
| nltk.download('punkt') | |
| import re | |
| # leemos diccionario de entidades | |
| diccionario = pd.read_csv('diccionario.csv', encoding = 'utf-8-sig', usecols = ['Entidad', 'Categoria']) | |
| diccionario = diccionario.dropna() | |
| diccionario = diccionario[diccionario['Categoria'] != 'Año'] | |
| diccionario = diccionario.iloc[1:] | |
| all_dicts = diccionario.apply(lambda x: {x['Entidad']: x['Categoria']}, axis = 1) | |
| # formateamos diccionario | |
| entities_dict = {} | |
| for i in all_dicts: | |
| entities_dict.update(i) | |
| def f_remove_accents(old: str): | |
| ''' | |
| Función que limpia acentos de las letras. | |
| old: texto a limpiar (str) | |
| ''' | |
| new = re.sub(r'[àáâãäå]', 'a', old) | |
| new = re.sub(r'[èéêë]', 'e', new) | |
| new = re.sub(r'[ìíîï]', 'i', new) | |
| new = re.sub(r'[òóôõö]', 'o', new) | |
| new = re.sub(r'[ùúûü]', 'u', new) | |
| return new | |
| def predict(text: str, goal = ''): | |
| output_sernac, output_sernac_categories, output_other, output_objective = np.nan, np.nan, np.nan, np.nan | |
| diccionario = entities_dict.copy() | |
| tokens = word_tokenize(text, language = 'spanish') | |
| tokens_lower = [f_remove_accents(token.lower()) for token in tokens] # tokens en minuscula | |
| dict_tokens = {tokens_lower[i]: tokens[i] for i in range(len(tokens))} | |
| dict_keys = {f_remove_accents(key.lower()): key for key in diccionario.keys()} | |
| # Evaluar el grado de ngramas en texto | |
| ngram_range = 5 # rango de ngramas a evaluar | |
| nmin = 1 # numero minimo de ngramas presente en el texto | |
| grams_detected = {} | |
| for i in range(2, ngram_range + 1): | |
| n_grams = [' '.join(ngram) for ngram in list(nltk.ngrams(tokens_lower, i))] | |
| intersection = list(set(n_grams) & set(dict_keys.keys())) | |
| if len(intersection) > 0: | |
| nmin = i | |
| grams_detected.update({nmin: intersection}) | |
| sep = '%$·' | |
| tmp_text = ' '.join(tokens_lower) | |
| for i in range(5, 1, -1): | |
| try: | |
| # obtener todos los ngramas de nivel "i" | |
| for j in range(len(grams_detected[i])): | |
| entity = grams_detected[i][j] | |
| tokens_entity = tuple(word_tokenize(entity)) | |
| ngrams = list(nltk.ngrams(tmp_text.split(' '), i)) | |
| tmp_list = [(f'{i}{sep}{j}',) if ngram == tokens_entity else ngram for ngram in ngrams] | |
| pos_list = [key for key, value in dict(enumerate(tmp_list)).items() if f'{i}{sep}{j}' in value[0]] | |
| exclude_list = [value + k for value in pos_list for k in range(1, i)] | |
| tmp_list = [value for key, value in dict(enumerate(tmp_list)).items() if key not in exclude_list] | |
| tmp_text = ' '.join([i[0] for i in tmp_list] + [token for token in tmp_text.split(' ')[-i+1:] if token not in tokens_entity]) | |
| except KeyError: # en caso de que no existan ngramas de nivel "i", pass | |
| pass | |
| labeled_tokens = [] | |
| # si hay solo entidades de largo 1, devuelvo oracion etiquetada token a token | |
| if nmin < 2: | |
| for token in tokens_lower: | |
| labeled_tokens.append((dict_tokens[token], diccionario[dict_keys[token]]) if token in dict_keys.keys() else (token, None)) | |
| # si hay entidades de largo 2 o mas, devuelvo texto etiquetado con ngramas | |
| else: | |
| tmp_text = ' '.join(tmp_text.split()) # texto sin espacios | |
| tmp_tokens = tmp_text.split() | |
| for token in tmp_tokens: | |
| if sep in token: | |
| level, pos = token.split(sep) | |
| encoded_token = grams_detected[int(level)][int(pos)] | |
| labeled_tokens.append((encoded_token, diccionario[dict_keys[encoded_token]])) | |
| elif token in dict_keys.keys(): | |
| #labeled_tokens.append((dict_tokens[token], diccionario[dict_keys[token]])) | |
| labeled_tokens.append((token, diccionario[dict_keys[token]])) | |
| else: | |
| labeled_tokens.append((token, None)) | |
| # CLASSIFICATION | |
| input = np.array([text, goal], ndmin = 2) | |
| # SERNAC CLASSIFICATION | |
| with open('sernac_model.pkl', 'rb') as model: | |
| clf = pickle.load(model) | |
| labels = [label for label in clf.classes_] | |
| probas = clf.predict_proba(input) | |
| sernac_probas = {labels[i]: float(probas[0][i]) for i in range(probas.shape[1])} | |
| sernac_categories, other_categories = {}, {} | |
| if clf.predict(input) == 'SERNAC': | |
| # SERNAC CATEGORIES CLASSIFICATION | |
| with open('sernac_categories_model.pkl', 'rb') as model: | |
| clf = pickle.load(model) | |
| labels = [label for label in clf.classes_] | |
| probas = clf.predict_proba(input) | |
| sernac_categories = {labels[i]: float(probas[0][i]) for i in range(probas.shape[1])} | |
| else: | |
| # OTHER CATEGORIES CLASSIFICATION | |
| with open('other_categories_model.pkl', 'rb') as model: | |
| clf = pickle.load(model) | |
| labels = [label for label in clf.classes_] | |
| probas = clf.predict_proba(input) | |
| other_categories = {labels[i]: float(probas[0][i]) for i in range(probas.shape[1])} | |
| objective_categories = {} | |
| if goal != '': | |
| with open('objective_model.pkl', 'rb') as model: | |
| clf = pickle.load(model) | |
| labels = [label for label in clf.classes_] | |
| probas = clf.predict_proba(input) | |
| objective_categories = {labels[i]: float(probas[0][i]) for i in range(probas.shape[1])} | |
| # RETURN | |
| return labeled_tokens, sernac_probas, sernac_categories, other_categories, objective_categories | |
| # DEMO | |
| demo = gr.Interface( | |
| predict, | |
| inputs = [gr.Textbox(placeholder = "Ingresa el reclamo acá", label = 'Reclamo'), gr.Textbox(placeholder = "Ingresa el objetivo acá (opcional)", label = 'Objetivo')], | |
| outputs = [gr.Highlightedtext(label = 'Entidades detectadas'), | |
| gr.outputs.Label(label = 'Clasificación SERNAC'), | |
| gr.outputs.Label(label = 'Clasificación categorías SERNAC'), | |
| gr.outputs.Label(label = 'Clasificación categorías No SERNAC'), | |
| gr.outputs.Label(label = 'Clasificación objetivo')], | |
| examples=[ | |
| ['este septiembre iremos manejando a tEmUco en un tóyòtA para pasar las fiestas patrias', 'ir a temuco'], | |
| ['no puedo, tengo que ir desde san pedro hasta la reina y luego hasta san pedro de la paz', ''], | |
| ['Buenas tardes, hace unas semanas compre un suzuki swift a derco de santiago, llevaba 2 semanas y la caja de cambios se echó a perder. Tengo asegurado el auto con BCI, pero aun no obtengo respuesta.', 'exijo una explicación!'], | |
| ['Tengo un toyota urban cruiser 1.3 año 2010 el cual consume mucho aceite y nunca me han respondido si tiene alguna solución o garantía me gustaría que fueran más concretas las respuestas gracias', 'Obtener una solucion Que reparación hay que hacer o si tiene garantía?'], | |
| ['Mi auto del año presenta Falla de motor y sensores siendo que lo compre nuevo 0km y tiene recién 5400kms.. Es un Peugeot 2008 gti... El servicio es como las pelotas.. Me mandaron a un servicio técnico en Calama que estaba cerrado', ''] | |
| ], | |
| title = 'Demo ML' | |
| ) | |
| demo.launch() |