Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import json | |
| import pandas as pd | |
| import numpy as np | |
| from transformers import AutoTokenizer, RobertaForTokenClassification | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from json import JSONEncoder | |
| from faker import Faker | |
| class out_json(): | |
| def __init__(self, w,l): | |
| self.word = w | |
| self.label = l | |
| class MyEncoder(JSONEncoder): | |
| def default(self, o): | |
| return o.__dict__ | |
| class Model: | |
| def __init__(self): | |
| self.texto="" | |
| self.idioma="" | |
| self.modelo_ner="" | |
| self.categoria_texto="" | |
| def identificacion_idioma(self,text): | |
| self.texto=text | |
| tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
| model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection") | |
| inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt") | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| preds = torch.softmax(logits, dim=-1) | |
| id2lang = model.config.id2label | |
| vals, idxs = torch.max(preds, dim=1) | |
| #retorna el idioma con mayor porcentaje | |
| maximo=vals.max() | |
| idioma='' | |
| porcentaje=0 | |
| for k, v in zip(idxs, vals): | |
| if v.item()==maximo: | |
| idioma,porcentaje=id2lang[k.item()],v.item() | |
| if idioma=='es': | |
| self.idioma="es" | |
| self.modelo_ner='BSC-LT/roberta_model_for_anonimization' | |
| self.faker_ = Faker('es_MX') | |
| self.model = RobertaForTokenClassification.from_pretrained(self.modelo_ner) | |
| else: | |
| self.idioma="en" | |
| self.faker_ = Faker('en_US') | |
| self.modelo_ner="FacebookAI/xlm-roberta-large-finetuned-conll03-english" | |
| self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner) | |
| self.categorizar_texto(self.texto) | |
| def reordenacion_tokens(self,tokens): | |
| i=0 | |
| new_tokens=[] | |
| ig_tokens=[] #ignorar estos indices del array de indentificadores | |
| for token in tokens: | |
| ind=len(new_tokens) | |
| if i<len(tokens): | |
| if token.startswith("▁"): | |
| new_tokens.append(token) | |
| i=i+1 | |
| else: | |
| new_tokens[ind-1] = (new_tokens[ind-1] + token) | |
| ig_tokens.append(i) | |
| i=i+1 | |
| return ( | |
| new_tokens, | |
| ig_tokens | |
| ) | |
| def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes): | |
| x=0 | |
| new_identificadores=[] | |
| for token in predicted_tokens_classes: | |
| if x not in ig_tokens: | |
| new_identificadores.append(token) | |
| x=x+1 | |
| else: | |
| x=x+1 | |
| return new_identificadores | |
| def salida_json(self,tokens,pre_tokens): | |
| list=[] | |
| i=0 | |
| for t in tokens: | |
| if pre_tokens[i]!='O': | |
| a = out_json(t.replace('▁','').replace('Ġ',''),pre_tokens[i].replace('▁','')) | |
| list.append(a) | |
| i=i+1 | |
| return MyEncoder().encode(list) | |
| def salida_texto( self,tokens,pre_tokens): | |
| new_labels = [] | |
| current_word = None | |
| i=0 | |
| for token in tokens: | |
| if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]: | |
| new_labels.append(' ' +token.replace('▁','')) | |
| else: | |
| new_labels.append(' ' + pre_tokens[i]) | |
| i=i+1 | |
| a='' | |
| for i in new_labels: | |
| a = a+i | |
| return a | |
| #return new_labels | |
| def salida_texto_anonimizado(self, ids,pre_tokens): | |
| new_labels = [] | |
| current_word = None | |
| i=0 | |
| for identificador in pre_tokens: | |
| if identificador=='O' or 'OTH' in identificador: | |
| new_labels.append(self.tokenizer.decode(ids[i])) | |
| else: | |
| new_labels.append(' ' + identificador) | |
| i=i+1 | |
| a='' | |
| for i in new_labels: | |
| a = a+i | |
| return a | |
| def formato_salida(self,out): | |
| a="" | |
| for i in out: | |
| a = a + i.replace('▁','').replace(' ','') + ' ' | |
| return a | |
| def fake_pers(self): | |
| return self.faker_.name(self) | |
| def fake_word(self): | |
| return self.faker_.word() | |
| def fake_first_name(self): | |
| return self.faker_.first_name() | |
| def fake_last_name(self): | |
| return self.faker_.last_name() | |
| def fake_address(self): | |
| return self.faker_.address() | |
| def fake_sentence(self,n): | |
| return self.faker_.sentence(nb_words=n) | |
| def fake_text(self): | |
| return self.faker_.text() | |
| def fake_company(self): | |
| return self.faker_.company() | |
| def fake_city(self): | |
| return self.faker_.city() | |
| def reemplazo_fake(self,identificadores): | |
| new_iden=[] | |
| for id in identificadores: | |
| if 'PER' in id: | |
| new_iden.append(self.fake_first_name()) | |
| elif 'ORG' in id: | |
| new_iden.append(self.fake_company()) | |
| elif 'LOC' in id: | |
| new_iden.append(self.fake_city()) | |
| else: | |
| new_iden.append(id) | |
| return new_iden | |
| def categorizar_texto(self,texto): | |
| name="elozano/bert-base-cased-news-category" | |
| tokenizer = AutoTokenizer.from_pretrained(name) | |
| model_ = AutoModelForSequenceClassification.from_pretrained(name) | |
| inputs_ = tokenizer(texto, padding=True, truncation=True, return_tensors="pt") | |
| with torch.no_grad(): | |
| logits = model_(**inputs_).logits | |
| preds = torch.softmax(logits, dim=-1) | |
| id2lang = model_.config.id2label | |
| vals, idxs = torch.max(preds, dim=1) | |
| #retorna el idioma con mayor porcentaje | |
| maximo=vals.max() | |
| cat='' | |
| self.categoria_texto='' | |
| porcentaje=0 | |
| for k, v in zip(idxs, vals): | |
| if v.item()==maximo: | |
| cat,porcentaje=id2lang[k.item()],v.item() | |
| self.categoria_texto=cat | |
| return cat, porcentaje | |
| def predict(self): | |
| categoria, porcentaje = self.categorizar_texto(self.texto) | |
| print(categoria, porcentaje) | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.modelo_ner) | |
| tokens = self.tokenizer.tokenize(self.texto) | |
| ids = self.tokenizer.convert_tokens_to_ids(tokens) | |
| input_ids = torch.tensor([ids]) | |
| with torch.no_grad(): | |
| logits = self.model(input_ids).logits | |
| predicted_token_class_ids = logits.argmax(-1) | |
| predicted_tokens_classes = [self.model.config.id2label[t.item()] for t in predicted_token_class_ids[0]] | |
| labels = predicted_token_class_ids | |
| loss = self.model(input_ids, labels=labels).loss | |
| if (self.idioma=='es'): | |
| out1 = self.salida_json(tokens,predicted_tokens_classes) #spanish solo palabras sensibles | |
| out2 = self.salida_texto_anonimizado(ids,self.reemplazo_fake(predicted_tokens_classes)) #español texto completo | |
| else: | |
| new_tokens,ig_tokens=self.reordenacion_tokens(tokens) | |
| new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes) | |
| out1 = self.salida_json(new_tokens,new_identificadores), | |
| out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores)) | |
| return ( | |
| self.texto[:1869], | |
| out1, | |
| str(out2) | |
| ) | |
| model = Model() | |
| def get_model(): | |
| return model | |
| def procesar(texto): | |
| model.identificacion_idioma(texto[:1869]) | |
| return model.predict() | |
| demo = gr.Interface(fn=procesar, inputs="text", outputs=[gr.Textbox(label="texto in"),gr.Textbox(label="identificadores"),gr.Textbox(label="texto procesado")]) | |
| demo.launch(share=True) |