Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,6 +22,22 @@ nltk.download('punkt')
|
|
| 22 |
|
| 23 |
docs = None
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 26 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
|
| 27 |
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
|
|
@@ -41,6 +57,35 @@ def validate_dataset(dataset):
|
|
| 41 |
else:
|
| 42 |
return "⚠️Esperando documentos..."
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
def request_pathname(files):
|
| 45 |
if files is None:
|
| 46 |
return [[]]
|
|
@@ -101,8 +146,9 @@ def encode_docs(docs,maxlen = 64, stride = 32):
|
|
| 101 |
return embeddings, spans, file_names
|
| 102 |
|
| 103 |
def predict(query,data):
|
|
|
|
| 104 |
name_to_save = data.name.split("/")[-1].split(".")[0][:-8]
|
| 105 |
-
k=
|
| 106 |
st = str([query,name_to_save])
|
| 107 |
st_hashed = str(hashlib.sha256(st.encode()).hexdigest()) #just to speed up examples load
|
| 108 |
hist = st + " " + st_hashed
|
|
@@ -114,7 +160,9 @@ def predict(query,data):
|
|
| 114 |
list_outputs = []
|
| 115 |
for i in range(k):
|
| 116 |
temp = [df.iloc[n] for n in range(k)][i]
|
| 117 |
-
tupla = (temp.Respuesta,
|
|
|
|
|
|
|
| 118 |
# text = ''
|
| 119 |
# text += 'Probabilidades: '+ temp.Probabilidades + '\n\n'
|
| 120 |
# text += 'Respuesta: ' +temp.Respuesta + '\n\n'
|
|
@@ -139,6 +187,8 @@ def predict(query,data):
|
|
| 139 |
text = text.replace("\r", " ")
|
| 140 |
text = text.replace("\n", " ")
|
| 141 |
text = text.replace(" . "," ")
|
|
|
|
|
|
|
| 142 |
|
| 143 |
doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)
|
| 144 |
|
|
@@ -194,7 +244,9 @@ def predict(query,data):
|
|
| 194 |
list_outputs = []
|
| 195 |
for i in range(k):
|
| 196 |
temp = [df.iloc[n] for n in range(k)][i]
|
| 197 |
-
tupla = (temp.Respuesta,
|
|
|
|
|
|
|
| 198 |
# text = ''
|
| 199 |
# text += 'Probabilidades: '+ temp.Probabilidades + '\n\n'
|
| 200 |
# text += 'Respuesta: ' +temp.Respuesta + '\n\n'
|
|
@@ -248,7 +300,7 @@ with gr.Blocks() as demo:
|
|
| 248 |
ask.click(fn=predict, inputs=[query,
|
| 249 |
file], outputs=[answer, context, prob])
|
| 250 |
|
| 251 |
-
examples = ["¿Cuándo suelen comenzar las adicciones?","Entrevista Miguel Ruiz.txt"]
|
| 252 |
|
| 253 |
demo.queue(concurrency_count=20)
|
| 254 |
demo.launch(show_error=True)
|
|
|
|
| 22 |
|
| 23 |
docs = None
|
| 24 |
|
| 25 |
+
# Definimos los modelos:
|
| 26 |
+
# Traducción
|
| 27 |
+
mname = "Helsinki-NLP/opus-mt-es-en"
|
| 28 |
+
tokenizer_es_en = MarianTokenizer.from_pretrained(mname)
|
| 29 |
+
model_es_en = MarianMTModel.from_pretrained(mname)
|
| 30 |
+
model_es_en.to(device)
|
| 31 |
+
|
| 32 |
+
mname = "Helsinki-NLP/opus-mt-en-es"
|
| 33 |
+
tokenizer_en_es = MarianTokenizer.from_pretrained(mname)
|
| 34 |
+
model_en_es = MarianMTModel.from_pretrained(mname)
|
| 35 |
+
model_en_es.to(device)
|
| 36 |
+
|
| 37 |
+
lt = LineTokenizer()
|
| 38 |
+
|
| 39 |
+
# Responder preguntas
|
| 40 |
+
|
| 41 |
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 42 |
model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
|
| 43 |
tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
|
|
|
|
| 57 |
else:
|
| 58 |
return "⚠️Esperando documentos..."
|
| 59 |
|
| 60 |
+
def traducir_parrafos(parrafos, tokenizer, model, tam_bloque=8, ):
|
| 61 |
+
parrafos_traducidos = []
|
| 62 |
+
for parrafo in parrafos:
|
| 63 |
+
frases = sent_tokenize(parrafo)
|
| 64 |
+
batches = math.ceil(len(frases) / tam_bloque)
|
| 65 |
+
traducido = []
|
| 66 |
+
for i in range(batches):
|
| 67 |
+
|
| 68 |
+
bloque_enviado = frases[i*tam_bloque:(i+1)*tam_bloque]
|
| 69 |
+
model_inputs = tokenizer(bloque_enviado, return_tensors="pt",
|
| 70 |
+
padding=True, truncation=True,
|
| 71 |
+
max_length=500).to(device)
|
| 72 |
+
with torch.no_grad():
|
| 73 |
+
bloque_traducido = model.generate(**model_inputs)
|
| 74 |
+
traducido += bloque_traducido
|
| 75 |
+
traducido = [tokenizer.decode(t, skip_special_tokens=True) for t in traducido]
|
| 76 |
+
parrafos_traducidos += [" ".join(traducido)]
|
| 77 |
+
return parrafos_traducidos
|
| 78 |
+
|
| 79 |
+
def traducir_es_en(texto):
|
| 80 |
+
parrafos = lt.tokenize(texto)
|
| 81 |
+
par_tra = traducir_parrafos(parrafos, tokenizer_es_en, model_es_en)
|
| 82 |
+
return "\n".join(par_tra)
|
| 83 |
+
|
| 84 |
+
def traducir_en_es(texto):
|
| 85 |
+
parrafos = lt.tokenize(texto)
|
| 86 |
+
par_tra = traducir_parrafos(parrafos, tokenizer_en_es, model_en_es)
|
| 87 |
+
return "\n".join(par_tra)
|
| 88 |
+
|
| 89 |
def request_pathname(files):
|
| 90 |
if files is None:
|
| 91 |
return [[]]
|
|
|
|
| 146 |
return embeddings, spans, file_names
|
| 147 |
|
| 148 |
def predict(query,data):
|
| 149 |
+
query = traducir_es_en(query)
|
| 150 |
name_to_save = data.name.split("/")[-1].split(".")[0][:-8]
|
| 151 |
+
k=2
|
| 152 |
st = str([query,name_to_save])
|
| 153 |
st_hashed = str(hashlib.sha256(st.encode()).hexdigest()) #just to speed up examples load
|
| 154 |
hist = st + " " + st_hashed
|
|
|
|
| 160 |
list_outputs = []
|
| 161 |
for i in range(k):
|
| 162 |
temp = [df.iloc[n] for n in range(k)][i]
|
| 163 |
+
tupla = (traducir_en_es(temp.Respuesta),
|
| 164 |
+
traducir_en_es(temp.Contexto),
|
| 165 |
+
traducir_en_es(temp.Probabilidades))
|
| 166 |
# text = ''
|
| 167 |
# text += 'Probabilidades: '+ temp.Probabilidades + '\n\n'
|
| 168 |
# text += 'Respuesta: ' +temp.Respuesta + '\n\n'
|
|
|
|
| 187 |
text = text.replace("\r", " ")
|
| 188 |
text = text.replace("\n", " ")
|
| 189 |
text = text.replace(" . "," ")
|
| 190 |
+
|
| 191 |
+
text = traducir_es_en(text)
|
| 192 |
|
| 193 |
doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)
|
| 194 |
|
|
|
|
| 244 |
list_outputs = []
|
| 245 |
for i in range(k):
|
| 246 |
temp = [df.iloc[n] for n in range(k)][i]
|
| 247 |
+
tupla = (traducir_en_es(temp.Respuesta),
|
| 248 |
+
traducir_en_es(temp.Contexto),
|
| 249 |
+
traducir_en_es(temp.Probabilidades))
|
| 250 |
# text = ''
|
| 251 |
# text += 'Probabilidades: '+ temp.Probabilidades + '\n\n'
|
| 252 |
# text += 'Respuesta: ' +temp.Respuesta + '\n\n'
|
|
|
|
| 300 |
ask.click(fn=predict, inputs=[query,
|
| 301 |
file], outputs=[answer, context, prob])
|
| 302 |
|
| 303 |
+
gr.Interface.load(examples = ["¿Cuándo suelen comenzar las adicciones?","Entrevista Miguel Ruiz.txt"])
|
| 304 |
|
| 305 |
demo.queue(concurrency_count=20)
|
| 306 |
demo.launch(show_error=True)
|