tecuhtli commited on
Commit
8a611c0
·
verified ·
1 Parent(s): 98d715a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +606 -123
app.py CHANGED
@@ -95,177 +95,660 @@ def load_model(path_str):
95
  return model, tokenizer
96
 
97
 
98
- # Funcion para clasificar las preguntas del usuario definiendo el contexto de las mismas
99
- def classify_context(question, label_classes, model, tokenizer, device):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- '''
102
- inputs:
 
 
 
 
 
 
 
 
103
 
104
- question --> Pregunta formulada por el usuario
105
- label_classes --> Clases del label encoder para decodificar inferencias
106
- model --> Clasificador para determinar el contexto de las pregutnas
107
- tokenizer --> Tokenizer usada para clasificar contextos
108
- device --> Usar el GPU o el CPU dependiendo de su disponibilidad
109
 
110
- outsputs:
 
 
 
 
 
 
111
 
112
- predicted_label --> Clasificacion de la pregunta en diversos contextos (clases)
 
 
 
 
 
113
 
114
- '''
 
 
115
 
116
- # Moviendo el modelo al device disponible
117
- model = model.to(device)
118
-
119
- # Procesando la entrada del usuario
120
- inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=128)
121
- inputs = {key: val.to(device) for key, val in inputs.items()}
122
 
123
- # Clasificacion de la pregunta del usuario en contextos
124
- with torch.no_grad():
125
- outputs = model(**inputs)
126
- logits = outputs.logits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- # Inferencia del clasificador
129
- pred_intent = torch.argmax(logits, dim=1).item()
130
- predicted_label = label_classes[pred_intent]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- return predicted_label
 
 
 
 
133
 
134
 
 
 
 
135
 
136
- # Funcion para generar respuestas tecnicas de Mori
137
- def technical_asnwer(question, context, model, tokenizer, device):
138
 
139
- '''
140
- inputs:
 
 
 
 
 
 
 
 
141
 
142
- question --> Pregunta formulada por el usuario
143
- context --> Contexto de la preguntadel usario definido por el clasificador
144
- model --> Modelo de Mori para responder preguntas tecnicas
145
- tokenizer --> Tokenizer usado para procesar entradas y decoodificar respuestas
146
- device --> Usar el GPU o el CPU dependiendo de su disponibilidad
147
 
148
- outsputs:
149
 
150
- response --> Respues de Mori tecnico (Modelo tecnico)
151
 
152
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
- # Moviendo el modelo al device disponible
155
- model = model.to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- # Promp Engineering para ayudar a Mori a encontrar la mejor respuesta
158
- input_text = f"Context: {context} [SEP] Question: {question}"
 
 
 
 
 
 
 
 
 
159
 
160
- # Tokenizando el texto de entrada
161
- inputs = tokenizer(input_text, return_tensors="pt").to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- # Generando la respuesta
164
- summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=5, early_stopping=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- # Decodificando la respuesta
167
- response = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 
 
 
 
 
 
 
168
 
169
- return "🧠 [Mori Técnico] " + response.strip()
170
 
 
171
 
172
- # Funcion para generar respuestas sociales de Mori
173
- def social_asnwer(question, model, tokenizer, device):
174
 
175
- '''
176
- inputs:
 
177
 
178
- question --> Pregunta formulada por el usuario
179
- model --> Modelo de Mori para responder preguntas sociales
180
- tokenizer --> Tokenizer usado para procesar entradas y decoodificar respuestas
181
- device --> Usar el GPU o el CPU dependiendo de su disponibilidad
182
 
183
- outsputs:
 
 
184
 
185
- response --> Respues de Mori social (Modelo social)
 
 
 
186
 
187
- '''
 
 
 
 
 
188
 
189
- # Moviendo el modelo al device disponible
 
 
 
 
190
  model = model.to(device)
191
 
192
- # Tokenizando la entrada del usuario sin agregar <eos> explícitamente
193
- inputs = tokenizer(
194
- question, # sin agregar eos_token
195
- return_tensors="pt",
196
- padding=True,
197
- truncation=True,
198
- max_length=128 # ✅ especificado para evitar warning
199
- ).to(device)
200
-
201
- # Generando respuesta usando muestreo
202
- output_ids = model.generate(
203
- input_ids=inputs["input_ids"],
204
- attention_mask=inputs["attention_mask"], # ✅ FIX agregado
205
- max_length=50,
206
- pad_token_id= tokenizer.eos_token_id,
207
- do_sample=True,
208
- top_p=0.95,
209
- top_k=50)
210
-
211
- # Decodificando y limpiando la respuesta
212
- response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
213
 
214
- return "🤝 [Mori Social] " + response.strip()
215
 
216
 
217
- # Funcion para generar respuesta general de Mori
218
- def contextual_asnwer(question, label_classes, context_model, cont_tok, tec_model, tec_tok, soc_model, soc_tok, device):
 
219
 
220
- '''
221
- inputs:
222
 
223
- question --> Pregunta formulada por el usuario
224
- label_classes --> Clases del label encoder para decodificar inferencias
225
- context_model --> Clasificador para determinar el contexto de las pregutnas
226
- cont_tok --> Tokenizer usada para clasificar contextos
227
- tec_model --> Modelo de Mori para responder preguntas tecnicas
228
- tec_tok --> Tokenizer usado por Mori Tenico
229
- soc_model --> Modelo de Mori para responder preguntas sociales
230
- soc_tok --> Tokenizer usado por Mori Social
231
- device --> Usar el GPU o el CPU dependiendo de su disponibilidad
232
 
233
- outsputs:
 
 
 
 
 
234
 
235
- response --> Respues de Mori General (Respues con Prompt Engineering)
 
 
236
 
237
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
- # Detectar contexto usando el clasificador
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  context = classify_context(question, label_classes, context_model, cont_tok, device)
 
241
 
242
- context_icons = {"social": "💬",
243
- "modelos": "🔧",
244
- "evaluación": "📏",
245
- "optimización": "⚙️",
246
- "visualización": "📈",
247
- "aprendizaje": "🧠",
248
- "vida digital" : "🧑‍💻",
249
- "estadística": "📊",
250
- "infraestructura": "🖥",
251
- "datos": "📂",
252
- "transformación digital": "🌀"}
253
-
254
- icon = context_icons.get(context, "🧠")
255
- #print(f"{icon} Contexto detectado: {context}") # (opcional para debug)
256
- st.markdown(f"**{icon} Contexto detectado:** `{context}`")
257
 
258
- if context == 'social':
259
-
260
- # Generar respuesta contextual usando el modelo social
261
- response = social_asnwer(question, soc_model,soc_tok, device)
262
 
263
- else:
264
 
265
- # Generar respuesta contextual usando el modelo tecnico
266
- response = technical_asnwer(question, context, tec_model, tec_tok, device)
 
 
 
267
 
268
- return response, context
269
 
270
 
271
 
 
95
  return model, tokenizer
96
 
97
 
98
+ #-------------------------------------------------------------------------
99
+ #Function to correct Spanish sentences' punctuation and missing characters
100
+ #-------------------------------------------------------------------------
101
+
102
+ def polish_spanish(s: str) -> str:
103
+
104
+ """Correcting Spanish sentences
105
+
106
+ Args:
107
+ s (str): Input Spanish sentence.
108
+
109
+ Returns:
110
+ str: A corrected and polished version of the input.
111
+ """
112
+
113
+ # Normalizing input for correct standardization
114
+ s = unicodedata.normalize("NFC", s).strip()
115
+
116
+ # Removing Model names if leaked into generated inputs prompts
117
+ s = re.sub(r'\s*[\[\(]\s*Mori\s+(?:Social|T[eé]nico|T[eé]cnico)\s*[\]\)]\s*', '', s, flags=re.I)
118
+
119
+ # Correcting missing or misspelled words
120
+ fixes = [
121
+ (r'(?i)(^|\W)T\s+puedes(?P<p>[^\w]|$)', r'\1Tú puedes\g<p>'),
122
+ (r'(?i)(^|\W)T\s+(ya|eres|estas|estás|tienes|puedes)\b', r'\1Tú \2'),
123
+ (r'(?i)\bclaro que s(?:i|í)?\b(?P<p>[,.\!?…])?', r'Claro que sí\g<p>'),
124
+ (r'(?i)(^|\s)si,', r'\1Sí,'),
125
+ (r'(?i)(\beso\s+)s(\s+est[áa]\b)', r'\1sí\2'),
126
+ (r'(?i)(^|[\s,;:])s(\s+es\b)', r'\1sí\2'),
127
+ (r'(?i)\btiles\b', 'útiles'),
128
+ (r'(?i)\butiles\b', 'útiles'),
129
+ (r'(?i)\butil\b', 'útil'),
130
+ (r'(?i)\baqui\b', 'aquí'),
131
+ (r'(?i)\baqu\b(?=\s+estoy\b)', 'aquí'),
132
+ (r'(?i)\balgn\b', 'algún'),
133
+ (r'(?i)\balgun\b', 'algún'),
134
+ (r'(?i)\bAnimo\b', 'Ánimo'),
135
+ (r'(?i)\bcario\b', 'cariño'),
136
+ (r'(?i)\baprendisaje\b', 'aprendizaje'),
137
+ (r'(?i)\bmanana\b', 'mañana'),
138
+ (r'(?i)\bmaana\b', 'mañana'),
139
+ (r'(?i)\benergia\b', 'energía'),
140
+ (r'(?i)\benerga\b', 'energía'),
141
+ (r'(?i)\bextrano\b', 'extraño'),
142
+ (r'(?i)\bextrana\b', 'extraña'),
143
+ (r'(?i)\bextranar\b', 'extrañar'),
144
+ (r'(?i)\bextranarte\b', 'extrañarte'),
145
+ (r'(?i)\bextranas\b', 'extrañas'),
146
+ (r'(?i)\bextranos\b', 'extraños'),
147
+ (r'(?i)\baqu\b', 'aquí'),
148
+ (r'(?i)\baqui\b', 'aquí'),
149
+ (r'(?i)\bestare\b', 'estaré'),
150
+ (r'(?i)\bclarn\b', 'clarín'),
151
+ (r'(?i)\bclarin\b', 'clarín'),
152
+ (r'(?i)\bclar[íi]n\s+cornetas\b', 'clarín cornetas'),
153
+ (r'(?i)(^|\s)s([,.;:!?])', r'\1Sí\2'),
154
+ (r'(?i)\bfutbol\b', 'fútbol'),
155
+ (r'(?i)(^|\s)as(\s+se\b)', r'\1Así\2'),
156
+ (r'(?i)(^|\s)s(\s+orientarte\b)', r'\1sí\2'),
157
+ (r'(?i)\bbuen dia\b', 'buen día'),
158
+ (r'(?i)\bgran dia\b', 'gran día'),
159
+ (r'(?i)\bdias\b', 'días'),
160
+ (r'(?i)\bdia\b', 'día'),
161
+ (r'(?i)\bgran da\b', 'gran día'),
162
+ (r'(?i)\bacompa?a(r|rte|do|da|dos|das)?\b', r'acompaña\1'),
163
+ (r'(?i)(^|\s)as([,.;:!?]|\s|$)', r'\1así\2'),
164
+ (r'(?i)(^|\s)S lo se\b', r'\1Sí lo sé'),
165
+ (r'(?i)(^|\s)S lo sé\b', r'\1Sí lo sé'),
166
+ (r'(?i)\bcudese\b', 'cuídese'),
167
+ (r'(?i)\bpequeo\b', 'pequeño'),
168
+ (r'(?i)\bpequea\b', 'pequeña'),
169
+ (r'(?i)\bpequeos\b', 'pequeños'),
170
+ (r'(?i)\bpequeas\b', 'pequeñas'),
171
+ (r'(?i)\bunico\b', 'único'),
172
+ (r'(?i)\bunica\b', 'única'),
173
+ (r'(?i)\bunicos\b', 'únicos'),
174
+ (r'(?i)\bunicas\b', 'únicas'),
175
+ (r'(?i)\bnico\b', 'único'),
176
+ (r'(?i)\bnica\b', 'única'),
177
+ (r'(?i)\bnicos\b', 'únicos'),
178
+ (r'(?i)\bnicas\b', 'únicas'),
179
+ (r'(?i)\bestadstico\b', 'estadístico'),
180
+ (r'(?i)\bestadstica\b', 'estadística'),
181
+ (r'(?i)\bestadsticos\b', 'estadísticos'),
182
+ (r'(?i)\bestadsticas\b', 'estadísticas'),
183
+ (r'(?i)\bcudate\b', 'cuídate'),
184
+ (r'(?i)\bcuidate\b', 'cuídate'),
185
+ (r'(?i)\bcuidese\b', 'cuídese'),
186
+ (r'(?i)\bcudese\b', 'cuídese'),
187
+ (r'(?i)\bcuidense\b', 'cuídense'),
188
+ (r'(?i)\bcudense\b', 'cuídense'),
189
+ (r'(?i)\bgracias por confiar en m\b', 'gracias por confiar en mí'),
190
+ (r'(?i)\bcada dia\b', 'cada día'),
191
+ (r'(?i)\bcada da\b', 'cada día'),
192
+ (r'(?i)\bsegun\b', 'según'),
193
+ (r'(?i)\bcaracteristica(s)?\b', r'característica\1'),
194
+ (r'(?i)\bcaracterstica(s)?\b', r'característica\1'),
195
+ (r'(?i)\b([a-záéíóúñ]+)cion\b', r'\1ción'),
196
+ (r'(?i)\bdeterminacio\b', 'determinación'),]
197
+
198
+ for pat, rep in fixes:
199
+ s = re.sub(pat, rep, s)
200
 
201
+ # Abrir exclamación para "Eso es todo!" (si viene sin ¡ al inicio)
202
+ s = re.sub(r'(?i)^eso es todo!(?P<r>(\s|$).*)', r'¡Eso es todo!\g<r>', s)
203
+
204
+ # Adds the ¿ character in in case they are missed
205
+ def add_opening_q(m):
206
+ cuerpo = m.group('qbody')
207
+ # evita duplicar si ya trae '¿'
208
+ if '¿' in cuerpo:
209
+ return m.group(0)
210
+ return f"{m.group('pre')}¿{cuerpo}"
211
 
212
+ s = re.sub(r'(?P<pre>(^|[\.!\…]\s+))(?P<qbody>[^?]*\?)', add_opening_q, s)
 
 
 
 
213
 
214
+ # Adds the ¡ character in in case they are missed
215
+ def _open_exclam(m):
216
+ palabra = m.group('w')
217
+ resto = m.group('r') or ''
218
+ return f'¡{palabra}!{resto}'
219
+
220
+ s = re.sub(r'(?i)^(?P<w>(hola|gracias|genial|perfecto|claro|por supuesto|con gusto|listo|vaya|wow|tu puedes|tú puedes|clarín|clarin|clarín cornetas))!(?P<r>(\s|$).*)',_open_exclam, s)
221
 
222
+ # Final cleaning
223
+ s = re.sub(r'\s+', ' ', s).strip()
224
+ if s and s[-1] not in ".!?…":
225
+ s += "."
226
+
227
+ return s
228
 
229
+ #-------------------------------------------------------------------------
230
+ # Function to remove repeated input in the Model answer
231
+ #-------------------------------------------------------------------------
232
 
233
+
234
+ def anti_echo(response: str, user_text: str) -> str:
 
 
 
 
235
 
236
+ """Removing duplicating words
237
+
238
+ Args:
239
+ response (str): Model response
240
+ user_text (str): Input Spanish sentence.
241
+
242
+ Returns:
243
+ str: Model response without duplicated input sentence words
244
+ """
245
+
246
+ # Normalizing sentences
247
+ rn = normalize_for_route(response)
248
+ un = normalize_for_route(user_text)
249
+
250
+ # Removing initial unexpected extra characters
251
+ def _clean_leading(s: str) -> str:
252
+ s = re.sub(r'^\s*[,;:\-–—]\s*', '', s)
253
+ s = re.sub(r'^\s+', '', s)
254
+ return s
255
+
256
+ # Removing user input text repeated within model response
257
+ if len(un) >= 4 and rn.startswith(un):
258
+ # Removing the first sentence, before the defined separator
259
+ cut = re.sub(r'^\s*[^,;:\.\!\?]{0,120}[,;:\-]\s*', '', response).lstrip()
260
+ if cut and cut != response:
261
+ return _clean_leading(cut)
262
+
263
+ return _clean_leading(response[len(user_text):])
264
+
265
+ return response
266
+
267
+
268
+ #-------------------------------------------------------------------------
269
+ # Function to remove unwanted characters, normalizacion of sentences
270
+ #-------------------------------------------------------------------------
271
+
272
+ def normalize_for_route(s: str) -> str:
273
+
274
+ """Function to standardize sentences
275
+
276
+ Args:
277
+ s (str): Sentence
278
+
279
+ Returns:
280
+ str: Corrected or Standardized Sentence
281
+ """
282
+
283
+ # Standardizing
284
+ s = unicodedata.normalize("NFKD", s)
285
+ s = "".join(ch for ch in s if not unicodedata.combining(ch))
286
+ s = re.sub(r"[^\w\s-]", " ", s, flags=re.UNICODE)
287
+ s = re.sub(r"\s+", " ", s).strip().lower()
288
 
289
+ return s
290
+
291
+
292
+ _Q_STARTERS = {
293
+ "como","que","quien","quienes","cuando","donde","por que","para que",
294
+ "cual","cuales","cuanto","cuantos","cuanta","cuantas"
295
+ }
296
+ _EXC_TRIGGERS = {"motiva","motivame","animate","animame","animo","ayudame","ayudame porfa", "clarin", "clarín", "clarinete", "clarin cornetas"}
297
+
298
+ SPECIAL_NOPUNCT = {"kiubo", "quiubo", "que chido", "qué chido", "que buena onda"}
299
+
300
+ # 3) verbos 2ª persona al inicio -> pregunta corta
301
+ _Q_VERB_STARTERS = {"eres","estas","estás","puedes","sabes","tienes","quieres","conoces",
302
+ "crees","piensas","dirias","dirías","podrias","podrías","podras","podrás"}
303
+
304
+ #-------------------------------------------------------------------------
305
+ # Function to determine if a ¿ character needs to be added
306
+ #-------------------------------------------------------------------------
307
+
308
+ def needs_question_marks(norm: str) -> bool:
309
+
310
+ """Function to standardize sentences
311
+
312
+ Args:
313
+ norm (str): User text input
314
+
315
+ Returns:
316
+ bol: If the character is missing or not
317
+ """
318
 
319
+ if "?" in norm: return False
320
+ for w in _Q_STARTERS:
321
+ if norm.startswith(w + " ") or norm == w:
322
+ return True
323
+ return False
324
 
325
 
326
+ #-------------------------------------------------------------------------
327
+ # Function to determine if a ¡ character needs to be added
328
+ #-------------------------------------------------------------------------
329
 
 
 
330
 
331
+ def needs_exclam(norm: str) -> bool:
332
+
333
+ """Function to standardize sentences
334
+
335
+ Args:
336
+ norm (str): User text input
337
+
338
+ Returns:
339
+ bol: If the character is missing or not
340
+ """
341
 
342
+ if "!" in norm: return False
 
 
 
 
343
 
344
+ return any(t in norm for t in _EXC_TRIGGERS)
345
 
 
346
 
347
+ #-------------------------------------------------------------------------
348
+ # Function that detects greetings in slang form
349
+ #-------------------------------------------------------------------------
350
+
351
+
352
+ def is_slang_greeting(norm: str) -> bool:
353
+
354
+
355
+ """Recognizing slang greetings
356
+
357
+ Args:
358
+ norm (str): User text input
359
+
360
+ Returns:
361
+ bol: If the character is missing or not
362
+ """
363
 
364
+ # Defining slang greetings
365
+ SHORT = {
366
+ "que pex", "que onda", "ke pex", "k pex", "q onda",
367
+ "kiubo", "quiubo", "quiubole", "quiubole", "kionda", "q onda", "k onda",
368
+ "que rollo", "ke onda", "que show", "que tranza"}
369
+
370
+ # Finding greetings within the input
371
+ if norm in SHORT:
372
+ return True
373
+
374
+ # Looking for more specific forms
375
+ if re.match(r"^(q|k|ke|que)\s+(pex|onda|rollo|show|tranza)\b", norm):
376
+ return True
377
+
378
+ # Looking for more specific forms
379
+ if re.match(r"^(kiubo|quiubo|quiubole|quiúbole|quiubol[e]?)\b", norm):
380
+ return True
381
+
382
+ return False
383
+
384
+ #-------------------------------------------------------------------------
385
+ # Function to capitalize the model response
386
+ #-------------------------------------------------------------------------
387
+
388
 
389
+ def capitalize_spanish(s: str) -> str:
390
+
391
+
392
+ """Recognizing slang greetings
393
+
394
+ Args:
395
+ s (str): User text input
396
+
397
+ Returns:
398
+ str: Capitalized user text input
399
+ """
400
 
401
+ s = s.strip()
402
+ i = 0
403
+ while i < len(s) and not s[i].isalpha():
404
+ i += 1
405
+ if i < len(s):
406
+ s = s[:i] + s[i].upper() + s[i+1:]
407
+ return s
408
+
409
+
410
+ #-------------------------------------------------------------------------
411
+ # Function to correct Spanish sentences grammar and punctuation
412
+ #-------------------------------------------------------------------------
413
+
414
+
415
+ def smart_autopunct(user_text: str) -> str:
416
+
417
+
418
+ """Correcting grammar and punctuation
419
+
420
+ Args:
421
+ user_text (str): User text input
422
+
423
+ Returns:
424
+ str: Corrected user text input
425
+ """
426
+
427
+ #
428
+ s = user_text.strip()
429
+ if len(s) > 20:
430
+ return capitalize_spanish(s)
431
+
432
+ #
433
+ norm = normalize_for_route(s)
434
+
435
+ # Removing unexpected signs from specific slang
436
+ if norm in SPECIAL_NOPUNCT:
437
+ # elimina cualquier signo si el usuario los puso y capitaliza
438
+ s = re.sub(r'[¿?!¡]+', '', s).strip()
439
+ return capitalize_spanish(s)
440
+
441
+ # Adding question marks to specific user input text, in case they are missed
442
+ if norm.startswith("y si "):
443
+ s = f"¿{s}?"
444
+ return capitalize_spanish(s)
445
+
446
+ # Completing missing question marks or exclamation marks
447
+ if "?" in s and "¿" not in s:
448
+ s = "¿" + s
449
+ return capitalize_spanish(s)
450
+ if "!" in s and "¡" not in s:
451
+ s = "¡" + s
452
+ return capitalize_spanish(s)
453
 
454
+ # Adding exclamation marks to slang greetings
455
+ if is_slang_greeting(norm):
456
+ s = f"¡{s}!"
457
+ return capitalize_spanish(s)
458
+
459
+ # Adding question marks to expected expressions
460
+ if needs_question_marks(norm):
461
+ s = f"¿{s}?"
462
+ return capitalize_spanish(s)
463
+
464
+ # Adding question marks to sentences with an expected form
465
+ toks = norm.split()
466
+ if toks and toks[0] in _Q_VERB_STARTERS:
467
+ s = f"¿{s}?"
468
+ return capitalize_spanish(s)
469
+
470
+ # Adding question marks to specific expressions
471
+ if re.match(r"^(me\s+ayudas?|me\s+puedes|podrias?|podras?)\b", norm):
472
+ s = f"¿{s}?"
473
+ return capitalize_spanish(s)
474
+
475
+ # Adding exclamation marks to specific expressions
476
+ if needs_exclam(norm):
477
+ s = f"¡{s}!"
478
+ return capitalize_spanish(s)
479
+
480
+ # Capitalizing the output
481
+ return capitalize_spanish(s)
482
+
483
+
484
+ #-------------------------------------------------------------------------
485
+ # Generating a social prompt from user input - RAG can be implemented here
486
+ #-------------------------------------------------------------------------
487
+
488
+
489
+ def build_prompt_social(user_text: str) -> str:
490
+
491
+ """Generating a social prompt from user input
492
+
493
+ Args:
494
+ user_text (str): User text input
495
+
496
+ Returns:
497
+ str: Generated prompt
498
+ """
499
 
500
+ # Generating prompt
501
+ fixed = smart_autopunct(user_text)
502
+
503
+ return f"respuesta social: {fixed}"
504
+
505
+
506
+ #-------------------------------------------------------------------------
507
+ # Function to set the random seed for reproducibility of results
508
+ #-------------------------------------------------------------------------
509
 
 
510
 
511
+ def set_seeds(seed: int = 42):
512
 
513
+ """Function to set the random seed for reproducibility of results
 
514
 
515
+ Args:
516
+ seed (int): Random seed
517
+ """
518
 
519
+ random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
520
+ if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
521
+ torch.backends.cudnn.deterministic = True
522
+ torch.backends.cudnn.benchmark = False
523
 
524
+ #-------------------------------------------------------------------------
525
+ # Function that classify the user input context, social or technical
526
+ #-------------------------------------------------------------------------
527
 
528
+ def classify_context(question, label_classes, model, tokenizer, device):
529
+
530
+ """
531
+ Classify the context of a user input text using a Hugging Face model.
532
 
533
+ Args:
534
+ question (str): User input text, which will be classified as a specific context
535
+ label_classes (List[str]): List of all possible classes used be classify the user input
536
+ model (transformers.PreTrainedModel): Huggingface pretrained model, with fine-tuning
537
+ tokenizer (transformers.PreTrainedTokenizer): Tokenizer, corresponding to the input model
538
+ device (torch.device): The device where the model will be running, torch.device("cuda") or torch.device("cpu"))
539
 
540
+ Returns:
541
+ str: Context related to the user input text
542
+ """
543
+
544
+ # Running the model on the selected device
545
  model = model.to(device)
546
 
547
+ # Generating tokens from user input
548
+ inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=128)
549
+ inputs = {k: v.to(device) for k, v in inputs.items()}
550
+
551
+ # Making inference about the use input context
552
+ with torch.no_grad():
553
+ outputs = model(**inputs)
554
+ logits = outputs.logits
555
+
556
+ # Determining the user input context
557
+ pred_intent = torch.argmax(logits, dim=1).item()
558
+ predicted_label = label_classes[pred_intent]
 
 
 
 
 
 
 
 
 
559
 
560
+ return predicted_label
561
 
562
 
563
+ #-------------------------------------------------------------------------
564
+ # Chatbot response for technical contexts using a Hugging Face model
565
+ #-------------------------------------------------------------------------
566
 
567
+ def technical_asnwer(question, context, model, tokenizer, device):
 
568
 
569
+ """
570
+ Generate a chatbot response for technical contexts using a Hugging Face model.
 
 
 
 
 
 
 
571
 
572
+ Args:
573
+ question (str): User input text
574
+ context (str): Technical context
575
+ model (transformers.PreTrainedModel): Huggingface pretrained model, with fine-tuning
576
+ tokenizer (transformers.PreTrainedTokenizer): Tokenizer, corresponding to the input model
577
+ device (torch.device): The device where the model will be running, torch.device("cuda") or torch.device("cpu"))
578
 
579
+ Returns:
580
+ str: The model's generated answer within the technical context
581
+ """
582
 
583
+ # Running the model on the selected device
584
+ model = model.to(device)
585
+ # Setting the model on eval mode
586
+ model.eval()
587
+
588
+ # Generating the promtp input to the technical model
589
+ #input_text = f"Context: {context} [SEP] Question: {question}"
590
+ input_text = f"definir: responde con la definición canónica exacta. Contexto={context} ; Pregunta={question}"
591
+ # Tokenizing the technical user input
592
+ enc = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
593
+
594
+ # Avoiding responses containing the following characters
595
+ bad_words = ["["]
596
+ bad_ids = [tokenizer(bw, add_special_tokens=False).input_ids for bw in bad_words]
597
+
598
+ # Generating responses from the technical model
599
+ out_ids = model.generate(
600
+ input_ids=enc["input_ids"],
601
+ attention_mask=enc["attention_mask"],
602
+ num_beams=4, do_sample=False,
603
+ max_new_tokens=160, min_new_tokens=24,
604
+ no_repeat_ngram_size=3,
605
+ bad_words_ids=bad_ids,
606
+ eos_token_id=tokenizer.eos_token_id,
607
+ pad_token_id=tokenizer.pad_token_id
608
+ )
609
+
610
+ # Translating model model-generated answer to a readable text
611
+ text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
612
+
613
+ return polish_spanish(text)
614
+
615
+
616
+ #-------------------------------------------------------------------------
617
+ # Chatbot response for social contexts using a Hugging Face model
618
+ #-------------------------------------------------------------------------
619
 
620
+
621
+ def social_asnwer(question, model, tokenizer, device):
622
+
623
+ """
624
+ Generate a chatbot response for social contexts using a Hugging Face model.
625
+
626
+ Args:
627
+ question (str): User input text
628
+ model (transformers.PreTrainedModel): Huggingface pretrained model, with fine-tuning
629
+ tokenizer (transformers.PreTrainedTokenizer): Tokenizer, corresponding to the input model
630
+ device (torch.device): The device where the model will be running, torch.device("cuda") or torch.device("cpu"))
631
+
632
+ Returns:
633
+ str: The model's generated answer within the social context
634
+ """
635
+
636
+ # Running the model on the selected device
637
+ model = model.to(device)
638
+ # Setting the model on eval mode
639
+ model.eval()
640
+
641
+ # Generating the promtp input to the technical model
642
+ prompt = build_prompt_social(question)
643
+ enc = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=192).to(device)
644
+
645
+ # Avoiding responses containing the following characters
646
+ bad_words = ["[", "Thanks", "thank you", "website", "http", "www", ".com"]
647
+ bad_ids = [tokenizer(bw, add_special_tokens=False).input_ids for bw in bad_words]
648
+
649
+ # Generating responses from the social model
650
+ out_ids = model.generate(
651
+ input_ids=enc["input_ids"],
652
+ attention_mask=enc["attention_mask"],
653
+ num_beams=4, do_sample=False, # determinista
654
+ max_new_tokens=64, min_new_tokens=16, # evita respuestas tipo “¡Descansa!”
655
+ no_repeat_ngram_size=3,
656
+ bad_words_ids=bad_ids,
657
+ eos_token_id=tokenizer.eos_token_id,
658
+ pad_token_id=tokenizer.pad_token_id
659
+ )
660
+
661
+ # Translating model model-generated answer to a readable text
662
+ text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
663
+
664
+ # Anti-echo code, avoids input words within the model answer
665
+ text = anti_echo(text, question)
666
+
667
+ # Improving Spanish responses, grammatically and with respect special characters
668
+ text = polish_spanish(text)
669
+
670
+ # Capatilazing the model response
671
+ text = capitalize_spanish(text)
672
+
673
+ return text
674
+
675
+
676
+ #-------------------------------------------------------------------------
677
+ # Function to override the contextual classifier if the user input is short
678
+ #-------------------------------------------------------------------------
679
+
680
+
681
+ def rule_intent_override(user_text: str, predicted_label: str) -> str:
682
+
683
+
684
+ """
685
+ Function to override the contextual classifier
686
+
687
+ Args:
688
+ user_text (str): User input text
689
+ predicted_label (str): Huggingface pretrained model, with fine-tuning
690
+
691
+ Returns:
692
+ str: Overridden context for the user input text
693
+ """
694
+
695
+ # Standardizing the user input
696
+ n = normalize_for_route(user_text)
697
+
698
+ # Overriding the classified context, in case the input is too short
699
+ if re.fullmatch(r"(motivame|motiva|animame|animo|ayudame|que tranza|qué tranza|que tranza mori|qué tranza mori)", n):
700
+ return "social"
701
+ return predicted_label
702
+
703
+
704
+ #-------------------------------------------------------------------------
705
+ # Function to determine the context of the user input, technical or social
706
+ #-------------------------------------------------------------------------
707
+
708
+ def contextual_asnwer(question, label_classes, context_model, cont_tok,
709
+ tec_model, tec_tok, soc_model, soc_tok, device):
710
+
711
+
712
+ """
713
+ Function to override the contextual classifier
714
+
715
+ Args:
716
+ question (str): User input text, which will be classified as a specific context
717
+ label_classes (List[str]): List of all possible classes used be classify the user input
718
+ context_model (transformers.PreTrainedModel): Model, with fine-tuning, for classifying the input user into social or technical contexts
719
+ cont_tok (transformers.PreTrainedTokenizer): Tokenizer, corresponding to the context classifier model
720
+ tec_model (transformers.PreTrainedModel): Model, with fine-tuning, for generating technical responses
721
+ tec_tok (transformers.PreTrainedTokenizer): Tokenizer, corresponding to the technical model
722
+ soc_model (transformers.PreTrainedModel): Model, with fine-tuning, for generating social responses
723
+ soc_tok (transformers.PreTrainedTokenizer): Tokenizer, corresponding to the social model
724
+ device (torch.device): The device where the model will be running, torch.device("cuda") or torch.device("cpu"))
725
+
726
+ Returns:
727
+ str: Context related to the user input text
728
+ """
729
+
730
+ # Classifying user input text into a social or technical context
731
  context = classify_context(question, label_classes, context_model, cont_tok, device)
732
+ context = rule_intent_override(question, context)
733
 
734
+ # Characters used to improve the interface experience with the user
735
+ context_icons = {
736
+ "social": "💬", "modelos": "🔧", "evaluación": "📏", "optimización": "⚙️",
737
+ "visualización": "📈", "aprendizaje": "🧠", "vida digital": "🧑‍💻",
738
+ "estadística": "📊", "infraestructura": "🖥", "datos": "📂", "transformación digital": "🌀"}
 
 
 
 
 
 
 
 
 
 
739
 
740
+ # Showing the context related to the user input text
741
+ icon = context_icons.get(context, "🧠")
742
+ print(f"{icon} Contexto detectado: {context}")
 
743
 
744
+ #return technical_asnwer(question, context, tec_model, tec_tok, device)
745
 
746
+ # Generating the Chatbot answer based on the trained models
747
+ if context == "social":
748
+ return social_asnwer(question, soc_model, soc_tok, device), context
749
+ else:
750
+ return technical_asnwer(question, context, tec_model, tec_tok, device), context
751
 
 
752
 
753
 
754