tecuhtli commited on
Commit
20e2189
·
1 Parent(s): 4c7269f

Actualización del modelo / README / pesos / etc.

Browse files
Prompts/prompts_social.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "meta": {
3
- "task": "social_response",
4
- "version": "1.0"
5
- },
6
- "modes": {
7
- "zero_shot": {
8
- "instruction": "Responder de forma amigable y cercana al usuario.",
9
- "rules": [
10
- "Responde de forma natural y amistosa, con un toque reflexivo; agrega ejemplos sencillos o analogías cortas. ",
11
- "Evita sonar formal y permite algo de color humano."
12
- ],
13
- "output_format": "1–2 oraciones relajadas y naturales.",
14
- "tone": "Amigable y entusiasta.",
15
- "examples": []
16
- },
17
- "one_shot": {
18
- "instruction": "Responder de forma amigable y cercana al usuario.",
19
- "rules": [
20
- "Responde de forma natural y amistosa, con un toque reflexivo; agrega ejemplos sencillos o analogías cortas. ",
21
- "Evita sonar formal y permite algo de color humano."
22
- ],
23
- "output_format": "1–2 oraciones relajadas y naturales.",
24
- "tone": "Amigable y entusiasta.",
25
- "examples": [
26
- {
27
- "input": "¿Puedo hacerte una pregunta?",
28
- "output": "¡Claro! Dime cuál es tu duda y lo resolvemos juntos."
29
- }
30
- ]
31
- },
32
- "few_shot_3": {
33
- "instruction": "Responder de forma amigable y cercana al usuario.",
34
- "rules": [
35
- "Responde de forma natural y amistosa, con un toque reflexivo; agrega ejemplos sencillos o analogías cortas. ",
36
- "Evita sonar formal y permite algo de color humano."
37
- ],
38
- "output_format": "1–2 oraciones relajadas y naturales.",
39
- "tone": "Amigable y entusiasta.",
40
- "examples": [
41
- {
42
- "input": "¿Cuál es el significado de la vida?",
43
- "output": "Pregunta enorme; por ahora enfoquémonos en tus metas de hoy. ¿Qué te gustaría avanzar?"
44
- },
45
- {
46
- "input": "No entiendo nada 😩",
47
- "output": "Tranquilo/a, aprender toma tiempo. Dime qué parte se atoró y la vemos paso a paso."
48
- },
49
- {
50
- "input": "¿Qué opinas del clima?",
51
- "output": "No tengo sensores, pero si el clima ayuda, nos aventamos otro tema 😄."
52
- }
53
- ]
54
- }
55
- }
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Prompts/prompts_technical.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "meta": {
3
- "task": "technical_definition",
4
- "version": "1.0"
5
- },
6
- "modes": {
7
- "zero_shot": {
8
- "instruction": "Define con precisión.",
9
- "rules": [
10
- "Identifica correctamente el concepto principal mencionado en la pregunta.",
11
- "Proporciona una definición canónica, exacta y concisa del concepto."
12
- ],
13
- "output_format": "Respuesta corta y concisa.",
14
- "tone": "Directo, técnico y formal.",
15
- "examples": []
16
- },
17
- "one_shot": {
18
- "instruction": "Define con precisión.",
19
- "rules": [
20
- "Identifica correctamente el concepto principal mencionado en la pregunta.",
21
- "Proporciona una definición canónica, exacta y concisa del concepto."
22
- ],
23
- "output_format": "Respuesta corta y concisa.",
24
- "tone": "Directo, técnico y formal.",
25
- "examples": [
26
- {
27
- "input": "¿Que es la tecnologia?",
28
- "output": "Conjunto de teorías y de técnicas que permiten el aprovechamiento práctico del conocimiento científico."
29
- }
30
- ]
31
- },
32
- "few_shot_3": {
33
- "instruction": "Define con precisión.",
34
- "rules": [
35
- "Identifica correctamente el concepto principal mencionado en la pregunta.",
36
- "Proporciona una definición canónica, exacta y concisa del concepto."
37
- ],
38
- "output_format": "Respuesta corta y concisa.",
39
- "tone": "Directo, técnico y formal.",
40
- "examples": [
41
- {
42
- "input": "¿Qué es un algoritmo?",
43
- "output": "Conjunto ordenado de pasos o instrucciones que permiten resolver un problema o realizar una tarea de forma sistemática."
44
- },
45
- {
46
- "input": "¿Cómo se define la inteligencia artificial?",
47
- "output": "Campo de la informática que busca crear sistemas capaces de realizar tareas que requieren inteligencia humana, como razonar, aprender o reconocer patrones."
48
- },
49
- {
50
- "input": "Define el término base de datos.",
51
- "output": "Conjunto organizado de información que se almacena y gestiona electrónicamente para facilitar su acceso, consulta y actualización."
52
- }
53
- ]
54
- }
55
- }
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -10,12 +10,12 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- # 🧠 Mori: Your Friendly Data Science Assistant
14
 
15
 
16
- # 🧠 Mori: Your Friendly Data Science Assistant
17
 
18
- **Mori** is a conversational assistant trained to answer questions about data science, AI concepts, and related topics.
19
  It now runs **tech-only**: just the technical model + optional RAG (FAISS on HF), no social model, no classifier.
20
 
21
  ---
 
10
  license: mit
11
  ---
12
 
13
+ # 🧠 Your Friendly Data Science Assistant
14
 
15
 
16
+ # 🧠 Your Friendly Data Science Assistant
17
 
18
+ **This Assistant** is a conversational assistant trained to answer questions about data science, AI concepts, and related topics.
19
  It now runs **tech-only**: just the technical model + optional RAG (FAISS on HF), no social model, no classifier.
20
 
21
  ---
Statistics/conversaciones_log.csv DELETED
The diff for this file is too large to render. See raw diff
 
Statistics/conversaciones_log.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -18,22 +18,182 @@ from sentence_transformers import SentenceTransformer # RAG embeddings
18
  # Configuración general
19
  # =========================
20
  HF_TOKEN = os.environ.get("HF_TOKEN") # Token privado (colócalo en Secrets o variable de entorno)
21
- RAG_REPO_ID = "tecuhtli/Mori_FAISS_Full" # Dataset privado con mori.faiss, mori_ids.npy, mori_metas.json
22
 
23
- # =========================
24
- # Utilidades de texto
25
- # =========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def truncate_sentences(text: str, max_sentences: int = 4) -> str:
27
  _SENT_SPLIT = re.compile(r'(?<=[\.\!\?…])\s+')
28
  s = text.strip()
29
- if not s:
30
- return s
31
  parts = _SENT_SPLIT.split(s)
32
  cut = " ".join(parts[:max_sentences]).strip()
33
- if cut and cut[-1] not in ".!?…":
34
- cut += "."
35
  return cut
36
 
 
37
  def _load_json_safe(path: Path, fallback: dict) -> dict:
38
  try:
39
  with open(path, "r", encoding="utf-8") as f:
@@ -41,15 +201,61 @@ def _load_json_safe(path: Path, fallback: dict) -> dict:
41
  except Exception:
42
  return fallback
43
 
44
- def load_prompt_cases():
45
- base = Path("Prompts")
46
- tech = _load_json_safe(base / "prompts_technical.json", {"modes": {}})
47
- social = _load_json_safe(base / "prompts_social.json", {"modes": {}}) # no usado, se deja por compatibilidad
48
- return {"technical": tech, "social": social}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def polish_spanish(s: str) -> str:
51
  s = unicodedata.normalize("NFC", s).strip()
52
- s = re.sub(r'\s*[\[\(]\s*Mori\s+(?:Social|T[eé]nico|T[eé]cnico)\s*[\]\)]\s*', '', s, flags=re.I)
53
  fixes = [
54
  (r'(?i)(^|\W)T\s+puedes(?P<p>[^\w]|$)', r'\1Tú puedes\g<p>'),
55
  (r'(?i)(^|\W)T\s+(ya|eres|estas|estás|tienes|puedes)\b', r'\1Tú \2'),
@@ -61,34 +267,67 @@ def polish_spanish(s: str) -> str:
61
  (r'(?i)\butiles\b', 'útiles'),
62
  (r'(?i)\butil\b', 'útil'),
63
  (r'(?i)\baqui\b', 'aquí'),
 
64
  (r'(?i)\balgn\b', 'algún'),
 
65
  (r'(?i)\bAnimo\b', 'Ánimo'),
 
66
  (r'(?i)\baprendisaje\b', 'aprendizaje'),
67
  (r'(?i)\bmanana\b', 'mañana'),
 
68
  (r'(?i)\benergia\b', 'energía'),
 
69
  (r'(?i)\bextrano\b', 'extraño'),
70
  (r'(?i)\bextrana\b', 'extraña'),
71
  (r'(?i)\bextranar\b', 'extrañar'),
72
  (r'(?i)\bextranarte\b', 'extrañarte'),
73
  (r'(?i)\bextranas\b', 'extrañas'),
74
  (r'(?i)\bextranos\b', 'extraños'),
 
 
75
  (r'(?i)\bestare\b', 'estaré'),
 
76
  (r'(?i)\bclarin\b', 'clarín'),
77
  (r'(?i)\bclar[íi]n\s+cornetas\b', 'clarín cornetas'),
78
  (r'(?i)(^|\s)s([,.;:!?])', r'\1Sí\2'),
79
  (r'(?i)\bfutbol\b', 'fútbol'),
80
  (r'(?i)(^|\s)as(\s+se\b)', r'\1Así\2'),
 
81
  (r'(?i)\bbuen dia\b', 'buen día'),
82
  (r'(?i)\bgran dia\b', 'gran día'),
83
  (r'(?i)\bdias\b', 'días'),
84
  (r'(?i)\bdia\b', 'día'),
 
85
  (r'(?i)\bacompa?a(r|rte|do|da|dos|das)?\b', r'acompaña\1'),
 
86
  (r'(?i)(^|\s)S lo se\b', r'\1Sí lo sé'),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  (r'(?i)\bcuidate\b', 'cuídate'),
88
  (r'(?i)\bcuidese\b', 'cuídese'),
 
89
  (r'(?i)\bcuidense\b', 'cuídense'),
 
90
  (r'(?i)\bgracias por confiar en m\b', 'gracias por confiar en mí'),
91
  (r'(?i)\bcada dia\b', 'cada día'),
 
92
  (r'(?i)\bsegun\b', 'según'),
93
  (r'(?i)\bcaracteristica(s)?\b', r'característica\1'),
94
  (r'(?i)\bcaracterstica(s)?\b', r'característica\1'),
@@ -97,18 +336,30 @@ def polish_spanish(s: str) -> str:
97
  ]
98
  for pat, rep in fixes:
99
  s = re.sub(pat, rep, s)
 
100
  s = re.sub(r'(?i)^eso es todo!(?P<r>(\s|$).*)', r'¡Eso es todo!\g<r>', s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  s = re.sub(r'\s+', ' ', s).strip()
102
  if s and s[-1] not in ".!?…":
103
  s += "."
104
  return s
105
 
106
- def normalize_for_route(s: str) -> str:
107
- s = unicodedata.normalize("NFKD", s)
108
- s = "".join(ch for ch in s if not unicodedata.combining(ch))
109
- s = re.sub(r"[^\w\s-]", " ", s, flags=re.UNICODE)
110
- s = re.sub(r"\s+", " ", s).strip().lower()
111
- return s
112
 
113
  def anti_echo(response: str, user_text: str) -> str:
114
  rn = normalize_for_route(response)
@@ -124,192 +375,198 @@ def anti_echo(response: str, user_text: str) -> str:
124
  return _clean_leading(response[len(user_text):])
125
  return response
126
 
127
- # =========================
128
- # Prompting técnico
129
- # =========================
130
- def build_prompt_from_cases(domain: str,
131
- prompt_type: str,
132
- persona: str,
133
- question: str,
134
- context: str | None = None) -> str:
135
- key_map = {
136
- "Zero-shot": "zero_shot",
137
- "One-shot": "one_shot",
138
- "Few-shot (3)": "few_shot_3"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  }
140
- mode_key = key_map.get(prompt_type, "zero_shot")
141
- data = st.session_state.PROMPT_CASES.get(domain, {}).get("modes", {}).get(mode_key, {})
142
-
143
- tone = data.get("tone", "")
144
- out_fmt = data.get("output_format", "")
145
- rules = "\n- ".join(data.get("rules", []))
146
- ctx_line = f"\n- Contexto: {context}" if context else ""
147
-
148
- # ejemplos si hay
149
- examples = data.get("examples", [])
150
- ex_str = ""
151
- if examples:
152
- parts = []
153
- for i, ex in enumerate(examples, 1):
154
- parts.append(f"Ejemplo {i} →\nPregunta: {ex.get('input','')}\nRespuesta: {ex.get('output','')}")
155
- ex_str = "\n\n" + "\n\n".join(parts) + "\n\nAhora responde:"
156
-
157
- # prompt final (siempre técnico)
158
- prompt = (
159
- f"Tarea: {data.get('instruction','Responde como asistente técnico en procesamiento de datos.')}\n"
160
- f"Reglas:\n- {rules}{ctx_line}\n"
161
- f"Estilo: {tone}\n"
162
- f"Formato de salida: {out_fmt}\n"
163
- f"{ex_str}\n"
164
- f"pregunta={question}\n"
165
- )
166
- return prompt.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  def set_seeds(seed: int = 42):
169
  random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
170
- if torch.cuda.is_available():
171
- torch.cuda.manual_seed_all(seed)
172
  torch.backends.cudnn.deterministic = True
173
  torch.backends.cudnn.benchmark = False
174
 
175
- # =========================
176
- # RAG helpers
177
- # =========================
178
- @st.cache_resource
179
- def load_rag_assets(device_str: str = "cpu"):
180
- """
181
- Carga E5 + FAISS + metadatos desde Hugging Face (dataset privado).
182
- """
183
- token = os.getenv("HF_TOKEN")
184
- if not token:
185
- st.warning("⚠️ No se encontró HF_TOKEN; RAG no estará disponible.")
186
- return None, None, None
187
-
188
- try:
189
- faiss_path = hf_hub_download(repo_id=RAG_REPO_ID, filename="mori.faiss", repo_type="dataset", token=token)
190
- ids_path = hf_hub_download(repo_id=RAG_REPO_ID, filename="mori_ids.npy", repo_type="dataset", token=token)
191
- meta_path = hf_hub_download(repo_id=RAG_REPO_ID, filename="mori_metas.json", repo_type="dataset", token=token)
192
-
193
- index = faiss.read_index(faiss_path)
194
- _ = np.load(ids_path, allow_pickle=True) # ids no usados explícitamente, se conserva por consistencia
195
- with open(meta_path, "r", encoding="utf-8") as f:
196
- metas = json.load(f)
197
-
198
- e5 = SentenceTransformer("intfloat/multilingual-e5-base", device=device_str)
199
- st.info(f"✅ RAG cargado con {index.ntotal} vectores.")
200
- return e5, index, metas
201
- except Exception as e:
202
- st.error(f"❌ Error al cargar RAG: {e}")
203
- return None, None, None
204
-
205
- def rag_retrieve(e5, index, metas, user_text: str, k: int = 5):
206
- if e5 is None or index is None or metas is None or index.ntotal == 0:
207
- return []
208
- qv = e5.encode([f"query: {user_text}"], normalize_embeddings=True,
209
- convert_to_numpy=True).astype("float32")
210
- k = max(1, min(int(k), index.ntotal))
211
- scores, idxs = index.search(qv, k)
212
- out = []
213
- for rank, (s, i) in enumerate(zip(scores[0], idxs[0]), 1):
214
- if i == -1:
215
- continue
216
- m = metas[i]
217
- out.append({
218
- "rank": rank, "score": float(s),
219
- "id": m.get("id",""),
220
- "canonical_term": m.get("canonical_term",""),
221
- "context": m.get("context",""),
222
- "input": m.get("input",""),
223
- "output": m.get("output",""),
224
- })
225
- return out
226
-
227
- def build_rag_prompt_technical(base_prompt: str, user_text: str, passages):
228
- ev_lines = []
229
- for p in passages:
230
- ev_lines.append(
231
- f"[{p['rank']}] term='{p.get('canonical_term','')}' ctx='{p.get('context','')}'\n"
232
- f"input: {p.get('input','')}\n"
233
- f"output: {p.get('output','')}"
234
- )
235
- ev_block = "\n".join(ev_lines)
236
- rag_rules = (
237
- "\n\n[ Modo RAG ]\n"
238
- "- Usa EXCLUSIVAMENTE la información relevante de las evidencias.\n"
239
- "- Si algo no aparece en las evidencias, dilo explícitamente.\n"
240
- "- Cita las evidencias con [n] (ej. [1], [3]).\n"
241
- )
242
- return f"{base_prompt.strip()}\n{rag_rules}\nEVIDENCIAS:\n{ev_block}\n"
243
-
244
- def get_bad_words_ids(tok):
245
- bad = []
246
- for sym in ["[", "]"]:
247
- ids = tok.encode(sym, add_special_tokens=False)
248
- if ids and all(isinstance(t, int) and t >= 0 for t in ids):
249
- bad.append(ids)
250
- return bad
251
 
252
- # =========================
253
- # Generación técnica
254
- # =========================
255
  def technical_asnwer(question, context, model, tokenizer, device, gen_params=None):
256
  model = model.to(device).eval()
257
-
258
- persona_name = (gen_params or {}).get("persona", st.session_state.get("persona", "Mori Normal"))
259
- prompt_type = st.session_state.get("prompt_type", "Zero-shot")
260
-
261
- input_text = build_prompt_from_cases(
262
- domain="technical",
263
- prompt_type=prompt_type,
264
- persona=persona_name,
265
- question=question,
266
- context=context
267
- )
268
-
269
- st.session_state["last_prompt"] = input_text
270
  st.session_state["just_generated"] = True
271
-
272
- enc = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
273
 
274
  bad_words = ["["]
275
  bad_ids = [tokenizer(bw, add_special_tokens=False).input_ids for bw in bad_words]
276
 
277
- max_new = int((gen_params or {}).get("max_new_tokens", 128))
278
- min_new = int((gen_params or {}).get("min_tokens", 16))
279
- no_repeat = int((gen_params or {}).get("no_repeat_ngram_size", 3))
280
- rep_pen = float((gen_params or {}).get("repetition_penalty", 1.0))
 
281
  mode = (gen_params or {}).get("mode", "beam")
282
 
283
- eos_id = tokenizer.eos_token_id or tokenizer.convert_tokens_to_ids("</s>")
284
- pad_id = tokenizer.pad_token_id or eos_id
285
-
286
  if mode == "sampling":
287
- temperature = float((gen_params or {}).get("temperature", 0.8))
288
  top_p = float((gen_params or {}).get("top_p", 0.9))
289
  kwargs = dict(
290
- do_sample=True, num_beams=1,
 
291
  temperature=max(0.1, temperature),
292
  top_p=min(1.0, max(0.5, top_p)),
293
  max_new_tokens=max_new,
294
- min_new_tokens=max(0, min_new),
295
  no_repeat_ngram_size=no_repeat,
296
  repetition_penalty=max(1.0, rep_pen),
297
  bad_words_ids=bad_ids,
298
- eos_token_id=eos_id,
299
- pad_token_id=pad_id,
300
  )
301
  else:
302
  num_beams = max(2, int((gen_params or {}).get("num_beams", 4)))
303
  length_penalty = float((gen_params or {}).get("length_penalty", 1.0))
304
  kwargs = dict(
305
- do_sample=False, num_beams=num_beams, length_penalty=length_penalty,
 
 
306
  max_new_tokens=max_new,
307
- min_new_tokens=max(0, min_new),
308
  no_repeat_ngram_size=no_repeat,
309
  repetition_penalty=max(1.0, rep_pen),
310
  bad_words_ids=bad_ids,
311
- eos_token_id=eos_id,
312
- pad_token_id=pad_id,
313
  )
314
 
315
  out_ids = model.generate(
@@ -317,239 +574,248 @@ def technical_asnwer(question, context, model, tokenizer, device, gen_params=Non
317
  )
318
  text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
319
 
320
- if persona_name == "Mori Normal":
321
  text = truncate_sentences(text, max_sentences=1)
322
 
323
  st.session_state["last_response"] = text
 
 
 
324
  return polish_spanish(text)
325
 
326
- def technical_answer_rag(
327
- question, tec_model, tec_tok, device, gen_params,
328
- e5, index, metas, k=5, sim_threshold=0.40
329
- ):
330
- passages = rag_retrieve(e5, index, metas, question, k=k)
331
- if not passages:
332
- return "No encontré evidencias relevantes para responder con certeza. ¿Puedes dar más contexto?"
333
-
334
- persona_name = (gen_params or {}).get("persona", st.session_state.get("persona", "Mori Normal"))
335
- _ = st.session_state.get("prompt_type", "Zero-shot") # guardado por compatibilidad
336
-
337
- base_prompt = build_prompt_from_cases(
338
- domain="technical",
339
- prompt_type="Zero-shot",
340
- persona=persona_name,
341
- question=question,
342
- context="RAG"
343
- )
344
 
345
- prompt = build_rag_prompt_technical(base_prompt, question, passages)
 
 
 
 
 
346
 
347
- max_sim = passages[0]["score"]
348
- if max_sim < sim_threshold:
349
- prompt = "⚠️ Baja similitud con la base; podría faltar contexto.\n\n" + prompt
350
- st.session_state["last_prompt"] = prompt
351
  st.session_state["just_generated"] = True
352
 
353
- enc = tec_tok(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
354
-
355
- bad_ids = get_bad_words_ids(tec_tok)
356
 
357
- max_new = int((gen_params or {}).get("max_new_tokens", 128))
358
- min_new = int((gen_params or {}).get("min_tokens", 16))
359
- no_repeat = int((gen_params or {}).get("no_repeat_ngram_size", 3))
360
- rep_pen = float((gen_params or {}).get("repetition_penalty", 1.0))
361
- mode = (gen_params or {}).get("mode", "beam")
362
 
363
- eos_id = tec_tok.eos_token_id or tec_tok.convert_tokens_to_ids("</s>")
364
- pad_id = tec_tok.pad_token_id or eos_id
 
 
 
 
 
365
 
366
  if mode == "sampling":
367
- temperature = float((gen_params or {}).get("temperature", 0.8))
368
- top_p = float((gen_params or {}).get("top_p", 0.9))
369
  kwargs = dict(
370
  do_sample=True, num_beams=1,
371
  temperature=max(0.1, temperature),
372
  top_p=min(1.0, max(0.5, top_p)),
373
  max_new_tokens=max_new,
374
- min_new_tokens=max(0, min_new),
 
375
  no_repeat_ngram_size=no_repeat,
376
  repetition_penalty=max(1.0, rep_pen),
377
- eos_token_id=eos_id,
378
- pad_token_id=pad_id,
 
379
  )
380
  else:
381
- num_beams = max(2, int((gen_params or {}).get("num_beams", 4)))
382
  length_penalty = float((gen_params or {}).get("length_penalty", 1.0))
383
  kwargs = dict(
384
  do_sample=False, num_beams=num_beams, length_penalty=length_penalty,
385
  max_new_tokens=max_new,
386
- min_new_tokens=max(0, min_new),
 
387
  no_repeat_ngram_size=no_repeat,
388
  repetition_penalty=max(1.0, rep_pen),
389
- eos_token_id=eos_id,
390
- pad_token_id=pad_id,
 
 
391
  )
392
 
393
- if bad_ids:
394
- kwargs["bad_words_ids"] = bad_ids
395
-
396
- out_ids = tec_model.generate(**enc, **kwargs)
397
- text = tec_tok.decode(out_ids[0], skip_special_tokens=True)
398
-
399
- if persona_name == "Mori Normal":
400
- text = truncate_sentences(text, max_sentences=1)
401
  text = polish_spanish(text)
 
402
 
403
  st.session_state["last_response"] = text
 
 
 
404
  return text
405
 
406
- # =========================
407
- # Persistencia simple
408
- # =========================
409
- def saving_interaction(question, response, context, user_id):
410
- timestamp = dt.datetime.now().isoformat()
411
- stats_dir = Path("Statistics")
412
- stats_dir.mkdir(parents=True, exist_ok=True)
413
 
414
- archivo_csv = stats_dir / "conversaciones_log.csv"
415
- existe_csv = archivo_csv.exists()
 
 
 
416
 
417
- with open(archivo_csv, mode="a", encoding="utf-8", newline="") as f_csv:
418
- writer = csv.writer(f_csv)
419
- if not existe_csv:
420
- writer.writerow(["timestamp", "user_id", "contexto", "pregunta", "respuesta"])
421
- writer.writerow([timestamp, user_id, context, question, response])
422
 
423
- archivo_jsonl = stats_dir / "conversaciones_log.jsonl"
424
- with open(archivo_jsonl, mode="a", encoding="utf-8") as f_jsonl:
425
- registro = {
426
- "timestamp": timestamp,
427
- "user_id": user_id,
428
- "context": context,
429
- "pregunta": question,
430
- "respuesta": response
431
- }
432
- f_jsonl.write(json.dumps(registro, ensure_ascii=False) + "\n")
433
 
434
- # =========================
435
- # Enrutador técnico único
436
- # =========================
437
- def answer_technical_only(user_text: str, device, gen_params,
438
- tec_model, tec_tok):
439
- # Intentar RAG si está activado
440
- use_rag = st.session_state.get("use_rag", True)
441
- if use_rag:
442
- e5, index, metas = load_rag_assets("cuda" if torch.cuda.is_available() else "cpu")
443
- if e5 is not None and index is not None and index.ntotal > 0:
444
- return technical_answer_rag(
445
- user_text, tec_model, tec_tok, device, gen_params,
446
- e5=e5, index=index, metas=metas,
447
- k=st.session_state.get("rag_k", 3), sim_threshold=0.40
448
- )
449
- # Fallback sin RAG
450
- return technical_asnwer(
451
- question=user_text,
452
- context="procesamiento de datos",
453
- model=tec_model, tokenizer=tec_tok, device=device,
454
- gen_params=gen_params
455
- )
456
 
457
- # =========================
 
 
 
 
 
 
 
 
458
  # MAIN
459
- # =========================
 
460
  if __name__ == '__main__':
461
- # Estado persistente
 
462
  ss = st.session_state
463
  ss.setdefault("historial", [])
464
  ss.setdefault("last_prompt", "")
465
  ss.setdefault("last_response", "")
466
  ss.setdefault("just_generated", False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
 
468
- # Prompt cases y presets (sin sidebar)
469
- if "PROMPT_CASES" not in ss:
470
- ss.PROMPT_CASES = load_prompt_cases()
471
-
472
- ss.setdefault("persona", "Mori Normal")
473
- ss.setdefault("prompt_type", "Zero-shot")
474
- ss.setdefault("use_rag", True)
475
- ss.setdefault("rag_k", 3)
476
-
477
- GEN_PARAMS = {
478
- "persona": ss.get("persona", "Mori Normal"),
479
- "mode": "beam", # 'beam' | 'sampling'
480
- "max_new_tokens": 128,
481
- "min_tokens": 16,
482
- "no_repeat_ngram_size": 3,
483
- "num_beams": 4,
484
- "length_penalty": 1.0,
485
- "temperature": 0.8, # usado solo si mode == "sampling"
486
- "top_p": 0.9, # usado solo si mode == "sampling"
487
- "repetition_penalty": 1.0,
488
- "seed": 42,
489
- }
490
 
491
- # ID de sesión
492
- if "user_id" not in ss:
493
- ss["user_id"] = str(uuid.uuid4())[:8]
494
 
495
- # Modelo Técnico
496
- tec_tok = AutoTokenizer.from_pretrained("tecuhtli/mori-tecnico-model", use_auth_token=HF_TOKEN)
497
- tec_model = AutoModelForSeq2SeqLM.from_pretrained("tecuhtli/mori-tecnico-model", use_auth_token=HF_TOKEN)
498
 
499
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
500
 
501
- # Presentación (solo técnico)
502
- st.title("🤖 Mori - Tu Asistente Personal 🎓")
503
- st.caption("🙋🏽‍ Puedes preguntarme conceptos técnicos como visualización, limpieza, BI, etc.")
504
- st.caption("🙇🏽‍ Por el momento, solo puedo contestar preguntas simples como:")
505
- st.caption("➡️ ¿Cómo estás? ¿Qué son?, Explícame algo, Define algo, ¿Para qué sirven?")
506
- st.caption("🦾 Me siguen mejorando, más sobre mí en: [hazutecuhtli.github.io](https://github.com/hazutecuhtli/Mori_Development)")
507
  st.markdown("<br>", unsafe_allow_html=True)
508
- st.caption("✏️ Escribe 'salir' para terminar.")
509
 
510
- # Limpieza previa del textarea si corresponde
511
- if ss.pop("_clear_entrada", False):
512
- if "entrada" in ss:
513
- del ss["entrada"]
 
 
514
 
515
- # Respuesta flash de ciclo anterior
516
- _flash = ss.pop("_flash_response", None)
517
 
518
- # Formulario
519
- with st.form("formulario_mori"):
520
  user_question = st.text_area("📝 Escribe tu pregunta aquí", key="entrada", height=100)
521
  submitted = st.form_submit_button("Responder")
522
 
523
  if submitted:
524
  if not user_question:
525
- st.info("Mori: ¿Podrías repetir eso? No entendí bien 😅")
526
  else:
527
- response = answer_technical_only(user_question, device, GEN_PARAMS, tec_model, tec_tok)
 
 
 
 
528
 
529
- # Historial
530
  hora_actual = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
531
- ss.historial.append(("Tú", user_question, hora_actual))
 
532
  hora_actual = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
533
- ss.historial.append(("Mori", response, hora_actual))
 
 
 
 
 
 
534
 
535
- # Guardado persistente
536
- saving_interaction(user_question, response, "procesamiento de datos", ss["user_id"])
537
 
538
- # Flash y limpieza
539
- ss["_flash_response"] = response
540
- ss["_clear_entrada"] = True
541
  st.rerun()
542
 
543
- # Mostrar respuesta flash
 
 
544
  if _flash:
545
  st.success(_flash)
546
 
547
- # Historial + descarga
548
- if ss.historial:
 
 
 
 
 
 
 
 
549
  st.markdown("---")
550
 
 
551
  lineas = []
552
- for msg in reversed(ss.historial):
553
  if len(msg) == 3:
554
  autor, texto, hora = msg
555
  lineas.append(f"[{hora}] {autor}: {texto}")
@@ -561,12 +827,12 @@ if __name__ == '__main__':
561
  st.download_button(
562
  label="💾 Descargar conversación como .txt",
563
  data=texto_chat,
564
- file_name="conversacion_mori.txt",
565
  mime="text/plain",
566
  use_container_width=True
567
  )
568
 
569
- # Contenedor con estilo
570
  st.markdown(
571
  """
572
  <div id="chat-container" style="
@@ -582,7 +848,7 @@ if __name__ == '__main__':
582
  unsafe_allow_html=True
583
  )
584
 
585
- for msg in reversed(ss.historial):
586
  if len(msg) == 3:
587
  autor, texto, _ = msg
588
  else:
@@ -632,6 +898,7 @@ if __name__ == '__main__':
632
  )
633
 
634
  st.markdown("</div>", unsafe_allow_html=True)
 
635
  #***************************************************************************
636
  # FIN
637
  #***************************************************************************
 
18
  # Configuración general
19
  # =========================
20
  HF_TOKEN = os.environ.get("HF_TOKEN") # Token privado (colócalo en Secrets o variable de entorno)
 
21
 
22
+ #***************************************************************************
23
+ # Sidebar controls for generation params
24
+ #***************************************************************************
25
+
26
+ def sidebar_params():
27
+
28
+ with st.sidebar:
29
+ st.title("🎮 Adjustments (T5-Base)")
30
+
31
+ ss = st.session_state
32
+ # Defaults (solo 1ª vez)
33
+
34
+ # Estado inicial: ocultar ajustes avanzados
35
+ ss = st.session_state
36
+ if "show_llm_controls" not in ss:
37
+ ss.show_llm_controls = False
38
+
39
+
40
+ ss.setdefault("persona", "Normal")
41
+ ss.setdefault("mode", "beam") # 'beam' | 'sampling'
42
+ ss.setdefault("max_new", 128)
43
+ ss.setdefault("min_tok", 16)
44
+ ss.setdefault("no_repeat", 3)
45
+ ss.setdefault("num_beams", 4)
46
+ ss.setdefault("length_penalty", 1.0)
47
+ ss.setdefault("temperature", 0.7)
48
+ ss.setdefault("top_p", 0.9)
49
+ ss.setdefault("repetition_penalty", 1.0)
50
+ ss.setdefault("show_llm_controls", True) # Toggle principal
51
+
52
+ # ----------------------------
53
+ # Personalidad (presets)
54
+ # ----------------------------
55
+ st.header("💡 Predefined Personalities")
56
+ c1, c2 = st.columns(2)
57
+
58
+ with c1:
59
+ if st.button("Normal 🧐", use_container_width=True):
60
+ ss.update({
61
+ "persona": "Normal",
62
+ "mode": "beam",
63
+ "num_beams": 1,
64
+ "max_new": 92,
65
+ "min_tok": 32,
66
+ "no_repeat": 3,
67
+ "length_penalty": .3,
68
+ "temperature": 0.4,
69
+ "top_p": 0.9,
70
+ "repetition_penalty": .4,
71
+ })
72
+ st.rerun()
73
+
74
+ with c2:
75
+ if st.button("Enthusiastic 😃", use_container_width=True):
76
+ ss.update({
77
+ "persona": "Enthusiastic", # <- corregido
78
+ "mode": "sampling",
79
+ "max_new": 192,
80
+ "min_tok": 48,
81
+ "no_repeat": 3,
82
+ "temperature": .8,
83
+ "top_p": 0.95,
84
+ "repetition_penalty": 1.0,
85
+ })
86
+ st.rerun()
87
+
88
+ st.caption(f"Selected Personality: **{ss.persona}**")
89
+
90
+ # ----------------------------
91
+ # Botón para mostrar/ocultar parámetros
92
+ # ----------------------------
93
+ if st.button(("🔼 Hide" if ss.show_llm_controls else "🔽 Show") + " Advanced Settings"):
94
+ ss.show_llm_controls = not ss.show_llm_controls
95
+ st.rerun()
96
+
97
+ # ----------------------------
98
+ # Controles del modelo (sliders, estrategia, etc.)
99
+ # ----------------------------
100
+ if ss.show_llm_controls:
101
+ st.header("⚙️ Manual Adjustments")
102
+ st.subheader("📝 Text Generation")
103
+ picked = st.radio(
104
+ "Strategy",
105
+ ["Beam search (stable)", "Sampling (creative)"],
106
+ index=0 if ss.mode == "beam" else 1,
107
+ help="https://huggingface.co/docs/transformers/generation_strategies"
108
+ )
109
+ ss.mode = "beam" if picked.startswith("Beam") else "sampling"
110
+
111
+ st.subheader("🔧 LLM text generation parameters")
112
+ ss.max_new = st.slider(
113
+ "max_new_tokens", 16, 256, int(ss.max_new), step=8,
114
+ help="https://huggingface.co/docs/transformers/main_classes/text_generation"
115
+ )
116
+ ss.min_tok = st.slider(
117
+ "min_tokens", 0, int(ss.max_new), int(ss.min_tok),
118
+ help="https://huggingface.co/docs/transformers/main_classes/text_generation"
119
+ )
120
+ ss.no_repeat = st.slider(
121
+ "no_repeat_ngram_size", 0, 6, int(ss.no_repeat),
122
+ help="https://huggingface.co/docs/transformers/main_classes/text_generation"
123
+ )
124
+
125
+ # Subcontroles según modo
126
+ if ss.mode == "beam":
127
+ ss.num_beams = st.slider(
128
+ "num_beams", 2, 8, int(ss.num_beams),
129
+ help="https://huggingface.co/docs/transformers/main_classes/text_generation"
130
+ )
131
+ ss.length_penalty = st.slider(
132
+ "length_penalty", 0.0, 2.0, float(ss.length_penalty),
133
+ step=0.1, help="https://huggingface.co/docs/transformers/main_classes/text_generation"
134
+ )
135
+ else:
136
+ ss.temperature = st.slider(
137
+ "temperature", 0.1, 1.5, float(ss.temperature),
138
+ step=0.05, help="https://huggingface.co/docs/transformers/main_classes/text_generation"
139
+ )
140
+ ss.top_p = st.slider(
141
+ "top_p", 0.5, 1.0, float(ss.top_p),
142
+ step=0.01, help="https://huggingface.co/docs/transformers/main_classes/text_generation"
143
+ )
144
+
145
+
146
+ if "last_prompt" in st.session_state and st.session_state["last_prompt"]:
147
+ with st.expander("Show generated prompt"):
148
+ st.text_area(
149
+ "Prompt actual:",
150
+ st.session_state["last_prompt"],
151
+ height=200,
152
+ disabled=True
153
+ )
154
+ else:
155
+ st.caption("👉 No prompt is available yet.")
156
+
157
+ # ----------------------------
158
+ # Construir diccionario de parámetros
159
+ # ----------------------------
160
+ params = {
161
+ "persona": ss.persona,
162
+ "mode": ss.mode,
163
+ "max_new_tokens": int(ss.max_new),
164
+ "min_tokens": int(ss.min_tok),
165
+ "no_repeat_ngram_size": int(ss.no_repeat),
166
+ "repetition_penalty": float(ss.repetition_penalty),
167
+ }
168
+ if ss.mode == "beam":
169
+ params.update({
170
+ "num_beams": int(ss.num_beams),
171
+ "length_penalty": float(ss.length_penalty),
172
+ })
173
+ else:
174
+ params.update({
175
+ "temperature": float(ss.temperature),
176
+ "top_p": float(ss.top_p),
177
+ })
178
+
179
+ return params
180
+
181
+
182
+ #***************************************************************************
183
+ # Functions
184
+ #***************************************************************************
185
+
186
+
187
  def truncate_sentences(text: str, max_sentences: int = 4) -> str:
188
  _SENT_SPLIT = re.compile(r'(?<=[\.\!\?…])\s+')
189
  s = text.strip()
190
+ if not s: return s
 
191
  parts = _SENT_SPLIT.split(s)
192
  cut = " ".join(parts[:max_sentences]).strip()
193
+ if cut and cut[-1] not in ".!?…": cut += "."
 
194
  return cut
195
 
196
+
197
  def _load_json_safe(path: Path, fallback: dict) -> dict:
198
  try:
199
  with open(path, "r", encoding="utf-8") as f:
 
201
  except Exception:
202
  return fallback
203
 
204
+ # Function to clean the question field
205
+ def limpiar_input():
206
+ st.session_state["entrada"] = ""
207
+
208
+ # Corrige la ruta correctamente desde Scripts hacia Models
209
+ def get_model_path(folder_name):
210
+ return Path("Models") / folder_name
211
+
212
+ # Function to save user interaction
213
+ def saving_interaction(question, response, context, user_id):
214
+ '''
215
+ inputs:
216
+ question --> User input question
217
+ response --> Assistant response to the user question
218
+ context --> Context related to the user input, found by the trained classifier
219
+ user_id --> ID for the current user (Unique ID per session)
220
+ '''
221
+ timestamp = dt.datetime.now().isoformat()
222
+ stats_dir = Path("Statistics")
223
+ stats_dir.mkdir(parents=True, exist_ok=True)
224
+
225
+ archivo_csv = stats_dir / "conversaciones_log.csv"
226
+ existe_csv = archivo_csv.exists()
227
+
228
+ with open(archivo_csv, mode="a", encoding="utf-8", newline="") as f_csv:
229
+ writer = csv.writer(f_csv)
230
+ if not existe_csv:
231
+ writer.writerow(["timestamp", "user_id", "contexto", "pregunta", "respuesta"])
232
+ writer.writerow([timestamp, user_id, context, question, response])
233
+
234
+ archivo_jsonl = stats_dir / "conversaciones_log.jsonl"
235
+ with open(archivo_jsonl, mode="a", encoding="utf-8") as f_jsonl:
236
+ registro = {
237
+ "timestamp": timestamp,
238
+ "user_id": user_id,
239
+ "context": context,
240
+ "pregunta": question,
241
+ "respuesta": response}
242
+ f_jsonl.write(json.dumps(registro, ensure_ascii=False) + "\n")
243
+
244
+ # Function to load models within the huggingface repositories space
245
+ @st.cache_resource
246
+ def load_model(path_str):
247
+ path = Path(path_str).resolve()
248
+ tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=True)
249
+ model = AutoModelForSeq2SeqLM.from_pretrained(path, local_files_only=True)
250
+ return model, tokenizer
251
+
252
+ #-------------------------------------------------------------------------
253
+ # Function to correct Spanish sentences' punctuation and missing characters
254
+ #-------------------------------------------------------------------------
255
 
256
  def polish_spanish(s: str) -> str:
257
  s = unicodedata.normalize("NFC", s).strip()
258
+ s = re.sub(r'\s*[\[\(]\s*Assistant\s+(?:Social|T[eé]nico|T[eé]cnico)\s*[\]\)]\s*', '', s, flags=re.I)
259
  fixes = [
260
  (r'(?i)(^|\W)T\s+puedes(?P<p>[^\w]|$)', r'\1Tú puedes\g<p>'),
261
  (r'(?i)(^|\W)T\s+(ya|eres|estas|estás|tienes|puedes)\b', r'\1Tú \2'),
 
267
  (r'(?i)\butiles\b', 'útiles'),
268
  (r'(?i)\butil\b', 'útil'),
269
  (r'(?i)\baqui\b', 'aquí'),
270
+ (r'(?i)\baqu\b(?=\s+estoy\b)', 'aquí'),
271
  (r'(?i)\balgn\b', 'algún'),
272
+ (r'(?i)\balgun\b', 'algún'),
273
  (r'(?i)\bAnimo\b', 'Ánimo'),
274
+ (r'(?i)\bcario\b', 'cariño'),
275
  (r'(?i)\baprendisaje\b', 'aprendizaje'),
276
  (r'(?i)\bmanana\b', 'mañana'),
277
+ (r'(?i)\bmaana\b', 'mañana'),
278
  (r'(?i)\benergia\b', 'energía'),
279
+ (r'(?i)\benerga\b', 'energía'),
280
  (r'(?i)\bextrano\b', 'extraño'),
281
  (r'(?i)\bextrana\b', 'extraña'),
282
  (r'(?i)\bextranar\b', 'extrañar'),
283
  (r'(?i)\bextranarte\b', 'extrañarte'),
284
  (r'(?i)\bextranas\b', 'extrañas'),
285
  (r'(?i)\bextranos\b', 'extraños'),
286
+ (r'(?i)\baqu\b', 'aquí'),
287
+ (r'(?i)\baqui\b', 'aquí'),
288
  (r'(?i)\bestare\b', 'estaré'),
289
+ (r'(?i)\bclarn\b', 'clarín'),
290
  (r'(?i)\bclarin\b', 'clarín'),
291
  (r'(?i)\bclar[íi]n\s+cornetas\b', 'clarín cornetas'),
292
  (r'(?i)(^|\s)s([,.;:!?])', r'\1Sí\2'),
293
  (r'(?i)\bfutbol\b', 'fútbol'),
294
  (r'(?i)(^|\s)as(\s+se\b)', r'\1Así\2'),
295
+ (r'(?i)(^|\s)s(\s+orientarte\b)', r'\1sí\2'),
296
  (r'(?i)\bbuen dia\b', 'buen día'),
297
  (r'(?i)\bgran dia\b', 'gran día'),
298
  (r'(?i)\bdias\b', 'días'),
299
  (r'(?i)\bdia\b', 'día'),
300
+ (r'(?i)\bgran da\b', 'gran día'),
301
  (r'(?i)\bacompa?a(r|rte|do|da|dos|das)?\b', r'acompaña\1'),
302
+ (r'(?i)(^|\s)as([,.;:!?]|\s|$)', r'\1así\2'),
303
  (r'(?i)(^|\s)S lo se\b', r'\1Sí lo sé'),
304
+ (r'(?i)(^|\s)S lo sé\b', r'\1Sí lo sé'),
305
+ (r'(?i)\bcudese\b', 'cuídese'),
306
+ (r'(?i)\bpequeo\b', 'pequeño'),
307
+ (r'(?i)\bpequea\b', 'pequeña'),
308
+ (r'(?i)\bpequeos\b', 'pequeños'),
309
+ (r'(?i)\bpequeas\b', 'pequeñas'),
310
+ (r'(?i)\bunico\b', 'único'),
311
+ (r'(?i)\bunica\b', 'única'),
312
+ (r'(?i)\bunicos\b', 'únicos'),
313
+ (r'(?i)\bunicas\b', 'únicas'),
314
+ (r'(?i)\bnico\b', 'único'),
315
+ (r'(?i)\bnica\b', 'única'),
316
+ (r'(?i)\bnicos\b', 'únicos'),
317
+ (r'(?i)\bnicas\b', 'únicas'),
318
+ (r'(?i)\bestadstico\b', 'estadístico'),
319
+ (r'(?i)\bestadstica\b', 'estadística'),
320
+ (r'(?i)\bestadsticos\b', 'estadísticos'),
321
+ (r'(?i)\bestadsticas\b', 'estadísticas'),
322
+ (r'(?i)\bcudate\b', 'cuídate'),
323
  (r'(?i)\bcuidate\b', 'cuídate'),
324
  (r'(?i)\bcuidese\b', 'cuídese'),
325
+ (r'(?i)\bcudese\b', 'cuídese'),
326
  (r'(?i)\bcuidense\b', 'cuídense'),
327
+ (r'(?i)\bcudense\b', 'cuídense'),
328
  (r'(?i)\bgracias por confiar en m\b', 'gracias por confiar en mí'),
329
  (r'(?i)\bcada dia\b', 'cada día'),
330
+ (r'(?i)\bcada da\b', 'cada día'),
331
  (r'(?i)\bsegun\b', 'según'),
332
  (r'(?i)\bcaracteristica(s)?\b', r'característica\1'),
333
  (r'(?i)\bcaracterstica(s)?\b', r'característica\1'),
 
336
  ]
337
  for pat, rep in fixes:
338
  s = re.sub(pat, rep, s)
339
+
340
  s = re.sub(r'(?i)^eso es todo!(?P<r>(\s|$).*)', r'¡Eso es todo!\g<r>', s)
341
+
342
+ def add_opening_q(m):
343
+ cuerpo = m.group('qbody')
344
+ if '¿' in cuerpo:
345
+ return m.group(0)
346
+ return f"{m.group('pre')}¿{cuerpo}"
347
+ s = re.sub(r'(?P<pre>(^|[\.!\…]\s+))(?P<qbody>[^?]*\?)', add_opening_q, s)
348
+
349
+ def _open_exclam(m):
350
+ palabra = m.group('w')
351
+ resto = m.group('r') or ''
352
+ return f'¡{palabra}!{resto}'
353
+ s = re.sub(r'(?i)^(?P<w>(hola|gracias|genial|perfecto|claro|por supuesto|con gusto|listo|vaya|wow|tu puedes|tú puedes|clarín|clarin|clarín cornetas))!(?P<r>(\s|$).*)',_open_exclam, s)
354
+
355
  s = re.sub(r'\s+', ' ', s).strip()
356
  if s and s[-1] not in ".!?…":
357
  s += "."
358
  return s
359
 
360
+ #-------------------------------------------------------------------------
361
+ # Function to remove repeated input in the Model answer
362
+ #-------------------------------------------------------------------------
 
 
 
363
 
364
  def anti_echo(response: str, user_text: str) -> str:
365
  rn = normalize_for_route(response)
 
375
  return _clean_leading(response[len(user_text):])
376
  return response
377
 
378
+ #-------------------------------------------------------------------------
379
+ # Normalization helpers
380
+ #-------------------------------------------------------------------------
381
+
382
+ def normalize_for_route(s: str) -> str:
383
+ s = unicodedata.normalize("NFKD", s)
384
+ s = "".join(ch for ch in s if not unicodedata.combining(ch))
385
+ s = re.sub(r"[^\w\s-]", " ", s, flags=re.UNICODE)
386
+ s = re.sub(r"\s+", " ", s).strip().lower()
387
+ return s
388
+
389
+ _Q_STARTERS = {
390
+ "como","que","quien","quienes","cuando","donde","por que","para que",
391
+ "cual","cuales","cuanto","cuantos","cuanta","cuantas"
392
+ }
393
+ _EXC_TRIGGERS = {"motiva","motivame","animate","animame","animo","ayudame","ayudame porfa", "clarin", "clarín", "clarinete", "clarin cornetas"}
394
+ SPECIAL_NOPUNCT = {"kiubo", "quiubo", "que chido", "qué chido", "que buena onda"}
395
+ _Q_VERB_STARTERS = {"eres","estas","estás","puedes","sabes","tienes","quieres","conoces",
396
+ "crees","piensas","dirias","dirías","podrias","podrías","podras","podrás"}
397
+
398
+ #-------------------------------------------------------------------------
399
+ # Punctuation helpers
400
+ #-------------------------------------------------------------------------
401
+
402
+ def needs_question_marks(norm: str) -> bool:
403
+ if "?" in norm: return False
404
+ for w in _Q_STARTERS:
405
+ if norm.startswith(w + " ") or norm == w:
406
+ return True
407
+ return False
408
+
409
+ def needs_exclam(norm: str) -> bool:
410
+ if "!" in norm: return False
411
+ return any(t in norm for t in _EXC_TRIGGERS)
412
+
413
+ #-------------------------------------------------------------------------
414
+ # Greetings detection
415
+ #-------------------------------------------------------------------------
416
+
417
+ def is_slang_greeting(norm: str) -> bool:
418
+ SHORT = {
419
+ "que pex", "que onda", "ke pex", "k pex", "q onda",
420
+ "kiubo", "quiubo", "quiubole", "quiúbole", "kionda", "q onda", "k onda",
421
+ "que rollo", "ke onda", "que show", "que tranza"
422
  }
423
+ if norm in SHORT: return True
424
+ if re.match(r"^(q|k|ke|que)\s+(pex|onda|rollo|show|tranza)\b", norm): return True
425
+ if re.match(r"^(kiubo|quiubo|quiubole|quiúbole|quiubol[e]?)\b", norm): return True
426
+ return False
427
+
428
+ #-------------------------------------------------------------------------
429
+ # Capitalization & autopunct
430
+ #-------------------------------------------------------------------------
431
+
432
+ def capitalize_spanish(s: str) -> str:
433
+ s = s.strip()
434
+ i = 0
435
+ while i < len(s) and not s[i].isalpha():
436
+ i += 1
437
+ if i < len(s):
438
+ s = s[:i] + s[i].upper() + s[i+1:]
439
+ return s
440
+
441
+ def smart_autopunct(user_text: str) -> str:
442
+ s = user_text.strip()
443
+ if len(s) > 20:
444
+ return capitalize_spanish(s)
445
+ norm = normalize_for_route(s)
446
+ if norm in SPECIAL_NOPUNCT:
447
+ s = re.sub(r'[¿?!¡]+', '', s).strip()
448
+ return capitalize_spanish(s)
449
+ if norm.startswith("y si "):
450
+ s = f"¿{s}?"
451
+ return capitalize_spanish(s)
452
+ if "?" in s and "¿" not in s:
453
+ s = "¿" + s
454
+ return capitalize_spanish(s)
455
+ if "!" in s and "¡" not in s:
456
+ s = "¡" + s
457
+ return capitalize_spanish(s)
458
+ if is_slang_greeting(norm):
459
+ s = f"¡{s}!"
460
+ return capitalize_spanish(s)
461
+ if needs_question_marks(norm):
462
+ s = f"¿{s}?"
463
+ return capitalize_spanish(s)
464
+ toks = norm.split()
465
+ if toks and toks[0] in _Q_VERB_STARTERS:
466
+ s = f"¿{s}?"
467
+ return capitalize_spanish(s)
468
+ if re.match(r"^(me\s+ayudas?|me\s+puedes|podrias?|podras?)\b", norm):
469
+ s = f"¿{s}?"
470
+ return capitalize_spanish(s)
471
+ if needs_exclam(norm):
472
+ s = f"¡{s}!"
473
+ return capitalize_spanish(s)
474
+ return capitalize_spanish(s)
475
+
476
+
477
+ #-------------------------------------------------------------------------
478
+ # Seeds & helpers
479
+ #-------------------------------------------------------------------------
480
 
481
  def set_seeds(seed: int = 42):
482
  random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
483
+ if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
 
484
  torch.backends.cudnn.deterministic = True
485
  torch.backends.cudnn.benchmark = False
486
 
487
+ # --- Personalidades (solo estilo en prompt; parámetros ya vienen del sidebar) ---
488
+
489
+ def persona_style_prompt(persona: str, domain: str) -> str:
490
+ """Instrucción breve de estilo según personalidad y dominio (technical/social)."""
491
+ if persona == "Enthusiastic":
492
+ return (
493
+ "Responde de forma creativa, usa al menos 232 palabras. ")
494
+ if persona == "Normal": # ya no se usa, pero por compatibilidad
495
+ return ""
496
+ return "" # Assistant response
497
+
498
+ #-------------------------------------------------------------------------
499
+ # Classifier
500
+ #-------------------------------------------------------------------------
501
+
502
+ def classify_context(question, label_classes, model, tokenizer, device):
503
+ model = model.to(device)
504
+ inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=128)
505
+ inputs = {k: v.to(device) for k, v in inputs.items()}
506
+ with torch.no_grad():
507
+ outputs = model(**inputs)
508
+ logits = outputs.logits
509
+ pred_intent = torch.argmax(logits, dim=1).item()
510
+ predicted_label = label_classes[pred_intent]
511
+ return predicted_label
512
+
513
+ #-------------------------------------------------------------------------
514
+ # Chatbot response for technical contexts using a Hugging Face model
515
+ #-------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
 
 
 
 
517
  def technical_asnwer(question, context, model, tokenizer, device, gen_params=None):
518
  model = model.to(device).eval()
519
+ persona_name = (gen_params or {}).get("persona", st.session_state.get("persona", "Normal"))
520
+ style = persona_style_prompt(persona_name, "technical")
521
+
522
+ # Promp Engineering para ayudar al asistente a encontrar la mejor respuesta
523
+ input_text = f"{style}Context: {context} [SEP] Question: {question}."
524
+
525
+ st.session_state["last_prompt"] = input_text # o prompt
 
 
 
 
 
 
526
  st.session_state["just_generated"] = True
527
+ #st.rerun()
528
+ enc = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
529
 
530
  bad_words = ["["]
531
  bad_ids = [tokenizer(bw, add_special_tokens=False).input_ids for bw in bad_words]
532
 
533
+ # --- construir kwargs de generación, SIN tocar nada por personalidad ---
534
+ max_new = int((gen_params).get("max_new_tokens"))
535
+ min_new = int((gen_params).get("min_tokens")) # <- ahora SIEMPRE min_new_tokens
536
+ no_repeat = int((gen_params).get("no_repeat_ngram_size"))
537
+ rep_pen = float((gen_params).get("repetition_penalty"))
538
  mode = (gen_params or {}).get("mode", "beam")
539
 
 
 
 
540
  if mode == "sampling":
541
+ temperature = float((gen_params or {}).get("temperature", 0.7))
542
  top_p = float((gen_params or {}).get("top_p", 0.9))
543
  kwargs = dict(
544
+ do_sample=True,
545
+ num_beams=1,
546
  temperature=max(0.1, temperature),
547
  top_p=min(1.0, max(0.5, top_p)),
548
  max_new_tokens=max_new,
549
+ min_new_tokens=max(0, min_new), # 👈 consistente
550
  no_repeat_ngram_size=no_repeat,
551
  repetition_penalty=max(1.0, rep_pen),
552
  bad_words_ids=bad_ids,
553
+ eos_token_id=tokenizer.eos_token_id,
554
+ pad_token_id=tokenizer.pad_token_id,
555
  )
556
  else:
557
  num_beams = max(2, int((gen_params or {}).get("num_beams", 4)))
558
  length_penalty = float((gen_params or {}).get("length_penalty", 1.0))
559
  kwargs = dict(
560
+ do_sample=False,
561
+ num_beams=num_beams,
562
+ length_penalty=length_penalty,
563
  max_new_tokens=max_new,
564
+ min_new_tokens=max(0, min_new), # 👈 también aquí (no min_length)
565
  no_repeat_ngram_size=no_repeat,
566
  repetition_penalty=max(1.0, rep_pen),
567
  bad_words_ids=bad_ids,
568
+ eos_token_id=tokenizer.eos_token_id,
569
+ pad_token_id=tokenizer.pad_token_id,
570
  )
571
 
572
  out_ids = model.generate(
 
574
  )
575
  text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
576
 
577
+ if persona_name == "Normal":
578
  text = truncate_sentences(text, max_sentences=1)
579
 
580
  st.session_state["last_response"] = text
581
+ #st.rerun()
582
+
583
+
584
  return polish_spanish(text)
585
 
586
+ #-------------------------------------------------------------------------
587
+ # Chatbot response for social contexts using a Hugging Face model
588
+ #-------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
+ def social_asnwer(question, model, tokenizer, device, gen_params=None, block_web=True):
591
+
592
+ model = model.to(device).eval()
593
+ persona_name = (gen_params or {}).get("persona", st.session_state.get("persona", "Normal"))
594
+ prompt_type = st.session_state.get("prompt_type", "Zero-shot")
595
+ prompt = question
596
 
597
+ st.session_state["last_prompt"] = prompt # o prompt
 
 
 
598
  st.session_state["just_generated"] = True
599
 
600
+
601
+ enc = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=192).to(device)
 
602
 
603
+ bad_words = ["[", "Thanks", "thank you"]
604
+ if block_web:
605
+ bad_words += ["website", "http", "www", ".com"]
606
+ bad_ids = [tokenizer(bw, add_special_tokens=False).input_ids for bw in bad_words]
 
607
 
608
+
609
+ max_new = int((gen_params).get("max_new_tokens"))
610
+ min_tokens = int((gen_params).get("min_tokens"))
611
+ min_length = int(enc["input_ids"].shape[1]) + max(0, min_tokens)
612
+ no_repeat = int((gen_params).get("no_repeat_ngram_size"))
613
+ rep_pen = float((gen_params).get("repetition_penalty"))
614
+ mode = (gen_params or {}).get("mode", "beam")
615
 
616
  if mode == "sampling":
617
+ temperature = float((gen_params or {}).get("temperature", 0.7))
618
+ top_p = float((gen_params or {}).get("top_p", 0.9))
619
  kwargs = dict(
620
  do_sample=True, num_beams=1,
621
  temperature=max(0.1, temperature),
622
  top_p=min(1.0, max(0.5, top_p)),
623
  max_new_tokens=max_new,
624
+ #min_length=min_length,
625
+ min_new_tokens=max(0, min_tokens),
626
  no_repeat_ngram_size=no_repeat,
627
  repetition_penalty=max(1.0, rep_pen),
628
+ bad_words_ids=bad_ids,
629
+ eos_token_id=tokenizer.eos_token_id,
630
+ pad_token_id=tokenizer.pad_token_id,
631
  )
632
  else:
633
+ num_beams = max(2, int((gen_params or {}).get("num_beams", 4)))
634
  length_penalty = float((gen_params or {}).get("length_penalty", 1.0))
635
  kwargs = dict(
636
  do_sample=False, num_beams=num_beams, length_penalty=length_penalty,
637
  max_new_tokens=max_new,
638
+ #min_length=min_length,
639
+ min_new_tokens=max(0, min_tokens), # <- usar min_new_tokens
640
  no_repeat_ngram_size=no_repeat,
641
  repetition_penalty=max(1.0, rep_pen),
642
+ bad_words_ids=bad_ids,
643
+ eos_token_id=tokenizer.eos_token_id,
644
+ pad_token_id=tokenizer.pad_token_id,
645
+
646
  )
647
 
648
+ out_ids = model.generate(
649
+ input_ids=enc["input_ids"], attention_mask=enc["attention_mask"], **kwargs
650
+ )
651
+ text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
652
+ if persona_name == "Normal":
653
+ text = truncate_sentences(text, max_sentences=2)
654
+ #text = anti_echo(text, question)
 
655
  text = polish_spanish(text)
656
+ text = capitalize_spanish(text)
657
 
658
  st.session_state["last_response"] = text
659
+ #st.rerun()
660
+
661
+
662
  return text
663
 
664
+ #-------------------------------------------------------------------------
665
+ # Rule overrides
666
+ #-------------------------------------------------------------------------
 
 
 
 
667
 
668
+ def rule_intent_override(user_text: str, predicted_label: str) -> str:
669
+ n = normalize_for_route(user_text)
670
+ if re.fullmatch(r"(motivame|motiva|animame|animo|ayudame|que tranza|qué tranza|que tranza)", n):
671
+ return "social"
672
+ return predicted_label
673
 
674
+ #-------------------------------------------------------------------------
675
+ # Router
676
+ #-------------------------------------------------------------------------
 
 
677
 
678
+ def contextual_asnwer(question, label_classes, context_model, cont_tok,
679
+ tec_model, tec_tok, soc_model, soc_tok, device, gen_params=None, block_web=True):
680
+ context = classify_context(question, label_classes, context_model, cont_tok, device)
681
+ context = rule_intent_override(question, context)
 
 
 
 
 
 
682
 
683
+ context_icons = {
684
+ "social": "💬", "modelos": "🔧", "evaluación": "📏", "optimización": "⚙️",
685
+ "visualización": "📈", "aprendizaje": "🧠", "vida digital": "🧑‍💻",
686
+ "estadística": "📊", "infraestructura": "🖥", "datos": "📂", "transformación digital": "🌀"}
687
+ icon = context_icons.get(context, "🧠")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
 
689
+ if gen_params and "seed" in gen_params:
690
+ set_seeds(gen_params["seed"])
691
+
692
+ if context == "social":
693
+ return social_asnwer(question, soc_model, soc_tok, device, gen_params=gen_params, block_web=block_web), context
694
+ else:
695
+ return technical_asnwer(question, context, tec_model, tec_tok, device, gen_params=gen_params), context
696
+
697
+ #***************************************************************************
698
  # MAIN
699
+ #***************************************************************************
700
+
701
  if __name__ == '__main__':
702
+
703
+ # --- Estado que debe persistir en todos los reruns ---
704
  ss = st.session_state
705
  ss.setdefault("historial", [])
706
  ss.setdefault("last_prompt", "")
707
  ss.setdefault("last_response", "")
708
  ss.setdefault("just_generated", False)
709
+
710
+ # Sidebar (control total)
711
+ GEN_PARAMS = sidebar_params()
712
+ GEN_PARAMS["persona"] = st.session_state.persona # por si acaso
713
+
714
+ # Setting historial for the current user
715
+ #if "historial" not in st.session_state:
716
+ # st.session_state.historial = []
717
+
718
+ # Assigning a new ID to the current user
719
+ if "user_id" not in st.session_state:
720
+ st.session_state["user_id"] = str(uuid.uuid4())[:8]
721
+
722
+ # Loading classifier encoder classes:
723
+ labels_path = hf_hub_download(repo_id="tecuhtli/assistant-classifier-bert", filename="context_labels.pkl", use_auth_token=HF_TOKEN)
724
+ label_classes = joblib.load(labels_path)
725
+
726
+ # Loading Saved Models
727
+ # Modelo Contexto
728
+ context_model = AutoModelForSequenceClassification.from_pretrained("tecuhtli/assistant-classifier-bert", use_auth_token=HF_TOKEN)
729
+ cont_tok = AutoTokenizer.from_pretrained("tecuhtli/assistant-classifier-bert", use_auth_token=HF_TOKEN)
730
+
731
+ # Modelo Técnico
732
+ tec_tok = AutoTokenizer.from_pretrained("tecuhtli/assistant-technical-t5", use_auth_token=HF_TOKEN)
733
+ tec_model = AutoModelForSeq2SeqLM.from_pretrained("tecuhtli/assistant-technical-t5", use_auth_token=HF_TOKEN)
734
 
735
+ # Modelo Social
736
+ soc_tok = AutoTokenizer.from_pretrained("tecuhtli/assistant-social-t5", use_auth_token=HF_TOKEN)
737
+ soc_model = AutoModelForSeq2SeqLM.from_pretrained("tecuhtli/assistant-social-t5", use_auth_token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
 
739
+ # Available Device
740
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
741
 
742
+ # Defining Assistant Presentation
743
+ st.title("🤖 Your Personal Assistant 🎓")
 
744
 
745
+ st.caption("🙋🏽‍ You can ask me about technical concepts such as visualization, data cleaning, BI, and more.")
746
+ st.caption("🙇🏽 I can *only* understand and answer in Spanish (🦅🇲🇽🌵).")
747
+ st.caption("➡️ At this stage, I can respond to simple questions such as:")
748
+ st.caption(" • ¿Cómo estás? • ¿Qué es...? • Explícame algo • Define algo • ¿Para qué sirve...?")
749
+
750
+ st.caption("😊 If you want to know me better, visit: [hazutecuhtli.github.io](https://hazutecuhtli.github.io)")
751
 
 
 
 
 
 
 
752
  st.markdown("<br>", unsafe_allow_html=True)
 
753
 
754
+ st.caption("✏️ Type **'salir'** to exit.")
755
+
756
+ # 🔁 Limpieza segura antes del formulario
757
+ if st.session_state.pop("_clear_entrada", False):
758
+ if "entrada" in st.session_state:
759
+ del st.session_state["entrada"]
760
 
761
+ # 🧠 Flash de respuesta (la guardamos, pero la mostraremos después del form)
762
+ _flash = st.session_state.pop("_flash_response", None)
763
 
764
+
765
+ with st.form("formulario_assistant"):
766
  user_question = st.text_area("📝 Escribe tu pregunta aquí", key="entrada", height=100)
767
  submitted = st.form_submit_button("Responder")
768
 
769
  if submitted:
770
  if not user_question:
771
+ st.info("Chatbot: ¿Podrías repetir eso? No entendí bien 😅")
772
  else:
773
+ response, context = contextual_asnwer(
774
+ user_question, label_classes, context_model, cont_tok,
775
+ tec_model, tec_tok, soc_model, soc_tok, device,
776
+ gen_params=GEN_PARAMS, block_web=True,
777
+ )
778
 
779
+ # 🧠 Guarda historial
780
  hora_actual = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
781
+ st.session_state.historial.append(("Tú", user_question, hora_actual))
782
+
783
  hora_actual = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
784
+ st.session_state.historial.append(("Assistant", response, hora_actual))
785
+
786
+ # 💾 Guarda conversación
787
+ saving_interaction(user_question, response, context, st.session_state["user_id"])
788
+
789
+ # 🟩 Guarda respuesta para mostrar después del rerun
790
+ st.session_state["_flash_response"] = response
791
 
792
+ # 🧼 Limpieza del textarea en el próximo ciclo
793
+ st.session_state["_clear_entrada"] = True
794
 
795
+ # ♻️ Forzar refresh (sidebar verá el nuevo prompt)
 
 
796
  st.rerun()
797
 
798
+ # -----------------------------------------------------------
799
+ # 💬 Mostrar la respuesta actual (flash) justo aquí ↓↓↓
800
+ # -----------------------------------------------------------
801
  if _flash:
802
  st.success(_flash)
803
 
804
+ # Mostrar último mensaje (opcional, arriba de todo)
805
+ #if st.session_state.get("just_generated"):
806
+ # if st.session_state["last_response"]:
807
+ # st.success(st.session_state["last_response"])
808
+ # st.session_state["just_generated"] = False
809
+
810
+ # ... formulario y lógica de respuesta ...
811
+
812
+ # 🔁 Historial con estilo chat y contenedor con scroll
813
+ if st.session_state.historial:
814
  st.markdown("---")
815
 
816
+ # 💾 Botón de descarga arriba del historial
817
  lineas = []
818
+ for msg in reversed(st.session_state.historial):
819
  if len(msg) == 3:
820
  autor, texto, hora = msg
821
  lineas.append(f"[{hora}] {autor}: {texto}")
 
827
  st.download_button(
828
  label="💾 Descargar conversación como .txt",
829
  data=texto_chat,
830
+ file_name="conversacion_assistant.txt",
831
  mime="text/plain",
832
  use_container_width=True
833
  )
834
 
835
+ # 🪟 Contenedor con scroll y burbujas
836
  st.markdown(
837
  """
838
  <div id="chat-container" style="
 
848
  unsafe_allow_html=True
849
  )
850
 
851
+ for msg in reversed(st.session_state.historial):
852
  if len(msg) == 3:
853
  autor, texto, _ = msg
854
  else:
 
898
  )
899
 
900
  st.markdown("</div>", unsafe_allow_html=True)
901
+
902
  #***************************************************************************
903
  # FIN
904
  #***************************************************************************