geronimo-pericoli commited on
Commit
27e76cc
·
verified ·
1 Parent(s): 946afe9

Create logging_manager.py

Browse files
Files changed (1) hide show
  1. logging_manager.py +485 -0
logging_manager.py ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # logging_manager.py
2
+ import pandas as pd
3
+ import json
4
+ from datetime import datetime
5
+ from typing import Optional, Union, List, Dict
6
+ from huggingface_hub import HfApi, file_exists, hf_hub_download, list_repo_files
7
+ import gradio as gr # Solo para el tipo LikeData
8
+
9
+ class LoggingManager:
10
+ def __init__(self, repo_name: str = "pharma-IA", project_id: str = "logs-engineering", hf_token: Optional[str] = None):
11
+ """
12
+ Inicializa el gestor de logs con configuración básica.
13
+
14
+ Args:
15
+ repo_name: Nombre del repositorio en Hugging Face
16
+ project_id: ID del proyecto dentro del repositorio
17
+ hf_token: Token de autenticación para Hugging Face Hub
18
+ """
19
+ self.repo_name = repo_name
20
+ self.project_id = project_id
21
+ self.hf_token = hf_token
22
+ self.repo_id = f"{repo_name}/{project_id}"
23
+ self.api = HfApi()
24
+
25
+ # --------------------------
26
+ # Funciones básicas de utilidad
27
+ # --------------------------
28
+
29
+ def _get_current_filename(self) -> str:
30
+ """Genera el nombre del archivo de logs para el mes actual."""
31
+ return f"logs_{datetime.now().strftime('%Y-%m')}.csv"
32
+
33
+ @staticmethod
34
+ def _normalize_text(text: str) -> str:
35
+ """Normaliza texto para comparaciones."""
36
+ return text.strip().lower()
37
+
38
+ def _file_exists(self, filename: str) -> bool:
39
+ """Verifica si un archivo existe en el repositorio."""
40
+ try:
41
+ files = list_repo_files(repo_id=self.repo_id, repo_type="dataset", token=self.hf_token)
42
+ return filename in files
43
+ except Exception as e:
44
+ print(f"Error checking file existence: {e}")
45
+ return False
46
+
47
+ # --------------------------
48
+ # Funciones principales
49
+ # --------------------------
50
+
51
+ def save_interaction(self, user_message: Union[str, dict], response_text: str, user: str) -> bool:
52
+ """
53
+ Guarda una interacción usuario-sistema en los logs.
54
+
55
+ Args:
56
+ user_message: Mensaje del usuario (puede ser string o dict con texto/archivos)
57
+ response_text: Respuesta generada por el sistema
58
+ user: Identificador del usuario
59
+
60
+ Returns:
61
+ bool: True si se guardó correctamente, False si hubo error
62
+ """
63
+ try:
64
+ filename = self._get_current_filename()
65
+
66
+ # Cargar dataframe existente o crear uno nuevo
67
+ if self._file_exists(filename):
68
+ local_path = hf_hub_download(
69
+ repo_id=self.repo_id,
70
+ filename=filename,
71
+ repo_type="dataset",
72
+ token=self.hf_token
73
+ )
74
+ df = pd.read_csv(local_path)
75
+ else:
76
+ df = pd.DataFrame(columns=["timestamp", "user_message", "response_text", "flag", "user"])
77
+
78
+ # Preparar nuevo registro
79
+ new_entry = {
80
+ "timestamp": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
81
+ "user_message": json.dumps(user_message) if isinstance(user_message, dict) else user_message,
82
+ "response_text": response_text,
83
+ "flag": "",
84
+ "user": user
85
+ }
86
+
87
+ # Añadir y guardar
88
+ df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
89
+ df.to_csv(filename, index=False)
90
+
91
+ # Subir al repositorio
92
+ self.api.upload_file(
93
+ path_or_fileobj=filename,
94
+ path_in_repo=filename,
95
+ repo_id=self.repo_id,
96
+ token=self.hf_token,
97
+ repo_type="dataset"
98
+ )
99
+ return True
100
+
101
+ except Exception as e:
102
+ print(f"Error saving interaction: {e}")
103
+ return False
104
+
105
+ def record_feedback(self, feedback_data: gr.LikeData) -> bool:
106
+ """
107
+ Registra feedback de usuario (like/dislike) en los logs.
108
+
109
+ Args:
110
+ feedback_data: Datos del feedback desde Gradio
111
+
112
+ Returns:
113
+ bool: True si se registró correctamente
114
+ """
115
+ if not feedback_data:
116
+ print("No feedback data provided")
117
+ return False
118
+
119
+ try:
120
+ text_value = feedback_data.value if isinstance(feedback_data.value, str) else feedback_data.value.get('value', '')
121
+ filename = self._get_current_filename()
122
+
123
+ if not self._file_exists(filename):
124
+ print(f"Log file {filename} doesn't exist")
125
+ return False
126
+
127
+ # Descargar y cargar logs
128
+ local_path = hf_hub_download(
129
+ repo_id=self.repo_id,
130
+ filename=filename,
131
+ repo_type="dataset",
132
+ token=self.hf_token
133
+ )
134
+ df = pd.read_csv(local_path)
135
+
136
+ # Buscar la interacción correspondiente
137
+ normalized_value = self._normalize_text(text_value)
138
+ df['normalized_response'] = df['response_text'].apply(self._normalize_text)
139
+
140
+ matching_indices = df.index[
141
+ df['normalized_response'].str.contains(normalized_value, na=False, regex=False)
142
+ ].tolist()
143
+
144
+ if matching_indices:
145
+ last_match = matching_indices[-1]
146
+ df.at[last_match, 'flag'] = str(feedback_data.liked)
147
+ df = df.drop(columns=['normalized_response'])
148
+
149
+ # Guardar cambios
150
+ df.to_csv(filename, index=False)
151
+ self.api.upload_file(
152
+ path_or_fileobj=filename,
153
+ path_in_repo=filename,
154
+ repo_id=self.repo_id,
155
+ token=self.hf_token,
156
+ repo_type="dataset"
157
+ )
158
+ return True
159
+ else:
160
+ print("No matching interaction found for feedback")
161
+ return False
162
+
163
+ except Exception as e:
164
+ print(f"Error recording feedback: {e}")
165
+ return False
166
+
167
+ def save_evaluation_metrics(
168
+ self,
169
+ query: str,
170
+ faithfulness_score: float,
171
+ answer_relevancy_score: float,
172
+ context_relevancy_score: float
173
+ ) -> bool:
174
+ """
175
+ Guarda métricas de evaluación para una interacción específica.
176
+
177
+ Args:
178
+ query: Texto de la consulta original (puede ser string JSON o texto plano)
179
+ faithfulness_score: Puntaje de groundedness/faithfulness
180
+ answer_relevancy_score: Puntaje de relevancia de la respuesta
181
+ context_relevancy_score: Puntaje de relevancia del contexto
182
+
183
+ Returns:
184
+ bool: True si se guardó correctamente
185
+ """
186
+ try:
187
+ filename = self._get_current_filename()
188
+ if not self._file_exists(filename):
189
+ print(f"Log file {filename} doesn't exist")
190
+ return False
191
+
192
+ # Cargar logs
193
+ local_path = hf_hub_download(
194
+ repo_id=self.repo_id,
195
+ filename=filename,
196
+ repo_type="dataset",
197
+ token=self.hf_token
198
+ )
199
+ df = pd.read_csv(local_path)
200
+
201
+ # Extraer el texto real de la query (maneja ambos formatos)
202
+ try:
203
+ # Si la query es un string JSON, extraemos el campo 'text'
204
+ import json
205
+ query_dict = json.loads(query.replace("'", '"')) # Normalizamos comillas
206
+ query_text = query_dict['text']
207
+ except:
208
+ # Si no es JSON, usamos la query directamente
209
+ query_text = query
210
+
211
+ # Buscar la interacción más reciente que coincida con la consulta
212
+ norm_query = self._normalize_text(query_text)
213
+
214
+ def extract_query_text(cell):
215
+ try:
216
+ if pd.isna(cell):
217
+ return ""
218
+ cell_dict = json.loads(cell.replace("'", '"'))
219
+ return self._normalize_text(cell_dict['text'])
220
+ except:
221
+ return self._normalize_text(str(cell))
222
+
223
+ matches = df.index[
224
+ df['user_message'].apply(extract_query_text) == norm_query
225
+ ].tolist()
226
+
227
+ if matches:
228
+ last_match = matches[-1]
229
+ df.at[last_match, 'groundedness'] = faithfulness_score
230
+ df.at[last_match, 'answer_relevancy'] = answer_relevancy_score
231
+ df.at[last_match, 'context_relevancy'] = context_relevancy_score
232
+
233
+ # Guardar cambios
234
+ df.to_csv(filename, index=False)
235
+ self.api.upload_file(
236
+ path_or_fileobj=filename,
237
+ path_in_repo=filename,
238
+ repo_id=self.repo_id,
239
+ token=self.hf_token,
240
+ repo_type="dataset"
241
+ )
242
+ return True
243
+ else:
244
+ print("No matching query found in logs")
245
+ print(f"Buscando: '{norm_query}'")
246
+ print("Consultas existentes:", df['user_message'].apply(extract_query_text).unique())
247
+ return False
248
+
249
+ except Exception as e:
250
+ print(f"Error saving evaluation metrics: {e}")
251
+ return False
252
+
253
+ def save_node_references(
254
+ self,
255
+ query: dict,
256
+ source_nodes: list,
257
+ kg_nodes: list
258
+ ) -> bool:
259
+ """
260
+ Guarda referencias a nodos utilizados en una respuesta.
261
+
262
+ Args:
263
+ query: Consulta original (dict con al menos campo 'text')
264
+ source_nodes: Nodos de documentos usados
265
+ kg_nodes: Nodos de knowledge graph usados
266
+
267
+ Returns:
268
+ bool: True si se guardó correctamente
269
+ """
270
+ try:
271
+ filename = self._get_current_filename()
272
+ if not self._file_exists(filename):
273
+ print(f"Log file {filename} doesn't exist")
274
+ return False
275
+
276
+ # Cargar logs
277
+ local_path = hf_hub_download(
278
+ repo_id=self.repo_id,
279
+ filename=filename,
280
+ repo_type="dataset",
281
+ token=self.hf_token
282
+ )
283
+ df = pd.read_csv(local_path)
284
+
285
+ # Buscar interacción correspondiente
286
+ query_text = query.get('text', '')
287
+ norm_query = self._normalize_text(query_text)
288
+ matches = df.index[
289
+ df['user_message'].apply(self._normalize_text).str.contains(norm_query, na=False, regex=False)
290
+ ].tolist()
291
+
292
+ if matches:
293
+ last_match = matches[-1]
294
+ df.at[last_match, 'response_node_ids'] = ", ".join([n.node.id_ for n in source_nodes])
295
+ df.at[last_match, 'kg_node_ids'] = ", ".join([n.node.id_ for n in kg_nodes])
296
+
297
+ # Guardar cambios
298
+ df.to_csv(filename, index=False)
299
+ self.api.upload_file(
300
+ path_or_fileobj=filename,
301
+ path_in_repo=filename,
302
+ repo_id=self.repo_id,
303
+ token=self.hf_token,
304
+ repo_type="dataset"
305
+ )
306
+ return True
307
+ else:
308
+ print("No matching query found for node references")
309
+ return False
310
+
311
+ except Exception as e:
312
+ print(f"Error saving node references: {e}")
313
+ return False
314
+
315
+ # --------------------------
316
+ # Funciones de consulta/auditoría
317
+ # --------------------------
318
+
319
+ def get_available_log_months(self) -> List[str]:
320
+ """Obtiene los meses con logs disponibles."""
321
+ try:
322
+ files = list_repo_files(repo_id=self.repo_id, repo_type="dataset", token=self.hf_token)
323
+ return sorted([f.split('_')[1].replace('.csv', '') for f in files if f.startswith('logs_')])
324
+ except Exception as e:
325
+ print(f"Error getting available months: {e}")
326
+ return []
327
+
328
+ def get_audit_trail(self, month: str) -> pd.DataFrame:
329
+ """
330
+ Obtiene los logs de un mes específico para auditoría.
331
+
332
+ Args:
333
+ month: Mes en formato YYYY-MM
334
+
335
+ Returns:
336
+ pd.DataFrame: DataFrame con los logs o vacío si hay error
337
+ """
338
+ try:
339
+ filename = f"logs_{month}.csv"
340
+ if not self._file_exists(filename):
341
+ print(f"No logs found for {month}")
342
+ return pd.DataFrame()
343
+
344
+ local_path = hf_hub_download(
345
+ repo_id=self.repo_id,
346
+ filename=filename,
347
+ repo_type="dataset",
348
+ token=self.hf_token
349
+ )
350
+ df = pd.read_csv(local_path)
351
+
352
+ # Formatear y ordenar
353
+ df["timestamp"] = pd.to_datetime(df["timestamp"]).dt.strftime('%Y-%m-%d %H:%M:%S UTC-0')
354
+ df = df.sort_values("timestamp", ascending=False)
355
+
356
+ # Renombrar columnas para visualización
357
+ return df.rename(columns={
358
+ "timestamp": "Timestamp",
359
+ "user_message": "User Message",
360
+ "response_text": "Response",
361
+ "flag": "Feedback",
362
+ "user": "User",
363
+ "groundedness": "Groundedness",
364
+ "answer_relevancy": "Answer Relevancy",
365
+ "context_relevancy": "Context Relevancy",
366
+ "response_node_ids": "Document Nodes",
367
+ "kg_node_ids": "KG Nodes"
368
+ })
369
+
370
+ except Exception as e:
371
+ print(f"Error loading audit trail: {e}")
372
+ return pd.DataFrame()
373
+
374
+ def get_user_history(self, user: str, limit: int = 5) -> str:
375
+ """
376
+ Obtiene el historial de un usuario formateado en Markdown.
377
+
378
+ Args:
379
+ user: Identificador del usuario
380
+ limit: Número máximo de interacciones a devolver
381
+
382
+ Returns:
383
+ str: Historial formateado o mensaje de error
384
+ """
385
+ try:
386
+ history = self._get_raw_history(user, limit)
387
+ if not history:
388
+ return "⚠️ No history found for this user"
389
+
390
+ markdown = [
391
+ f"## Chat History for {user} (last {len(history)} interactions)",
392
+ "*Ordered from oldest to newest*\n"
393
+ ]
394
+
395
+ for i, interaction in enumerate(history, 1):
396
+ question = self._format_question(interaction['user_message'])
397
+ response = interaction['response_text'].strip() if interaction['response_text'] else "(No response)"
398
+
399
+ markdown.extend([
400
+ f"\n### Interaction {i}",
401
+ f"**📅 {interaction['timestamp']}**",
402
+ "",
403
+ "**❓ Question:**",
404
+ f"> {question}",
405
+ "",
406
+ "**💡 Response:**",
407
+ f"> {response}",
408
+ "",
409
+ "---"
410
+ ])
411
+
412
+ return "\n".join(markdown[:-1]) + "\n\n*End of history*"
413
+
414
+ except Exception as e:
415
+ print(f"Error generating user history: {e}")
416
+ return "⚠️ Error retrieving history"
417
+
418
+ # --------------------------
419
+ # Funciones auxiliares privadas
420
+ # --------------------------
421
+
422
+ def _format_question(self, question_data: Union[str, dict]) -> str:
423
+ """Formatea el texto de pregunta que puede ser string o dict."""
424
+ if not question_data:
425
+ return "(No text)"
426
+
427
+ if isinstance(question_data, str):
428
+ try:
429
+ data = json.loads(question_data)
430
+ if isinstance(data, dict):
431
+ question_data = data
432
+ except json.JSONDecodeError:
433
+ pass
434
+
435
+ if isinstance(question_data, dict):
436
+ question_text = question_data.get('text', '(No text)')
437
+ if files := question_data.get('files', []):
438
+ attachments = "\n📎 Attachments: " + ", ".join([f"`{f.get('name', 'file')}`" for f in files])
439
+ return f"{question_text}{attachments}"
440
+ return question_text
441
+ return str(question_data)
442
+
443
+ def _get_raw_history(self, user: str, limit: int) -> List[Dict]:
444
+ """Obtiene el historial crudo de un usuario."""
445
+ try:
446
+ all_data = []
447
+ for month in self.get_available_log_months():
448
+ filename = f"logs_{month}.csv"
449
+ try:
450
+ if self._file_exists(filename):
451
+ local_path = hf_hub_download(
452
+ repo_id=self.repo_id,
453
+ filename=filename,
454
+ repo_type="dataset",
455
+ token=self.hf_token
456
+ )
457
+ df = pd.read_csv(local_path)
458
+ if all(col in df.columns for col in ['user', 'user_message', 'response_text', 'timestamp']):
459
+ all_data.append(df)
460
+ except Exception as e:
461
+ print(f"Error processing {filename}: {e}")
462
+ continue
463
+
464
+ if not all_data:
465
+ return []
466
+
467
+ # Combinar y filtrar
468
+ combined = pd.concat(all_data, ignore_index=True)
469
+ user_data = combined[combined['user'] == user].copy()
470
+
471
+ if user_data.empty:
472
+ return []
473
+
474
+ # Ordenar y limitar
475
+ try:
476
+ user_data['timestamp'] = pd.to_datetime(user_data['timestamp'])
477
+ user_data = user_data.sort_values('timestamp', ascending=True).tail(limit)
478
+ except:
479
+ user_data = user_data.tail(limit)
480
+
481
+ return user_data[['user_message', 'response_text', 'timestamp']].to_dict('records')
482
+
483
+ except Exception as e:
484
+ print(f"Error getting raw history: {e}")
485
+ return []