GaetanoParente commited on
Commit
6a2dd05
·
1 Parent(s): 4634051

fix logger timestamp e aggiunto testo request

Browse files
app.py CHANGED
@@ -1,7 +1,6 @@
1
  from modules.utilities import logger
2
  import time
3
  import gradio as gr
4
- import cv2
5
  import os
6
  import modules.utilities.utils as utils
7
  from modules.binary_classification import binary_classification as binary
 
1
  from modules.utilities import logger
2
  import time
3
  import gradio as gr
 
4
  import os
5
  import modules.utilities.utils as utils
6
  from modules.binary_classification import binary_classification as binary
modules/bpo_dispatcher.py CHANGED
@@ -2,7 +2,6 @@ import torch
2
  from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
3
  import spacy
4
  import re
5
- import os
6
  import torch.nn.functional as F
7
 
8
  try:
@@ -24,12 +23,10 @@ class BPODispatcher:
24
  self.model = None
25
  self.tokenizer = None
26
  self.nlp = None
27
- self.device = "cpu" # In uno Space CPU basic, usa "cuda" solo se hai GPU
28
 
29
- # 1. BERT (Caricamento da Hugging Face Hub)
30
  print(f"🔄 Tentativo di caricamento modello da: {model_id}...")
31
  try:
32
- # token=True usa automaticamente il Secret 'HF_TOKEN' dello Space
33
  self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_id, token=True)
34
  self.model = DistilBertForSequenceClassification.from_pretrained(model_id, token=True)
35
  self.model.to(self.device)
@@ -41,7 +38,6 @@ class BPODispatcher:
41
  except Exception as e:
42
  print(f"❌ Errore generico BERT: {e}")
43
 
44
- # 2. spaCy
45
  try:
46
  self.nlp = spacy.load("it_core_news_lg")
47
  print("✅ spaCy caricato.")
@@ -50,7 +46,7 @@ class BPODispatcher:
50
 
51
  def _extract_smart_entities(self, text):
52
  entities = []
53
- occupied_spans = [] # Tiene traccia delle zone di testo già etichettate
54
 
55
  def is_overlapping(start, end):
56
  """Controlla se la posizione è già occupata"""
@@ -65,19 +61,18 @@ class BPODispatcher:
65
  entities.append((text_val, label))
66
  occupied_spans.append((start, end))
67
 
68
- # --- FASE 1: REGEX ALTA PRIORITÀ (Dati Strutturati) ---
69
 
70
- # A. EMAIL
71
  for m in re.finditer(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text):
72
  add_entity(m.group(), "EMAIL", m.start(), m.end())
73
 
74
- # B. TELEFONO (Mobile e Fisso Italiano)
75
- # Cerca pattern tipo 3xx... o 0x... con spazi opzionali
76
  for m in re.finditer(r'\b(?:3\d{2}|0\d{1,4})[\s.-]?\d{6,10}\b', text):
77
  add_entity(m.group(), "TELEFONO", m.start(), m.end())
78
 
79
- # C. NUMERI CONTESTUALI (Fatture, Clienti, Forniture)
80
- # Regex migliorata: Accetta anche alfanumerici per codici cliente
81
  # Pattern: Parola che inizia o finisce con cifra, lunga 4-15 chars
82
  candidates = re.finditer(r'\b(?=[A-Za-z0-9]*\d)[A-Za-z0-9]{4,15}\b', text)
83
 
@@ -92,34 +87,33 @@ class BPODispatcher:
92
 
93
  context = text[max(0, start - window_size):start].lower()
94
 
95
- # 1. Fatture
96
  if any(w in context for w in ["fattura", "bolletta", "nota", "nr.", "n."]):
97
- # Verifica extra: le fatture solitamente sono solo numeri o hanno /
98
  if val.isdigit() or '/' in val:
99
  add_entity(val, "N. FATTURA", start, end)
100
  continue
101
 
102
- # 2. Forniture (POD/PDR/Luce/Gas)
103
  if any(w in context for w in ["luce", "gas", "fornitura", "pod", "pdr", "contatore"]):
104
  add_entity(val, "COD. FORNITURA", start, end)
105
  continue
106
 
107
- # 3. Codici Cliente (più generico, accetta alfanumerici)
108
  if any(w in context for w in ["cliente", "codice", "utenza", "pratica", "id"]):
109
  add_entity(val, "CODICE CLIENTE", start, end)
110
  continue
111
 
112
- # --- FASE 2: SPACY BASSA PRIORITÀ (Entità Semantiche) ---
113
  if self.nlp:
114
  doc = self.nlp(text)
115
  for ent in doc.ents:
116
- # VALIDAZIONE ANTI-ALLUCINAZIONE
117
  # Regola: Una PERSONA non può contenere cifre
118
  if ent.label_ == "PER":
119
  if any(char.isdigit() for char in ent.text):
120
- continue # Scarta "25458958" classificato come Persona
121
  if len(ent.text) < 3:
122
- continue # Scarta nomi troppo corti
123
 
124
  add_entity(ent.text, "PERSONA", ent.start_char, ent.end_char)
125
 
@@ -135,7 +129,7 @@ class BPODispatcher:
135
  urgency = "Bassa"
136
  text_lower = text.lower()
137
 
138
- # 1. Analisi Sentiment
139
  sentiment_score_neg = 0.0
140
  sentiment_score_pos = 0.0
141
 
@@ -147,14 +141,12 @@ class BPODispatcher:
147
  except Exception:
148
  sentiment_score_neg = 0.5 # Fallback neutro
149
 
150
- # --- LOGICA DECISIONALE ---
151
-
152
- # CASO A: CHURN (Disdetta) -> Sempre Critico
153
  # Indipendentemente dal tono, se uno vuole andare via è priorità assoluta.
154
  if intent_label == "Retention / Churn Risk":
155
  return "CRITICA (Rischio Abbandono)"
156
 
157
- # CASO B: SUPPORTO TECNICO
158
  elif intent_label == "Supporto Tecnico":
159
  # Se il cliente è FURIOSO
160
  if sentiment_score_neg > 0.9:
@@ -171,7 +163,7 @@ class BPODispatcher:
171
  # Caso standard: "Ho un problema col wifi" (Neutro/Lievemente negativo)
172
  return "MEDIA (Guasto Standard)"
173
 
174
- # CASO C: AMMINISTRAZIONE / BILLING
175
  elif intent_label == "Amministrazione / Billing":
176
  # Le questioni di soldi scaldano gli animi.
177
  if sentiment_score_neg > 0.9:
@@ -198,7 +190,7 @@ class BPODispatcher:
198
  if self.model is None: return None, "Errore", []
199
  if not text.strip(): return None, "Vuoto", []
200
 
201
- # 1. Intent Classification (BERT)
202
  inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True)
203
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
204
  with torch.no_grad():
@@ -206,14 +198,14 @@ class BPODispatcher:
206
  probs = F.softmax(outputs.logits, dim=-1)
207
  label_output = {LABELS_MAP[i]: float(probs[0][i]) for i in range(len(LABELS_MAP))}
208
 
209
- # Prendi l'intento vincente
210
  top_idx = torch.max(probs, dim=-1)[1].item()
211
  predicted_label = LABELS_MAP[top_idx]
212
 
213
- # 2. Urgenza Intelligente (AI + Sentiment + Rules)
214
  urgency = self._calculate_smart_urgency(text, predicted_label)
215
 
216
- # 3. NER Extraction
217
  entities = self._extract_smart_entities(text)
218
 
219
  return label_output, urgency, entities
 
2
  from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
3
  import spacy
4
  import re
 
5
  import torch.nn.functional as F
6
 
7
  try:
 
23
  self.model = None
24
  self.tokenizer = None
25
  self.nlp = None
26
+ self.device = "cpu"
27
 
 
28
  print(f"🔄 Tentativo di caricamento modello da: {model_id}...")
29
  try:
 
30
  self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_id, token=True)
31
  self.model = DistilBertForSequenceClassification.from_pretrained(model_id, token=True)
32
  self.model.to(self.device)
 
38
  except Exception as e:
39
  print(f"❌ Errore generico BERT: {e}")
40
 
 
41
  try:
42
  self.nlp = spacy.load("it_core_news_lg")
43
  print("✅ spaCy caricato.")
 
46
 
47
  def _extract_smart_entities(self, text):
48
  entities = []
49
+ occupied_spans = []
50
 
51
  def is_overlapping(start, end):
52
  """Controlla se la posizione è già occupata"""
 
61
  entities.append((text_val, label))
62
  occupied_spans.append((start, end))
63
 
64
+ # --- REGEX (Dati Strutturati) ---
65
 
66
+ # EMAIL
67
  for m in re.finditer(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text):
68
  add_entity(m.group(), "EMAIL", m.start(), m.end())
69
 
70
+ # TELEFONO (Mobile e Fisso Italiano)
71
+ # pattern tipo 3xx... o 0x... con spazi opzionali
72
  for m in re.finditer(r'\b(?:3\d{2}|0\d{1,4})[\s.-]?\d{6,10}\b', text):
73
  add_entity(m.group(), "TELEFONO", m.start(), m.end())
74
 
75
+ # NUMERI CONTESTUALI (Fatture, Clienti, Forniture)
 
76
  # Pattern: Parola che inizia o finisce con cifra, lunga 4-15 chars
77
  candidates = re.finditer(r'\b(?=[A-Za-z0-9]*\d)[A-Za-z0-9]{4,15}\b', text)
78
 
 
87
 
88
  context = text[max(0, start - window_size):start].lower()
89
 
90
+ # Fatture
91
  if any(w in context for w in ["fattura", "bolletta", "nota", "nr.", "n."]):
92
+ # le fatture solitamente sono solo numeri o hanno /
93
  if val.isdigit() or '/' in val:
94
  add_entity(val, "N. FATTURA", start, end)
95
  continue
96
 
97
+ # Forniture (POD/PDR/Luce/Gas)
98
  if any(w in context for w in ["luce", "gas", "fornitura", "pod", "pdr", "contatore"]):
99
  add_entity(val, "COD. FORNITURA", start, end)
100
  continue
101
 
102
+ # Codici Cliente
103
  if any(w in context for w in ["cliente", "codice", "utenza", "pratica", "id"]):
104
  add_entity(val, "CODICE CLIENTE", start, end)
105
  continue
106
 
 
107
  if self.nlp:
108
  doc = self.nlp(text)
109
  for ent in doc.ents:
110
+ # VALIDAZIONE
111
  # Regola: Una PERSONA non può contenere cifre
112
  if ent.label_ == "PER":
113
  if any(char.isdigit() for char in ent.text):
114
+ continue
115
  if len(ent.text) < 3:
116
+ continue
117
 
118
  add_entity(ent.text, "PERSONA", ent.start_char, ent.end_char)
119
 
 
129
  urgency = "Bassa"
130
  text_lower = text.lower()
131
 
132
+ # Analisi Sentiment
133
  sentiment_score_neg = 0.0
134
  sentiment_score_pos = 0.0
135
 
 
141
  except Exception:
142
  sentiment_score_neg = 0.5 # Fallback neutro
143
 
144
+ # CASO CHURN (Disdetta) -> Sempre Critico
 
 
145
  # Indipendentemente dal tono, se uno vuole andare via è priorità assoluta.
146
  if intent_label == "Retention / Churn Risk":
147
  return "CRITICA (Rischio Abbandono)"
148
 
149
+ # CASO SUPPORTO TECNICO
150
  elif intent_label == "Supporto Tecnico":
151
  # Se il cliente è FURIOSO
152
  if sentiment_score_neg > 0.9:
 
163
  # Caso standard: "Ho un problema col wifi" (Neutro/Lievemente negativo)
164
  return "MEDIA (Guasto Standard)"
165
 
166
+ # CASO AMMINISTRAZIONE / BILLING
167
  elif intent_label == "Amministrazione / Billing":
168
  # Le questioni di soldi scaldano gli animi.
169
  if sentiment_score_neg > 0.9:
 
190
  if self.model is None: return None, "Errore", []
191
  if not text.strip(): return None, "Vuoto", []
192
 
193
+ # Intent Classification (BERT)
194
  inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True)
195
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
196
  with torch.no_grad():
 
198
  probs = F.softmax(outputs.logits, dim=-1)
199
  label_output = {LABELS_MAP[i]: float(probs[0][i]) for i in range(len(LABELS_MAP))}
200
 
201
+ # Prendo l'intento vincente
202
  top_idx = torch.max(probs, dim=-1)[1].item()
203
  predicted_label = LABELS_MAP[top_idx]
204
 
205
+ # Urgenza (AI + Sentiment + Rules)
206
  urgency = self._calculate_smart_urgency(text, predicted_label)
207
 
208
+ # NER Extraction
209
  entities = self._extract_smart_entities(text)
210
 
211
  return label_output, urgency, entities
modules/utilities/logger.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import uuid
3
  import csv
4
- import threading
5
  from datetime import datetime
6
  from pathlib import Path
7
  from huggingface_hub import CommitScheduler, HfApi
@@ -11,7 +11,8 @@ DATASET_REPO_ID = "NextGenTech/ngt-ai-platform-logs"
11
  LOG_DIR = Path("data/logs")
12
  LOG_FILE = LOG_DIR / "access_logs.csv"
13
  HF_TOKEN = os.environ.get("HF_TOKEN")
14
- TIME = 2 # Minuti che intercorrono tra gli aggiornamenti del dataset
 
15
 
16
  LOG_DIR.mkdir(parents=True, exist_ok=True)
17
 
@@ -22,7 +23,7 @@ if not LOG_FILE.exists() or LOG_FILE.stat().st_size == 0:
22
  writer = csv.writer(f, lineterminator='\n')
23
  writer.writerow([
24
  "timestamp", "session_id", "module", "action",
25
- "ip_address", "user_agent", "language", "input_size", "processing_time"
26
  ])
27
 
28
  if HF_TOKEN:
@@ -65,18 +66,25 @@ def log_interaction(request, module_name, action, input_data=None, execution_tim
65
  session_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, session_raw))[:8]
66
 
67
  input_meta = "0"
 
 
 
 
68
  if isinstance(input_data, str):
69
  input_meta = f"{len(input_data)} chars"
 
 
70
  elif hasattr(input_data, 'shape'):
71
  input_meta = f"{input_data.shape}"
 
72
  elif input_data is not None:
73
  input_meta = "Binary/File"
74
-
75
  with scheduler.lock:
76
  with open(LOG_FILE, "a", newline="", encoding="utf-8") as f:
77
  writer = csv.writer(f, lineterminator='\n')
78
  writer.writerow([
79
- datetime.now().isoformat(),
80
  session_id,
81
  module_name,
82
  action,
@@ -84,6 +92,7 @@ def log_interaction(request, module_name, action, input_data=None, execution_tim
84
  user_agent,
85
  language,
86
  input_meta,
 
87
  f"{execution_time:.4f}s"
88
  ])
89
 
 
1
  import os
2
  import uuid
3
  import csv
4
+ import pytz
5
  from datetime import datetime
6
  from pathlib import Path
7
  from huggingface_hub import CommitScheduler, HfApi
 
11
  LOG_DIR = Path("data/logs")
12
  LOG_FILE = LOG_DIR / "access_logs.csv"
13
  HF_TOKEN = os.environ.get("HF_TOKEN")
14
+ ITALY_TZ = pytz.timezone("Europe/Rome")
15
+ TIME = 5 # Minuti che intercorrono tra gli aggiornamenti del dataset
16
 
17
  LOG_DIR.mkdir(parents=True, exist_ok=True)
18
 
 
23
  writer = csv.writer(f, lineterminator='\n')
24
  writer.writerow([
25
  "timestamp", "session_id", "module", "action",
26
+ "ip_address", "user_agent", "language", "input_size", "input_text" ,"processing_time"
27
  ])
28
 
29
  if HF_TOKEN:
 
66
  session_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, session_raw))[:8]
67
 
68
  input_meta = "0"
69
+ input_text_content = ""
70
+
71
+ now_italy = datetime.now(ITALY_TZ)
72
+
73
  if isinstance(input_data, str):
74
  input_meta = f"{len(input_data)} chars"
75
+ clean_text = input_data.replace('\n', ' ').replace('\r', '')
76
+ input_text_content = (clean_text[:1000] + '..') if len(clean_text) > 1000 else clean_text
77
  elif hasattr(input_data, 'shape'):
78
  input_meta = f"{input_data.shape}"
79
+ input_text_content = "[IMAGE/BINARY DATA]"
80
  elif input_data is not None:
81
  input_meta = "Binary/File"
82
+ input_text_content = "[FILE]"
83
  with scheduler.lock:
84
  with open(LOG_FILE, "a", newline="", encoding="utf-8") as f:
85
  writer = csv.writer(f, lineterminator='\n')
86
  writer.writerow([
87
+ now_italy.isoformat(),
88
  session_id,
89
  module_name,
90
  action,
 
92
  user_agent,
93
  language,
94
  input_meta,
95
+ input_text_content,
96
  f"{execution_time:.4f}s"
97
  ])
98
 
requirements.txt CHANGED
@@ -26,6 +26,7 @@ spacy==3.8.2
26
  nltk>=3.8.1
27
  scikit-learn>=1.3.0
28
  plotly>=5.0.0
 
29
 
30
  # --- IMAGE PROCESSING ---
31
  opencv-python-headless
 
26
  nltk>=3.8.1
27
  scikit-learn>=1.3.0
28
  plotly>=5.0.0
29
+ pytz # per gestire il fuso orario
30
 
31
  # --- IMAGE PROCESSING ---
32
  opencv-python-headless