Spaces:
Running
Running
Commit ·
6a2dd05
1
Parent(s): 4634051
fix logger timestamp e aggiunto testo request
Browse files- app.py +0 -1
- modules/bpo_dispatcher.py +22 -30
- modules/utilities/logger.py +14 -5
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from modules.utilities import logger
|
| 2 |
import time
|
| 3 |
import gradio as gr
|
| 4 |
-
import cv2
|
| 5 |
import os
|
| 6 |
import modules.utilities.utils as utils
|
| 7 |
from modules.binary_classification import binary_classification as binary
|
|
|
|
| 1 |
from modules.utilities import logger
|
| 2 |
import time
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
import os
|
| 5 |
import modules.utilities.utils as utils
|
| 6 |
from modules.binary_classification import binary_classification as binary
|
modules/bpo_dispatcher.py
CHANGED
|
@@ -2,7 +2,6 @@ import torch
|
|
| 2 |
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
|
| 3 |
import spacy
|
| 4 |
import re
|
| 5 |
-
import os
|
| 6 |
import torch.nn.functional as F
|
| 7 |
|
| 8 |
try:
|
|
@@ -24,12 +23,10 @@ class BPODispatcher:
|
|
| 24 |
self.model = None
|
| 25 |
self.tokenizer = None
|
| 26 |
self.nlp = None
|
| 27 |
-
self.device = "cpu"
|
| 28 |
|
| 29 |
-
# 1. BERT (Caricamento da Hugging Face Hub)
|
| 30 |
print(f"🔄 Tentativo di caricamento modello da: {model_id}...")
|
| 31 |
try:
|
| 32 |
-
# token=True usa automaticamente il Secret 'HF_TOKEN' dello Space
|
| 33 |
self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_id, token=True)
|
| 34 |
self.model = DistilBertForSequenceClassification.from_pretrained(model_id, token=True)
|
| 35 |
self.model.to(self.device)
|
|
@@ -41,7 +38,6 @@ class BPODispatcher:
|
|
| 41 |
except Exception as e:
|
| 42 |
print(f"❌ Errore generico BERT: {e}")
|
| 43 |
|
| 44 |
-
# 2. spaCy
|
| 45 |
try:
|
| 46 |
self.nlp = spacy.load("it_core_news_lg")
|
| 47 |
print("✅ spaCy caricato.")
|
|
@@ -50,7 +46,7 @@ class BPODispatcher:
|
|
| 50 |
|
| 51 |
def _extract_smart_entities(self, text):
|
| 52 |
entities = []
|
| 53 |
-
occupied_spans = []
|
| 54 |
|
| 55 |
def is_overlapping(start, end):
|
| 56 |
"""Controlla se la posizione è già occupata"""
|
|
@@ -65,19 +61,18 @@ class BPODispatcher:
|
|
| 65 |
entities.append((text_val, label))
|
| 66 |
occupied_spans.append((start, end))
|
| 67 |
|
| 68 |
-
# ---
|
| 69 |
|
| 70 |
-
#
|
| 71 |
for m in re.finditer(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text):
|
| 72 |
add_entity(m.group(), "EMAIL", m.start(), m.end())
|
| 73 |
|
| 74 |
-
#
|
| 75 |
-
#
|
| 76 |
for m in re.finditer(r'\b(?:3\d{2}|0\d{1,4})[\s.-]?\d{6,10}\b', text):
|
| 77 |
add_entity(m.group(), "TELEFONO", m.start(), m.end())
|
| 78 |
|
| 79 |
-
#
|
| 80 |
-
# Regex migliorata: Accetta anche alfanumerici per codici cliente
|
| 81 |
# Pattern: Parola che inizia o finisce con cifra, lunga 4-15 chars
|
| 82 |
candidates = re.finditer(r'\b(?=[A-Za-z0-9]*\d)[A-Za-z0-9]{4,15}\b', text)
|
| 83 |
|
|
@@ -92,34 +87,33 @@ class BPODispatcher:
|
|
| 92 |
|
| 93 |
context = text[max(0, start - window_size):start].lower()
|
| 94 |
|
| 95 |
-
#
|
| 96 |
if any(w in context for w in ["fattura", "bolletta", "nota", "nr.", "n."]):
|
| 97 |
-
#
|
| 98 |
if val.isdigit() or '/' in val:
|
| 99 |
add_entity(val, "N. FATTURA", start, end)
|
| 100 |
continue
|
| 101 |
|
| 102 |
-
#
|
| 103 |
if any(w in context for w in ["luce", "gas", "fornitura", "pod", "pdr", "contatore"]):
|
| 104 |
add_entity(val, "COD. FORNITURA", start, end)
|
| 105 |
continue
|
| 106 |
|
| 107 |
-
#
|
| 108 |
if any(w in context for w in ["cliente", "codice", "utenza", "pratica", "id"]):
|
| 109 |
add_entity(val, "CODICE CLIENTE", start, end)
|
| 110 |
continue
|
| 111 |
|
| 112 |
-
# --- FASE 2: SPACY BASSA PRIORITÀ (Entità Semantiche) ---
|
| 113 |
if self.nlp:
|
| 114 |
doc = self.nlp(text)
|
| 115 |
for ent in doc.ents:
|
| 116 |
-
# VALIDAZIONE
|
| 117 |
# Regola: Una PERSONA non può contenere cifre
|
| 118 |
if ent.label_ == "PER":
|
| 119 |
if any(char.isdigit() for char in ent.text):
|
| 120 |
-
continue
|
| 121 |
if len(ent.text) < 3:
|
| 122 |
-
continue
|
| 123 |
|
| 124 |
add_entity(ent.text, "PERSONA", ent.start_char, ent.end_char)
|
| 125 |
|
|
@@ -135,7 +129,7 @@ class BPODispatcher:
|
|
| 135 |
urgency = "Bassa"
|
| 136 |
text_lower = text.lower()
|
| 137 |
|
| 138 |
-
#
|
| 139 |
sentiment_score_neg = 0.0
|
| 140 |
sentiment_score_pos = 0.0
|
| 141 |
|
|
@@ -147,14 +141,12 @@ class BPODispatcher:
|
|
| 147 |
except Exception:
|
| 148 |
sentiment_score_neg = 0.5 # Fallback neutro
|
| 149 |
|
| 150 |
-
#
|
| 151 |
-
|
| 152 |
-
# CASO A: CHURN (Disdetta) -> Sempre Critico
|
| 153 |
# Indipendentemente dal tono, se uno vuole andare via è priorità assoluta.
|
| 154 |
if intent_label == "Retention / Churn Risk":
|
| 155 |
return "CRITICA (Rischio Abbandono)"
|
| 156 |
|
| 157 |
-
# CASO
|
| 158 |
elif intent_label == "Supporto Tecnico":
|
| 159 |
# Se il cliente è FURIOSO
|
| 160 |
if sentiment_score_neg > 0.9:
|
|
@@ -171,7 +163,7 @@ class BPODispatcher:
|
|
| 171 |
# Caso standard: "Ho un problema col wifi" (Neutro/Lievemente negativo)
|
| 172 |
return "MEDIA (Guasto Standard)"
|
| 173 |
|
| 174 |
-
# CASO
|
| 175 |
elif intent_label == "Amministrazione / Billing":
|
| 176 |
# Le questioni di soldi scaldano gli animi.
|
| 177 |
if sentiment_score_neg > 0.9:
|
|
@@ -198,7 +190,7 @@ class BPODispatcher:
|
|
| 198 |
if self.model is None: return None, "Errore", []
|
| 199 |
if not text.strip(): return None, "Vuoto", []
|
| 200 |
|
| 201 |
-
#
|
| 202 |
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True)
|
| 203 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 204 |
with torch.no_grad():
|
|
@@ -206,14 +198,14 @@ class BPODispatcher:
|
|
| 206 |
probs = F.softmax(outputs.logits, dim=-1)
|
| 207 |
label_output = {LABELS_MAP[i]: float(probs[0][i]) for i in range(len(LABELS_MAP))}
|
| 208 |
|
| 209 |
-
#
|
| 210 |
top_idx = torch.max(probs, dim=-1)[1].item()
|
| 211 |
predicted_label = LABELS_MAP[top_idx]
|
| 212 |
|
| 213 |
-
#
|
| 214 |
urgency = self._calculate_smart_urgency(text, predicted_label)
|
| 215 |
|
| 216 |
-
#
|
| 217 |
entities = self._extract_smart_entities(text)
|
| 218 |
|
| 219 |
return label_output, urgency, entities
|
|
|
|
| 2 |
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
|
| 3 |
import spacy
|
| 4 |
import re
|
|
|
|
| 5 |
import torch.nn.functional as F
|
| 6 |
|
| 7 |
try:
|
|
|
|
| 23 |
self.model = None
|
| 24 |
self.tokenizer = None
|
| 25 |
self.nlp = None
|
| 26 |
+
self.device = "cpu"
|
| 27 |
|
|
|
|
| 28 |
print(f"🔄 Tentativo di caricamento modello da: {model_id}...")
|
| 29 |
try:
|
|
|
|
| 30 |
self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_id, token=True)
|
| 31 |
self.model = DistilBertForSequenceClassification.from_pretrained(model_id, token=True)
|
| 32 |
self.model.to(self.device)
|
|
|
|
| 38 |
except Exception as e:
|
| 39 |
print(f"❌ Errore generico BERT: {e}")
|
| 40 |
|
|
|
|
| 41 |
try:
|
| 42 |
self.nlp = spacy.load("it_core_news_lg")
|
| 43 |
print("✅ spaCy caricato.")
|
|
|
|
| 46 |
|
| 47 |
def _extract_smart_entities(self, text):
|
| 48 |
entities = []
|
| 49 |
+
occupied_spans = []
|
| 50 |
|
| 51 |
def is_overlapping(start, end):
|
| 52 |
"""Controlla se la posizione è già occupata"""
|
|
|
|
| 61 |
entities.append((text_val, label))
|
| 62 |
occupied_spans.append((start, end))
|
| 63 |
|
| 64 |
+
# --- REGEX (Dati Strutturati) ---
|
| 65 |
|
| 66 |
+
# EMAIL
|
| 67 |
for m in re.finditer(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text):
|
| 68 |
add_entity(m.group(), "EMAIL", m.start(), m.end())
|
| 69 |
|
| 70 |
+
# TELEFONO (Mobile e Fisso Italiano)
|
| 71 |
+
# pattern tipo 3xx... o 0x... con spazi opzionali
|
| 72 |
for m in re.finditer(r'\b(?:3\d{2}|0\d{1,4})[\s.-]?\d{6,10}\b', text):
|
| 73 |
add_entity(m.group(), "TELEFONO", m.start(), m.end())
|
| 74 |
|
| 75 |
+
# NUMERI CONTESTUALI (Fatture, Clienti, Forniture)
|
|
|
|
| 76 |
# Pattern: Parola che inizia o finisce con cifra, lunga 4-15 chars
|
| 77 |
candidates = re.finditer(r'\b(?=[A-Za-z0-9]*\d)[A-Za-z0-9]{4,15}\b', text)
|
| 78 |
|
|
|
|
| 87 |
|
| 88 |
context = text[max(0, start - window_size):start].lower()
|
| 89 |
|
| 90 |
+
# Fatture
|
| 91 |
if any(w in context for w in ["fattura", "bolletta", "nota", "nr.", "n."]):
|
| 92 |
+
# le fatture solitamente sono solo numeri o hanno /
|
| 93 |
if val.isdigit() or '/' in val:
|
| 94 |
add_entity(val, "N. FATTURA", start, end)
|
| 95 |
continue
|
| 96 |
|
| 97 |
+
# Forniture (POD/PDR/Luce/Gas)
|
| 98 |
if any(w in context for w in ["luce", "gas", "fornitura", "pod", "pdr", "contatore"]):
|
| 99 |
add_entity(val, "COD. FORNITURA", start, end)
|
| 100 |
continue
|
| 101 |
|
| 102 |
+
# Codici Cliente
|
| 103 |
if any(w in context for w in ["cliente", "codice", "utenza", "pratica", "id"]):
|
| 104 |
add_entity(val, "CODICE CLIENTE", start, end)
|
| 105 |
continue
|
| 106 |
|
|
|
|
| 107 |
if self.nlp:
|
| 108 |
doc = self.nlp(text)
|
| 109 |
for ent in doc.ents:
|
| 110 |
+
# VALIDAZIONE
|
| 111 |
# Regola: Una PERSONA non può contenere cifre
|
| 112 |
if ent.label_ == "PER":
|
| 113 |
if any(char.isdigit() for char in ent.text):
|
| 114 |
+
continue
|
| 115 |
if len(ent.text) < 3:
|
| 116 |
+
continue
|
| 117 |
|
| 118 |
add_entity(ent.text, "PERSONA", ent.start_char, ent.end_char)
|
| 119 |
|
|
|
|
| 129 |
urgency = "Bassa"
|
| 130 |
text_lower = text.lower()
|
| 131 |
|
| 132 |
+
# Analisi Sentiment
|
| 133 |
sentiment_score_neg = 0.0
|
| 134 |
sentiment_score_pos = 0.0
|
| 135 |
|
|
|
|
| 141 |
except Exception:
|
| 142 |
sentiment_score_neg = 0.5 # Fallback neutro
|
| 143 |
|
| 144 |
+
# CASO CHURN (Disdetta) -> Sempre Critico
|
|
|
|
|
|
|
| 145 |
# Indipendentemente dal tono, se uno vuole andare via è priorità assoluta.
|
| 146 |
if intent_label == "Retention / Churn Risk":
|
| 147 |
return "CRITICA (Rischio Abbandono)"
|
| 148 |
|
| 149 |
+
# CASO SUPPORTO TECNICO
|
| 150 |
elif intent_label == "Supporto Tecnico":
|
| 151 |
# Se il cliente è FURIOSO
|
| 152 |
if sentiment_score_neg > 0.9:
|
|
|
|
| 163 |
# Caso standard: "Ho un problema col wifi" (Neutro/Lievemente negativo)
|
| 164 |
return "MEDIA (Guasto Standard)"
|
| 165 |
|
| 166 |
+
# CASO AMMINISTRAZIONE / BILLING
|
| 167 |
elif intent_label == "Amministrazione / Billing":
|
| 168 |
# Le questioni di soldi scaldano gli animi.
|
| 169 |
if sentiment_score_neg > 0.9:
|
|
|
|
| 190 |
if self.model is None: return None, "Errore", []
|
| 191 |
if not text.strip(): return None, "Vuoto", []
|
| 192 |
|
| 193 |
+
# Intent Classification (BERT)
|
| 194 |
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True)
|
| 195 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 196 |
with torch.no_grad():
|
|
|
|
| 198 |
probs = F.softmax(outputs.logits, dim=-1)
|
| 199 |
label_output = {LABELS_MAP[i]: float(probs[0][i]) for i in range(len(LABELS_MAP))}
|
| 200 |
|
| 201 |
+
# Prendo l'intento vincente
|
| 202 |
top_idx = torch.max(probs, dim=-1)[1].item()
|
| 203 |
predicted_label = LABELS_MAP[top_idx]
|
| 204 |
|
| 205 |
+
# Urgenza (AI + Sentiment + Rules)
|
| 206 |
urgency = self._calculate_smart_urgency(text, predicted_label)
|
| 207 |
|
| 208 |
+
# NER Extraction
|
| 209 |
entities = self._extract_smart_entities(text)
|
| 210 |
|
| 211 |
return label_output, urgency, entities
|
modules/utilities/logger.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import uuid
|
| 3 |
import csv
|
| 4 |
-
import
|
| 5 |
from datetime import datetime
|
| 6 |
from pathlib import Path
|
| 7 |
from huggingface_hub import CommitScheduler, HfApi
|
|
@@ -11,7 +11,8 @@ DATASET_REPO_ID = "NextGenTech/ngt-ai-platform-logs"
|
|
| 11 |
LOG_DIR = Path("data/logs")
|
| 12 |
LOG_FILE = LOG_DIR / "access_logs.csv"
|
| 13 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
| 17 |
|
|
@@ -22,7 +23,7 @@ if not LOG_FILE.exists() or LOG_FILE.stat().st_size == 0:
|
|
| 22 |
writer = csv.writer(f, lineterminator='\n')
|
| 23 |
writer.writerow([
|
| 24 |
"timestamp", "session_id", "module", "action",
|
| 25 |
-
"ip_address", "user_agent", "language", "input_size", "processing_time"
|
| 26 |
])
|
| 27 |
|
| 28 |
if HF_TOKEN:
|
|
@@ -65,18 +66,25 @@ def log_interaction(request, module_name, action, input_data=None, execution_tim
|
|
| 65 |
session_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, session_raw))[:8]
|
| 66 |
|
| 67 |
input_meta = "0"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
if isinstance(input_data, str):
|
| 69 |
input_meta = f"{len(input_data)} chars"
|
|
|
|
|
|
|
| 70 |
elif hasattr(input_data, 'shape'):
|
| 71 |
input_meta = f"{input_data.shape}"
|
|
|
|
| 72 |
elif input_data is not None:
|
| 73 |
input_meta = "Binary/File"
|
| 74 |
-
|
| 75 |
with scheduler.lock:
|
| 76 |
with open(LOG_FILE, "a", newline="", encoding="utf-8") as f:
|
| 77 |
writer = csv.writer(f, lineterminator='\n')
|
| 78 |
writer.writerow([
|
| 79 |
-
|
| 80 |
session_id,
|
| 81 |
module_name,
|
| 82 |
action,
|
|
@@ -84,6 +92,7 @@ def log_interaction(request, module_name, action, input_data=None, execution_tim
|
|
| 84 |
user_agent,
|
| 85 |
language,
|
| 86 |
input_meta,
|
|
|
|
| 87 |
f"{execution_time:.4f}s"
|
| 88 |
])
|
| 89 |
|
|
|
|
| 1 |
import os
|
| 2 |
import uuid
|
| 3 |
import csv
|
| 4 |
+
import pytz
|
| 5 |
from datetime import datetime
|
| 6 |
from pathlib import Path
|
| 7 |
from huggingface_hub import CommitScheduler, HfApi
|
|
|
|
| 11 |
LOG_DIR = Path("data/logs")
|
| 12 |
LOG_FILE = LOG_DIR / "access_logs.csv"
|
| 13 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 14 |
+
ITALY_TZ = pytz.timezone("Europe/Rome")
|
| 15 |
+
TIME = 5 # Minuti che intercorrono tra gli aggiornamenti del dataset
|
| 16 |
|
| 17 |
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
| 18 |
|
|
|
|
| 23 |
writer = csv.writer(f, lineterminator='\n')
|
| 24 |
writer.writerow([
|
| 25 |
"timestamp", "session_id", "module", "action",
|
| 26 |
+
"ip_address", "user_agent", "language", "input_size", "input_text" ,"processing_time"
|
| 27 |
])
|
| 28 |
|
| 29 |
if HF_TOKEN:
|
|
|
|
| 66 |
session_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, session_raw))[:8]
|
| 67 |
|
| 68 |
input_meta = "0"
|
| 69 |
+
input_text_content = ""
|
| 70 |
+
|
| 71 |
+
now_italy = datetime.now(ITALY_TZ)
|
| 72 |
+
|
| 73 |
if isinstance(input_data, str):
|
| 74 |
input_meta = f"{len(input_data)} chars"
|
| 75 |
+
clean_text = input_data.replace('\n', ' ').replace('\r', '')
|
| 76 |
+
input_text_content = (clean_text[:1000] + '..') if len(clean_text) > 1000 else clean_text
|
| 77 |
elif hasattr(input_data, 'shape'):
|
| 78 |
input_meta = f"{input_data.shape}"
|
| 79 |
+
input_text_content = "[IMAGE/BINARY DATA]"
|
| 80 |
elif input_data is not None:
|
| 81 |
input_meta = "Binary/File"
|
| 82 |
+
input_text_content = "[FILE]"
|
| 83 |
with scheduler.lock:
|
| 84 |
with open(LOG_FILE, "a", newline="", encoding="utf-8") as f:
|
| 85 |
writer = csv.writer(f, lineterminator='\n')
|
| 86 |
writer.writerow([
|
| 87 |
+
now_italy.isoformat(),
|
| 88 |
session_id,
|
| 89 |
module_name,
|
| 90 |
action,
|
|
|
|
| 92 |
user_agent,
|
| 93 |
language,
|
| 94 |
input_meta,
|
| 95 |
+
input_text_content,
|
| 96 |
f"{execution_time:.4f}s"
|
| 97 |
])
|
| 98 |
|
requirements.txt
CHANGED
|
@@ -26,6 +26,7 @@ spacy==3.8.2
|
|
| 26 |
nltk>=3.8.1
|
| 27 |
scikit-learn>=1.3.0
|
| 28 |
plotly>=5.0.0
|
|
|
|
| 29 |
|
| 30 |
# --- IMAGE PROCESSING ---
|
| 31 |
opencv-python-headless
|
|
|
|
| 26 |
nltk>=3.8.1
|
| 27 |
scikit-learn>=1.3.0
|
| 28 |
plotly>=5.0.0
|
| 29 |
+
pytz # per gestire il fuso orario
|
| 30 |
|
| 31 |
# --- IMAGE PROCESSING ---
|
| 32 |
opencv-python-headless
|