py scripts
Browse files- anomaly_intelligence.py +1260 -0
- descritption_v2.py +942 -0
- equipment_analysis.py +319 -0
- training.py +1069 -0
anomaly_intelligence.py
ADDED
|
@@ -0,0 +1,1260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# enhanced_anomaly_intelligence_v2.py
|
| 2 |
+
# TAQATHON 2025 - Production Anomaly Intelligence with Equipment Intelligence
|
| 3 |
+
# Enhanced for single and batch processing with safety override rules
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
import joblib
|
| 8 |
+
import json
|
| 9 |
+
import re
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 12 |
+
import warnings
|
| 13 |
+
from typing import Union, List, Dict, Any
|
| 14 |
+
import time
|
| 15 |
+
|
| 16 |
+
warnings.filterwarnings('ignore')
|
| 17 |
+
|
| 18 |
+
class EnhancedAnomalyIntelligence:
|
| 19 |
+
"""
|
| 20 |
+
Enhanced Production-ready Anomaly Intelligence System v2.0
|
| 21 |
+
Features: Equipment Intelligence + Safety Override Rules + Conservative Prediction
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
self.models = {}
|
| 26 |
+
self.model_metadata = None
|
| 27 |
+
self.safety_rules = None
|
| 28 |
+
self.embeddings = None
|
| 29 |
+
self.embedding_metadata = None
|
| 30 |
+
self.sentence_model = None
|
| 31 |
+
self._models_loaded = False
|
| 32 |
+
|
| 33 |
+
# Equipment intelligence configuration
|
| 34 |
+
self.equipment_type_scores = {}
|
| 35 |
+
self.section_risk_multipliers = {}
|
| 36 |
+
|
| 37 |
+
def _load_models(self):
|
| 38 |
+
"""Load all enhanced models and metadata (called once)"""
|
| 39 |
+
if self._models_loaded:
|
| 40 |
+
return
|
| 41 |
+
|
| 42 |
+
print("Loading enhanced models and metadata...")
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
# Load enhanced model metadata
|
| 46 |
+
self.model_metadata = joblib.load('enhanced_model_metadata_v2.joblib')
|
| 47 |
+
target_columns = self.model_metadata['target_columns']
|
| 48 |
+
|
| 49 |
+
# Load enhanced trained models
|
| 50 |
+
for target in target_columns:
|
| 51 |
+
model_filename = f"enhanced_model_{target.replace(' ', '_').replace('Γ©', 'e')}_v2.joblib"
|
| 52 |
+
self.models[target] = joblib.load(model_filename)
|
| 53 |
+
print(f"β Loaded {target} model")
|
| 54 |
+
|
| 55 |
+
# Load safety override rules
|
| 56 |
+
try:
|
| 57 |
+
with open('safety_override_rules_v2.json', 'r') as f:
|
| 58 |
+
self.safety_rules = json.load(f)
|
| 59 |
+
print("β Loaded safety override rules")
|
| 60 |
+
except FileNotFoundError:
|
| 61 |
+
print("β οΈ Warning: safety_override_rules_v2.json not found - safety rules disabled")
|
| 62 |
+
self.safety_rules = {}
|
| 63 |
+
|
| 64 |
+
# Load embeddings and metadata for similarity search
|
| 65 |
+
try:
|
| 66 |
+
self.embeddings = np.load('anomaly_embeddings.npy')
|
| 67 |
+
self.embedding_metadata = joblib.load('embedding_metadata.joblib')
|
| 68 |
+
print("β Loaded similarity search embeddings")
|
| 69 |
+
except FileNotFoundError:
|
| 70 |
+
print("β οΈ Warning: Embedding files not found - similarity search disabled")
|
| 71 |
+
self.embeddings = None
|
| 72 |
+
self.embedding_metadata = None
|
| 73 |
+
|
| 74 |
+
# Load sentence transformer
|
| 75 |
+
try:
|
| 76 |
+
from sentence_transformers import SentenceTransformer
|
| 77 |
+
try:
|
| 78 |
+
self.sentence_model = SentenceTransformer('dangvantuan/sentence-camembert-large')
|
| 79 |
+
print("β Loaded French CamemBERT model")
|
| 80 |
+
except:
|
| 81 |
+
try:
|
| 82 |
+
self.sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
| 83 |
+
print("β Loaded multilingual model")
|
| 84 |
+
except:
|
| 85 |
+
self.sentence_model = SentenceTransformer('distiluse-base-multilingual-cased')
|
| 86 |
+
print("β Loaded basic multilingual model")
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"β οΈ Warning: Could not load sentence transformer: {e}")
|
| 89 |
+
self.sentence_model = None
|
| 90 |
+
|
| 91 |
+
# Extract equipment intelligence configuration
|
| 92 |
+
if 'training_config' in self.model_metadata:
|
| 93 |
+
training_config = self.model_metadata['training_config']
|
| 94 |
+
print("β Loaded training configuration")
|
| 95 |
+
|
| 96 |
+
self._models_loaded = True
|
| 97 |
+
print("β All enhanced models loaded successfully")
|
| 98 |
+
|
| 99 |
+
except Exception as e:
|
| 100 |
+
raise Exception(f"Failed to load enhanced models: {str(e)}")
|
| 101 |
+
|
| 102 |
+
def predict_single(self, anomaly_data: Dict,
|
| 103 |
+
confidence_threshold: float = 0.7,
|
| 104 |
+
include_similar: bool = True,
|
| 105 |
+
format_type: str = 'rich',
|
| 106 |
+
apply_safety_rules: bool = True) -> Dict:
|
| 107 |
+
"""
|
| 108 |
+
Enhanced single anomaly prediction with equipment intelligence and safety rules
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
anomaly_data: Dictionary with anomaly information
|
| 112 |
+
confidence_threshold: Threshold for flagging manual review
|
| 113 |
+
include_similar: Whether to include similar anomalies
|
| 114 |
+
format_type: 'rich' for UI, 'simple' for database
|
| 115 |
+
apply_safety_rules: Whether to apply safety override rules
|
| 116 |
+
"""
|
| 117 |
+
self._load_models()
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
# Extract and prepare enhanced features
|
| 121 |
+
enhanced_features = self._extract_enhanced_features_single(anomaly_data)
|
| 122 |
+
|
| 123 |
+
# Make base predictions
|
| 124 |
+
predictions, confidences, probabilities = self._predict_criticality(enhanced_features)
|
| 125 |
+
|
| 126 |
+
# Apply safety override rules if enabled
|
| 127 |
+
if apply_safety_rules and self.safety_rules:
|
| 128 |
+
predictions = self._apply_safety_override_rules(enhanced_features, predictions)
|
| 129 |
+
|
| 130 |
+
# Calculate enhanced metrics
|
| 131 |
+
total_criticality = sum(predictions.values())
|
| 132 |
+
overall_confidence = np.mean(list(confidences.values()))
|
| 133 |
+
|
| 134 |
+
# Enhanced business logic for manual review
|
| 135 |
+
needs_review = self._determine_manual_review_need(
|
| 136 |
+
enhanced_features, predictions, overall_confidence, confidence_threshold
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# Equipment-specific risk assessment
|
| 140 |
+
equipment_risk_assessment = self._assess_equipment_risk(enhanced_features, predictions)
|
| 141 |
+
|
| 142 |
+
# Find similar anomalies
|
| 143 |
+
similar_anomalies = []
|
| 144 |
+
if include_similar and self.sentence_model is not None:
|
| 145 |
+
similar_anomalies = self._find_similar_anomalies(
|
| 146 |
+
anomaly_data.get('Description', ''), top_k=3
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
# Format response based on type
|
| 150 |
+
if format_type == 'simple':
|
| 151 |
+
return self._format_simple_response(
|
| 152 |
+
anomaly_data, predictions, total_criticality,
|
| 153 |
+
overall_confidence, needs_review, equipment_risk_assessment
|
| 154 |
+
)
|
| 155 |
+
else:
|
| 156 |
+
return self._format_rich_response(
|
| 157 |
+
anomaly_data, predictions, confidences,
|
| 158 |
+
total_criticality, overall_confidence,
|
| 159 |
+
similar_anomalies, needs_review, confidence_threshold,
|
| 160 |
+
equipment_risk_assessment, enhanced_features
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
except Exception as e:
|
| 164 |
+
return {
|
| 165 |
+
'error': f'Enhanced prediction failed: {str(e)}',
|
| 166 |
+
'timestamp': datetime.now().isoformat(),
|
| 167 |
+
'input_description': anomaly_data.get('Description', 'N/A')
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
def predict_batch(self, anomaly_list: List[Dict],
|
| 171 |
+
confidence_threshold: float = 0.7,
|
| 172 |
+
include_similar: bool = False,
|
| 173 |
+
format_type: str = 'simple',
|
| 174 |
+
apply_safety_rules: bool = True) -> List[Dict]:
|
| 175 |
+
"""
|
| 176 |
+
Enhanced batch prediction with equipment intelligence
|
| 177 |
+
|
| 178 |
+
Args:
|
| 179 |
+
anomaly_list: List of anomaly dictionaries
|
| 180 |
+
confidence_threshold: Threshold for flagging manual review
|
| 181 |
+
include_similar: Whether to include similar anomalies (slower for batch)
|
| 182 |
+
format_type: 'rich' for UI, 'simple' for database
|
| 183 |
+
apply_safety_rules: Whether to apply safety override rules
|
| 184 |
+
"""
|
| 185 |
+
self._load_models()
|
| 186 |
+
|
| 187 |
+
print(f"Processing enhanced batch of {len(anomaly_list)} anomalies...")
|
| 188 |
+
start_time = time.time()
|
| 189 |
+
|
| 190 |
+
results = []
|
| 191 |
+
|
| 192 |
+
try:
|
| 193 |
+
# Extract enhanced features for all anomalies
|
| 194 |
+
all_features = []
|
| 195 |
+
for anomaly_data in anomaly_list:
|
| 196 |
+
enhanced_features = self._extract_enhanced_features_single(anomaly_data)
|
| 197 |
+
all_features.append(enhanced_features)
|
| 198 |
+
|
| 199 |
+
# Create batch DataFrame with all enhanced features
|
| 200 |
+
batch_df = pd.DataFrame(all_features)
|
| 201 |
+
|
| 202 |
+
# Make batch predictions
|
| 203 |
+
batch_predictions = {}
|
| 204 |
+
batch_confidences = {}
|
| 205 |
+
|
| 206 |
+
target_columns = self.model_metadata['target_columns']
|
| 207 |
+
for target in target_columns:
|
| 208 |
+
model = self.models[target]
|
| 209 |
+
preds = model.predict(batch_df)
|
| 210 |
+
probas = model.predict_proba(batch_df)
|
| 211 |
+
confs = np.max(probas, axis=1)
|
| 212 |
+
|
| 213 |
+
batch_predictions[target] = preds
|
| 214 |
+
batch_confidences[target] = confs
|
| 215 |
+
|
| 216 |
+
# Process results with enhanced logic
|
| 217 |
+
for i, anomaly_data in enumerate(anomaly_list):
|
| 218 |
+
# Get base predictions
|
| 219 |
+
predictions = {target: int(batch_predictions[target][i])
|
| 220 |
+
for target in target_columns}
|
| 221 |
+
confidences = {target: float(batch_confidences[target][i])
|
| 222 |
+
for target in target_columns}
|
| 223 |
+
|
| 224 |
+
enhanced_features = all_features[i]
|
| 225 |
+
|
| 226 |
+
# Apply safety override rules if enabled
|
| 227 |
+
if apply_safety_rules and self.safety_rules:
|
| 228 |
+
predictions = self._apply_safety_override_rules(enhanced_features, predictions)
|
| 229 |
+
|
| 230 |
+
total_criticality = sum(predictions.values())
|
| 231 |
+
overall_confidence = np.mean(list(confidences.values()))
|
| 232 |
+
|
| 233 |
+
# Enhanced business logic
|
| 234 |
+
needs_review = self._determine_manual_review_need(
|
| 235 |
+
enhanced_features, predictions, overall_confidence, confidence_threshold
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
equipment_risk_assessment = self._assess_equipment_risk(enhanced_features, predictions)
|
| 239 |
+
|
| 240 |
+
# Find similar anomalies (optional for batch)
|
| 241 |
+
similar_anomalies = []
|
| 242 |
+
if include_similar and self.sentence_model is not None:
|
| 243 |
+
similar_anomalies = self._find_similar_anomalies(
|
| 244 |
+
anomaly_data.get('Description', ''), top_k=2
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Format response
|
| 248 |
+
if format_type == 'simple':
|
| 249 |
+
result = self._format_simple_response(
|
| 250 |
+
anomaly_data, predictions, total_criticality,
|
| 251 |
+
overall_confidence, needs_review, equipment_risk_assessment
|
| 252 |
+
)
|
| 253 |
+
else:
|
| 254 |
+
result = self._format_rich_response(
|
| 255 |
+
anomaly_data, predictions, confidences,
|
| 256 |
+
total_criticality, overall_confidence,
|
| 257 |
+
similar_anomalies, needs_review, confidence_threshold,
|
| 258 |
+
equipment_risk_assessment, enhanced_features
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
results.append(result)
|
| 262 |
+
|
| 263 |
+
processing_time = time.time() - start_time
|
| 264 |
+
print(f"β Enhanced batch processing completed in {processing_time:.2f}s")
|
| 265 |
+
print(f" Average time per anomaly: {processing_time/len(anomaly_list):.3f}s")
|
| 266 |
+
|
| 267 |
+
flagged_count = sum(1 for r in results if r.get('needs_manual_review', False))
|
| 268 |
+
safety_overrides = sum(1 for r in results if r.get('safety_override_applied', False))
|
| 269 |
+
|
| 270 |
+
print(f" Flagged for manual review: {flagged_count}/{len(anomaly_list)} ({flagged_count/len(anomaly_list)*100:.1f}%)")
|
| 271 |
+
print(f" Safety overrides applied: {safety_overrides}/{len(anomaly_list)} ({safety_overrides/len(anomaly_list)*100:.1f}%)")
|
| 272 |
+
|
| 273 |
+
return results
|
| 274 |
+
|
| 275 |
+
except Exception as e:
|
| 276 |
+
# Return error for all items in batch
|
| 277 |
+
error_result = {
|
| 278 |
+
'error': f'Enhanced batch prediction failed: {str(e)}',
|
| 279 |
+
'timestamp': datetime.now().isoformat()
|
| 280 |
+
}
|
| 281 |
+
return [error_result] * len(anomaly_list)
|
| 282 |
+
|
| 283 |
+
def _extract_enhanced_features_single(self, anomaly_data: Dict) -> Dict:
|
| 284 |
+
"""Extract enhanced features including equipment intelligence"""
|
| 285 |
+
|
| 286 |
+
# Create temporary DataFrame for feature engineering
|
| 287 |
+
temp_df = pd.DataFrame([anomaly_data])
|
| 288 |
+
|
| 289 |
+
# Apply enhanced feature engineering (matching training pipeline)
|
| 290 |
+
enhanced_features = self._extract_enhanced_features(temp_df)
|
| 291 |
+
|
| 292 |
+
# Prepare feature dict with all required features
|
| 293 |
+
feature_columns = self.model_metadata.get('all_feature_columns', [])
|
| 294 |
+
|
| 295 |
+
input_data = {}
|
| 296 |
+
|
| 297 |
+
# Text feature
|
| 298 |
+
input_data['Description'] = anomaly_data.get('Description', '')
|
| 299 |
+
|
| 300 |
+
# Enhanced numerical features
|
| 301 |
+
numerical_features = self.model_metadata.get('numerical_features', [])
|
| 302 |
+
for feat in numerical_features:
|
| 303 |
+
if feat in enhanced_features.columns:
|
| 304 |
+
value = enhanced_features[feat].iloc[0]
|
| 305 |
+
# Ensure proper type conversion
|
| 306 |
+
if pd.isna(value):
|
| 307 |
+
input_data[feat] = 0.0
|
| 308 |
+
elif isinstance(value, (bool, np.bool_)):
|
| 309 |
+
input_data[feat] = float(value)
|
| 310 |
+
else:
|
| 311 |
+
input_data[feat] = float(value)
|
| 312 |
+
else:
|
| 313 |
+
input_data[feat] = 0.0
|
| 314 |
+
|
| 315 |
+
# Categorical features
|
| 316 |
+
categorical_features = self.model_metadata.get('categorical_features', [])
|
| 317 |
+
for feat in categorical_features:
|
| 318 |
+
input_data[feat] = anomaly_data.get(feat, 'Unknown')
|
| 319 |
+
|
| 320 |
+
return input_data
|
| 321 |
+
|
| 322 |
+
def _extract_enhanced_features(self, df):
|
| 323 |
+
"""Extract enhanced features (matching training pipeline logic)"""
|
| 324 |
+
import re
|
| 325 |
+
|
| 326 |
+
features_df = df.copy()
|
| 327 |
+
|
| 328 |
+
# Create combined text field
|
| 329 |
+
features_df['combined_text'] = features_df['Description'].fillna('') + ' ' + features_df.get('Description de l\'Γ©quipement', '').fillna('')
|
| 330 |
+
features_df['combined_text_lower'] = features_df['combined_text'].str.lower()
|
| 331 |
+
|
| 332 |
+
# Basic text features
|
| 333 |
+
features_df['description_length'] = features_df['Description'].str.len()
|
| 334 |
+
features_df['description_word_count'] = features_df['Description'].str.split().str.len()
|
| 335 |
+
features_df['equipment_desc_length'] = features_df.get('Description de l\'Γ©quipement', '').str.len()
|
| 336 |
+
features_df['equipment_desc_word_count'] = features_df.get('Description de l\'Γ©quipement', '').str.split().str.len()
|
| 337 |
+
features_df['combined_length'] = features_df['combined_text'].str.len()
|
| 338 |
+
features_df['combined_word_count'] = features_df['combined_text'].str.split().str.len()
|
| 339 |
+
|
| 340 |
+
# Equipment intelligence classification
|
| 341 |
+
def classify_equipment_type(equipment_desc):
|
| 342 |
+
"""Classify equipment based on training analysis"""
|
| 343 |
+
equipment_upper = str(equipment_desc).upper()
|
| 344 |
+
|
| 345 |
+
# Equipment type scoring (from training pipeline)
|
| 346 |
+
if any(keyword in equipment_upper for keyword in ['ALTERNATEUR', 'TRANSFO PRINCIPAL', 'PROTECTION ALTERNATEUR']):
|
| 347 |
+
return 'ELECTRICAL_CRITICAL', 8.0
|
| 348 |
+
elif any(keyword in equipment_upper for keyword in ['VENTILATEUR DE REFROIDISSEMENT', 'REFROIDISSEMENT TP', 'MOTEUR VENTILATEUR DE REFROIDISSEMENT']):
|
| 349 |
+
return 'COOLING_CRITICAL', 7.5
|
| 350 |
+
elif any(keyword in equipment_upper for keyword in ['TURBINE', 'SOUPAPE REGULATRICE', 'REFRIGERANT HUILE', 'POMPE DE SOULΓVEMENT']):
|
| 351 |
+
return 'TURBINE_SYSTEMS', 7.5
|
| 352 |
+
elif any(keyword in equipment_upper for keyword in ['DISJONCTEUR', 'TRANSFORMATEUR', 'MOTEUR', 'ARMOIRE', 'GROUPE']):
|
| 353 |
+
return 'ELECTRICAL_STANDARD', 6.5
|
| 354 |
+
elif any(keyword in equipment_upper for keyword in ['RECHAUFFEUR', 'RΓCHAUFFEUR', 'CHAUDIERE', 'CHAUDIΓRE']):
|
| 355 |
+
return 'HEATING_SYSTEMS', 6.5
|
| 356 |
+
elif any(keyword in equipment_upper for keyword in ['VENTILATEUR', 'TIRAGE', 'SOUFFLAGE', 'AIR PRIMAIRE', 'AIR SECONDAIRE']):
|
| 357 |
+
return 'VENTILATION_SYSTEMS', 6.0
|
| 358 |
+
elif any(keyword in equipment_upper for keyword in ['POMPE', 'SOUPAPE', 'VANNE', 'CONVOYEUR', 'BROYEUR', 'COAL FEEDER']):
|
| 359 |
+
return 'PROCESS_SYSTEMS', 5.5
|
| 360 |
+
elif any(keyword in equipment_upper for keyword in ['DECRASSEUR', 'DΓGRILLEUR', 'FILTRE', 'CAPTEUR', 'TRANSMETTEUR']):
|
| 361 |
+
return 'AUXILIARY_SYSTEMS', 5.0
|
| 362 |
+
else:
|
| 363 |
+
return 'UNKNOWN', 4.5
|
| 364 |
+
|
| 365 |
+
def detect_equipment_redundancy(equipment_desc):
|
| 366 |
+
"""Detect equipment redundancy based on naming patterns"""
|
| 367 |
+
equipment_upper = str(equipment_desc).upper()
|
| 368 |
+
|
| 369 |
+
if any(pattern in equipment_upper for pattern in ['PRINCIPAL', 'UNIQUE']):
|
| 370 |
+
return 'SINGLE_CRITICAL', 1.3
|
| 371 |
+
elif any(re.search(pattern, equipment_upper) for pattern in [r'\b[AB]$', r'NΒ°[12]$', r'PRIMAIRE$', r'SECONDAIRE$']):
|
| 372 |
+
return 'DUAL_SYSTEM', 1.0
|
| 373 |
+
elif any(re.search(pattern, equipment_upper) for pattern in [r'NΒ°[3-9]$', r'NΒ°[0-9][0-9]$']):
|
| 374 |
+
return 'MULTIPLE_SYSTEM', 0.8
|
| 375 |
+
else:
|
| 376 |
+
return 'UNKNOWN_REDUNDANCY', 1.0
|
| 377 |
+
|
| 378 |
+
# Apply equipment intelligence
|
| 379 |
+
if 'Description de l\'Γ©quipement' in features_df.columns:
|
| 380 |
+
equipment_classifications = features_df['Description de l\'Γ©quipement'].apply(classify_equipment_type)
|
| 381 |
+
features_df['equipment_type_class'] = [x[0] for x in equipment_classifications]
|
| 382 |
+
features_df['equipment_base_criticality'] = [x[1] for x in equipment_classifications]
|
| 383 |
+
|
| 384 |
+
redundancy_classifications = features_df['Description de l\'Γ©quipement'].apply(detect_equipment_redundancy)
|
| 385 |
+
features_df['equipment_redundancy_class'] = [x[0] for x in redundancy_classifications]
|
| 386 |
+
features_df['equipment_redundancy_multiplier'] = [x[1] for x in redundancy_classifications]
|
| 387 |
+
else:
|
| 388 |
+
features_df['equipment_type_class'] = 'UNKNOWN'
|
| 389 |
+
features_df['equipment_base_criticality'] = 4.5
|
| 390 |
+
features_df['equipment_redundancy_class'] = 'UNKNOWN_REDUNDANCY'
|
| 391 |
+
features_df['equipment_redundancy_multiplier'] = 1.0
|
| 392 |
+
|
| 393 |
+
# Section risk multiplier
|
| 394 |
+
section_risk_multipliers = {'34EL': 1.2, '34MM': 1.1, '34MD': 1.1, '34MC': 1.0, '34CT': 1.0}
|
| 395 |
+
features_df['section_risk_multiplier'] = features_df.get('Section propriΓ©taire', '').map(section_risk_multipliers).fillna(1.0)
|
| 396 |
+
|
| 397 |
+
# Combined equipment risk score
|
| 398 |
+
features_df['equipment_risk_score'] = (features_df['equipment_base_criticality'] *
|
| 399 |
+
features_df['equipment_redundancy_multiplier'] *
|
| 400 |
+
features_df['section_risk_multiplier'])
|
| 401 |
+
|
| 402 |
+
# Enhanced keyword extraction
|
| 403 |
+
def extract_keywords_dual_field(description, equipment_desc, keyword_dict):
|
| 404 |
+
"""Extract keywords from both description and equipment description"""
|
| 405 |
+
combined_text = (str(description) + ' ' + str(equipment_desc)).lower()
|
| 406 |
+
found_keywords = []
|
| 407 |
+
|
| 408 |
+
for category, keywords in keyword_dict.items():
|
| 409 |
+
for keyword in keywords:
|
| 410 |
+
if keyword in combined_text:
|
| 411 |
+
found_keywords.append(category)
|
| 412 |
+
break
|
| 413 |
+
|
| 414 |
+
return found_keywords
|
| 415 |
+
|
| 416 |
+
# Keyword dictionaries (from training pipeline)
|
| 417 |
+
equipment_keywords = {
|
| 418 |
+
'pompe': ['pompe', 'pompes'],
|
| 419 |
+
'vanne': ['vanne', 'vannes'],
|
| 420 |
+
'ventilateur': ['ventilateur', 'ventilateurs', 'ventilo'],
|
| 421 |
+
'moteur': ['moteur', 'moteurs', 'moto'],
|
| 422 |
+
'alternateur': ['alternateur', 'alternateurs'],
|
| 423 |
+
'transformateur': ['transformateur', 'transformateurs', 'transfo'],
|
| 424 |
+
'turbine': ['turbine', 'turbines'],
|
| 425 |
+
'principal': ['principal', 'principale'],
|
| 426 |
+
'groupe': ['groupe', 'groupes']
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
problem_keywords = {
|
| 430 |
+
'fuite': ['fuite', 'fuites', 'fuit', 'fuyant'],
|
| 431 |
+
'vibration': ['vibration', 'vibrations', 'vibre'],
|
| 432 |
+
'bruit_anormal': ['bruit anormal', 'bruit anormale'],
|
| 433 |
+
'percement': ['percement', 'percΓ©', 'percΓ©e'],
|
| 434 |
+
'Γ©clatement': ['Γ©clatement', 'eclatement'],
|
| 435 |
+
'fissure': ['fissure', 'fissurΓ©', 'fissures'],
|
| 436 |
+
'aggravation': ['aggravation'],
|
| 437 |
+
'sifflement': ['sifflement', 'siffler'],
|
| 438 |
+
'dΓ©faillance': ['dΓ©faillance', 'dΓ©faillant'],
|
| 439 |
+
'dysfonctionnement': ['dysfonctionnement', 'dysfonctionnel'],
|
| 440 |
+
'sens_inverse': ['sens inverse', 'sens contraire'],
|
| 441 |
+
'surchauffe': ['surchauffe', 'surchauffΓ©', 'tempΓ©rature Γ©levΓ©e', 'temp elevee']
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
action_keywords = {
|
| 445 |
+
'maintenance': ['maintenance', 'entretien'],
|
| 446 |
+
'prΓ©vision': ['prΓ©voir', 'prΓ©voire', 'prevoir'],
|
| 447 |
+
'remplacement': ['remplacement', 'remplacer', 'remplacΓ©']
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
urgency_keywords = {
|
| 451 |
+
'safety': ['safety', 'sΓ©curitΓ©'],
|
| 452 |
+
'urgent': ['urgent', 'urgence'],
|
| 453 |
+
'critique': ['critique', 'critiques'],
|
| 454 |
+
'important': ['important', 'importante']
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
# Apply keyword extraction
|
| 458 |
+
description_col = features_df['Description']
|
| 459 |
+
equipment_col = features_df.get('Description de l\'Γ©quipement', '')
|
| 460 |
+
|
| 461 |
+
features_df['equipment_mentioned'] = features_df.apply(
|
| 462 |
+
lambda row: extract_keywords_dual_field(row['Description'], row.get('Description de l\'Γ©quipement', ''), equipment_keywords),
|
| 463 |
+
axis=1
|
| 464 |
+
)
|
| 465 |
+
features_df['equipment_count'] = features_df['equipment_mentioned'].str.len()
|
| 466 |
+
|
| 467 |
+
features_df['problem_types'] = features_df.apply(
|
| 468 |
+
lambda row: extract_keywords_dual_field(row['Description'], row.get('Description de l\'Γ©quipement', ''), problem_keywords),
|
| 469 |
+
axis=1
|
| 470 |
+
)
|
| 471 |
+
features_df['problem_count'] = features_df['problem_types'].str.len()
|
| 472 |
+
|
| 473 |
+
features_df['actions_mentioned'] = features_df.apply(
|
| 474 |
+
lambda row: extract_keywords_dual_field(row['Description'], row.get('Description de l\'Γ©quipement', ''), action_keywords),
|
| 475 |
+
axis=1
|
| 476 |
+
)
|
| 477 |
+
features_df['action_count'] = features_df['actions_mentioned'].str.len()
|
| 478 |
+
|
| 479 |
+
features_df['urgency_indicators'] = features_df.apply(
|
| 480 |
+
lambda row: extract_keywords_dual_field(row['Description'], row.get('Description de l\'Γ©quipement', ''), urgency_keywords),
|
| 481 |
+
axis=1
|
| 482 |
+
)
|
| 483 |
+
features_df['has_urgency'] = (features_df['urgency_indicators'].str.len() > 0).astype(int)
|
| 484 |
+
|
| 485 |
+
# Critical failure pattern detection
|
| 486 |
+
features_df['has_structural_failure'] = features_df['combined_text_lower'].str.contains(
|
| 487 |
+
'percement|Γ©clatement|eclatement|fissure|rupture', regex=True, na=False
|
| 488 |
+
).astype(int)
|
| 489 |
+
|
| 490 |
+
features_df['has_equipment_malfunction'] = features_df['combined_text_lower'].str.contains(
|
| 491 |
+
'sens inverse|dysfonctionnement|dΓ©faillance|dΓ©faut|panne', regex=True, na=False
|
| 492 |
+
).astype(int)
|
| 493 |
+
|
| 494 |
+
features_df['has_escalation'] = features_df['combined_text_lower'].str.contains(
|
| 495 |
+
'aggravation|empirΓ©|empire', regex=True, na=False
|
| 496 |
+
).astype(int)
|
| 497 |
+
|
| 498 |
+
features_df['has_safety_mention'] = features_df['Description'].str.contains('SAFETY', case=False, na=False).astype(int)
|
| 499 |
+
|
| 500 |
+
# Specific high-risk combinations
|
| 501 |
+
features_df['electrical_cooling_issue'] = (
|
| 502 |
+
(features_df['equipment_type_class'].isin(['ELECTRICAL_CRITICAL', 'ELECTRICAL_STANDARD'])) &
|
| 503 |
+
(features_df['combined_text_lower'].str.contains('refroidissement|ventilateur|tempΓ©rature', regex=True, na=False))
|
| 504 |
+
).astype(int)
|
| 505 |
+
|
| 506 |
+
features_df['turbine_oil_issue'] = (
|
| 507 |
+
(features_df['equipment_type_class'] == 'TURBINE_SYSTEMS') &
|
| 508 |
+
(features_df['combined_text_lower'].str.contains('huile|fuite|graissage', regex=True, na=False))
|
| 509 |
+
).astype(int)
|
| 510 |
+
|
| 511 |
+
features_df['main_equipment_failure'] = (
|
| 512 |
+
(features_df['equipment_redundancy_class'] == 'SINGLE_CRITICAL') &
|
| 513 |
+
(features_df['has_structural_failure'] == 1)
|
| 514 |
+
).astype(int)
|
| 515 |
+
|
| 516 |
+
# Enhanced compound features
|
| 517 |
+
features_df['fuite_vapeur'] = features_df['combined_text_lower'].str.contains('fuite.*vapeur|vapeur.*fuite', regex=True, na=False).astype(int)
|
| 518 |
+
features_df['fuite_huile'] = features_df['combined_text_lower'].str.contains('fuite.*huile|huile.*fuite', regex=True, na=False).astype(int)
|
| 519 |
+
features_df['fuite_eau'] = features_df['combined_text_lower'].str.contains('fuite.*eau|eau.*fuite', regex=True, na=False).astype(int)
|
| 520 |
+
features_df['bruit_anormal'] = features_df['combined_text_lower'].str.contains('bruit anormal', regex=True, na=False).astype(int)
|
| 521 |
+
features_df['vibration_excessive'] = features_df['combined_text_lower'].str.contains('vibration.*excessive|vibration.*Γ©levΓ©e', regex=True, na=False).astype(int)
|
| 522 |
+
features_df['temperature_elevee'] = features_df['combined_text_lower'].str.contains('tempΓ©rature Γ©levΓ©e|temp Γ©levΓ©e|temp elevee', regex=True, na=False).astype(int)
|
| 523 |
+
features_df['maintenance_planning'] = features_df['combined_text_lower'].str.contains('prΓ©voir|prΓ©voire|planifier', regex=True, na=False).astype(int)
|
| 524 |
+
features_df['is_recurring'] = features_df['combined_text_lower'].str.contains('frΓ©quent|rΓ©pΓ©titif|souvent', regex=True, na=False).astype(int)
|
| 525 |
+
|
| 526 |
+
# Technical features
|
| 527 |
+
features_df['has_measurements'] = features_df['combined_text_lower'].str.contains(r'\d+\s*Β°c|\d+\s*bar|\d+\s*%', regex=True, na=False).astype(int)
|
| 528 |
+
features_df['has_equipment_codes'] = features_df['combined_text_lower'].str.contains(r'[A-Z0-9]{5,}', regex=True, na=False).astype(int)
|
| 529 |
+
features_df['has_location_details'] = features_df['combined_text_lower'].str.contains('niveau|angle|cΓ΄tΓ©|palier', regex=True, na=False).astype(int)
|
| 530 |
+
|
| 531 |
+
# Enhanced severity scoring
|
| 532 |
+
severity_words = {
|
| 533 |
+
'critique': 4, 'grave': 4, 'majeur': 4, 'important': 3,
|
| 534 |
+
'total': 5, 'complet': 5, 'rupture': 5, 'Γ©clatement': 5,
|
| 535 |
+
'percement': 5, 'fissure': 4, 'aggravation': 4, 'urgent': 3
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
def calculate_enhanced_severity_score(text):
|
| 539 |
+
text = str(text).lower()
|
| 540 |
+
max_score = 0
|
| 541 |
+
for word, weight in severity_words.items():
|
| 542 |
+
if word in text:
|
| 543 |
+
max_score = max(max_score, weight)
|
| 544 |
+
return max_score
|
| 545 |
+
|
| 546 |
+
features_df['enhanced_severity_score'] = features_df['combined_text_lower'].apply(calculate_enhanced_severity_score)
|
| 547 |
+
|
| 548 |
+
# Equipment-Problem Risk Matrix
|
| 549 |
+
def calculate_equipment_problem_risk(equipment_type, problem_types, has_structural):
|
| 550 |
+
base_risk = 1.0
|
| 551 |
+
|
| 552 |
+
if equipment_type in ['ELECTRICAL_CRITICAL', 'TURBINE_SYSTEMS', 'COOLING_CRITICAL']:
|
| 553 |
+
base_risk = 1.5
|
| 554 |
+
elif equipment_type in ['ELECTRICAL_STANDARD', 'HEATING_SYSTEMS']:
|
| 555 |
+
base_risk = 1.2
|
| 556 |
+
|
| 557 |
+
if has_structural:
|
| 558 |
+
base_risk *= 2.0
|
| 559 |
+
|
| 560 |
+
if 'vibration' in problem_types:
|
| 561 |
+
base_risk *= 1.3
|
| 562 |
+
if 'fuite' in problem_types:
|
| 563 |
+
base_risk *= 1.2
|
| 564 |
+
|
| 565 |
+
return min(base_risk, 3.0)
|
| 566 |
+
|
| 567 |
+
features_df['equipment_problem_risk'] = features_df.apply(
|
| 568 |
+
lambda row: calculate_equipment_problem_risk(
|
| 569 |
+
row['equipment_type_class'],
|
| 570 |
+
row['problem_types'],
|
| 571 |
+
row['has_structural_failure']
|
| 572 |
+
), axis=1
|
| 573 |
+
)
|
| 574 |
+
|
| 575 |
+
# Technical complexity
|
| 576 |
+
features_df['technical_complexity'] = (
|
| 577 |
+
features_df['combined_word_count'] / 15 +
|
| 578 |
+
features_df['equipment_count'] +
|
| 579 |
+
features_df['problem_count'] +
|
| 580 |
+
features_df['has_measurements'] +
|
| 581 |
+
features_df['has_equipment_codes'] +
|
| 582 |
+
features_df['has_location_details']
|
| 583 |
+
)
|
| 584 |
+
|
| 585 |
+
# Fill missing values and ensure proper types
|
| 586 |
+
numeric_columns = features_df.select_dtypes(include=[np.number]).columns
|
| 587 |
+
features_df[numeric_columns] = features_df[numeric_columns].fillna(0)
|
| 588 |
+
|
| 589 |
+
for col in features_df.select_dtypes(include=[np.integer, np.floating, bool]).columns:
|
| 590 |
+
features_df[col] = pd.to_numeric(features_df[col], errors='coerce').fillna(0)
|
| 591 |
+
|
| 592 |
+
return features_df
|
| 593 |
+
|
| 594 |
+
def _predict_criticality(self, input_data: Dict) -> tuple:
|
| 595 |
+
"""Make criticality predictions using enhanced models"""
|
| 596 |
+
|
| 597 |
+
# Convert to DataFrame
|
| 598 |
+
input_df = pd.DataFrame([input_data])
|
| 599 |
+
|
| 600 |
+
target_columns = self.model_metadata['target_columns']
|
| 601 |
+
predictions = {}
|
| 602 |
+
confidences = {}
|
| 603 |
+
probabilities = {}
|
| 604 |
+
|
| 605 |
+
for target in target_columns:
|
| 606 |
+
model = self.models[target]
|
| 607 |
+
pred = model.predict(input_df)[0]
|
| 608 |
+
pred_proba = model.predict_proba(input_df)[0]
|
| 609 |
+
confidence = np.max(pred_proba)
|
| 610 |
+
|
| 611 |
+
predictions[target] = int(pred)
|
| 612 |
+
confidences[target] = float(confidence)
|
| 613 |
+
probabilities[target] = [float(x) for x in pred_proba]
|
| 614 |
+
|
| 615 |
+
return predictions, confidences, probabilities
|
| 616 |
+
|
| 617 |
+
def _apply_safety_override_rules(self, enhanced_features: Dict, predictions: Dict) -> Dict:
|
| 618 |
+
"""Apply safety override rules to predictions"""
|
| 619 |
+
|
| 620 |
+
def _apply_safety_override_rules(self, enhanced_features: Dict, predictions: Dict) -> Dict:
|
| 621 |
+
"""Apply safety override rules to predictions"""
|
| 622 |
+
|
| 623 |
+
if not self.safety_rules:
|
| 624 |
+
return predictions
|
| 625 |
+
|
| 626 |
+
modified_predictions = predictions.copy()
|
| 627 |
+
safety_override_applied = False
|
| 628 |
+
|
| 629 |
+
# Rule 1: Structural failure override
|
| 630 |
+
if enhanced_features.get('has_structural_failure', 0) == 1:
|
| 631 |
+
# Ensure minimum criticality of 9 for structural failures
|
| 632 |
+
total_current = sum(modified_predictions.values())
|
| 633 |
+
if total_current < 9:
|
| 634 |
+
# Boost Process Safety to 5 first (most critical for structural failures)
|
| 635 |
+
if modified_predictions['Process Safety'] < 5:
|
| 636 |
+
modified_predictions['Process Safety'] = 5
|
| 637 |
+
safety_override_applied = True
|
| 638 |
+
|
| 639 |
+
# Then boost FiabilitΓ© if still needed
|
| 640 |
+
total_after_safety = sum(modified_predictions.values())
|
| 641 |
+
if total_after_safety < 9:
|
| 642 |
+
needed_boost = 9 - total_after_safety
|
| 643 |
+
new_fiabilite = min(5, modified_predictions['FiabilitΓ© IntΓ©gritΓ©'] + needed_boost)
|
| 644 |
+
modified_predictions['FiabilitΓ© IntΓ©gritΓ©'] = new_fiabilite
|
| 645 |
+
safety_override_applied = True
|
| 646 |
+
|
| 647 |
+
# Rule 2: Cooling critical equipment override
|
| 648 |
+
if enhanced_features.get('equipment_type_class', '') == 'COOLING_CRITICAL':
|
| 649 |
+
# Ensure minimum criticality of 10 for cooling critical equipment
|
| 650 |
+
total_current = sum(modified_predictions.values())
|
| 651 |
+
if total_current < 10:
|
| 652 |
+
# Boost all components proportionally
|
| 653 |
+
needed_boost = 10 - total_current
|
| 654 |
+
for component in modified_predictions:
|
| 655 |
+
if modified_predictions[component] < 5:
|
| 656 |
+
boost = min(2, needed_boost // 3 + 1)
|
| 657 |
+
modified_predictions[component] = min(5, modified_predictions[component] + boost)
|
| 658 |
+
needed_boost -= boost
|
| 659 |
+
safety_override_applied = True
|
| 660 |
+
if needed_boost <= 0:
|
| 661 |
+
break
|
| 662 |
+
|
| 663 |
+
# Rule 3: Safety mention boost
|
| 664 |
+
if enhanced_features.get('has_safety_mention', 0) == 1:
|
| 665 |
+
# Add +2 to Process Safety for safety mentions
|
| 666 |
+
if modified_predictions['Process Safety'] < 5:
|
| 667 |
+
boost = min(2, 5 - modified_predictions['Process Safety'])
|
| 668 |
+
modified_predictions['Process Safety'] += boost
|
| 669 |
+
safety_override_applied = True
|
| 670 |
+
|
| 671 |
+
# Rule 4: Turbine oil issue override
|
| 672 |
+
if enhanced_features.get('turbine_oil_issue', 0) == 1:
|
| 673 |
+
# Ensure minimum criticality of 8 for turbine oil issues
|
| 674 |
+
total_current = sum(modified_predictions.values())
|
| 675 |
+
if total_current < 8:
|
| 676 |
+
# Boost FiabilitΓ© and DisponibilitΓ© (oil issues affect both)
|
| 677 |
+
needed_boost = 8 - total_current
|
| 678 |
+
for component in ['FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©']:
|
| 679 |
+
if needed_boost > 0 and modified_predictions[component] < 4:
|
| 680 |
+
boost = min(2, needed_boost)
|
| 681 |
+
modified_predictions[component] = min(5, modified_predictions[component] + boost)
|
| 682 |
+
needed_boost -= boost
|
| 683 |
+
safety_override_applied = True
|
| 684 |
+
|
| 685 |
+
# Rule 5: Electrical critical equipment override
|
| 686 |
+
if enhanced_features.get('equipment_type_class', '') == 'ELECTRICAL_CRITICAL':
|
| 687 |
+
# Conservative boost for electrical critical equipment
|
| 688 |
+
for component in modified_predictions:
|
| 689 |
+
if modified_predictions[component] >= 3: # Only boost already elevated predictions
|
| 690 |
+
boost = min(1, 5 - modified_predictions[component])
|
| 691 |
+
if boost > 0:
|
| 692 |
+
modified_predictions[component] += boost
|
| 693 |
+
safety_override_applied = True
|
| 694 |
+
|
| 695 |
+
return modified_predictions
|
| 696 |
+
|
| 697 |
+
def _determine_manual_review_need(self, enhanced_features: Dict, predictions: Dict,
|
| 698 |
+
overall_confidence: float, confidence_threshold: float) -> bool:
|
| 699 |
+
"""Enhanced logic to determine if manual review is needed"""
|
| 700 |
+
|
| 701 |
+
# Base confidence check
|
| 702 |
+
if overall_confidence < confidence_threshold:
|
| 703 |
+
return True
|
| 704 |
+
|
| 705 |
+
# Critical equipment always needs review for high predictions
|
| 706 |
+
if enhanced_features.get('equipment_type_class', '') in ['ELECTRICAL_CRITICAL', 'COOLING_CRITICAL', 'TURBINE_SYSTEMS']:
|
| 707 |
+
if sum(predictions.values()) >= 8:
|
| 708 |
+
return True
|
| 709 |
+
|
| 710 |
+
# Structural failures always need review
|
| 711 |
+
if enhanced_features.get('has_structural_failure', 0) == 1:
|
| 712 |
+
return True
|
| 713 |
+
|
| 714 |
+
# Safety mentions need review
|
| 715 |
+
if enhanced_features.get('has_safety_mention', 0) == 1:
|
| 716 |
+
return True
|
| 717 |
+
|
| 718 |
+
# High criticality cases need review
|
| 719 |
+
if sum(predictions.values()) >= 10:
|
| 720 |
+
return True
|
| 721 |
+
|
| 722 |
+
# Equipment malfunction with high-risk equipment
|
| 723 |
+
if (enhanced_features.get('has_equipment_malfunction', 0) == 1 and
|
| 724 |
+
enhanced_features.get('equipment_type_class', '') in ['ELECTRICAL_CRITICAL', 'TURBINE_SYSTEMS']):
|
| 725 |
+
return True
|
| 726 |
+
|
| 727 |
+
return False
|
| 728 |
+
|
| 729 |
+
def _assess_equipment_risk(self, enhanced_features: Dict, predictions: Dict) -> Dict:
|
| 730 |
+
"""Assess equipment-specific risk factors"""
|
| 731 |
+
|
| 732 |
+
equipment_type = enhanced_features.get('equipment_type_class', 'UNKNOWN')
|
| 733 |
+
total_criticality = sum(predictions.values())
|
| 734 |
+
|
| 735 |
+
risk_assessment = {
|
| 736 |
+
'equipment_type': equipment_type,
|
| 737 |
+
'redundancy_class': enhanced_features.get('equipment_redundancy_class', 'UNKNOWN'),
|
| 738 |
+
'base_risk_score': enhanced_features.get('equipment_risk_score', 4.5),
|
| 739 |
+
'risk_level': 'LOW',
|
| 740 |
+
'risk_factors': [],
|
| 741 |
+
'business_impact': 'MINOR'
|
| 742 |
+
}
|
| 743 |
+
|
| 744 |
+
# Determine risk level based on equipment type and criticality
|
| 745 |
+
if equipment_type == 'COOLING_CRITICAL':
|
| 746 |
+
risk_assessment['risk_level'] = 'CRITICAL'
|
| 747 |
+
risk_assessment['business_impact'] = 'SEVERE'
|
| 748 |
+
risk_assessment['risk_factors'].append('Critical cooling system failure')
|
| 749 |
+
elif equipment_type == 'ELECTRICAL_CRITICAL':
|
| 750 |
+
if total_criticality >= 8:
|
| 751 |
+
risk_assessment['risk_level'] = 'HIGH'
|
| 752 |
+
risk_assessment['business_impact'] = 'MAJOR'
|
| 753 |
+
else:
|
| 754 |
+
risk_assessment['risk_level'] = 'MEDIUM'
|
| 755 |
+
risk_assessment['business_impact'] = 'MODERATE'
|
| 756 |
+
risk_assessment['risk_factors'].append('Electrical critical infrastructure')
|
| 757 |
+
elif equipment_type == 'TURBINE_SYSTEMS':
|
| 758 |
+
if total_criticality >= 8:
|
| 759 |
+
risk_assessment['risk_level'] = 'HIGH'
|
| 760 |
+
risk_assessment['business_impact'] = 'MAJOR'
|
| 761 |
+
else:
|
| 762 |
+
risk_assessment['risk_level'] = 'MEDIUM'
|
| 763 |
+
risk_assessment['business_impact'] = 'MODERATE'
|
| 764 |
+
risk_assessment['risk_factors'].append('Turbine system component')
|
| 765 |
+
|
| 766 |
+
# Add specific risk factors
|
| 767 |
+
if enhanced_features.get('has_structural_failure', 0) == 1:
|
| 768 |
+
risk_assessment['risk_factors'].append('Structural integrity compromise')
|
| 769 |
+
risk_assessment['risk_level'] = 'HIGH'
|
| 770 |
+
|
| 771 |
+
if enhanced_features.get('has_safety_mention', 0) == 1:
|
| 772 |
+
risk_assessment['risk_factors'].append('Safety concern flagged')
|
| 773 |
+
|
| 774 |
+
if enhanced_features.get('equipment_redundancy_class', '') == 'SINGLE_CRITICAL':
|
| 775 |
+
risk_assessment['risk_factors'].append('Single point of failure')
|
| 776 |
+
|
| 777 |
+
if enhanced_features.get('turbine_oil_issue', 0) == 1:
|
| 778 |
+
risk_assessment['risk_factors'].append('Turbine lubrication system issue')
|
| 779 |
+
|
| 780 |
+
if enhanced_features.get('electrical_cooling_issue', 0) == 1:
|
| 781 |
+
risk_assessment['risk_factors'].append('Electrical equipment cooling problem')
|
| 782 |
+
|
| 783 |
+
# Determine business impact based on total criticality and equipment type
|
| 784 |
+
if total_criticality >= 12:
|
| 785 |
+
risk_assessment['business_impact'] = 'SEVERE'
|
| 786 |
+
elif total_criticality >= 10:
|
| 787 |
+
risk_assessment['business_impact'] = 'MAJOR'
|
| 788 |
+
elif total_criticality >= 8:
|
| 789 |
+
risk_assessment['business_impact'] = 'MODERATE'
|
| 790 |
+
|
| 791 |
+
return risk_assessment
|
| 792 |
+
|
| 793 |
+
def _find_similar_anomalies(self, description: str, top_k: int = 3) -> List[Dict]:
|
| 794 |
+
"""Find similar historical anomalies"""
|
| 795 |
+
|
| 796 |
+
if not description or self.sentence_model is None or self.embeddings is None:
|
| 797 |
+
return []
|
| 798 |
+
|
| 799 |
+
try:
|
| 800 |
+
# Encode new description
|
| 801 |
+
new_embedding = self.sentence_model.encode([description])
|
| 802 |
+
|
| 803 |
+
# Calculate similarities
|
| 804 |
+
similarities = cosine_similarity(new_embedding, self.embeddings)[0]
|
| 805 |
+
|
| 806 |
+
# Get top k most similar
|
| 807 |
+
top_indices = np.argsort(similarities)[::-1]
|
| 808 |
+
|
| 809 |
+
similar_anomalies = []
|
| 810 |
+
for idx in top_indices[:top_k*2]: # Check more to filter
|
| 811 |
+
similarity_score = float(similarities[idx])
|
| 812 |
+
|
| 813 |
+
# Skip if too similar (likely duplicate) or too dissimilar
|
| 814 |
+
if similarity_score > 0.99 or similarity_score < 0.15:
|
| 815 |
+
continue
|
| 816 |
+
|
| 817 |
+
if len(similar_anomalies) >= top_k:
|
| 818 |
+
break
|
| 819 |
+
|
| 820 |
+
similar_anomalies.append({
|
| 821 |
+
'description': self.embedding_metadata['descriptions'][idx],
|
| 822 |
+
'criticality': int(self.embedding_metadata['criticality_scores'][idx]),
|
| 823 |
+
'similarity_score': round(similarity_score, 3),
|
| 824 |
+
'section': self.embedding_metadata.get('sections', ['Unknown'])[idx],
|
| 825 |
+
'equipment_mentioned': self.embedding_metadata.get('equipment_mentioned', [[]])[idx]
|
| 826 |
+
})
|
| 827 |
+
|
| 828 |
+
return similar_anomalies
|
| 829 |
+
|
| 830 |
+
except Exception as e:
|
| 831 |
+
print(f"Warning: Similarity search failed: {e}")
|
| 832 |
+
return []
|
| 833 |
+
|
| 834 |
+
def _format_simple_response(self, anomaly_data: Dict, predictions: Dict,
|
| 835 |
+
total_criticality: int, overall_confidence: float,
|
| 836 |
+
needs_review: bool, equipment_risk_assessment: Dict) -> Dict:
|
| 837 |
+
"""Format simple response for database insertion"""
|
| 838 |
+
|
| 839 |
+
return {
|
| 840 |
+
'timestamp': datetime.now().isoformat(),
|
| 841 |
+
'input_description': anomaly_data.get('Description', ''),
|
| 842 |
+
'input_section': anomaly_data.get('Section propriΓ©taire', ''),
|
| 843 |
+
'input_equipment': anomaly_data.get('Description de l\'Γ©quipement', ''),
|
| 844 |
+
|
| 845 |
+
# Predictions
|
| 846 |
+
'predicted_criticite': total_criticality,
|
| 847 |
+
'predicted_fiabilite': predictions['FiabilitΓ© IntΓ©gritΓ©'],
|
| 848 |
+
'predicted_disponibilite': predictions['DisponibiltΓ©'],
|
| 849 |
+
'predicted_safety': predictions['Process Safety'],
|
| 850 |
+
|
| 851 |
+
# AI Metrics
|
| 852 |
+
'ai_confidence': round(overall_confidence, 3),
|
| 853 |
+
'needs_manual_review': bool(needs_review),
|
| 854 |
+
|
| 855 |
+
# Equipment Intelligence
|
| 856 |
+
'equipment_type': equipment_risk_assessment['equipment_type'],
|
| 857 |
+
'equipment_risk_level': equipment_risk_assessment['risk_level'],
|
| 858 |
+
'business_impact': equipment_risk_assessment['business_impact'],
|
| 859 |
+
'safety_override_applied': any(pred > 3 for pred in predictions.values()),
|
| 860 |
+
|
| 861 |
+
# Metadata
|
| 862 |
+
'model_version': '2.0_enhanced',
|
| 863 |
+
'processing_timestamp': datetime.now().isoformat()
|
| 864 |
+
}
|
| 865 |
+
|
| 866 |
+
def _format_rich_response(self, anomaly_data: Dict, predictions: Dict,
|
| 867 |
+
confidences: Dict, total_criticality: int,
|
| 868 |
+
overall_confidence: float, similar_anomalies: List,
|
| 869 |
+
needs_review: bool, confidence_threshold: float,
|
| 870 |
+
equipment_risk_assessment: Dict, enhanced_features: Dict) -> Dict:
|
| 871 |
+
"""Format rich response for UI display"""
|
| 872 |
+
|
| 873 |
+
# Calculate additional metrics
|
| 874 |
+
reliability_score = self._calculate_reliability_score(
|
| 875 |
+
confidences, enhanced_features, equipment_risk_assessment
|
| 876 |
+
)
|
| 877 |
+
|
| 878 |
+
return {
|
| 879 |
+
'timestamp': datetime.now().isoformat(),
|
| 880 |
+
'input_description': anomaly_data.get('Description', ''),
|
| 881 |
+
'input_section': anomaly_data.get('Section propriΓ©taire', ''),
|
| 882 |
+
'input_equipment': anomaly_data.get('Description de l\'Γ©quipement', ''),
|
| 883 |
+
|
| 884 |
+
'predictions': {
|
| 885 |
+
'criticite_totale': total_criticality,
|
| 886 |
+
'components': {
|
| 887 |
+
'fiabilite_integrite': predictions['FiabilitΓ© IntΓ©gritΓ©'],
|
| 888 |
+
'disponibilite': predictions['DisponibiltΓ©'],
|
| 889 |
+
'process_safety': predictions['Process Safety']
|
| 890 |
+
}
|
| 891 |
+
},
|
| 892 |
+
|
| 893 |
+
'confidence': {
|
| 894 |
+
'overall_confidence': round(overall_confidence, 3),
|
| 895 |
+
'reliability_score': round(reliability_score, 3),
|
| 896 |
+
'component_confidence': {
|
| 897 |
+
'fiabilite_integrite': round(confidences['FiabilitΓ© IntΓ©gritΓ©'], 3),
|
| 898 |
+
'disponibilite': round(confidences['DisponibiltΓ©'], 3),
|
| 899 |
+
'process_safety': round(confidences['Process Safety'], 3)
|
| 900 |
+
},
|
| 901 |
+
'needs_manual_review': bool(needs_review),
|
| 902 |
+
'confidence_threshold': confidence_threshold,
|
| 903 |
+
'recommendation': self._get_confidence_recommendation(reliability_score)
|
| 904 |
+
},
|
| 905 |
+
|
| 906 |
+
'equipment_intelligence': {
|
| 907 |
+
'equipment_type': equipment_risk_assessment['equipment_type'],
|
| 908 |
+
'redundancy_class': equipment_risk_assessment['redundancy_class'],
|
| 909 |
+
'risk_level': equipment_risk_assessment['risk_level'],
|
| 910 |
+
'business_impact': equipment_risk_assessment['business_impact'],
|
| 911 |
+
'risk_factors': equipment_risk_assessment['risk_factors'],
|
| 912 |
+
'base_risk_score': round(equipment_risk_assessment['base_risk_score'], 2)
|
| 913 |
+
},
|
| 914 |
+
|
| 915 |
+
'safety_analysis': {
|
| 916 |
+
'structural_failure_detected': bool(enhanced_features.get('has_structural_failure', 0)),
|
| 917 |
+
'safety_mention_present': bool(enhanced_features.get('has_safety_mention', 0)),
|
| 918 |
+
'equipment_malfunction_detected': bool(enhanced_features.get('has_equipment_malfunction', 0)),
|
| 919 |
+
'escalation_detected': bool(enhanced_features.get('has_escalation', 0)),
|
| 920 |
+
'safety_override_applied': any(pred > 3 for pred in predictions.values()),
|
| 921 |
+
'urgency_level': self._determine_urgency_level(total_criticality, reliability_score, equipment_risk_assessment)
|
| 922 |
+
},
|
| 923 |
+
|
| 924 |
+
'similar_anomalies': similar_anomalies,
|
| 925 |
+
|
| 926 |
+
'analysis': {
|
| 927 |
+
'problem_types_detected': enhanced_features.get('problem_types', []),
|
| 928 |
+
'equipment_mentioned': enhanced_features.get('equipment_mentioned', []),
|
| 929 |
+
'severity_score': enhanced_features.get('enhanced_severity_score', 0),
|
| 930 |
+
'technical_complexity': round(enhanced_features.get('technical_complexity', 0), 2),
|
| 931 |
+
'pattern_indicators': self._identify_critical_patterns(enhanced_features)
|
| 932 |
+
},
|
| 933 |
+
|
| 934 |
+
'model_metadata': {
|
| 935 |
+
'version': '2.0_enhanced',
|
| 936 |
+
'features_used': len([k for k in enhanced_features.keys() if k != 'Description']),
|
| 937 |
+
'equipment_intelligence_enabled': True,
|
| 938 |
+
'safety_rules_enabled': bool(self.safety_rules)
|
| 939 |
+
}
|
| 940 |
+
}
|
| 941 |
+
|
| 942 |
+
def _calculate_reliability_score(self, confidences: Dict, enhanced_features: Dict,
|
| 943 |
+
equipment_risk_assessment: Dict) -> float:
|
| 944 |
+
"""Calculate enhanced reliability score"""
|
| 945 |
+
|
| 946 |
+
# Base prediction confidence
|
| 947 |
+
prediction_confidence = np.mean(list(confidences.values()))
|
| 948 |
+
|
| 949 |
+
# Model agreement (lower std = higher agreement)
|
| 950 |
+
model_agreement = 1.0 - (np.std(list(confidences.values())) / max(np.mean(list(confidences.values())), 0.1))
|
| 951 |
+
|
| 952 |
+
# Feature completeness
|
| 953 |
+
has_description = len(enhanced_features.get('Description', '')) > 10
|
| 954 |
+
has_equipment = enhanced_features.get('equipment_type_class', 'UNKNOWN') != 'UNKNOWN'
|
| 955 |
+
has_section = enhanced_features.get('Section propriΓ©taire', 'Unknown') != 'Unknown'
|
| 956 |
+
feature_completeness = (has_description + has_equipment + has_section) / 3
|
| 957 |
+
|
| 958 |
+
# Equipment intelligence confidence boost
|
| 959 |
+
equipment_confidence_boost = 0.0
|
| 960 |
+
if equipment_risk_assessment['equipment_type'] != 'UNKNOWN':
|
| 961 |
+
equipment_confidence_boost = 0.1
|
| 962 |
+
|
| 963 |
+
# Pattern detection confidence
|
| 964 |
+
pattern_confidence = 0.0
|
| 965 |
+
if enhanced_features.get('has_safety_mention', 0) == 1:
|
| 966 |
+
pattern_confidence += 0.1
|
| 967 |
+
if enhanced_features.get('has_structural_failure', 0) == 1:
|
| 968 |
+
pattern_confidence += 0.15
|
| 969 |
+
if enhanced_features.get('equipment_problem_risk', 0) > 1.5:
|
| 970 |
+
pattern_confidence += 0.1
|
| 971 |
+
|
| 972 |
+
# Combine all factors
|
| 973 |
+
reliability_score = (
|
| 974 |
+
prediction_confidence * 0.4 +
|
| 975 |
+
model_agreement * 0.25 +
|
| 976 |
+
feature_completeness * 0.2 +
|
| 977 |
+
equipment_confidence_boost +
|
| 978 |
+
pattern_confidence
|
| 979 |
+
)
|
| 980 |
+
|
| 981 |
+
return min(reliability_score, 1.0)
|
| 982 |
+
|
| 983 |
+
def _get_confidence_recommendation(self, reliability_score: float) -> str:
|
| 984 |
+
"""Get confidence-based recommendation"""
|
| 985 |
+
if reliability_score >= 0.85:
|
| 986 |
+
return "Very high confidence - Prediction highly reliable"
|
| 987 |
+
elif reliability_score >= 0.75:
|
| 988 |
+
return "High confidence - Prediction can be trusted"
|
| 989 |
+
elif reliability_score >= 0.65:
|
| 990 |
+
return "Medium confidence - Consider expert review for critical decisions"
|
| 991 |
+
elif reliability_score >= 0.5:
|
| 992 |
+
return "Low confidence - Manual review recommended"
|
| 993 |
+
else:
|
| 994 |
+
return "Very low confidence - Expert assessment required"
|
| 995 |
+
|
| 996 |
+
def _determine_urgency_level(self, total_criticality: int, reliability_score: float,
|
| 997 |
+
equipment_risk_assessment: Dict) -> str:
|
| 998 |
+
"""Determine enhanced urgency level"""
|
| 999 |
+
|
| 1000 |
+
# Adjust criticality by reliability and equipment risk
|
| 1001 |
+
adjusted_criticality = total_criticality * reliability_score
|
| 1002 |
+
|
| 1003 |
+
# Equipment type urgency multiplier
|
| 1004 |
+
equipment_urgency_multiplier = 1.0
|
| 1005 |
+
if equipment_risk_assessment['equipment_type'] in ['COOLING_CRITICAL', 'ELECTRICAL_CRITICAL']:
|
| 1006 |
+
equipment_urgency_multiplier = 1.3
|
| 1007 |
+
elif equipment_risk_assessment['equipment_type'] in ['TURBINE_SYSTEMS']:
|
| 1008 |
+
equipment_urgency_multiplier = 1.2
|
| 1009 |
+
|
| 1010 |
+
final_urgency_score = adjusted_criticality * equipment_urgency_multiplier
|
| 1011 |
+
|
| 1012 |
+
if final_urgency_score >= 14:
|
| 1013 |
+
return "EMERGENCY - Immediate shutdown may be required"
|
| 1014 |
+
elif final_urgency_score >= 12:
|
| 1015 |
+
return "CRITICAL - Immediate action required (within 1 hour)"
|
| 1016 |
+
elif final_urgency_score >= 9:
|
| 1017 |
+
return "HIGH - Action required within 24 hours"
|
| 1018 |
+
elif final_urgency_score >= 6:
|
| 1019 |
+
return "MEDIUM - Action required within 1 week"
|
| 1020 |
+
else:
|
| 1021 |
+
return "LOW - Routine maintenance scheduling"
|
| 1022 |
+
|
| 1023 |
+
def _identify_critical_patterns(self, enhanced_features: Dict) -> List[str]:
|
| 1024 |
+
"""Identify critical patterns in the anomaly"""
|
| 1025 |
+
|
| 1026 |
+
patterns = []
|
| 1027 |
+
|
| 1028 |
+
if enhanced_features.get('has_structural_failure', 0) == 1:
|
| 1029 |
+
patterns.append('Structural failure detected')
|
| 1030 |
+
|
| 1031 |
+
if enhanced_features.get('has_safety_mention', 0) == 1:
|
| 1032 |
+
patterns.append('Safety concern explicitly mentioned')
|
| 1033 |
+
|
| 1034 |
+
if enhanced_features.get('electrical_cooling_issue', 0) == 1:
|
| 1035 |
+
patterns.append('Electrical equipment cooling issue')
|
| 1036 |
+
|
| 1037 |
+
if enhanced_features.get('turbine_oil_issue', 0) == 1:
|
| 1038 |
+
patterns.append('Turbine lubrication system problem')
|
| 1039 |
+
|
| 1040 |
+
if enhanced_features.get('main_equipment_failure', 0) == 1:
|
| 1041 |
+
patterns.append('Critical single-point equipment failure')
|
| 1042 |
+
|
| 1043 |
+
if enhanced_features.get('has_escalation', 0) == 1:
|
| 1044 |
+
patterns.append('Problem escalation indicated')
|
| 1045 |
+
|
| 1046 |
+
if enhanced_features.get('vibration_excessive', 0) == 1:
|
| 1047 |
+
patterns.append('Excessive vibration detected')
|
| 1048 |
+
|
| 1049 |
+
if enhanced_features.get('temperature_elevee', 0) == 1:
|
| 1050 |
+
patterns.append('High temperature condition')
|
| 1051 |
+
|
| 1052 |
+
if enhanced_features.get('enhanced_severity_score', 0) >= 4:
|
| 1053 |
+
patterns.append('High severity language detected')
|
| 1054 |
+
|
| 1055 |
+
return patterns
|
| 1056 |
+
|
| 1057 |
+
|
| 1058 |
+
# ============== CONVENIENCE FUNCTIONS ==============
|
| 1059 |
+
|
| 1060 |
+
# Global instance for easy use
|
| 1061 |
+
_enhanced_ai_instance = None
|
| 1062 |
+
|
| 1063 |
+
def get_enhanced_ai_instance():
|
| 1064 |
+
"""Get singleton enhanced AI instance"""
|
| 1065 |
+
global _enhanced_ai_instance
|
| 1066 |
+
if _enhanced_ai_instance is None:
|
| 1067 |
+
_enhanced_ai_instance = EnhancedAnomalyIntelligence()
|
| 1068 |
+
return _enhanced_ai_instance
|
| 1069 |
+
|
| 1070 |
+
def predict_anomaly_single_enhanced(anomaly_data: Dict, **kwargs) -> Dict:
|
| 1071 |
+
"""Convenience function for enhanced single prediction"""
|
| 1072 |
+
ai = get_enhanced_ai_instance()
|
| 1073 |
+
return ai.predict_single(anomaly_data, **kwargs)
|
| 1074 |
+
|
| 1075 |
+
def predict_anomaly_batch_enhanced(anomaly_list: List[Dict], **kwargs) -> List[Dict]:
|
| 1076 |
+
"""Convenience function for enhanced batch prediction"""
|
| 1077 |
+
ai = get_enhanced_ai_instance()
|
| 1078 |
+
return ai.predict_batch(anomaly_list, **kwargs)
|
| 1079 |
+
|
| 1080 |
+
def process_excel_upload_enhanced(excel_data: pd.DataFrame,
|
| 1081 |
+
confidence_threshold: float = 0.7) -> pd.DataFrame:
|
| 1082 |
+
"""
|
| 1083 |
+
Process Excel upload with enhanced AI predictions
|
| 1084 |
+
|
| 1085 |
+
Args:
|
| 1086 |
+
excel_data: DataFrame from uploaded Excel
|
| 1087 |
+
confidence_threshold: Confidence threshold for manual review
|
| 1088 |
+
|
| 1089 |
+
Returns:
|
| 1090 |
+
DataFrame with enhanced AI prediction columns
|
| 1091 |
+
"""
|
| 1092 |
+
|
| 1093 |
+
# Convert DataFrame to list of dicts
|
| 1094 |
+
anomaly_list = excel_data.to_dict('records')
|
| 1095 |
+
|
| 1096 |
+
# Get enhanced batch predictions
|
| 1097 |
+
predictions = predict_anomaly_batch_enhanced(
|
| 1098 |
+
anomaly_list,
|
| 1099 |
+
confidence_threshold=confidence_threshold,
|
| 1100 |
+
include_similar=False, # Skip for batch processing speed
|
| 1101 |
+
format_type='simple',
|
| 1102 |
+
apply_safety_rules=True
|
| 1103 |
+
)
|
| 1104 |
+
|
| 1105 |
+
# Add enhanced prediction columns to original DataFrame
|
| 1106 |
+
result_df = excel_data.copy()
|
| 1107 |
+
|
| 1108 |
+
# Enhanced AI prediction columns
|
| 1109 |
+
result_df['AI_Predicted_Criticite'] = [p.get('predicted_criticite', 0) for p in predictions]
|
| 1110 |
+
result_df['AI_Predicted_Fiabilite'] = [p.get('predicted_fiabilite', 0) for p in predictions]
|
| 1111 |
+
result_df['AI_Predicted_Disponibilite'] = [p.get('predicted_disponibilite', 0) for p in predictions]
|
| 1112 |
+
result_df['AI_Predicted_Safety'] = [p.get('predicted_safety', 0) for p in predictions]
|
| 1113 |
+
result_df['AI_Confidence'] = [p.get('ai_confidence', 0.0) for p in predictions]
|
| 1114 |
+
result_df['AI_Needs_Review'] = [bool(p.get('needs_manual_review', True)) for p in predictions]
|
| 1115 |
+
|
| 1116 |
+
# Equipment intelligence columns
|
| 1117 |
+
result_df['AI_Equipment_Type'] = [p.get('equipment_type', 'UNKNOWN') for p in predictions]
|
| 1118 |
+
result_df['AI_Risk_Level'] = [p.get('equipment_risk_level', 'LOW') for p in predictions]
|
| 1119 |
+
result_df['AI_Business_Impact'] = [p.get('business_impact', 'MINOR') for p in predictions]
|
| 1120 |
+
result_df['AI_Safety_Override'] = [bool(p.get('safety_override_applied', False)) for p in predictions]
|
| 1121 |
+
|
| 1122 |
+
# Human verification columns
|
| 1123 |
+
result_df['Human_Verified'] = False
|
| 1124 |
+
result_df['Human_Criticite'] = None
|
| 1125 |
+
result_df['Human_Fiabilite'] = None
|
| 1126 |
+
result_df['Human_Disponibilite'] = None
|
| 1127 |
+
result_df['Human_Safety'] = None
|
| 1128 |
+
result_df['Correction_Reason'] = ''
|
| 1129 |
+
result_df['Verified_At'] = None
|
| 1130 |
+
result_df['Verified_By'] = ''
|
| 1131 |
+
result_df['Expert_Notes'] = ''
|
| 1132 |
+
|
| 1133 |
+
return result_df
|
| 1134 |
+
|
| 1135 |
+
|
| 1136 |
+
# ============== ENHANCED EXAMPLE USAGE ==============
|
| 1137 |
+
|
| 1138 |
+
if __name__ == "__main__":
|
| 1139 |
+
|
| 1140 |
+
# Example 1: Enhanced single anomaly prediction
|
| 1141 |
+
print("="*70)
|
| 1142 |
+
print("TESTING ENHANCED SINGLE ANOMALY PREDICTION")
|
| 1143 |
+
print("="*70)
|
| 1144 |
+
|
| 1145 |
+
single_anomaly = {
|
| 1146 |
+
'Description': 'SAFETY : fuite vapeur importante sur TRANSFO PRINCIPAL, tempΓ©rature Γ©levΓ©e detectΓ©e, vibration excessive',
|
| 1147 |
+
'Section propriΓ©taire': '34EL',
|
| 1148 |
+
'Description de l\'Γ©quipement': 'TRANSFO PRINCIPAL'
|
| 1149 |
+
}
|
| 1150 |
+
|
| 1151 |
+
result = predict_anomaly_single_enhanced(
|
| 1152 |
+
single_anomaly,
|
| 1153 |
+
format_type='rich',
|
| 1154 |
+
apply_safety_rules=True,
|
| 1155 |
+
include_similar=True
|
| 1156 |
+
)
|
| 1157 |
+
|
| 1158 |
+
print("Enhanced rich format result:")
|
| 1159 |
+
print(f"Predicted Criticality: {result['predictions']['criticite_totale']}")
|
| 1160 |
+
print(f"Equipment Type: {result['equipment_intelligence']['equipment_type']}")
|
| 1161 |
+
print(f"Risk Level: {result['equipment_intelligence']['risk_level']}")
|
| 1162 |
+
print(f"Business Impact: {result['equipment_intelligence']['business_impact']}")
|
| 1163 |
+
print(f"Safety Override Applied: {result['safety_analysis']['safety_override_applied']}")
|
| 1164 |
+
print(f"Urgency Level: {result['safety_analysis']['urgency_level']}")
|
| 1165 |
+
print(f"Risk Factors: {result['equipment_intelligence']['risk_factors']}")
|
| 1166 |
+
|
| 1167 |
+
# Example 2: Enhanced batch processing
|
| 1168 |
+
print("\n" + "="*70)
|
| 1169 |
+
print("TESTING ENHANCED BATCH PREDICTION")
|
| 1170 |
+
print("="*70)
|
| 1171 |
+
|
| 1172 |
+
batch_anomalies = [
|
| 1173 |
+
{
|
| 1174 |
+
'Description': 'vibration excessive ALTERNATEUR, bruit anormal dΓ©tectΓ©',
|
| 1175 |
+
'Section propriΓ©taire': '34EL',
|
| 1176 |
+
'Description de l\'Γ©quipement': 'ALTERNATEUR'
|
| 1177 |
+
},
|
| 1178 |
+
{
|
| 1179 |
+
'Description': 'fuite huile système hydraulique TURBINE, pression basse',
|
| 1180 |
+
'Section propriΓ©taire': '34MM',
|
| 1181 |
+
'Description de l\'Γ©quipement': 'TURBINE'
|
| 1182 |
+
},
|
| 1183 |
+
{
|
| 1184 |
+
'Description': 'maintenance prΓ©ventive DECRASSEUR Γ prΓ©voir',
|
| 1185 |
+
'Section propriΓ©taire': '34MC',
|
| 1186 |
+
'Description de l\'Γ©quipement': 'DECRASSEUR'
|
| 1187 |
+
},
|
| 1188 |
+
{
|
| 1189 |
+
'Description': 'percement conduite vapeur VENTILATEUR DE REFROIDISSEMENT TP',
|
| 1190 |
+
'Section propriΓ©taire': '34EL',
|
| 1191 |
+
'Description de l\'Γ©quipement': 'VENTILATEUR DE REFROIDISSEMENT TP'
|
| 1192 |
+
}
|
| 1193 |
+
]
|
| 1194 |
+
|
| 1195 |
+
batch_results = predict_anomaly_batch_enhanced(
|
| 1196 |
+
batch_anomalies,
|
| 1197 |
+
confidence_threshold=0.7,
|
| 1198 |
+
format_type='simple',
|
| 1199 |
+
apply_safety_rules=True
|
| 1200 |
+
)
|
| 1201 |
+
|
| 1202 |
+
print("Enhanced batch results:")
|
| 1203 |
+
for i, result in enumerate(batch_results):
|
| 1204 |
+
print(f"\nAnomaly {i+1}:")
|
| 1205 |
+
print(f" Equipment Type: {result.get('equipment_type', 'N/A')}")
|
| 1206 |
+
print(f" CriticitΓ©: {result.get('predicted_criticite', 'N/A')}")
|
| 1207 |
+
print(f" Risk Level: {result.get('equipment_risk_level', 'N/A')}")
|
| 1208 |
+
print(f" Business Impact: {result.get('business_impact', 'N/A')}")
|
| 1209 |
+
print(f" Confidence: {result.get('ai_confidence', 'N/A')}")
|
| 1210 |
+
print(f" Safety Override: {result.get('safety_override_applied', 'N/A')}")
|
| 1211 |
+
print(f" Needs Review: {result.get('needs_manual_review', 'N/A')}")
|
| 1212 |
+
|
| 1213 |
+
# Example 3: Enhanced Excel processing simulation
|
| 1214 |
+
print("\n" + "="*70)
|
| 1215 |
+
print("TESTING ENHANCED EXCEL PROCESSING")
|
| 1216 |
+
print("="*70)
|
| 1217 |
+
|
| 1218 |
+
# Simulate Excel data with various equipment types
|
| 1219 |
+
excel_df = pd.DataFrame([
|
| 1220 |
+
{
|
| 1221 |
+
'Description': 'problème refroidissement TRANSFO PRINCIPAL',
|
| 1222 |
+
'Section propriΓ©taire': '34EL',
|
| 1223 |
+
'Description de l\'Γ©quipement': 'TRANSFO PRINCIPAL',
|
| 1224 |
+
'Date de dΓ©tΓ©ction de l\'anomalie': '2025-01-15'
|
| 1225 |
+
},
|
| 1226 |
+
{
|
| 1227 |
+
'Description': 'SAFETY : éclatement tube chaudière, fissure détectée',
|
| 1228 |
+
'Section propriΓ©taire': '34MD',
|
| 1229 |
+
'Description de l\'Γ©quipement': 'CHAUDIERE',
|
| 1230 |
+
'Date de dΓ©tΓ©ction de l\'anomalie': '2025-01-16'
|
| 1231 |
+
},
|
| 1232 |
+
{
|
| 1233 |
+
'Description': 'maintenance POMPE A prΓ©voir',
|
| 1234 |
+
'Section propriΓ©taire': '34MC',
|
| 1235 |
+
'Description de l\'Γ©quipement': 'POMPE',
|
| 1236 |
+
'Date de dΓ©tΓ©ction de l\'anomalie': '2025-01-17'
|
| 1237 |
+
}
|
| 1238 |
+
])
|
| 1239 |
+
|
| 1240 |
+
processed_df = process_excel_upload_enhanced(excel_df, confidence_threshold=0.7)
|
| 1241 |
+
|
| 1242 |
+
print("Enhanced processed Excel columns:")
|
| 1243 |
+
enhanced_columns = [col for col in processed_df.columns if col.startswith('AI_')]
|
| 1244 |
+
print(enhanced_columns)
|
| 1245 |
+
|
| 1246 |
+
print("\nSample of enhanced processed data:")
|
| 1247 |
+
display_cols = ['Description', 'AI_Predicted_Criticite', 'AI_Equipment_Type',
|
| 1248 |
+
'AI_Risk_Level', 'AI_Business_Impact', 'AI_Safety_Override', 'AI_Needs_Review']
|
| 1249 |
+
print(processed_df[display_cols].to_string(index=False))
|
| 1250 |
+
|
| 1251 |
+
print("\n" + "π―" + "="*68)
|
| 1252 |
+
print("ENHANCED ANOMALY INTELLIGENCE v2.0 TESTS COMPLETED SUCCESSFULLY!")
|
| 1253 |
+
print("="*70)
|
| 1254 |
+
print("β Equipment Intelligence Integration")
|
| 1255 |
+
print("β Safety Override Rules")
|
| 1256 |
+
print("β Enhanced Risk Assessment")
|
| 1257 |
+
print("β Conservative Prediction Bias")
|
| 1258 |
+
print("β Business Impact Analysis")
|
| 1259 |
+
print("β Production-Ready Performance")
|
| 1260 |
+
print("="*70)
|
descritption_v2.py
ADDED
|
@@ -0,0 +1,942 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# enhanced_data_processing_v2.py
|
| 2 |
+
# TAQATHON 2025 - Enhanced Data Processing with Equipment Intelligence
|
| 3 |
+
# Incorporates dual-field analysis + equipment criticality patterns from analysis
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
import re
|
| 8 |
+
from collections import Counter, defaultdict
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
import seaborn as sns
|
| 11 |
+
from wordcloud import WordCloud
|
| 12 |
+
import warnings
|
| 13 |
+
warnings.filterwarnings('ignore')
|
| 14 |
+
|
| 15 |
+
print("="*70)
|
| 16 |
+
print("TAQATHON 2025 - ENHANCED DATA PROCESSING v2.0")
|
| 17 |
+
print("Equipment Intelligence + Dual-Field Analysis + Noise-Robust Features")
|
| 18 |
+
print("="*70)
|
| 19 |
+
|
| 20 |
+
# ============== STEP 1: LOAD DATA AND BASIC SETUP ==============
|
| 21 |
+
print("\n" + "="*50)
|
| 22 |
+
print("STEP 1: LOADING DATA AND BASIC SETUP")
|
| 23 |
+
print("="*50)
|
| 24 |
+
|
| 25 |
+
# Load the data
|
| 26 |
+
try:
|
| 27 |
+
df = pd.read_excel('Taqathon_data.xlsx', sheet_name='Oracle')
|
| 28 |
+
print(f"β Successfully loaded dataset: {df.shape}")
|
| 29 |
+
except FileNotFoundError:
|
| 30 |
+
print("β Error: Taqathon_data.xlsx not found!")
|
| 31 |
+
exit(1)
|
| 32 |
+
|
| 33 |
+
print("Columns:", df.columns.tolist())
|
| 34 |
+
|
| 35 |
+
# Check for missing values
|
| 36 |
+
print("\nMissing values per column:")
|
| 37 |
+
print(df.isnull().sum())
|
| 38 |
+
|
| 39 |
+
# Clean data
|
| 40 |
+
df = df.dropna(subset=['Description', 'Description de l\'Γ©quipement'])
|
| 41 |
+
print(f"After removing missing key fields: {df.shape}")
|
| 42 |
+
|
| 43 |
+
# Convert date column to datetime
|
| 44 |
+
df['Date de dΓ©tΓ©ction de l\'anomalie'] = pd.to_datetime(df['Date de dΓ©tΓ©ction de l\'anomalie'])
|
| 45 |
+
|
| 46 |
+
# Remove duplicates
|
| 47 |
+
df = df.drop_duplicates()
|
| 48 |
+
print(f"After removing duplicates: {df.shape}")
|
| 49 |
+
|
| 50 |
+
# ============== STEP 2: EQUIPMENT INTELLIGENCE SETUP ==============
|
| 51 |
+
print("\n" + "="*50)
|
| 52 |
+
print("STEP 2: EQUIPMENT INTELLIGENCE CLASSIFICATION")
|
| 53 |
+
print("="*50)
|
| 54 |
+
|
| 55 |
+
# Based on our analysis - Equipment Type Criticality Scores
|
| 56 |
+
EQUIPMENT_TYPE_SCORES = {
|
| 57 |
+
# High-risk electrical equipment (8.0+ avg criticality)
|
| 58 |
+
'ELECTRICAL_CRITICAL': {
|
| 59 |
+
'keywords': ['ALTERNATEUR', 'TRANSFO PRINCIPAL', 'PROTECTION ALTERNATEUR'],
|
| 60 |
+
'score': 8.0
|
| 61 |
+
},
|
| 62 |
+
# Turbine and power generation systems (7.0+ avg)
|
| 63 |
+
'TURBINE_SYSTEMS': {
|
| 64 |
+
'keywords': ['TURBINE', 'SOUPAPE REGULATRICE', 'REFRIGERANT HUILE', 'POMPE DE SOULΓVEMENT'],
|
| 65 |
+
'score': 7.5
|
| 66 |
+
},
|
| 67 |
+
# Cooling and ventilation systems (7.5+ avg for critical cooling)
|
| 68 |
+
'COOLING_CRITICAL': {
|
| 69 |
+
'keywords': ['VENTILATEUR DE REFROIDISSEMENT', 'REFROIDISSEMENT TP', 'MOTEUR VENTILATEUR DE REFROIDISSEMENT'],
|
| 70 |
+
'score': 7.5
|
| 71 |
+
},
|
| 72 |
+
# Standard electrical equipment (6.0-7.0 avg)
|
| 73 |
+
'ELECTRICAL_STANDARD': {
|
| 74 |
+
'keywords': ['DISJONCTEUR', 'TRANSFORMATEUR', 'MOTEUR', 'ARMOIRE', 'GROUPE'],
|
| 75 |
+
'score': 6.5
|
| 76 |
+
},
|
| 77 |
+
# Heating systems (6.0+ avg)
|
| 78 |
+
'HEATING_SYSTEMS': {
|
| 79 |
+
'keywords': ['RECHAUFFEUR', 'RΓCHAUFFEUR', 'CHAUDIERE', 'CHAUDIΓRE'],
|
| 80 |
+
'score': 6.5
|
| 81 |
+
},
|
| 82 |
+
# Ventilation systems (6.0+ avg)
|
| 83 |
+
'VENTILATION_SYSTEMS': {
|
| 84 |
+
'keywords': ['VENTILATEUR', 'TIRAGE', 'SOUFFLAGE', 'AIR PRIMAIRE', 'AIR SECONDAIRE'],
|
| 85 |
+
'score': 6.0
|
| 86 |
+
},
|
| 87 |
+
# Process systems (5.5+ avg)
|
| 88 |
+
'PROCESS_SYSTEMS': {
|
| 89 |
+
'keywords': ['POMPE', 'SOUPAPE', 'VANNE', 'CONVOYEUR', 'BROYEUR', 'COAL FEEDER'],
|
| 90 |
+
'score': 5.5
|
| 91 |
+
},
|
| 92 |
+
# Auxiliary/maintenance systems (5.0+ avg)
|
| 93 |
+
'AUXILIARY_SYSTEMS': {
|
| 94 |
+
'keywords': ['DECRASSEUR', 'DΓGRILLEUR', 'FILTRE', 'CAPTEUR', 'TRANSMETTEUR'],
|
| 95 |
+
'score': 5.0
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Redundancy detection patterns (from analysis)
|
| 100 |
+
REDUNDANCY_PATTERNS = {
|
| 101 |
+
'SINGLE_CRITICAL': {
|
| 102 |
+
'patterns': [r'PRINCIPAL', r'UNIQUE', r'^(?!.*[AB]$)(?!.*NΒ°[0-9])(?!.*[0-9]$)'],
|
| 103 |
+
'multiplier': 1.3
|
| 104 |
+
},
|
| 105 |
+
'DUAL_SYSTEM': {
|
| 106 |
+
'patterns': [r'\b[AB]$', r'NΒ°[12]$', r'PRIMAIRE$', r'SECONDAIRE$'],
|
| 107 |
+
'multiplier': 1.0
|
| 108 |
+
},
|
| 109 |
+
'MULTIPLE_SYSTEM': {
|
| 110 |
+
'patterns': [r'NΒ°[3-9]$', r'NΒ°[0-9][0-9]$'],
|
| 111 |
+
'multiplier': 0.8
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
# Section risk multipliers (from analysis)
|
| 116 |
+
SECTION_RISK_MULTIPLIERS = {
|
| 117 |
+
'34EL': 1.2, # Electrical - highest critical case rate
|
| 118 |
+
'34MM': 1.1, # Mechanical - high turbine/oil systems
|
| 119 |
+
'34MD': 1.1, # Medium risk
|
| 120 |
+
'34MC': 1.0, # Lower critical case rate
|
| 121 |
+
'34CT': 1.0 # Control systems
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
def classify_equipment_type(equipment_desc):
|
| 125 |
+
"""Classify equipment based on criticality analysis"""
|
| 126 |
+
equipment_upper = str(equipment_desc).upper()
|
| 127 |
+
|
| 128 |
+
for category, info in EQUIPMENT_TYPE_SCORES.items():
|
| 129 |
+
for keyword in info['keywords']:
|
| 130 |
+
if keyword in equipment_upper:
|
| 131 |
+
return category, info['score']
|
| 132 |
+
|
| 133 |
+
return 'UNKNOWN', 4.5 # Default for unclassified
|
| 134 |
+
|
| 135 |
+
def detect_equipment_redundancy(equipment_desc):
|
| 136 |
+
"""Detect equipment redundancy based on naming patterns"""
|
| 137 |
+
equipment_upper = str(equipment_desc).upper()
|
| 138 |
+
|
| 139 |
+
for redundancy_class, info in REDUNDANCY_PATTERNS.items():
|
| 140 |
+
for pattern in info['patterns']:
|
| 141 |
+
if re.search(pattern, equipment_upper):
|
| 142 |
+
return redundancy_class, info['multiplier']
|
| 143 |
+
|
| 144 |
+
return 'UNKNOWN_REDUNDANCY', 1.0
|
| 145 |
+
|
| 146 |
+
# Apply equipment intelligence
|
| 147 |
+
print("Applying equipment intelligence classification...")
|
| 148 |
+
|
| 149 |
+
# Equipment type classification
|
| 150 |
+
equipment_classifications = df['Description de l\'Γ©quipement'].apply(classify_equipment_type)
|
| 151 |
+
df['equipment_type_class'] = [x[0] for x in equipment_classifications]
|
| 152 |
+
df['equipment_base_criticality'] = [x[1] for x in equipment_classifications]
|
| 153 |
+
|
| 154 |
+
# Equipment redundancy detection
|
| 155 |
+
redundancy_classifications = df['Description de l\'Γ©quipement'].apply(detect_equipment_redundancy)
|
| 156 |
+
df['equipment_redundancy_class'] = [x[0] for x in redundancy_classifications]
|
| 157 |
+
df['equipment_redundancy_multiplier'] = [x[1] for x in redundancy_classifications]
|
| 158 |
+
|
| 159 |
+
# Section risk multiplier
|
| 160 |
+
df['section_risk_multiplier'] = df['Section propriΓ©taire'].map(SECTION_RISK_MULTIPLIERS).fillna(1.0)
|
| 161 |
+
|
| 162 |
+
# Combined equipment risk score
|
| 163 |
+
df['equipment_risk_score'] = (df['equipment_base_criticality'] *
|
| 164 |
+
df['equipment_redundancy_multiplier'] *
|
| 165 |
+
df['section_risk_multiplier'])
|
| 166 |
+
|
| 167 |
+
print("β Equipment intelligence classification completed")
|
| 168 |
+
print(f"Equipment type distribution:")
|
| 169 |
+
print(df['equipment_type_class'].value_counts())
|
| 170 |
+
print(f"\nRedundancy classification:")
|
| 171 |
+
print(df['equipment_redundancy_class'].value_counts())
|
| 172 |
+
|
| 173 |
+
# ============== STEP 3: DUAL-FIELD TEXT ANALYSIS ==============
|
| 174 |
+
print("\n" + "="*50)
|
| 175 |
+
print("STEP 3: DUAL-FIELD TEXT ANALYSIS")
|
| 176 |
+
print("="*50)
|
| 177 |
+
|
| 178 |
+
# Create combined text field for comprehensive analysis
|
| 179 |
+
df['combined_text'] = df['Description'].fillna('') + ' ' + df['Description de l\'Γ©quipement'].fillna('')
|
| 180 |
+
df['combined_text_lower'] = df['combined_text'].str.lower()
|
| 181 |
+
|
| 182 |
+
# Basic text features for both fields
|
| 183 |
+
df['description_length'] = df['Description'].str.len()
|
| 184 |
+
df['description_word_count'] = df['Description'].str.split().str.len()
|
| 185 |
+
df['equipment_desc_length'] = df['Description de l\'Γ©quipement'].str.len()
|
| 186 |
+
df['equipment_desc_word_count'] = df['Description de l\'Γ©quipement'].str.split().str.len()
|
| 187 |
+
df['combined_length'] = df['combined_text'].str.len()
|
| 188 |
+
df['combined_word_count'] = df['combined_text'].str.split().str.len()
|
| 189 |
+
|
| 190 |
+
print(f"Text analysis completed:")
|
| 191 |
+
print(f"Average description length: {df['description_length'].mean():.1f} chars")
|
| 192 |
+
print(f"Average equipment description length: {df['equipment_desc_length'].mean():.1f} chars")
|
| 193 |
+
print(f"Average combined length: {df['combined_length'].mean():.1f} chars")
|
| 194 |
+
|
| 195 |
+
# ============== STEP 4: ENHANCED KEYWORD EXTRACTION ==============
|
| 196 |
+
print("\n" + "="*50)
|
| 197 |
+
print("STEP 4: ENHANCED KEYWORD EXTRACTION (DUAL-FIELD)")
|
| 198 |
+
print("="*50)
|
| 199 |
+
|
| 200 |
+
# Enhanced equipment keywords (from analysis + original)
|
| 201 |
+
equipment_keywords = {
|
| 202 |
+
'pompe': ['pompe', 'pompes'],
|
| 203 |
+
'vanne': ['vanne', 'vannes'],
|
| 204 |
+
'ventilateur': ['ventilateur', 'ventilateurs', 'ventilo'],
|
| 205 |
+
'moteur': ['moteur', 'moteurs', 'moto'],
|
| 206 |
+
'alternateur': ['alternateur', 'alternateurs'], # HIGH RISK
|
| 207 |
+
'transformateur': ['transformateur', 'transformateurs', 'transfo'], # HIGH RISK
|
| 208 |
+
'turbine': ['turbine', 'turbines'], # HIGH RISK
|
| 209 |
+
'chaudière': ['chaudière', 'chaudières', 'chaudiere'],
|
| 210 |
+
'rΓ©chauffeur': ['rΓ©chauffeur', 'rΓ©chauffeurs', 'rechauffeur'],
|
| 211 |
+
'refroidissement': ['refroidissement', 'refroidisseur', 'refrigerant', 'rΓ©frigΓ©rant'], # HIGH RISK
|
| 212 |
+
'compresseur': ['compresseur', 'compresseurs'],
|
| 213 |
+
'soupape': ['soupape', 'soupapes'],
|
| 214 |
+
'dΓ©crasseur': ['dΓ©crasseur', 'dΓ©crasseurs', 'decrasseur'],
|
| 215 |
+
'principal': ['principal', 'principale'], # SINGLE CRITICAL
|
| 216 |
+
'groupe': ['groupe', 'groupes'], # HIGH RISK
|
| 217 |
+
'protection': ['protection', 'protections'],
|
| 218 |
+
'armoire': ['armoire', 'armoires'],
|
| 219 |
+
'disjoncteur': ['disjoncteur', 'disjoncteurs']
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
# Enhanced problem keywords (from critical case analysis)
|
| 223 |
+
problem_keywords = {
|
| 224 |
+
'fuite': ['fuite', 'fuites', 'fuit', 'fuyant'],
|
| 225 |
+
'vibration': ['vibration', 'vibrations', 'vibre'],
|
| 226 |
+
'bruit_anormal': ['bruit anormal', 'bruit anormale'], # SPECIFIC PATTERN
|
| 227 |
+
'percement': ['percement', 'percΓ©', 'percΓ©e'], # CRITICAL FAILURE
|
| 228 |
+
'Γ©clatement': ['Γ©clatement', 'eclatement'], # CRITICAL FAILURE
|
| 229 |
+
'fissure': ['fissure', 'fissurΓ©', 'fissures'], # STRUCTURAL FAILURE
|
| 230 |
+
'aggravation': ['aggravation'], # ESCALATION INDICATOR
|
| 231 |
+
'sifflement': ['sifflement', 'siffler'], # PRESSURE ISSUE
|
| 232 |
+
'dΓ©faillance': ['dΓ©faillance', 'dΓ©faillant'],
|
| 233 |
+
'dysfonctionnement': ['dysfonctionnement', 'dysfonctionnel'],
|
| 234 |
+
'sens_inverse': ['sens inverse', 'sens contraire'], # CRITICAL MALFUNCTION
|
| 235 |
+
'dΓ©tachΓ©s': ['dΓ©tachΓ©s', 'dΓ©tachΓ©', 'detaches'],
|
| 236 |
+
'corrosion': ['corrosion', 'corrodΓ©', 'rouille'],
|
| 237 |
+
'usure': ['usure', 'usΓ©', 'usΓ©e'],
|
| 238 |
+
'surchauffe': ['surchauffe', 'surchauffΓ©', 'tempΓ©rature Γ©levΓ©e', 'temp elevee'],
|
| 239 |
+
'blocage': ['blocage', 'bloquΓ©', 'bloque', 'coincΓ©'],
|
| 240 |
+
'dΓ©gradation': ['dΓ©gradation', 'dΓ©gradΓ©'],
|
| 241 |
+
'obstruction': ['obstruction', 'obstruΓ©', 'bouchΓ©', 'bouchage']
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
# Enhanced action keywords
|
| 245 |
+
action_keywords = {
|
| 246 |
+
'remplacement': ['remplacement', 'remplacer', 'remplacΓ©', 'changement', 'changer'],
|
| 247 |
+
'rΓ©paration': ['rΓ©paration', 'rΓ©parer', 'rΓ©parΓ©'],
|
| 248 |
+
'maintenance': ['maintenance', 'entretien'],
|
| 249 |
+
'prΓ©vision': ['prΓ©voir', 'prΓ©voire', 'prevoir'], # MAINTENANCE PLANNING
|
| 250 |
+
'soufflage': ['soufflage', 'souffler', 'soufflΓ©'],
|
| 251 |
+
'nettoyage': ['nettoyage', 'nettoyer', 'nettoyΓ©'],
|
| 252 |
+
'dΓ©bouchage': ['dΓ©bouchage', 'dΓ©boucher'],
|
| 253 |
+
'inspection': ['inspection', 'inspecter', 'contrΓ΄le', 'contrΓ΄ler'],
|
| 254 |
+
'rΓ©vision': ['rΓ©vision', 'rΓ©viser'],
|
| 255 |
+
'remise_Γ©tat': ['remise en Γ©tat', 'remise Γ©tat']
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
# SAFETY and urgency indicators (enhanced)
|
| 259 |
+
urgency_keywords = {
|
| 260 |
+
'safety': ['safety', 'sΓ©curitΓ©'], # BUT NOT AUTOMATIC HIGH CRITICALITY
|
| 261 |
+
'urgent': ['urgent', 'urgence'],
|
| 262 |
+
'critique': ['critique', 'critiques'],
|
| 263 |
+
'important': ['important', 'importante'],
|
| 264 |
+
'immΓ©diat': ['immΓ©diat', 'immΓ©diatement'],
|
| 265 |
+
'prioritaire': ['prioritaire', 'prioritΓ©'],
|
| 266 |
+
'grave': ['grave', 'graves'],
|
| 267 |
+
'majeur': ['majeur', 'majeure'],
|
| 268 |
+
'dangereux': ['dangereux', 'dangereuse', 'danger'],
|
| 269 |
+
'risque': ['risque', 'risques', 'risquΓ©'],
|
| 270 |
+
'chute': ['chute', 'tomber'],
|
| 271 |
+
'frΓ©quent': ['frΓ©quent', 'frΓ©quente', 'rΓ©pΓ©titif', 'rΓ©pΓ©titive']
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
def extract_keywords_dual_field(description, equipment_desc, keyword_dict):
|
| 275 |
+
"""Extract keywords from both description and equipment description"""
|
| 276 |
+
combined_text = (str(description) + ' ' + str(equipment_desc)).lower()
|
| 277 |
+
found_keywords = []
|
| 278 |
+
|
| 279 |
+
for category, keywords in keyword_dict.items():
|
| 280 |
+
for keyword in keywords:
|
| 281 |
+
if keyword in combined_text:
|
| 282 |
+
found_keywords.append(category)
|
| 283 |
+
break
|
| 284 |
+
|
| 285 |
+
return found_keywords
|
| 286 |
+
|
| 287 |
+
# Apply enhanced keyword extraction
|
| 288 |
+
print("Extracting enhanced keywords from both fields...")
|
| 289 |
+
|
| 290 |
+
# Equipment mentions (dual-field)
|
| 291 |
+
df['equipment_mentioned'] = df.apply(
|
| 292 |
+
lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'Γ©quipement'], equipment_keywords),
|
| 293 |
+
axis=1
|
| 294 |
+
)
|
| 295 |
+
df['equipment_count'] = df['equipment_mentioned'].str.len()
|
| 296 |
+
|
| 297 |
+
# Problem types (dual-field)
|
| 298 |
+
df['problem_types'] = df.apply(
|
| 299 |
+
lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'Γ©quipement'], problem_keywords),
|
| 300 |
+
axis=1
|
| 301 |
+
)
|
| 302 |
+
df['problem_count'] = df['problem_types'].str.len()
|
| 303 |
+
|
| 304 |
+
# Actions mentioned (dual-field)
|
| 305 |
+
df['actions_mentioned'] = df.apply(
|
| 306 |
+
lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'Γ©quipement'], action_keywords),
|
| 307 |
+
axis=1
|
| 308 |
+
)
|
| 309 |
+
df['action_count'] = df['actions_mentioned'].str.len()
|
| 310 |
+
|
| 311 |
+
# Urgency indicators (dual-field)
|
| 312 |
+
df['urgency_indicators'] = df.apply(
|
| 313 |
+
lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'Γ©quipement'], urgency_keywords),
|
| 314 |
+
axis=1
|
| 315 |
+
)
|
| 316 |
+
df['has_urgency'] = df['urgency_indicators'].str.len() > 0
|
| 317 |
+
|
| 318 |
+
print(f"β Enhanced keyword extraction completed")
|
| 319 |
+
|
| 320 |
+
# ============== STEP 5: CRITICAL FAILURE PATTERN DETECTION ==============
|
| 321 |
+
print("\n" + "="*50)
|
| 322 |
+
print("STEP 5: CRITICAL FAILURE PATTERN DETECTION")
|
| 323 |
+
print("="*50)
|
| 324 |
+
|
| 325 |
+
# Structural failure indicators (highest severity)
|
| 326 |
+
df['has_structural_failure'] = df['combined_text_lower'].str.contains(
|
| 327 |
+
'percement|Γ©clatement|eclatement|fissure|rupture', regex=True, na=False
|
| 328 |
+
).astype(int)
|
| 329 |
+
|
| 330 |
+
# Equipment malfunction indicators
|
| 331 |
+
df['has_equipment_malfunction'] = df['combined_text_lower'].str.contains(
|
| 332 |
+
'sens inverse|dysfonctionnement|dΓ©faillance|dΓ©faut|panne', regex=True, na=False
|
| 333 |
+
).astype(int)
|
| 334 |
+
|
| 335 |
+
# Escalation indicators
|
| 336 |
+
df['has_escalation'] = df['combined_text_lower'].str.contains(
|
| 337 |
+
'aggravation|empirΓ©|empire', regex=True, na=False
|
| 338 |
+
).astype(int)
|
| 339 |
+
|
| 340 |
+
# Safety indicators (but not automatic high criticality)
|
| 341 |
+
df['has_safety_mention'] = df['Description'].str.contains('SAFETY', case=False, na=False).astype(int)
|
| 342 |
+
|
| 343 |
+
# Specific high-risk combinations (from critical case analysis)
|
| 344 |
+
df['electrical_cooling_issue'] = (
|
| 345 |
+
(df['equipment_type_class'].isin(['ELECTRICAL_CRITICAL', 'ELECTRICAL_STANDARD'])) &
|
| 346 |
+
(df['combined_text_lower'].str.contains('refroidissement|ventilateur|tempΓ©rature', regex=True, na=False))
|
| 347 |
+
).astype(int)
|
| 348 |
+
|
| 349 |
+
df['turbine_oil_issue'] = (
|
| 350 |
+
(df['equipment_type_class'] == 'TURBINE_SYSTEMS') &
|
| 351 |
+
(df['combined_text_lower'].str.contains('huile|fuite|graissage', regex=True, na=False))
|
| 352 |
+
).astype(int)
|
| 353 |
+
|
| 354 |
+
df['main_equipment_failure'] = (
|
| 355 |
+
(df['equipment_redundancy_class'] == 'SINGLE_CRITICAL') &
|
| 356 |
+
(df['has_structural_failure'] == 1)
|
| 357 |
+
).astype(int)
|
| 358 |
+
|
| 359 |
+
print(f"Critical failure patterns detected:")
|
| 360 |
+
print(f"Structural failures: {df['has_structural_failure'].sum()}")
|
| 361 |
+
print(f"Equipment malfunctions: {df['has_equipment_malfunction'].sum()}")
|
| 362 |
+
print(f"Escalation indicators: {df['has_escalation'].sum()}")
|
| 363 |
+
print(f"Electrical cooling issues: {df['electrical_cooling_issue'].sum()}")
|
| 364 |
+
print(f"Turbine oil issues: {df['turbine_oil_issue'].sum()}")
|
| 365 |
+
print(f"Main equipment failures: {df['main_equipment_failure'].sum()}")
|
| 366 |
+
|
| 367 |
+
# ============== STEP 6: ENHANCED COMPOUND FEATURES ==============
|
| 368 |
+
print("\n" + "="*50)
|
| 369 |
+
print("STEP 6: ENHANCED COMPOUND FEATURES")
|
| 370 |
+
print("="*50)
|
| 371 |
+
|
| 372 |
+
# Specific leak types (from original analysis)
|
| 373 |
+
df['fuite_vapeur'] = df['combined_text_lower'].str.contains('fuite.*vapeur|vapeur.*fuite', regex=True, na=False).astype(int)
|
| 374 |
+
df['fuite_huile'] = df['combined_text_lower'].str.contains('fuite.*huile|huile.*fuite', regex=True, na=False).astype(int)
|
| 375 |
+
df['fuite_eau'] = df['combined_text_lower'].str.contains('fuite.*eau|eau.*fuite', regex=True, na=False).astype(int)
|
| 376 |
+
|
| 377 |
+
# Enhanced vibration/noise detection
|
| 378 |
+
df['bruit_anormal'] = df['combined_text_lower'].str.contains('bruit anormal', regex=True, na=False).astype(int)
|
| 379 |
+
df['vibration_excessive'] = df['combined_text_lower'].str.contains(
|
| 380 |
+
'vibration.*excessive|vibration.*Γ©levΓ©e|vibration.*haute', regex=True, na=False
|
| 381 |
+
).astype(int)
|
| 382 |
+
|
| 383 |
+
# Temperature issues
|
| 384 |
+
df['temperature_elevee'] = df['combined_text_lower'].str.contains(
|
| 385 |
+
'tempΓ©rature Γ©levΓ©e|temp Γ©levΓ©e|temp elevee|surchauffe', regex=True, na=False
|
| 386 |
+
).astype(int)
|
| 387 |
+
|
| 388 |
+
# Maintenance prediction indicators
|
| 389 |
+
df['maintenance_planning'] = df['combined_text_lower'].str.contains(
|
| 390 |
+
'prΓ©voir|prΓ©voire|planifier|programmer', regex=True, na=False
|
| 391 |
+
).astype(int)
|
| 392 |
+
|
| 393 |
+
# Recurring issue indicators
|
| 394 |
+
df['is_recurring'] = df['combined_text_lower'].str.contains(
|
| 395 |
+
'frΓ©quent|rΓ©pΓ©titif|souvent|plusieurs fois|encore', regex=True, na=False
|
| 396 |
+
).astype(int)
|
| 397 |
+
|
| 398 |
+
# Measurements and technical details
|
| 399 |
+
df['has_measurements'] = df['combined_text_lower'].str.contains(
|
| 400 |
+
r'\d+\s*Β°c|\d+\s*bar|\d+\s*%|\d+\s*mm|\d+\s*m3', regex=True, na=False
|
| 401 |
+
).astype(int)
|
| 402 |
+
|
| 403 |
+
df['has_equipment_codes'] = df['combined_text_lower'].str.contains(
|
| 404 |
+
r'[A-Z0-9]{5,}|[0-9]{2}[A-Z]{3}[0-9]{2}', regex=True, na=False
|
| 405 |
+
).astype(int)
|
| 406 |
+
|
| 407 |
+
# Equipment location indicators
|
| 408 |
+
df['has_location_details'] = df['combined_text_lower'].str.contains(
|
| 409 |
+
'niveau|angle|cΓ΄tΓ©|cotΓ©|palier|entrΓ©e|sortie|amont|aval', regex=True, na=False
|
| 410 |
+
).astype(int)
|
| 411 |
+
|
| 412 |
+
# ============== STEP 7: ADVANCED SEVERITY SCORING ==============
|
| 413 |
+
print("\n" + "="*50)
|
| 414 |
+
print("STEP 7: ADVANCED SEVERITY SCORING")
|
| 415 |
+
print("="*50)
|
| 416 |
+
|
| 417 |
+
# Enhanced severity word scoring (from critical case analysis)
|
| 418 |
+
severity_words = {
|
| 419 |
+
'critique': 4, 'critiques': 4,
|
| 420 |
+
'grave': 4, 'graves': 4,
|
| 421 |
+
'majeur': 4, 'majeure': 4,
|
| 422 |
+
'important': 3, 'importante': 3,
|
| 423 |
+
'total': 5, 'totale': 5,
|
| 424 |
+
'complet': 5, 'complète': 5,
|
| 425 |
+
'rupture': 5, 'Γ©clatement': 5, 'eclatement': 5,
|
| 426 |
+
'percement': 5, 'fissure': 4,
|
| 427 |
+
'aggravation': 4,
|
| 428 |
+
'sifflement': 3,
|
| 429 |
+
'sens inverse': 5,
|
| 430 |
+
'dysfonctionnement': 3,
|
| 431 |
+
'dΓ©faillance': 3,
|
| 432 |
+
'urgent': 3, 'urgence': 3,
|
| 433 |
+
'immΓ©diat': 3, 'immΓ©diatement': 3,
|
| 434 |
+
'dangereux': 4, 'dangereuse': 4,
|
| 435 |
+
'léger': 1, 'légère': 1,
|
| 436 |
+
'faible': 1, 'petit': 1, 'petite': 1,
|
| 437 |
+
'normal': 1, 'normale': 1
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
def calculate_enhanced_severity_score(text):
|
| 441 |
+
"""Calculate severity score based on enhanced word analysis"""
|
| 442 |
+
text = str(text).lower()
|
| 443 |
+
max_score = 0
|
| 444 |
+
word_count = 0
|
| 445 |
+
|
| 446 |
+
for word, weight in severity_words.items():
|
| 447 |
+
if word in text:
|
| 448 |
+
max_score = max(max_score, weight)
|
| 449 |
+
word_count += 1
|
| 450 |
+
|
| 451 |
+
# Bonus for multiple severity indicators
|
| 452 |
+
if word_count > 1:
|
| 453 |
+
max_score += 0.5
|
| 454 |
+
|
| 455 |
+
return max_score
|
| 456 |
+
|
| 457 |
+
df['enhanced_severity_score'] = df['combined_text_lower'].apply(calculate_enhanced_severity_score)
|
| 458 |
+
|
| 459 |
+
# Equipment-Problem Risk Matrix
|
| 460 |
+
def calculate_equipment_problem_risk(equipment_type, problem_types, has_structural):
|
| 461 |
+
"""Calculate compound risk based on equipment type and problem severity"""
|
| 462 |
+
base_risk = 1.0
|
| 463 |
+
|
| 464 |
+
# High-risk equipment gets higher base risk
|
| 465 |
+
if equipment_type in ['ELECTRICAL_CRITICAL', 'TURBINE_SYSTEMS', 'COOLING_CRITICAL']:
|
| 466 |
+
base_risk = 1.5
|
| 467 |
+
elif equipment_type in ['ELECTRICAL_STANDARD', 'HEATING_SYSTEMS']:
|
| 468 |
+
base_risk = 1.2
|
| 469 |
+
|
| 470 |
+
# Structural failure on any equipment is serious
|
| 471 |
+
if has_structural:
|
| 472 |
+
base_risk *= 2.0
|
| 473 |
+
|
| 474 |
+
# Specific problem type multipliers
|
| 475 |
+
if 'vibration' in problem_types:
|
| 476 |
+
base_risk *= 1.3
|
| 477 |
+
if 'fuite' in problem_types:
|
| 478 |
+
base_risk *= 1.2
|
| 479 |
+
if 'bruit_anormal' in problem_types:
|
| 480 |
+
base_risk *= 1.2
|
| 481 |
+
|
| 482 |
+
return min(base_risk, 3.0) # Cap at 3.0
|
| 483 |
+
|
| 484 |
+
df['equipment_problem_risk'] = df.apply(
|
| 485 |
+
lambda row: calculate_equipment_problem_risk(
|
| 486 |
+
row['equipment_type_class'],
|
| 487 |
+
row['problem_types'],
|
| 488 |
+
row['has_structural_failure']
|
| 489 |
+
), axis=1
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
# Complexity indicators
|
| 493 |
+
df['technical_complexity'] = (
|
| 494 |
+
df['combined_word_count'] / 15 + # Normalized word count
|
| 495 |
+
df['equipment_count'] +
|
| 496 |
+
df['problem_count'] +
|
| 497 |
+
df['has_measurements'] +
|
| 498 |
+
df['has_equipment_codes'] +
|
| 499 |
+
df['has_location_details']
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
print(f"β Advanced severity scoring completed")
|
| 503 |
+
print(f"Enhanced severity score distribution:")
|
| 504 |
+
print(df['enhanced_severity_score'].value_counts().sort_index())
|
| 505 |
+
|
| 506 |
+
# ============== STEP 8: NOISE-ROBUST LABEL ANALYSIS ==============
|
| 507 |
+
print("\n" + "="*50)
|
| 508 |
+
print("STEP 8: NOISE-ROBUST LABEL ANALYSIS")
|
| 509 |
+
print("="*50)
|
| 510 |
+
|
| 511 |
+
# Identify potentially noisy labels
|
| 512 |
+
def identify_label_inconsistencies(df, similarity_threshold=0.8):
|
| 513 |
+
"""Identify potentially inconsistent labels for similar cases"""
|
| 514 |
+
|
| 515 |
+
# Group by similar characteristics
|
| 516 |
+
similar_groups = df.groupby([
|
| 517 |
+
'equipment_type_class',
|
| 518 |
+
'equipment_redundancy_class',
|
| 519 |
+
'Section propriΓ©taire'
|
| 520 |
+
])
|
| 521 |
+
|
| 522 |
+
inconsistent_cases = []
|
| 523 |
+
|
| 524 |
+
for group_key, group_df in similar_groups:
|
| 525 |
+
if len(group_df) >= 3: # Need at least 3 cases to detect outliers
|
| 526 |
+
criticality_std = group_df['CriticitΓ©'].std()
|
| 527 |
+
criticality_mean = group_df['CriticitΓ©'].mean()
|
| 528 |
+
|
| 529 |
+
if criticality_std > 3.0: # High variance in similar cases
|
| 530 |
+
for idx, row in group_df.iterrows():
|
| 531 |
+
z_score = abs(row['CriticitΓ©'] - criticality_mean) / (criticality_std + 0.1)
|
| 532 |
+
if z_score > 2.0: # Outlier
|
| 533 |
+
inconsistent_cases.append({
|
| 534 |
+
'index': idx,
|
| 535 |
+
'criticality': row['CriticitΓ©'],
|
| 536 |
+
'expected_range': f"{criticality_mean-criticality_std:.1f}-{criticality_mean+criticality_std:.1f}",
|
| 537 |
+
'z_score': z_score,
|
| 538 |
+
'group': group_key
|
| 539 |
+
})
|
| 540 |
+
|
| 541 |
+
return inconsistent_cases
|
| 542 |
+
|
| 543 |
+
inconsistent_labels = identify_label_inconsistencies(df)
|
| 544 |
+
df['potentially_mislabeled'] = 0
|
| 545 |
+
if inconsistent_labels:
|
| 546 |
+
inconsistent_indices = [case['index'] for case in inconsistent_labels]
|
| 547 |
+
df.loc[inconsistent_indices, 'potentially_mislabeled'] = 1
|
| 548 |
+
|
| 549 |
+
print(f"Identified {len(inconsistent_labels)} potentially inconsistent labels")
|
| 550 |
+
print(f"Percentage of potentially noisy labels: {len(inconsistent_labels)/len(df)*100:.2f}%")
|
| 551 |
+
|
| 552 |
+
# Create label confidence scores
|
| 553 |
+
def calculate_label_confidence(row):
|
| 554 |
+
"""Calculate confidence in the label based on consistency with similar cases"""
|
| 555 |
+
base_confidence = 1.0
|
| 556 |
+
|
| 557 |
+
# Reduce confidence for outliers
|
| 558 |
+
if row['potentially_mislabeled']:
|
| 559 |
+
base_confidence *= 0.6
|
| 560 |
+
|
| 561 |
+
# Increase confidence for cases that align with equipment risk
|
| 562 |
+
expected_criticality = row['equipment_risk_score']
|
| 563 |
+
actual_criticality = row['CriticitΓ©']
|
| 564 |
+
|
| 565 |
+
# If actual is close to expected, increase confidence
|
| 566 |
+
diff = abs(actual_criticality - expected_criticality)
|
| 567 |
+
if diff <= 2:
|
| 568 |
+
base_confidence *= 1.2
|
| 569 |
+
elif diff > 5:
|
| 570 |
+
base_confidence *= 0.8
|
| 571 |
+
|
| 572 |
+
return min(base_confidence, 1.0)
|
| 573 |
+
|
| 574 |
+
df['label_confidence'] = df.apply(calculate_label_confidence, axis=1)
|
| 575 |
+
|
| 576 |
+
print(f"Label confidence distribution:")
|
| 577 |
+
print(f"High confidence (>0.9): {(df['label_confidence'] > 0.9).sum()}")
|
| 578 |
+
print(f"Medium confidence (0.7-0.9): {((df['label_confidence'] > 0.7) & (df['label_confidence'] <= 0.9)).sum()}")
|
| 579 |
+
print(f"Low confidence (<0.7): {(df['label_confidence'] <= 0.7).sum()}")
|
| 580 |
+
|
| 581 |
+
# ============== STEP 9: CORRELATION ANALYSIS ==============
|
| 582 |
+
print("\n" + "="*50)
|
| 583 |
+
print("STEP 9: ENHANCED FEATURE CORRELATION ANALYSIS")
|
| 584 |
+
print("="*50)
|
| 585 |
+
|
| 586 |
+
# Enhanced feature list
|
| 587 |
+
enhanced_features = [
|
| 588 |
+
'equipment_risk_score', 'equipment_base_criticality', 'equipment_redundancy_multiplier',
|
| 589 |
+
'section_risk_multiplier', 'enhanced_severity_score', 'equipment_problem_risk',
|
| 590 |
+
'technical_complexity', 'has_structural_failure', 'has_equipment_malfunction',
|
| 591 |
+
'has_escalation', 'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure',
|
| 592 |
+
'combined_word_count', 'equipment_count', 'problem_count', 'action_count',
|
| 593 |
+
'has_urgency', 'bruit_anormal', 'vibration_excessive', 'temperature_elevee',
|
| 594 |
+
'fuite_vapeur', 'fuite_huile', 'maintenance_planning', 'is_recurring',
|
| 595 |
+
'has_measurements', 'has_equipment_codes', 'has_location_details', 'has_safety_mention'
|
| 596 |
+
]
|
| 597 |
+
|
| 598 |
+
target_cols = ['FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©', 'Process Safety', 'CriticitΓ©']
|
| 599 |
+
|
| 600 |
+
print("\nTop correlations with CriticitΓ©:")
|
| 601 |
+
correlations = []
|
| 602 |
+
for feature in enhanced_features:
|
| 603 |
+
if feature in df.columns:
|
| 604 |
+
corr = df[feature].corr(df['CriticitΓ©'])
|
| 605 |
+
correlations.append({'Feature': feature, 'Correlation': corr})
|
| 606 |
+
|
| 607 |
+
correlation_df = pd.DataFrame(correlations).sort_values('Correlation', key=abs, ascending=False)
|
| 608 |
+
print(correlation_df.head(15).to_string(index=False))
|
| 609 |
+
|
| 610 |
+
# ============== STEP 10: SAVE ENHANCED DATASET ==============
|
| 611 |
+
print("\n" + "="*50)
|
| 612 |
+
print("STEP 10: SAVING ENHANCED DATASET")
|
| 613 |
+
print("="*50)
|
| 614 |
+
|
| 615 |
+
# Select final feature columns
|
| 616 |
+
final_columns = [
|
| 617 |
+
# Original columns
|
| 618 |
+
'Num_equipement', 'Systeme', 'Description', 'Date de dΓ©tΓ©ction de l\'anomalie',
|
| 619 |
+
'Description de l\'Γ©quipement', 'Section propriΓ©taire',
|
| 620 |
+
'FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©', 'Process Safety', 'CriticitΓ©',
|
| 621 |
+
|
| 622 |
+
# Equipment Intelligence Features
|
| 623 |
+
'equipment_type_class', 'equipment_base_criticality', 'equipment_redundancy_class',
|
| 624 |
+
'equipment_redundancy_multiplier', 'section_risk_multiplier', 'equipment_risk_score',
|
| 625 |
+
|
| 626 |
+
# Text Analysis Features
|
| 627 |
+
'combined_text', 'description_length', 'description_word_count',
|
| 628 |
+
'equipment_desc_length', 'equipment_desc_word_count', 'combined_length', 'combined_word_count',
|
| 629 |
+
|
| 630 |
+
# Enhanced Keyword Features
|
| 631 |
+
'equipment_mentioned', 'equipment_count', 'problem_types', 'problem_count',
|
| 632 |
+
'actions_mentioned', 'action_count', 'urgency_indicators', 'has_urgency',
|
| 633 |
+
|
| 634 |
+
# Critical Failure Features
|
| 635 |
+
'has_structural_failure', 'has_equipment_malfunction', 'has_escalation', 'has_safety_mention',
|
| 636 |
+
'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure',
|
| 637 |
+
|
| 638 |
+
# Compound Features
|
| 639 |
+
'fuite_vapeur', 'fuite_huile', 'fuite_eau', 'bruit_anormal', 'vibration_excessive',
|
| 640 |
+
'temperature_elevee', 'maintenance_planning', 'is_recurring',
|
| 641 |
+
|
| 642 |
+
# Technical Features
|
| 643 |
+
'has_measurements', 'has_equipment_codes', 'has_location_details',
|
| 644 |
+
|
| 645 |
+
# Advanced Features
|
| 646 |
+
'enhanced_severity_score', 'equipment_problem_risk', 'technical_complexity',
|
| 647 |
+
|
| 648 |
+
# Noise-Robust Features
|
| 649 |
+
'potentially_mislabeled', 'label_confidence'
|
| 650 |
+
]
|
| 651 |
+
|
| 652 |
+
# Ensure all columns exist
|
| 653 |
+
available_columns = [col for col in final_columns if col in df.columns]
|
| 654 |
+
missing_columns = [col for col in final_columns if col not in df.columns]
|
| 655 |
+
|
| 656 |
+
if missing_columns:
|
| 657 |
+
print(f"Warning: Missing columns: {missing_columns}")
|
| 658 |
+
|
| 659 |
+
# Save enhanced dataset
|
| 660 |
+
enhanced_df = df[available_columns].copy()
|
| 661 |
+
enhanced_df.to_csv('enhanced_anomaly_data_v2.csv', index=False, encoding='utf-8')
|
| 662 |
+
|
| 663 |
+
print(f"β Enhanced dataset saved to 'enhanced_anomaly_data_v2.csv'")
|
| 664 |
+
print(f"Dataset shape: {enhanced_df.shape}")
|
| 665 |
+
print(f"Total features: {len(available_columns)}")
|
| 666 |
+
|
| 667 |
+
# ============== STEP 11: FEATURE SUMMARY AND RECOMMENDATIONS ==============
|
| 668 |
+
print("\n" + "="*50)
|
| 669 |
+
print("STEP 11: FEATURE SUMMARY AND RECOMMENDATIONS")
|
| 670 |
+
print("="*50)
|
| 671 |
+
|
| 672 |
+
# Feature importance ranking based on correlations
|
| 673 |
+
feature_importance = correlation_df.copy()
|
| 674 |
+
feature_importance['Abs_Correlation'] = feature_importance['Correlation'].abs()
|
| 675 |
+
feature_importance = feature_importance.sort_values('Abs_Correlation', ascending=False)
|
| 676 |
+
|
| 677 |
+
print("\nπ― TOP 10 MOST IMPORTANT FEATURES:")
|
| 678 |
+
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
|
| 679 |
+
print(f"{i:2d}. {row['Feature']:35s}: {row['Correlation']:6.3f}")
|
| 680 |
+
|
| 681 |
+
# Equipment intelligence summary
|
| 682 |
+
print(f"\nπ§ EQUIPMENT INTELLIGENCE SUMMARY:")
|
| 683 |
+
print(f"Equipment types classified:")
|
| 684 |
+
equipment_type_summary = df['equipment_type_class'].value_counts()
|
| 685 |
+
for eq_type, count in equipment_type_summary.items():
|
| 686 |
+
avg_crit = df[df['equipment_type_class'] == eq_type]['CriticitΓ©'].mean()
|
| 687 |
+
print(f" {eq_type:25s}: {count:4d} cases (avg criticality: {avg_crit:.2f})")
|
| 688 |
+
|
| 689 |
+
print(f"\nRedundancy classification:")
|
| 690 |
+
redundancy_summary = df['equipment_redundancy_class'].value_counts()
|
| 691 |
+
for red_class, count in redundancy_summary.items():
|
| 692 |
+
avg_crit = df[df['equipment_redundancy_class'] == red_class]['CriticitΓ©'].mean()
|
| 693 |
+
print(f" {red_class:20s}: {count:4d} cases (avg criticality: {avg_crit:.2f})")
|
| 694 |
+
|
| 695 |
+
# Critical case analysis
|
| 696 |
+
critical_cases = df[df['CriticitΓ©'] >= 10]
|
| 697 |
+
print(f"\nβ οΈ CRITICAL CASE ANALYSIS (Criticality >= 10): {len(critical_cases)} cases")
|
| 698 |
+
|
| 699 |
+
if len(critical_cases) > 0:
|
| 700 |
+
print("Equipment types in critical cases:")
|
| 701 |
+
crit_equipment = critical_cases['equipment_type_class'].value_counts()
|
| 702 |
+
for eq_type, count in crit_equipment.items():
|
| 703 |
+
total_type = len(df[df['equipment_type_class'] == eq_type])
|
| 704 |
+
percentage = count / total_type * 100
|
| 705 |
+
print(f" {eq_type:25s}: {count:2d}/{total_type:3d} cases ({percentage:5.1f}% critical)")
|
| 706 |
+
|
| 707 |
+
print("\nTop critical failure patterns:")
|
| 708 |
+
critical_patterns = {
|
| 709 |
+
'Structural Failure': critical_cases['has_structural_failure'].sum(),
|
| 710 |
+
'Equipment Malfunction': critical_cases['has_equipment_malfunction'].sum(),
|
| 711 |
+
'Escalation': critical_cases['has_escalation'].sum(),
|
| 712 |
+
'Electrical Cooling Issue': critical_cases['electrical_cooling_issue'].sum(),
|
| 713 |
+
'Turbine Oil Issue': critical_cases['turbine_oil_issue'].sum(),
|
| 714 |
+
'Main Equipment Failure': critical_cases['main_equipment_failure'].sum()
|
| 715 |
+
}
|
| 716 |
+
|
| 717 |
+
for pattern, count in sorted(critical_patterns.items(), key=lambda x: x[1], reverse=True):
|
| 718 |
+
if count > 0:
|
| 719 |
+
percentage = count / len(critical_cases) * 100
|
| 720 |
+
print(f" {pattern:25s}: {count:2d} cases ({percentage:5.1f}% of critical)")
|
| 721 |
+
|
| 722 |
+
# Data quality assessment
|
| 723 |
+
print(f"\nπ DATA QUALITY ASSESSMENT:")
|
| 724 |
+
print(f"Total samples: {len(df)}")
|
| 725 |
+
print(f"Potentially mislabeled: {df['potentially_mislabeled'].sum()} ({df['potentially_mislabeled'].mean()*100:.1f}%)")
|
| 726 |
+
print(f"High confidence labels: {(df['label_confidence'] > 0.9).sum()} ({(df['label_confidence'] > 0.9).mean()*100:.1f}%)")
|
| 727 |
+
print(f"Low confidence labels: {(df['label_confidence'] < 0.7).sum()} ({(df['label_confidence'] < 0.7).mean()*100:.1f}%)")
|
| 728 |
+
|
| 729 |
+
# ============== STEP 12: VISUALIZATION CREATION ==============
|
| 730 |
+
print("\n" + "="*50)
|
| 731 |
+
print("STEP 12: CREATING ENHANCED VISUALIZATIONS")
|
| 732 |
+
print("="*50)
|
| 733 |
+
|
| 734 |
+
# Create comprehensive visualization
|
| 735 |
+
fig = plt.figure(figsize=(20, 16))
|
| 736 |
+
|
| 737 |
+
# 1. Equipment Risk Score vs Criticality
|
| 738 |
+
plt.subplot(3, 4, 1)
|
| 739 |
+
plt.scatter(df['equipment_risk_score'], df['CriticitΓ©'], alpha=0.6, s=20)
|
| 740 |
+
plt.xlabel('Equipment Risk Score')
|
| 741 |
+
plt.ylabel('Actual CriticitΓ©')
|
| 742 |
+
plt.title('Equipment Risk Score vs Actual CriticitΓ©')
|
| 743 |
+
plt.grid(True, alpha=0.3)
|
| 744 |
+
|
| 745 |
+
# 2. Equipment Type Distribution
|
| 746 |
+
plt.subplot(3, 4, 2)
|
| 747 |
+
equipment_counts = df['equipment_type_class'].value_counts()
|
| 748 |
+
plt.pie(equipment_counts.values, labels=equipment_counts.index, autopct='%1.1f%%', startangle=90)
|
| 749 |
+
plt.title('Equipment Type Distribution')
|
| 750 |
+
|
| 751 |
+
# 3. Section Risk Analysis
|
| 752 |
+
plt.subplot(3, 4, 3)
|
| 753 |
+
section_criticality = df.groupby('Section propriΓ©taire')['CriticitΓ©'].mean().sort_values(ascending=False)
|
| 754 |
+
plt.bar(section_criticality.index, section_criticality.values)
|
| 755 |
+
plt.xlabel('Section')
|
| 756 |
+
plt.ylabel('Average CriticitΓ©')
|
| 757 |
+
plt.title('Average Criticality by Section')
|
| 758 |
+
plt.xticks(rotation=45)
|
| 759 |
+
|
| 760 |
+
# 4. Feature Correlation Heatmap
|
| 761 |
+
plt.subplot(3, 4, 4)
|
| 762 |
+
top_features = feature_importance.head(8)['Feature'].tolist() + ['CriticitΓ©']
|
| 763 |
+
if len(top_features) > 1:
|
| 764 |
+
corr_matrix = df[top_features].corr()
|
| 765 |
+
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', cbar_kws={'shrink': 0.8})
|
| 766 |
+
plt.title('Top Features Correlation')
|
| 767 |
+
|
| 768 |
+
# 5. Critical Failure Patterns
|
| 769 |
+
plt.subplot(3, 4, 5)
|
| 770 |
+
failure_patterns = {
|
| 771 |
+
'Structural': df['has_structural_failure'].sum(),
|
| 772 |
+
'Malfunction': df['has_equipment_malfunction'].sum(),
|
| 773 |
+
'Escalation': df['has_escalation'].sum(),
|
| 774 |
+
'Elec-Cooling': df['electrical_cooling_issue'].sum(),
|
| 775 |
+
'Turbine-Oil': df['turbine_oil_issue'].sum(),
|
| 776 |
+
'Main-Equip': df['main_equipment_failure'].sum()
|
| 777 |
+
}
|
| 778 |
+
plt.bar(failure_patterns.keys(), failure_patterns.values())
|
| 779 |
+
plt.xlabel('Failure Pattern')
|
| 780 |
+
plt.ylabel('Count')
|
| 781 |
+
plt.title('Critical Failure Pattern Frequency')
|
| 782 |
+
plt.xticks(rotation=45)
|
| 783 |
+
|
| 784 |
+
# 6. Redundancy vs Criticality
|
| 785 |
+
plt.subplot(3, 4, 6)
|
| 786 |
+
redundancy_crit = df.groupby('equipment_redundancy_class')['CriticitΓ©'].mean()
|
| 787 |
+
plt.bar(redundancy_crit.index, redundancy_crit.values)
|
| 788 |
+
plt.xlabel('Redundancy Class')
|
| 789 |
+
plt.ylabel('Average CriticitΓ©')
|
| 790 |
+
plt.title('Redundancy vs Average Criticality')
|
| 791 |
+
plt.xticks(rotation=45)
|
| 792 |
+
|
| 793 |
+
# 7. Label Confidence Distribution
|
| 794 |
+
plt.subplot(3, 4, 7)
|
| 795 |
+
plt.hist(df['label_confidence'], bins=20, alpha=0.7, edgecolor='black')
|
| 796 |
+
plt.xlabel('Label Confidence')
|
| 797 |
+
plt.ylabel('Frequency')
|
| 798 |
+
plt.title('Label Confidence Distribution')
|
| 799 |
+
plt.grid(True, alpha=0.3)
|
| 800 |
+
|
| 801 |
+
# 8. Enhanced Severity Score vs Criticality
|
| 802 |
+
plt.subplot(3, 4, 8)
|
| 803 |
+
plt.scatter(df['enhanced_severity_score'], df['CriticitΓ©'], alpha=0.6, s=20)
|
| 804 |
+
plt.xlabel('Enhanced Severity Score')
|
| 805 |
+
plt.ylabel('Actual CriticitΓ©')
|
| 806 |
+
plt.title('Severity Score vs Criticality')
|
| 807 |
+
plt.grid(True, alpha=0.3)
|
| 808 |
+
|
| 809 |
+
# 9. Equipment Problem Risk vs Criticality
|
| 810 |
+
plt.subplot(3, 4, 9)
|
| 811 |
+
plt.scatter(df['equipment_problem_risk'], df['CriticitΓ©'], alpha=0.6, s=20)
|
| 812 |
+
plt.xlabel('Equipment Problem Risk')
|
| 813 |
+
plt.ylabel('Actual CriticitΓ©')
|
| 814 |
+
plt.title('Equipment-Problem Risk vs Criticality')
|
| 815 |
+
plt.grid(True, alpha=0.3)
|
| 816 |
+
|
| 817 |
+
# 10. Critical Cases by Equipment Type
|
| 818 |
+
plt.subplot(3, 4, 10)
|
| 819 |
+
if len(critical_cases) > 0:
|
| 820 |
+
crit_eq_counts = critical_cases['equipment_type_class'].value_counts()
|
| 821 |
+
plt.barh(range(len(crit_eq_counts)), crit_eq_counts.values)
|
| 822 |
+
plt.yticks(range(len(crit_eq_counts)), crit_eq_counts.index)
|
| 823 |
+
plt.xlabel('Count')
|
| 824 |
+
plt.title('Critical Cases by Equipment Type')
|
| 825 |
+
|
| 826 |
+
# 11. Technical Complexity Distribution
|
| 827 |
+
plt.subplot(3, 4, 11)
|
| 828 |
+
plt.hist(df['technical_complexity'], bins=30, alpha=0.7, edgecolor='black')
|
| 829 |
+
plt.xlabel('Technical Complexity Score')
|
| 830 |
+
plt.ylabel('Frequency')
|
| 831 |
+
plt.title('Technical Complexity Distribution')
|
| 832 |
+
plt.grid(True, alpha=0.3)
|
| 833 |
+
|
| 834 |
+
# 12. Monthly Trend Analysis
|
| 835 |
+
plt.subplot(3, 4, 12)
|
| 836 |
+
df['Month'] = df['Date de dΓ©tΓ©ction de l\'anomalie'].dt.month
|
| 837 |
+
monthly_criticality = df.groupby('Month')['CriticitΓ©'].mean()
|
| 838 |
+
plt.plot(monthly_criticality.index, monthly_criticality.values, 'b-o', linewidth=2, markersize=6)
|
| 839 |
+
plt.xlabel('Month')
|
| 840 |
+
plt.ylabel('Average CriticitΓ©')
|
| 841 |
+
plt.title('Monthly Criticality Trend')
|
| 842 |
+
plt.grid(True, alpha=0.3)
|
| 843 |
+
plt.xticks(range(1, 13))
|
| 844 |
+
|
| 845 |
+
plt.tight_layout()
|
| 846 |
+
plt.savefig('enhanced_analysis_dashboard_v2.png', dpi=300, bbox_inches='tight')
|
| 847 |
+
print("β Enhanced analysis dashboard saved as 'enhanced_analysis_dashboard_v2.png'")
|
| 848 |
+
|
| 849 |
+
# ============== STEP 13: TRAINING RECOMMENDATIONS ==============
|
| 850 |
+
print("\n" + "="*50)
|
| 851 |
+
print("STEP 13: TRAINING RECOMMENDATIONS")
|
| 852 |
+
print("="*50)
|
| 853 |
+
|
| 854 |
+
print("π ENHANCED MODEL TRAINING RECOMMENDATIONS:")
|
| 855 |
+
print("\n1. FEATURE SELECTION:")
|
| 856 |
+
print(" Prioritize features with |correlation| > 0.15:")
|
| 857 |
+
high_impact_features = feature_importance[feature_importance['Abs_Correlation'] > 0.15]['Feature'].tolist()
|
| 858 |
+
for i, feature in enumerate(high_impact_features, 1):
|
| 859 |
+
corr = feature_importance[feature_importance['Feature'] == feature]['Correlation'].iloc[0]
|
| 860 |
+
print(f" {i:2d}. {feature:35s} (r={corr:6.3f})")
|
| 861 |
+
|
| 862 |
+
print(f"\n2. NOISE-ROBUST TRAINING:")
|
| 863 |
+
print(f" - Use sample weighting based on 'label_confidence'")
|
| 864 |
+
print(f" - Apply higher weights to high-confidence samples")
|
| 865 |
+
print(f" - Consider excluding or down-weighting {df['potentially_mislabeled'].sum()} potentially mislabeled cases")
|
| 866 |
+
|
| 867 |
+
print(f"\n3. CLASS IMBALANCE HANDLING:")
|
| 868 |
+
print(f" - Focus SMOTE on high-criticality cases (>= 10)")
|
| 869 |
+
print(f" - Use cost-sensitive learning with heavy penalty for missing critical cases")
|
| 870 |
+
print(f" - Implement stratified sampling by equipment_type_class")
|
| 871 |
+
|
| 872 |
+
print(f"\n4. FEATURE ENGINEERING PRIORITIES:")
|
| 873 |
+
print(f" - Equipment intelligence features show strong correlation")
|
| 874 |
+
print(f" - Structural failure indicators are crucial for critical cases")
|
| 875 |
+
print(f" - Section-equipment interactions provide additional signal")
|
| 876 |
+
|
| 877 |
+
print(f"\n5. MODEL ARCHITECTURE SUGGESTIONS:")
|
| 878 |
+
print(f" - Use ensemble with equipment-type-specific models")
|
| 879 |
+
print(f" - Implement conservative prediction thresholds for ELECTRICAL_CRITICAL equipment")
|
| 880 |
+
print(f" - Add safety override rules for has_structural_failure = 1")
|
| 881 |
+
|
| 882 |
+
# Save feature metadata for training
|
| 883 |
+
feature_metadata = {
|
| 884 |
+
'high_impact_features': high_impact_features,
|
| 885 |
+
'equipment_type_classes': df['equipment_type_class'].unique().tolist(),
|
| 886 |
+
'redundancy_classes': df['equipment_redundancy_class'].unique().tolist(),
|
| 887 |
+
'section_risk_multipliers': SECTION_RISK_MULTIPLIERS,
|
| 888 |
+
'equipment_type_scores': EQUIPMENT_TYPE_SCORES,
|
| 889 |
+
'feature_correlations': [
|
| 890 |
+
{'Feature': row['Feature'], 'Correlation': float(row['Correlation'])}
|
| 891 |
+
for _, row in correlation_df.iterrows()
|
| 892 |
+
],
|
| 893 |
+
'data_quality_metrics': {
|
| 894 |
+
'total_samples': int(len(df)),
|
| 895 |
+
'potentially_mislabeled': int(df['potentially_mislabeled'].sum()),
|
| 896 |
+
'high_confidence_samples': int((df['label_confidence'] > 0.9).sum()),
|
| 897 |
+
'critical_cases': int(len(critical_cases)),
|
| 898 |
+
'structural_failures': int(df['has_structural_failure'].sum())
|
| 899 |
+
}
|
| 900 |
+
}
|
| 901 |
+
|
| 902 |
+
import json
|
| 903 |
+
with open('enhanced_feature_metadata_v2.json', 'w') as f:
|
| 904 |
+
json.dump(feature_metadata, f, indent=2)
|
| 905 |
+
|
| 906 |
+
print(f"\nβ Feature metadata saved to 'enhanced_feature_metadata_v2.json'")
|
| 907 |
+
|
| 908 |
+
# ============== FINAL SUMMARY ==============
|
| 909 |
+
print("\n" + "="*70)
|
| 910 |
+
print("ENHANCED DATA PROCESSING v2.0 COMPLETED!")
|
| 911 |
+
print("="*70)
|
| 912 |
+
|
| 913 |
+
print(f"\nπ ACHIEVEMENTS:")
|
| 914 |
+
print(f"β Equipment Intelligence Classification: {len(EQUIPMENT_TYPE_SCORES)} equipment categories")
|
| 915 |
+
print(f"β Redundancy Detection: {len(REDUNDANCY_PATTERNS)} redundancy patterns")
|
| 916 |
+
print(f"β Dual-Field Text Analysis: Description + Equipment Description")
|
| 917 |
+
print(f"β Critical Failure Pattern Detection: {len(critical_patterns)} pattern types")
|
| 918 |
+
print(f"β Noise-Robust Label Analysis: Confidence scoring implemented")
|
| 919 |
+
print(f"β Enhanced Feature Engineering: {len(available_columns)} total features")
|
| 920 |
+
|
| 921 |
+
print(f"\nπ DATASET ENHANCEMENT:")
|
| 922 |
+
print(f"Original features: 10")
|
| 923 |
+
print(f"Enhanced features: {len(available_columns)}")
|
| 924 |
+
print(f"Feature improvement: {(len(available_columns)/10-1)*100:.0f}% increase")
|
| 925 |
+
|
| 926 |
+
print(f"\nπ― KEY INSIGHTS FOR MODEL:")
|
| 927 |
+
print(f"1. Equipment type is strongest predictor of criticality")
|
| 928 |
+
print(f"2. Structural failures require immediate attention regardless of equipment")
|
| 929 |
+
print(f"3. Electrical equipment (34EL) has highest critical case rate")
|
| 930 |
+
print(f"4. Label confidence varies significantly - use for robust training")
|
| 931 |
+
print(f"5. Equipment redundancy affects criticality but not as strongly as type")
|
| 932 |
+
|
| 933 |
+
print(f"\nπ FILES GENERATED:")
|
| 934 |
+
print(f"β enhanced_anomaly_data_v2.csv - Enhanced dataset")
|
| 935 |
+
print(f"β enhanced_feature_metadata_v2.json - Feature metadata for training")
|
| 936 |
+
print(f"β enhanced_analysis_dashboard_v2.png - Comprehensive visualizations")
|
| 937 |
+
|
| 938 |
+
print(f"\nπ READY FOR ENHANCED MODEL TRAINING!")
|
| 939 |
+
print(f"The enhanced dataset now includes equipment intelligence that should")
|
| 940 |
+
print(f"significantly improve high-criticality case detection.")
|
| 941 |
+
|
| 942 |
+
print("="*70)
|
equipment_analysis.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# equipment_analysis.py
|
| 2 |
+
# Analyze equipment patterns across full dataset to understand redundancy and criticality patterns
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
import re
|
| 7 |
+
from collections import Counter, defaultdict
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
import seaborn as sns
|
| 10 |
+
|
| 11 |
+
print("="*60)
|
| 12 |
+
print("EQUIPMENT PATTERN ANALYSIS FOR CRITICALITY UNDERSTANDING")
|
| 13 |
+
print("="*60)
|
| 14 |
+
|
| 15 |
+
# Load the full dataset
|
| 16 |
+
try:
|
| 17 |
+
df = pd.read_excel('Taqathon_data.xlsx', sheet_name='Oracle')
|
| 18 |
+
print(f"β Loaded full dataset: {df.shape}")
|
| 19 |
+
except FileNotFoundError:
|
| 20 |
+
print("β Error: Taqathon_data.xlsx not found!")
|
| 21 |
+
print("Please ensure the file is in the current directory.")
|
| 22 |
+
exit(1)
|
| 23 |
+
|
| 24 |
+
print(f"Columns available: {df.columns.tolist()}")
|
| 25 |
+
|
| 26 |
+
# ============== STEP 1: BASIC EQUIPMENT OVERVIEW ==============
|
| 27 |
+
print("\n" + "="*50)
|
| 28 |
+
print("STEP 1: EQUIPMENT OVERVIEW")
|
| 29 |
+
print("="*50)
|
| 30 |
+
|
| 31 |
+
# Check for missing values in key columns
|
| 32 |
+
print("\nMissing values check:")
|
| 33 |
+
print(f"Description: {df['Description'].isnull().sum()}")
|
| 34 |
+
print(f"Description de l'Γ©quipement: {df['Description de l\'Γ©quipement'].isnull().sum()}")
|
| 35 |
+
print(f"CriticitΓ©: {df['CriticitΓ©'].isnull().sum()}")
|
| 36 |
+
|
| 37 |
+
# Remove rows with missing critical information
|
| 38 |
+
df_clean = df.dropna(subset=['Description', 'Description de l\'Γ©quipement', 'CriticitΓ©'])
|
| 39 |
+
print(f"\nClean dataset shape: {df_clean.shape}")
|
| 40 |
+
|
| 41 |
+
# ============== STEP 2: EQUIPMENT TYPE ANALYSIS ==============
|
| 42 |
+
print("\n" + "="*50)
|
| 43 |
+
print("STEP 2: EQUIPMENT TYPE FREQUENCY ANALYSIS")
|
| 44 |
+
print("="*50)
|
| 45 |
+
|
| 46 |
+
# Get all unique equipment types
|
| 47 |
+
equipment_types = df_clean['Description de l\'Γ©quipement'].value_counts()
|
| 48 |
+
print(f"\nTotal unique equipment types: {len(equipment_types)}")
|
| 49 |
+
|
| 50 |
+
print(f"\nTop 20 most frequent equipment types:")
|
| 51 |
+
for equipment, count in equipment_types.head(20).items():
|
| 52 |
+
avg_criticality = df_clean[df_clean['Description de l\'Γ©quipement'] == equipment]['CriticitΓ©'].mean()
|
| 53 |
+
print(f" {equipment}: {count} cases (avg criticality: {avg_criticality:.2f})")
|
| 54 |
+
|
| 55 |
+
# ============== STEP 3: REDUNDANCY PATTERN DETECTION ==============
|
| 56 |
+
print("\n" + "="*50)
|
| 57 |
+
print("STEP 3: REDUNDANCY PATTERN DETECTION")
|
| 58 |
+
print("="*50)
|
| 59 |
+
|
| 60 |
+
# Function to detect redundancy patterns
|
| 61 |
+
def analyze_redundancy_patterns(equipment_name):
|
| 62 |
+
patterns = {
|
| 63 |
+
'has_ab_suffix': bool(re.search(r'\b[AB]$|\b[AB]\b', equipment_name, re.IGNORECASE)),
|
| 64 |
+
'has_number_suffix': bool(re.search(r'\b[NΒ°]*\s*[0-9]+$|\b[0-9]+$', equipment_name)),
|
| 65 |
+
'has_principal': 'PRINCIPAL' in equipment_name.upper(),
|
| 66 |
+
'has_primaire': 'PRIMAIRE' in equipment_name.upper(),
|
| 67 |
+
'has_secondaire': 'SECONDAIRE' in equipment_name.upper(),
|
| 68 |
+
'has_auxiliaire': 'AUXILIAIRE' in equipment_name.upper(),
|
| 69 |
+
'has_unique': 'UNIQUE' in equipment_name.upper(),
|
| 70 |
+
'multiple_numbers': len(re.findall(r'\d+', equipment_name)) > 1
|
| 71 |
+
}
|
| 72 |
+
return patterns
|
| 73 |
+
|
| 74 |
+
# Apply redundancy analysis
|
| 75 |
+
equipment_analysis = []
|
| 76 |
+
for equipment in df_clean['Description de l\'Γ©quipement'].unique():
|
| 77 |
+
patterns = analyze_redundancy_patterns(equipment)
|
| 78 |
+
equipment_data = df_clean[df_clean['Description de l\'Γ©quipement'] == equipment]
|
| 79 |
+
|
| 80 |
+
analysis = {
|
| 81 |
+
'equipment': equipment,
|
| 82 |
+
'count': len(equipment_data),
|
| 83 |
+
'avg_criticality': equipment_data['CriticitΓ©'].mean(),
|
| 84 |
+
'max_criticality': equipment_data['CriticitΓ©'].max(),
|
| 85 |
+
'min_criticality': equipment_data['CriticitΓ©'].min(),
|
| 86 |
+
'std_criticality': equipment_data['CriticitΓ©'].std(),
|
| 87 |
+
**patterns
|
| 88 |
+
}
|
| 89 |
+
equipment_analysis.append(analysis)
|
| 90 |
+
|
| 91 |
+
equipment_df = pd.DataFrame(equipment_analysis)
|
| 92 |
+
|
| 93 |
+
# ============== STEP 4: REDUNDANCY CLASSIFICATION ==============
|
| 94 |
+
print("\n" + "="*50)
|
| 95 |
+
print("STEP 4: EQUIPMENT REDUNDANCY CLASSIFICATION")
|
| 96 |
+
print("="*50)
|
| 97 |
+
|
| 98 |
+
# Classify equipment by redundancy indicators
|
| 99 |
+
def classify_redundancy(row):
|
| 100 |
+
if row['has_principal'] or row['has_unique']:
|
| 101 |
+
return 'SINGLE_CRITICAL'
|
| 102 |
+
elif row['has_primaire'] or row['has_secondaire']:
|
| 103 |
+
return 'DUAL_SYSTEM'
|
| 104 |
+
elif row['has_ab_suffix']:
|
| 105 |
+
return 'DUAL_SYSTEM'
|
| 106 |
+
elif row['has_number_suffix']:
|
| 107 |
+
return 'MULTIPLE_SYSTEM'
|
| 108 |
+
elif row['has_auxiliaire']:
|
| 109 |
+
return 'AUXILIARY'
|
| 110 |
+
else:
|
| 111 |
+
return 'UNKNOWN'
|
| 112 |
+
|
| 113 |
+
equipment_df['redundancy_class'] = equipment_df.apply(classify_redundancy, axis=1)
|
| 114 |
+
|
| 115 |
+
# Analyze by redundancy class
|
| 116 |
+
print("\nEquipment distribution by redundancy classification:")
|
| 117 |
+
redundancy_stats = equipment_df.groupby('redundancy_class').agg({
|
| 118 |
+
'count': 'sum',
|
| 119 |
+
'avg_criticality': 'mean',
|
| 120 |
+
'equipment': 'count'
|
| 121 |
+
}).round(3)
|
| 122 |
+
|
| 123 |
+
for redundancy_class, stats in redundancy_stats.iterrows():
|
| 124 |
+
print(f"\n{redundancy_class}:")
|
| 125 |
+
print(f" Number of equipment types: {stats['equipment']}")
|
| 126 |
+
print(f" Total anomaly cases: {stats['count']}")
|
| 127 |
+
print(f" Average criticality: {stats['avg_criticality']:.3f}")
|
| 128 |
+
|
| 129 |
+
# ============== STEP 5: HIGH CRITICALITY EQUIPMENT ANALYSIS ==============
|
| 130 |
+
print("\n" + "="*50)
|
| 131 |
+
print("STEP 5: HIGH CRITICALITY EQUIPMENT IDENTIFICATION")
|
| 132 |
+
print("="*50)
|
| 133 |
+
|
| 134 |
+
# Find equipment with highest average criticality
|
| 135 |
+
high_criticality_equipment = equipment_df[equipment_df['avg_criticality'] >= 6.0].sort_values('avg_criticality', ascending=False)
|
| 136 |
+
|
| 137 |
+
print(f"\nEquipment types with average criticality >= 6.0:")
|
| 138 |
+
for _, row in high_criticality_equipment.iterrows():
|
| 139 |
+
print(f" {row['equipment']}: {row['avg_criticality']:.2f} (n={row['count']}, class={row['redundancy_class']})")
|
| 140 |
+
|
| 141 |
+
# ============== STEP 6: EQUIPMENT NAMING PATTERN ANALYSIS ==============
|
| 142 |
+
print("\n" + "="*50)
|
| 143 |
+
print("STEP 6: EQUIPMENT NAMING PATTERN ANALYSIS")
|
| 144 |
+
print("="*50)
|
| 145 |
+
|
| 146 |
+
# Group similar equipment names to detect families
|
| 147 |
+
def extract_base_equipment_name(equipment_name):
|
| 148 |
+
# Remove common suffixes and numbers to group similar equipment
|
| 149 |
+
base_name = re.sub(r'\s*[AB]$|\s*[NΒ°]*\s*[0-9]+$', '', equipment_name)
|
| 150 |
+
base_name = re.sub(r'\s*PRIMAIRE$|\s*SECONDAIRE$|\s*PRINCIPAL$', '', base_name)
|
| 151 |
+
base_name = base_name.strip()
|
| 152 |
+
return base_name
|
| 153 |
+
|
| 154 |
+
# Create equipment families
|
| 155 |
+
equipment_families = defaultdict(list)
|
| 156 |
+
for equipment in df_clean['Description de l\'Γ©quipement'].unique():
|
| 157 |
+
base_name = extract_base_equipment_name(equipment)
|
| 158 |
+
equipment_families[base_name].append(equipment)
|
| 159 |
+
|
| 160 |
+
# Find equipment families with multiple variants (indicating redundancy)
|
| 161 |
+
print("\nEquipment families with multiple variants (indicating redundancy):")
|
| 162 |
+
redundant_families = {k: v for k, v in equipment_families.items() if len(v) > 1}
|
| 163 |
+
|
| 164 |
+
for family, variants in sorted(redundant_families.items(), key=lambda x: len(x[1]), reverse=True)[:15]:
|
| 165 |
+
if len(variants) <= 10: # Only show families with reasonable number of variants
|
| 166 |
+
print(f"\n{family} ({len(variants)} variants):")
|
| 167 |
+
for variant in sorted(variants):
|
| 168 |
+
variant_data = df_clean[df_clean['Description de l\'Γ©quipement'] == variant]
|
| 169 |
+
avg_crit = variant_data['CriticitΓ©'].mean()
|
| 170 |
+
count = len(variant_data)
|
| 171 |
+
print(f" - {variant}: {avg_crit:.2f} avg criticality ({count} cases)")
|
| 172 |
+
|
| 173 |
+
# ============== STEP 7: SECTION-EQUIPMENT CRITICALITY ANALYSIS ==============
|
| 174 |
+
print("\n" + "="*50)
|
| 175 |
+
print("STEP 7: SECTION-EQUIPMENT CRITICALITY ANALYSIS")
|
| 176 |
+
print("="*50)
|
| 177 |
+
|
| 178 |
+
# Analyze criticality by section and equipment type
|
| 179 |
+
section_equipment_analysis = df_clean.groupby(['Section propriΓ©taire', 'Description de l\'Γ©quipement']).agg({
|
| 180 |
+
'CriticitΓ©': ['mean', 'count', 'max']
|
| 181 |
+
}).round(3)
|
| 182 |
+
|
| 183 |
+
section_equipment_analysis.columns = ['avg_criticality', 'count', 'max_criticality']
|
| 184 |
+
section_equipment_analysis = section_equipment_analysis.reset_index()
|
| 185 |
+
|
| 186 |
+
# Find section-equipment combinations with highest criticality
|
| 187 |
+
high_risk_combinations = section_equipment_analysis[
|
| 188 |
+
(section_equipment_analysis['avg_criticality'] >= 7.0) &
|
| 189 |
+
(section_equipment_analysis['count'] >= 3)
|
| 190 |
+
].sort_values('avg_criticality', ascending=False)
|
| 191 |
+
|
| 192 |
+
print(f"\nHigh-risk Section-Equipment combinations (avg criticality >= 7.0, min 3 cases):")
|
| 193 |
+
for _, row in high_risk_combinations.iterrows():
|
| 194 |
+
print(f" {row['Section propriΓ©taire']} - {row['Description de l\'Γ©quipement']}: "
|
| 195 |
+
f"{row['avg_criticality']:.2f} avg ({row['count']} cases, max: {row['max_criticality']})")
|
| 196 |
+
|
| 197 |
+
# ============== STEP 8: EQUIPMENT KEYWORD ANALYSIS ==============
|
| 198 |
+
print("\n" + "="*50)
|
| 199 |
+
print("STEP 8: CRITICAL EQUIPMENT KEYWORD ANALYSIS")
|
| 200 |
+
print("="*50)
|
| 201 |
+
|
| 202 |
+
# Analyze keywords in equipment descriptions that correlate with high criticality
|
| 203 |
+
equipment_keywords = {}
|
| 204 |
+
all_equipment_text = ' '.join(df_clean['Description de l\'Γ©quipement'].values).upper()
|
| 205 |
+
|
| 206 |
+
# Define important keywords to analyze
|
| 207 |
+
important_keywords = [
|
| 208 |
+
'PRINCIPAL', 'TRANSFO', 'TURBINE', 'ALTERNATEUR', 'POMPE', 'VENTILATEUR',
|
| 209 |
+
'CHAUDIERE', 'CHAUDIΓRE', 'COMPRESSEUR', 'MOTEUR', 'VANNE', 'SOUPAPE',
|
| 210 |
+
'RECHAUFFEUR', 'RΓCHAUFFEUR', 'REFROIDISSEMENT', 'REFRIGERANT', 'RΓFRIGΓRANT',
|
| 211 |
+
'PRIMAIRE', 'SECONDAIRE', 'AUXILIAIRE', 'UNITE', 'UNITΓ', 'GROUPE'
|
| 212 |
+
]
|
| 213 |
+
|
| 214 |
+
for keyword in important_keywords:
|
| 215 |
+
# Find equipment containing this keyword
|
| 216 |
+
equipment_with_keyword = df_clean[df_clean['Description de l\'Γ©quipement'].str.contains(keyword, case=False, na=False)]
|
| 217 |
+
if len(equipment_with_keyword) > 0:
|
| 218 |
+
avg_criticality = equipment_with_keyword['CriticitΓ©'].mean()
|
| 219 |
+
count = len(equipment_with_keyword)
|
| 220 |
+
equipment_keywords[keyword] = {
|
| 221 |
+
'count': count,
|
| 222 |
+
'avg_criticality': avg_criticality,
|
| 223 |
+
'percentage': count / len(df_clean) * 100
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
print("\nEquipment keywords analysis (sorted by average criticality):")
|
| 227 |
+
sorted_keywords = sorted(equipment_keywords.items(), key=lambda x: x[1]['avg_criticality'], reverse=True)
|
| 228 |
+
for keyword, stats in sorted_keywords:
|
| 229 |
+
print(f" {keyword}: {stats['avg_criticality']:.3f} avg criticality "
|
| 230 |
+
f"({stats['count']} cases, {stats['percentage']:.1f}% of dataset)")
|
| 231 |
+
|
| 232 |
+
# ============== STEP 9: SPECIFIC PATTERNS FOR CRITICAL CASES ==============
|
| 233 |
+
print("\n" + "="*50)
|
| 234 |
+
print("STEP 9: PATTERNS IN CRITICAL CASES (CRITICALITY >= 10)")
|
| 235 |
+
print("="*50)
|
| 236 |
+
|
| 237 |
+
critical_cases = df_clean[df_clean['CriticitΓ©'] >= 10]
|
| 238 |
+
print(f"\nTotal critical cases (criticality >= 10): {len(critical_cases)}")
|
| 239 |
+
|
| 240 |
+
if len(critical_cases) > 0:
|
| 241 |
+
print(f"\nEquipment types in critical cases:")
|
| 242 |
+
critical_equipment_counts = critical_cases['Description de l\'Γ©quipement'].value_counts()
|
| 243 |
+
for equipment, count in critical_equipment_counts.items():
|
| 244 |
+
total_equipment_cases = len(df_clean[df_clean['Description de l\'Γ©quipement'] == equipment])
|
| 245 |
+
percentage = count / total_equipment_cases * 100
|
| 246 |
+
print(f" {equipment}: {count}/{total_equipment_cases} cases ({percentage:.1f}% critical)")
|
| 247 |
+
|
| 248 |
+
print(f"\nSections with critical cases:")
|
| 249 |
+
critical_section_counts = critical_cases['Section propriΓ©taire'].value_counts()
|
| 250 |
+
for section, count in critical_section_counts.items():
|
| 251 |
+
total_section_cases = len(df_clean[df_clean['Section propriΓ©taire'] == section])
|
| 252 |
+
percentage = count / total_section_cases * 100
|
| 253 |
+
print(f" {section}: {count}/{total_section_cases} cases ({percentage:.1f}% critical)")
|
| 254 |
+
|
| 255 |
+
# ============== STEP 10: RECOMMENDATIONS ==============
|
| 256 |
+
print("\n" + "="*50)
|
| 257 |
+
print("STEP 10: EQUIPMENT ANALYSIS RECOMMENDATIONS")
|
| 258 |
+
print("="*50)
|
| 259 |
+
|
| 260 |
+
print("\nπ― KEY FINDINGS:")
|
| 261 |
+
print("1. Equipment Redundancy Patterns:")
|
| 262 |
+
print(f" - {len(equipment_df[equipment_df['redundancy_class'] == 'SINGLE_CRITICAL'])} equipment types classified as SINGLE_CRITICAL")
|
| 263 |
+
print(f" - {len(equipment_df[equipment_df['redundancy_class'] == 'DUAL_SYSTEM'])} equipment types classified as DUAL_SYSTEM")
|
| 264 |
+
print(f" - {len(equipment_df[equipment_df['redundancy_class'] == 'MULTIPLE_SYSTEM'])} equipment types classified as MULTIPLE_SYSTEM")
|
| 265 |
+
|
| 266 |
+
print("\n2. High-Risk Equipment Keywords:")
|
| 267 |
+
top_risk_keywords = sorted_keywords[:5]
|
| 268 |
+
for keyword, stats in top_risk_keywords:
|
| 269 |
+
print(f" - '{keyword}': {stats['avg_criticality']:.3f} avg criticality")
|
| 270 |
+
|
| 271 |
+
print("\n3. Equipment Families with Redundancy:")
|
| 272 |
+
print(f" - Found {len(redundant_families)} equipment families with multiple variants")
|
| 273 |
+
print(f" - This suggests systematic redundancy patterns in the data")
|
| 274 |
+
|
| 275 |
+
print("\nπ RECOMMENDATIONS FOR FEATURE ENGINEERING:")
|
| 276 |
+
print("1. Create 'equipment_redundancy_class' feature based on naming patterns")
|
| 277 |
+
print("2. Add 'equipment_base_type' feature by extracting equipment families")
|
| 278 |
+
print("3. Implement 'critical_equipment_keywords' scoring system")
|
| 279 |
+
print("4. Create 'section_equipment_risk' interaction features")
|
| 280 |
+
print("5. Build 'equipment_criticality_history' based on historical data")
|
| 281 |
+
|
| 282 |
+
# ============== SAVE ANALYSIS RESULTS ==============
|
| 283 |
+
print("\n" + "="*50)
|
| 284 |
+
print("SAVING ANALYSIS RESULTS")
|
| 285 |
+
print("="*50)
|
| 286 |
+
|
| 287 |
+
# Save equipment analysis dataframe
|
| 288 |
+
equipment_df.to_csv('equipment_analysis_results.csv', index=False)
|
| 289 |
+
print("β Saved equipment analysis to 'equipment_analysis_results.csv'")
|
| 290 |
+
|
| 291 |
+
# Save high-risk combinations
|
| 292 |
+
high_risk_combinations.to_csv('high_risk_equipment_combinations.csv', index=False)
|
| 293 |
+
print("β Saved high-risk combinations to 'high_risk_equipment_combinations.csv'")
|
| 294 |
+
|
| 295 |
+
# Create summary statistics
|
| 296 |
+
summary_stats = {
|
| 297 |
+
'total_equipment_types': len(equipment_df),
|
| 298 |
+
'single_critical_equipment': len(equipment_df[equipment_df['redundancy_class'] == 'SINGLE_CRITICAL']),
|
| 299 |
+
'dual_system_equipment': len(equipment_df[equipment_df['redundancy_class'] == 'DUAL_SYSTEM']),
|
| 300 |
+
'multiple_system_equipment': len(equipment_df[equipment_df['redundancy_class'] == 'MULTIPLE_SYSTEM']),
|
| 301 |
+
'high_criticality_equipment': len(high_criticality_equipment),
|
| 302 |
+
'equipment_families_with_redundancy': len(redundant_families),
|
| 303 |
+
'critical_cases_count': len(critical_cases)
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
import json
|
| 307 |
+
with open('equipment_analysis_summary.json', 'w') as f:
|
| 308 |
+
json.dump(summary_stats, f, indent=2)
|
| 309 |
+
print("β Saved summary statistics to 'equipment_analysis_summary.json'")
|
| 310 |
+
|
| 311 |
+
print("\n" + "="*60)
|
| 312 |
+
print("EQUIPMENT ANALYSIS COMPLETED!")
|
| 313 |
+
print("="*60)
|
| 314 |
+
print("\nFiles generated:")
|
| 315 |
+
print("- equipment_analysis_results.csv")
|
| 316 |
+
print("- high_risk_equipment_combinations.csv")
|
| 317 |
+
print("- equipment_analysis_summary.json")
|
| 318 |
+
print("\nPlease review the analysis results and share the key findings!")
|
| 319 |
+
print("This will help us design the optimal equipment intelligence features.")
|
training.py
ADDED
|
@@ -0,0 +1,1069 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# enhanced_training_pipeline_v2.py
|
| 2 |
+
# TAQATHON 2025 - Enhanced Training Pipeline with Equipment Intelligence
|
| 3 |
+
# Cost-sensitive learning + Equipment-specific strategies + Noise-robust training
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
import joblib
|
| 8 |
+
import warnings
|
| 9 |
+
import json
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
|
| 12 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
|
| 13 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 14 |
+
from sklearn.compose import ColumnTransformer
|
| 15 |
+
from sklearn.pipeline import Pipeline
|
| 16 |
+
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, recall_score, precision_score
|
| 17 |
+
from sklearn.utils.class_weight import compute_class_weight
|
| 18 |
+
from lightgbm import LGBMClassifier
|
| 19 |
+
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
|
| 20 |
+
from imblearn.pipeline import Pipeline as ImbPipeline
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import seaborn as sns
|
| 23 |
+
|
| 24 |
+
warnings.filterwarnings('ignore')
|
| 25 |
+
|
| 26 |
+
print("="*80)
|
| 27 |
+
print("TAQATHON 2025 - ENHANCED TRAINING PIPELINE v2.0")
|
| 28 |
+
print("Equipment Intelligence + Cost-Sensitive Learning + Conservative Prediction")
|
| 29 |
+
print("="*80)
|
| 30 |
+
|
| 31 |
+
# ============== STEP 1: LOAD ENHANCED DATA ==============
|
| 32 |
+
print("\n" + "="*60)
|
| 33 |
+
print("STEP 1: LOADING ENHANCED ANOMALY DATA")
|
| 34 |
+
print("="*60)
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
df = pd.read_csv('enhanced_anomaly_data_v2.csv')
|
| 38 |
+
print(f"β Successfully loaded enhanced data: {df.shape}")
|
| 39 |
+
except FileNotFoundError:
|
| 40 |
+
print("β Error: enhanced_anomaly_data_v2.csv not found!")
|
| 41 |
+
print("Please run the enhanced data processing script first.")
|
| 42 |
+
exit(1)
|
| 43 |
+
|
| 44 |
+
# Load feature metadata
|
| 45 |
+
try:
|
| 46 |
+
with open('enhanced_feature_metadata_v2.json', 'r') as f:
|
| 47 |
+
feature_metadata = json.load(f)
|
| 48 |
+
print(f"β Successfully loaded feature metadata")
|
| 49 |
+
except FileNotFoundError:
|
| 50 |
+
print("β Warning: enhanced_feature_metadata_v2.json not found!")
|
| 51 |
+
feature_metadata = {}
|
| 52 |
+
|
| 53 |
+
# Check for required columns
|
| 54 |
+
required_cols = ['Description', 'FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©', 'Process Safety', 'CriticitΓ©']
|
| 55 |
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 56 |
+
if missing_cols:
|
| 57 |
+
print(f"β Missing required columns: {missing_cols}")
|
| 58 |
+
exit(1)
|
| 59 |
+
|
| 60 |
+
print(f"Dataset shape: {df.shape}")
|
| 61 |
+
print(f"Enhanced features available: {len([col for col in df.columns if col not in required_cols])}")
|
| 62 |
+
|
| 63 |
+
# ============== STEP 2: BUSINESS-FOCUSED DATA ANALYSIS ==============
|
| 64 |
+
print("\n" + "="*60)
|
| 65 |
+
print("STEP 2: BUSINESS-FOCUSED ANALYSIS FOR TRAINING STRATEGY")
|
| 66 |
+
print("="*60)
|
| 67 |
+
|
| 68 |
+
# Target variable distributions with business impact analysis
|
| 69 |
+
target_columns = ['FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©', 'Process Safety']
|
| 70 |
+
|
| 71 |
+
print("Target variable distributions:")
|
| 72 |
+
for target in target_columns:
|
| 73 |
+
print(f"\n{target}:")
|
| 74 |
+
distribution = df[target].value_counts().sort_index()
|
| 75 |
+
for value, count in distribution.items():
|
| 76 |
+
percentage = count / len(df) * 100
|
| 77 |
+
print(f" {value}: {count:4d} cases ({percentage:5.1f}%)")
|
| 78 |
+
|
| 79 |
+
# Critical case analysis (Criticality >= 10)
|
| 80 |
+
critical_cases = df[df['CriticitΓ©'] >= 10]
|
| 81 |
+
very_critical_cases = df[df['CriticitΓ©'] >= 12]
|
| 82 |
+
|
| 83 |
+
print(f"\nBUSINESS IMPACT ANALYSIS:")
|
| 84 |
+
print(f"Total critical cases (β₯10): {len(critical_cases)} ({len(critical_cases)/len(df)*100:.2f}%)")
|
| 85 |
+
print(f"Very critical cases (β₯12): {len(very_critical_cases)} ({len(very_critical_cases)/len(df)*100:.2f}%)")
|
| 86 |
+
|
| 87 |
+
# Equipment type risk analysis
|
| 88 |
+
if 'equipment_type_class' in df.columns:
|
| 89 |
+
print(f"\nCritical cases by equipment type:")
|
| 90 |
+
for eq_type in df['equipment_type_class'].unique():
|
| 91 |
+
eq_df = df[df['equipment_type_class'] == eq_type]
|
| 92 |
+
eq_critical = eq_df[eq_df['CriticitΓ©'] >= 10]
|
| 93 |
+
if len(eq_df) > 0:
|
| 94 |
+
critical_rate = len(eq_critical) / len(eq_df) * 100
|
| 95 |
+
print(f" {eq_type:25s}: {len(eq_critical):2d}/{len(eq_df):4d} ({critical_rate:5.1f}% critical)")
|
| 96 |
+
|
| 97 |
+
# ============== STEP 3: COST-SENSITIVE LOSS FUNCTION DESIGN ==============
|
| 98 |
+
print("\n" + "="*60)
|
| 99 |
+
print("STEP 3: COST-SENSITIVE LEARNING SETUP")
|
| 100 |
+
print("="*60)
|
| 101 |
+
|
| 102 |
+
def create_cost_matrix(num_classes, severity_penalty=5.0):
|
| 103 |
+
"""
|
| 104 |
+
Create asymmetric cost matrix that heavily penalizes underestimation
|
| 105 |
+
"""
|
| 106 |
+
cost_matrix = np.ones((num_classes, num_classes))
|
| 107 |
+
|
| 108 |
+
for i in range(num_classes):
|
| 109 |
+
for j in range(num_classes):
|
| 110 |
+
if i == j:
|
| 111 |
+
cost_matrix[i, j] = 0 # No cost for correct prediction
|
| 112 |
+
elif i > j: # Underestimation (predicted lower than actual)
|
| 113 |
+
# Severe penalty for underestimation, especially for high classes
|
| 114 |
+
underestimation_penalty = severity_penalty * (i - j) * (1 + i * 0.5)
|
| 115 |
+
cost_matrix[i, j] = underestimation_penalty
|
| 116 |
+
else: # Overestimation (predicted higher than actual)
|
| 117 |
+
# Lighter penalty for overestimation
|
| 118 |
+
overestimation_penalty = (j - i) * 0.5
|
| 119 |
+
cost_matrix[i, j] = overestimation_penalty
|
| 120 |
+
|
| 121 |
+
return cost_matrix
|
| 122 |
+
|
| 123 |
+
def calculate_sample_weights(y, equipment_types=None, label_confidence=None):
|
| 124 |
+
"""
|
| 125 |
+
Calculate sample weights based on criticality, equipment type, and label confidence
|
| 126 |
+
"""
|
| 127 |
+
weights = np.ones(len(y))
|
| 128 |
+
|
| 129 |
+
# Base class weights (inverse frequency)
|
| 130 |
+
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
|
| 131 |
+
class_weight_dict = {cls: weight for cls, weight in zip(np.unique(y), class_weights)}
|
| 132 |
+
|
| 133 |
+
for i, value in enumerate(y):
|
| 134 |
+
weights[i] = class_weight_dict[value]
|
| 135 |
+
|
| 136 |
+
# Extra weight for high criticality cases
|
| 137 |
+
if value >= 4: # High individual component scores
|
| 138 |
+
weights[i] *= 2.0
|
| 139 |
+
if value >= 5: # Maximum individual component scores
|
| 140 |
+
weights[i] *= 3.0
|
| 141 |
+
|
| 142 |
+
# Equipment type weighting
|
| 143 |
+
if equipment_types is not None:
|
| 144 |
+
for i, eq_type in enumerate(equipment_types):
|
| 145 |
+
if eq_type in ['ELECTRICAL_CRITICAL', 'COOLING_CRITICAL']:
|
| 146 |
+
weights[i] *= 2.0 # Double weight for critical equipment
|
| 147 |
+
elif eq_type in ['TURBINE_SYSTEMS', 'HEATING_SYSTEMS']:
|
| 148 |
+
weights[i] *= 1.5 # 1.5x weight for important equipment
|
| 149 |
+
|
| 150 |
+
# Label confidence weighting
|
| 151 |
+
if label_confidence is not None:
|
| 152 |
+
weights = weights * label_confidence
|
| 153 |
+
|
| 154 |
+
return weights
|
| 155 |
+
|
| 156 |
+
# Calculate business impact weights
|
| 157 |
+
equipment_types = df.get('equipment_type_class', None)
|
| 158 |
+
label_confidence = df.get('label_confidence', None)
|
| 159 |
+
|
| 160 |
+
print("Creating cost-sensitive learning setup...")
|
| 161 |
+
print(f"β Equipment type information available: {equipment_types is not None}")
|
| 162 |
+
print(f"β Label confidence information available: {label_confidence is not None}")
|
| 163 |
+
|
| 164 |
+
# ============== STEP 4: ENHANCED FEATURE PREPARATION ==============
|
| 165 |
+
print("\n" + "="*60)
|
| 166 |
+
print("STEP 4: ENHANCED FEATURE PREPARATION")
|
| 167 |
+
print("="*60)
|
| 168 |
+
|
| 169 |
+
# High-impact features from analysis (correlation > 0.15)
|
| 170 |
+
high_impact_features = [
|
| 171 |
+
'has_safety_mention', 'has_urgency', 'equipment_problem_risk', 'problem_count',
|
| 172 |
+
'technical_complexity', 'section_risk_multiplier', 'equipment_risk_score',
|
| 173 |
+
'enhanced_severity_score', 'has_structural_failure', 'equipment_base_criticality'
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
# Additional important features
|
| 177 |
+
important_features = [
|
| 178 |
+
'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure',
|
| 179 |
+
'equipment_count', 'action_count', 'has_equipment_malfunction', 'has_escalation',
|
| 180 |
+
'bruit_anormal', 'vibration_excessive', 'temperature_elevee', 'fuite_vapeur',
|
| 181 |
+
'fuite_huile', 'maintenance_planning', 'is_recurring', 'has_measurements',
|
| 182 |
+
'has_location_details', 'combined_word_count'
|
| 183 |
+
]
|
| 184 |
+
|
| 185 |
+
# Text feature
|
| 186 |
+
text_features = ['Description']
|
| 187 |
+
|
| 188 |
+
# Categorical features
|
| 189 |
+
categorical_features = []
|
| 190 |
+
if 'equipment_type_class' in df.columns:
|
| 191 |
+
categorical_features.append('equipment_type_class')
|
| 192 |
+
if 'equipment_redundancy_class' in df.columns:
|
| 193 |
+
categorical_features.append('equipment_redundancy_class')
|
| 194 |
+
if 'Section propriΓ©taire' in df.columns:
|
| 195 |
+
categorical_features.append('Section propriΓ©taire')
|
| 196 |
+
|
| 197 |
+
# Combine all features
|
| 198 |
+
all_engineered_features = high_impact_features + important_features
|
| 199 |
+
available_features = [feat for feat in all_engineered_features if feat in df.columns]
|
| 200 |
+
|
| 201 |
+
print(f"High-impact features (>0.15 correlation): {len([f for f in high_impact_features if f in df.columns])}")
|
| 202 |
+
print(f"Additional important features: {len([f for f in important_features if f in df.columns])}")
|
| 203 |
+
print(f"Text features: {len(text_features)}")
|
| 204 |
+
print(f"Categorical features: {len(categorical_features)}")
|
| 205 |
+
print(f"Total engineered features: {len(available_features)}")
|
| 206 |
+
|
| 207 |
+
# Handle missing values
|
| 208 |
+
for col in available_features:
|
| 209 |
+
if df[col].dtype in ['int64', 'float64']:
|
| 210 |
+
df[col] = df[col].fillna(0)
|
| 211 |
+
elif df[col].dtype == 'bool':
|
| 212 |
+
df[col] = df[col].astype(int).fillna(0)
|
| 213 |
+
|
| 214 |
+
for col in categorical_features:
|
| 215 |
+
df[col] = df[col].fillna('Unknown')
|
| 216 |
+
|
| 217 |
+
# --- FIX #1a: Handle missing values in the text column ---
|
| 218 |
+
df['Description'] = df['Description'].fillna('')
|
| 219 |
+
|
| 220 |
+
print("β Feature preparation completed")
|
| 221 |
+
|
| 222 |
+
# ============== STEP 5: ENHANCED PREPROCESSING PIPELINES ==============
|
| 223 |
+
print("\n" + "="*60)
|
| 224 |
+
print("STEP 5: ENHANCED PREPROCESSING PIPELINES")
|
| 225 |
+
print("="*60)
|
| 226 |
+
|
| 227 |
+
# --- FIX #1b: Define the column name as a string for the ColumnTransformer ---
|
| 228 |
+
# This ensures the TfidfVectorizer receives a 1D Series instead of a 2D DataFrame.
|
| 229 |
+
text_feature_name_for_transformer = 'Description'
|
| 230 |
+
|
| 231 |
+
# Enhanced text preprocessing
|
| 232 |
+
text_pipeline = Pipeline([
|
| 233 |
+
('tfidf', TfidfVectorizer(
|
| 234 |
+
max_features=1500, # Increased for better text representation
|
| 235 |
+
stop_words=None,
|
| 236 |
+
ngram_range=(1, 2),
|
| 237 |
+
min_df=2,
|
| 238 |
+
max_df=0.95,
|
| 239 |
+
lowercase=True,
|
| 240 |
+
strip_accents='unicode',
|
| 241 |
+
sublinear_tf=True # Better for high-dimensional data
|
| 242 |
+
))
|
| 243 |
+
])
|
| 244 |
+
|
| 245 |
+
# Numerical features preprocessing
|
| 246 |
+
numerical_pipeline = Pipeline([
|
| 247 |
+
('scaler', StandardScaler())
|
| 248 |
+
])
|
| 249 |
+
|
| 250 |
+
# Categorical features preprocessing
|
| 251 |
+
categorical_pipeline = Pipeline([
|
| 252 |
+
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
|
| 253 |
+
])
|
| 254 |
+
|
| 255 |
+
# Combined preprocessing
|
| 256 |
+
transformers = [
|
| 257 |
+
# --- FIX #1c: Use the string variable here ---
|
| 258 |
+
('text', text_pipeline, text_feature_name_for_transformer),
|
| 259 |
+
('numerical', numerical_pipeline, available_features)
|
| 260 |
+
]
|
| 261 |
+
|
| 262 |
+
if categorical_features:
|
| 263 |
+
transformers.append(('categorical', categorical_pipeline, categorical_features))
|
| 264 |
+
|
| 265 |
+
preprocessor = ColumnTransformer(transformers, remainder='drop')
|
| 266 |
+
|
| 267 |
+
print("β Enhanced preprocessing pipelines created")
|
| 268 |
+
print(f" Text processing: 1 feature β 1500 TF-IDF features")
|
| 269 |
+
print(f" Numerical processing: {len(available_features)} features")
|
| 270 |
+
print(f" Categorical processing: {len(categorical_features)} features")
|
| 271 |
+
|
| 272 |
+
# ============== STEP 6: ENHANCED DATA SPLITTING WITH CRITICALITY STRATIFICATION ==============
|
| 273 |
+
print("\n" + "="*60)
|
| 274 |
+
print("STEP 6: ENHANCED DATA SPLITTING WITH CRITICALITY STRATIFICATION")
|
| 275 |
+
print("="*60)
|
| 276 |
+
|
| 277 |
+
# Create feature matrix
|
| 278 |
+
feature_columns = text_features + available_features + categorical_features
|
| 279 |
+
X = df[feature_columns].copy()
|
| 280 |
+
|
| 281 |
+
# Calculate combined criticality for stratification
|
| 282 |
+
df['combined_criticality'] = df['FiabilitΓ© IntΓ©gritΓ©'] + df['DisponibiltΓ©'] + df['Process Safety']
|
| 283 |
+
|
| 284 |
+
# Create stratification groups to ensure critical cases in test set
|
| 285 |
+
def create_stratification_groups(criticality_scores):
|
| 286 |
+
"""Create stratification groups ensuring critical cases in test set"""
|
| 287 |
+
groups = []
|
| 288 |
+
for score in criticality_scores:
|
| 289 |
+
if score >= 12:
|
| 290 |
+
groups.append('very_critical')
|
| 291 |
+
elif score >= 10:
|
| 292 |
+
groups.append('critical')
|
| 293 |
+
elif score >= 8:
|
| 294 |
+
groups.append('high')
|
| 295 |
+
elif score >= 6:
|
| 296 |
+
groups.append('medium')
|
| 297 |
+
else:
|
| 298 |
+
groups.append('low')
|
| 299 |
+
return groups
|
| 300 |
+
|
| 301 |
+
stratification_groups = create_stratification_groups(df['combined_criticality'])
|
| 302 |
+
df['stratification_group'] = stratification_groups
|
| 303 |
+
|
| 304 |
+
print(f"Stratification group distribution:")
|
| 305 |
+
for group, count in pd.Series(stratification_groups).value_counts().items():
|
| 306 |
+
percentage = count / len(df) * 100
|
| 307 |
+
print(f" {group}: {count} cases ({percentage:.1f}%)")
|
| 308 |
+
|
| 309 |
+
# Enhanced splitting strategy - single split for all targets using combined criticality
|
| 310 |
+
print(f"\nUsing combined criticality stratification for consistent test sets...")
|
| 311 |
+
|
| 312 |
+
# Filter out groups with too few samples for stratification
|
| 313 |
+
group_counts = pd.Series(stratification_groups).value_counts()
|
| 314 |
+
valid_groups = group_counts[group_counts >= 4].index
|
| 315 |
+
valid_mask = pd.Series(stratification_groups).isin(valid_groups)
|
| 316 |
+
|
| 317 |
+
df_filtered = df[valid_mask].copy()
|
| 318 |
+
X_filtered = df_filtered[feature_columns]
|
| 319 |
+
stratification_filtered = df_filtered['stratification_group']
|
| 320 |
+
|
| 321 |
+
print(f"Filtered dataset: {len(df_filtered)} samples (removed {len(df) - len(df_filtered)} rare cases)")
|
| 322 |
+
|
| 323 |
+
# Single stratified split for consistency across all targets
|
| 324 |
+
X_train_base, X_test_base, _, _ = train_test_split(
|
| 325 |
+
X_filtered, stratification_filtered,
|
| 326 |
+
test_size=0.2,
|
| 327 |
+
random_state=42,
|
| 328 |
+
stratify=stratification_filtered
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
# Check critical cases in splits
|
| 332 |
+
train_criticality = df_filtered.loc[X_train_base.index, 'combined_criticality']
|
| 333 |
+
test_criticality = df_filtered.loc[X_test_base.index, 'combined_criticality']
|
| 334 |
+
|
| 335 |
+
train_critical_cases = (train_criticality >= 10).sum()
|
| 336 |
+
test_critical_cases = (test_criticality >= 10).sum()
|
| 337 |
+
|
| 338 |
+
print(f"\nCritical case distribution after stratification:")
|
| 339 |
+
print(f" Training critical cases (β₯10): {train_critical_cases}")
|
| 340 |
+
print(f" Test critical cases (β₯10): {test_critical_cases}")
|
| 341 |
+
print(f" Test set critical case rate: {test_critical_cases/len(X_test_base)*100:.1f}%")
|
| 342 |
+
|
| 343 |
+
# Initialize dictionaries for each target
|
| 344 |
+
X_train_dict, X_test_dict, y_train_dict, y_test_dict = {}, {}, {}, {}
|
| 345 |
+
sample_weights_dict = {}
|
| 346 |
+
|
| 347 |
+
# Create consistent splits for each target
|
| 348 |
+
for target in target_columns:
|
| 349 |
+
print(f"\nPreparing data for {target}...")
|
| 350 |
+
|
| 351 |
+
# Use the same base splits for all targets
|
| 352 |
+
X_train_dict[target] = X_train_base
|
| 353 |
+
X_test_dict[target] = X_test_base
|
| 354 |
+
y_train_dict[target] = df_filtered.loc[X_train_base.index, target]
|
| 355 |
+
y_test_dict[target] = df_filtered.loc[X_test_base.index, target]
|
| 356 |
+
|
| 357 |
+
# Calculate sample weights for training
|
| 358 |
+
train_equipment_types = None
|
| 359 |
+
train_label_confidence = None
|
| 360 |
+
|
| 361 |
+
if 'equipment_type_class' in df_filtered.columns:
|
| 362 |
+
train_equipment_types = df_filtered.loc[X_train_base.index, 'equipment_type_class'].values
|
| 363 |
+
if 'label_confidence' in df_filtered.columns:
|
| 364 |
+
train_label_confidence = df_filtered.loc[X_train_base.index, 'label_confidence'].values
|
| 365 |
+
|
| 366 |
+
sample_weights = calculate_sample_weights(
|
| 367 |
+
y_train_dict[target].values,
|
| 368 |
+
train_equipment_types,
|
| 369 |
+
train_label_confidence
|
| 370 |
+
)
|
| 371 |
+
sample_weights_dict[target] = sample_weights
|
| 372 |
+
|
| 373 |
+
print(f" Training set: {len(X_train_dict[target])} samples")
|
| 374 |
+
print(f" Test set: {len(X_test_dict[target])} samples")
|
| 375 |
+
print(f" Training class distribution: {dict(y_train_dict[target].value_counts().sort_index())}")
|
| 376 |
+
print(f" Sample weights range: {sample_weights.min():.2f} - {sample_weights.max():.2f}")
|
| 377 |
+
|
| 378 |
+
print(f"\nβ Enhanced stratification completed - Critical cases preserved in test set!")
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
# ============== STEP 7: CONSERVATIVE MODEL TRAINING ==============
|
| 382 |
+
print("\n" + "="*60)
|
| 383 |
+
print("STEP 7: CONSERVATIVE MODEL TRAINING WITH COST-SENSITIVE LEARNING")
|
| 384 |
+
print("="*60)
|
| 385 |
+
|
| 386 |
+
# Enhanced LightGBM parameters for conservative prediction
|
| 387 |
+
conservative_lgbm_params = {
|
| 388 |
+
'objective': 'multiclass',
|
| 389 |
+
'metric': 'multi_logloss',
|
| 390 |
+
'boosting_type': 'gbdt',
|
| 391 |
+
'num_leaves': 31,
|
| 392 |
+
'learning_rate': 0.05, # Lower learning rate for better generalization
|
| 393 |
+
'feature_fraction': 0.8,
|
| 394 |
+
'bagging_fraction': 0.8,
|
| 395 |
+
'bagging_freq': 5,
|
| 396 |
+
'verbose': -1,
|
| 397 |
+
'random_state': 42,
|
| 398 |
+
'n_estimators': 500, # More estimators with lower learning rate
|
| 399 |
+
'class_weight': 'balanced',
|
| 400 |
+
'min_child_samples': 20, # Prevent overfitting
|
| 401 |
+
'reg_alpha': 0.1, # L1 regularization
|
| 402 |
+
'reg_lambda': 0.1, # L2 regularization
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
# Store trained models and performance
|
| 406 |
+
trained_models = {}
|
| 407 |
+
model_performance = {}
|
| 408 |
+
business_metrics = {}
|
| 409 |
+
|
| 410 |
+
for target in target_columns:
|
| 411 |
+
print(f"\n" + "-"*50)
|
| 412 |
+
print(f"TRAINING CONSERVATIVE MODEL FOR: {target}")
|
| 413 |
+
print("-"*50)
|
| 414 |
+
|
| 415 |
+
# Get data for this target
|
| 416 |
+
X_train = X_train_dict[target]
|
| 417 |
+
X_test = X_test_dict[target]
|
| 418 |
+
y_train = y_train_dict[target]
|
| 419 |
+
y_test = y_test_dict[target]
|
| 420 |
+
sample_weights = sample_weights_dict[target]
|
| 421 |
+
|
| 422 |
+
# Prepare model parameters
|
| 423 |
+
unique_classes = sorted(y_train.unique())
|
| 424 |
+
num_classes = len(unique_classes)
|
| 425 |
+
current_params = conservative_lgbm_params.copy()
|
| 426 |
+
current_params['num_class'] = num_classes
|
| 427 |
+
|
| 428 |
+
print(f"Classes: {unique_classes} (total: {num_classes})")
|
| 429 |
+
|
| 430 |
+
# Enhanced SMOTE for better minority class handling
|
| 431 |
+
min_class_size = min(y_train.value_counts())
|
| 432 |
+
k_neighbors = min(3, min_class_size - 1) if min_class_size > 1 else 1
|
| 433 |
+
|
| 434 |
+
# Use BorderlineSMOTE for better boundary detection
|
| 435 |
+
if num_classes > 2 and min_class_size > 1:
|
| 436 |
+
try:
|
| 437 |
+
smote = BorderlineSMOTE(
|
| 438 |
+
random_state=42,
|
| 439 |
+
k_neighbors=k_neighbors,
|
| 440 |
+
sampling_strategy='auto' # Only oversample minority classes
|
| 441 |
+
)
|
| 442 |
+
model_pipeline = ImbPipeline([
|
| 443 |
+
('preprocessor', preprocessor),
|
| 444 |
+
('smote', smote),
|
| 445 |
+
('classifier', LGBMClassifier(**current_params))
|
| 446 |
+
])
|
| 447 |
+
print(f"Using BorderlineSMOTE with k_neighbors={k_neighbors}")
|
| 448 |
+
except:
|
| 449 |
+
# Fallback to standard SMOTE
|
| 450 |
+
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
|
| 451 |
+
model_pipeline = ImbPipeline([
|
| 452 |
+
('preprocessor', preprocessor),
|
| 453 |
+
('smote', smote),
|
| 454 |
+
('classifier', LGBMClassifier(**current_params))
|
| 455 |
+
])
|
| 456 |
+
print(f"Using standard SMOTE with k_neighbors={k_neighbors}")
|
| 457 |
+
else:
|
| 458 |
+
model_pipeline = Pipeline([
|
| 459 |
+
('preprocessor', preprocessor),
|
| 460 |
+
('classifier', LGBMClassifier(**current_params))
|
| 461 |
+
])
|
| 462 |
+
print("Using standard pipeline (no SMOTE)")
|
| 463 |
+
|
| 464 |
+
# Train with sample weights
|
| 465 |
+
print("Training in progress...")
|
| 466 |
+
if 'smote' in model_pipeline.named_steps:
|
| 467 |
+
# SMOTE pipeline - fit without sample weights first, then use them for classifier
|
| 468 |
+
model_pipeline.fit(X_train, y_train)
|
| 469 |
+
else:
|
| 470 |
+
# Standard pipeline - use sample weights directly
|
| 471 |
+
model_pipeline.fit(X_train, y_train,
|
| 472 |
+
classifier__sample_weight=sample_weights)
|
| 473 |
+
|
| 474 |
+
# Make predictions
|
| 475 |
+
y_pred_train = model_pipeline.predict(X_train)
|
| 476 |
+
y_pred_test = model_pipeline.predict(X_test)
|
| 477 |
+
y_pred_proba_test = model_pipeline.predict_proba(X_test)
|
| 478 |
+
|
| 479 |
+
# Standard metrics
|
| 480 |
+
train_accuracy = (y_pred_train == y_train).mean()
|
| 481 |
+
test_accuracy = (y_pred_test == y_test).mean()
|
| 482 |
+
test_mae = mean_absolute_error(y_test, y_pred_test)
|
| 483 |
+
|
| 484 |
+
# Business-critical metrics
|
| 485 |
+
high_value_mask = y_test >= 4 # High component values
|
| 486 |
+
if high_value_mask.sum() > 0:
|
| 487 |
+
high_value_recall = recall_score(y_test, y_pred_test, labels=[4, 5], average='macro', zero_division=0)
|
| 488 |
+
high_value_precision = precision_score(y_test, y_pred_test, labels=[4, 5], average='macro', zero_division=0)
|
| 489 |
+
|
| 490 |
+
# Underestimation analysis for high values
|
| 491 |
+
underestimated = (y_test > y_pred_test) & high_value_mask
|
| 492 |
+
underestimation_rate = underestimated.mean() if high_value_mask.sum() > 0 else 0
|
| 493 |
+
|
| 494 |
+
print(f"HIGH-VALUE COMPONENT PERFORMANCE:")
|
| 495 |
+
print(f" Recall for values 4-5: {high_value_recall:.3f}")
|
| 496 |
+
print(f" Precision for values 4-5: {high_value_precision:.3f}")
|
| 497 |
+
print(f" Underestimation rate: {underestimation_rate:.3f}")
|
| 498 |
+
else:
|
| 499 |
+
high_value_recall = 0
|
| 500 |
+
high_value_precision = 0
|
| 501 |
+
underestimation_rate = 0
|
| 502 |
+
print("No high-value cases in test set")
|
| 503 |
+
|
| 504 |
+
print(f"OVERALL PERFORMANCE:")
|
| 505 |
+
print(f" Training Accuracy: {train_accuracy:.3f}")
|
| 506 |
+
print(f" Test Accuracy: {test_accuracy:.3f}")
|
| 507 |
+
print(f" Test MAE: {test_mae:.3f}")
|
| 508 |
+
|
| 509 |
+
# Store results
|
| 510 |
+
trained_models[target] = model_pipeline
|
| 511 |
+
model_performance[target] = {
|
| 512 |
+
'train_accuracy': train_accuracy,
|
| 513 |
+
'test_accuracy': test_accuracy,
|
| 514 |
+
'test_mae': test_mae,
|
| 515 |
+
'predictions': y_pred_test,
|
| 516 |
+
'probabilities': y_pred_proba_test,
|
| 517 |
+
'unique_classes': unique_classes
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
business_metrics[target] = {
|
| 521 |
+
'high_value_recall': high_value_recall,
|
| 522 |
+
'high_value_precision': high_value_precision,
|
| 523 |
+
'underestimation_rate': underestimation_rate,
|
| 524 |
+
'total_high_value_cases': high_value_mask.sum()
|
| 525 |
+
}
|
| 526 |
+
|
| 527 |
+
# Classification report
|
| 528 |
+
print(f"\nDetailed Classification Report:")
|
| 529 |
+
print(classification_report(y_test, y_pred_test, zero_division=0))
|
| 530 |
+
|
| 531 |
+
# ============== STEP 8: OVERALL CRITICALITY ANALYSIS ==============
|
| 532 |
+
print("\n" + "="*60)
|
| 533 |
+
print("STEP 8: OVERALL CRITICALITY PREDICTION ANALYSIS")
|
| 534 |
+
print("="*60)
|
| 535 |
+
|
| 536 |
+
# Calculate combined criticality predictions for common test set
|
| 537 |
+
print(f"\nCalculating combined criticality for {len(X_test_base)} test samples...")
|
| 538 |
+
|
| 539 |
+
predicted_criticality = np.zeros(len(X_test_base))
|
| 540 |
+
actual_criticality = df_filtered.loc[X_test_base.index, 'combined_criticality'].values
|
| 541 |
+
|
| 542 |
+
# Get predictions for each target and sum them
|
| 543 |
+
for target in target_columns:
|
| 544 |
+
model = trained_models[target]
|
| 545 |
+
target_predictions = model.predict(X_test_base)
|
| 546 |
+
predicted_criticality += target_predictions
|
| 547 |
+
|
| 548 |
+
predicted_criticality = predicted_criticality.astype(int)
|
| 549 |
+
|
| 550 |
+
print(f"Actual criticality range: {actual_criticality.min()} - {actual_criticality.max()}")
|
| 551 |
+
print(f"Predicted criticality range: {predicted_criticality.min()} - {predicted_criticality.max()}")
|
| 552 |
+
|
| 553 |
+
|
| 554 |
+
# Business impact analysis
|
| 555 |
+
critical_threshold = 10
|
| 556 |
+
very_critical_threshold = 12
|
| 557 |
+
|
| 558 |
+
critical_actual = actual_criticality >= critical_threshold
|
| 559 |
+
critical_predicted = predicted_criticality >= critical_threshold
|
| 560 |
+
|
| 561 |
+
very_critical_actual = actual_criticality >= very_critical_threshold
|
| 562 |
+
very_critical_predicted = predicted_criticality >= very_critical_threshold
|
| 563 |
+
|
| 564 |
+
# Calculate business metrics
|
| 565 |
+
overall_mae = mean_absolute_error(actual_criticality, predicted_criticality)
|
| 566 |
+
critical_recall = recall_score(critical_actual, critical_predicted) if critical_actual.sum() > 0 else 0
|
| 567 |
+
critical_precision = precision_score(critical_actual, critical_predicted) if critical_predicted.sum() > 0 else 0
|
| 568 |
+
|
| 569 |
+
# Conservative prediction analysis
|
| 570 |
+
conservative_score = (predicted_criticality >= actual_criticality).mean()
|
| 571 |
+
severe_underestimation = ((actual_criticality >= 10) & (predicted_criticality <= 6)).sum()
|
| 572 |
+
|
| 573 |
+
print(f"OVERALL CRITICALITY PERFORMANCE:")
|
| 574 |
+
print(f"Total test samples: {len(actual_criticality)}")
|
| 575 |
+
print(f"Combined MAE: {overall_mae:.3f}")
|
| 576 |
+
print(f"Conservative prediction rate: {conservative_score:.3f}")
|
| 577 |
+
print(f"Severe underestimation cases (actualβ₯10, predβ€6): {severe_underestimation}")
|
| 578 |
+
|
| 579 |
+
print(f"\nCRITICAL CASE DETECTION (β₯{critical_threshold}):")
|
| 580 |
+
print(f"Actual critical cases: {critical_actual.sum()}")
|
| 581 |
+
print(f"Predicted critical cases: {critical_predicted.sum()}")
|
| 582 |
+
print(f"Critical case recall: {critical_recall:.3f}")
|
| 583 |
+
print(f"Critical case precision: {critical_precision:.3f}")
|
| 584 |
+
|
| 585 |
+
if very_critical_actual.sum() > 0:
|
| 586 |
+
very_critical_recall = recall_score(very_critical_actual, very_critical_predicted)
|
| 587 |
+
print(f"\nVERY CRITICAL CASE DETECTION (β₯{very_critical_threshold}):")
|
| 588 |
+
print(f"Very critical recall: {very_critical_recall:.3f}")
|
| 589 |
+
else:
|
| 590 |
+
print(f"\nNo very critical cases (β₯{very_critical_threshold}) in test set")
|
| 591 |
+
|
| 592 |
+
# ============== STEP 9: EQUIPMENT-SPECIFIC ANALYSIS ==============
|
| 593 |
+
print("\n" + "="*60)
|
| 594 |
+
print("STEP 9: EQUIPMENT-SPECIFIC PERFORMANCE ANALYSIS")
|
| 595 |
+
print("="*60)
|
| 596 |
+
|
| 597 |
+
# Equipment-specific performance analysis
|
| 598 |
+
# --- FIX #2: Check if the test set is not empty ---
|
| 599 |
+
if 'equipment_type_class' in df.columns and not X_test_base.empty:
|
| 600 |
+
print("Equipment-specific performance analysis:")
|
| 601 |
+
|
| 602 |
+
# Get equipment types for the common test set
|
| 603 |
+
equipment_types_test = df_filtered.loc[X_test_base.index, 'equipment_type_class'].values
|
| 604 |
+
|
| 605 |
+
# Analyze by equipment type
|
| 606 |
+
equipment_performance = {}
|
| 607 |
+
for eq_type in set(equipment_types_test):
|
| 608 |
+
eq_mask = equipment_types_test == eq_type
|
| 609 |
+
if eq_mask.sum() > 0:
|
| 610 |
+
eq_actual = actual_criticality[eq_mask]
|
| 611 |
+
eq_predicted = predicted_criticality[eq_mask]
|
| 612 |
+
|
| 613 |
+
eq_mae = mean_absolute_error(eq_actual, eq_predicted)
|
| 614 |
+
eq_conservative = (eq_predicted >= eq_actual).mean()
|
| 615 |
+
|
| 616 |
+
# Critical case detection for this equipment type
|
| 617 |
+
eq_critical_actual = eq_actual >= critical_threshold
|
| 618 |
+
eq_critical_predicted = eq_predicted >= critical_threshold
|
| 619 |
+
|
| 620 |
+
if eq_critical_actual.sum() > 0:
|
| 621 |
+
eq_critical_recall = recall_score(eq_critical_actual, eq_critical_predicted)
|
| 622 |
+
else:
|
| 623 |
+
eq_critical_recall = np.nan
|
| 624 |
+
|
| 625 |
+
equipment_performance[eq_type] = {
|
| 626 |
+
'samples': eq_mask.sum(),
|
| 627 |
+
'mae': eq_mae,
|
| 628 |
+
'conservative_rate': eq_conservative,
|
| 629 |
+
'critical_cases': eq_critical_actual.sum(),
|
| 630 |
+
'critical_recall': eq_critical_recall
|
| 631 |
+
}
|
| 632 |
+
|
| 633 |
+
print(f"\n{eq_type}:")
|
| 634 |
+
print(f" Samples: {eq_mask.sum()}")
|
| 635 |
+
print(f" MAE: {eq_mae:.3f}")
|
| 636 |
+
print(f" Conservative rate: {eq_conservative:.3f}")
|
| 637 |
+
print(f" Critical cases: {eq_critical_actual.sum()}")
|
| 638 |
+
if not np.isnan(eq_critical_recall):
|
| 639 |
+
print(f" Critical recall: {eq_critical_recall:.3f}")
|
| 640 |
+
else:
|
| 641 |
+
print(f" Critical recall: N/A (no critical cases)")
|
| 642 |
+
else:
|
| 643 |
+
# Handle the case where equipment performance can't be calculated
|
| 644 |
+
equipment_performance = {}
|
| 645 |
+
|
| 646 |
+
# ============== STEP 10: SAVE ENHANCED MODELS ==============
|
| 647 |
+
print("\n" + "="*60)
|
| 648 |
+
print("STEP 10: SAVING ENHANCED MODELS AND METADATA")
|
| 649 |
+
print("="*60)
|
| 650 |
+
|
| 651 |
+
# Save individual models
|
| 652 |
+
for target in target_columns:
|
| 653 |
+
model_filename = f"enhanced_model_{target.replace(' ', '_').replace('Γ©', 'e')}_v2.joblib"
|
| 654 |
+
joblib.dump(trained_models[target], model_filename)
|
| 655 |
+
print(f"β Saved {target} model to {model_filename}")
|
| 656 |
+
|
| 657 |
+
# Enhanced feature info with training metadata
|
| 658 |
+
enhanced_feature_info = {
|
| 659 |
+
'text_features': text_features,
|
| 660 |
+
'numerical_features': available_features,
|
| 661 |
+
'categorical_features': categorical_features,
|
| 662 |
+
'high_impact_features': high_impact_features,
|
| 663 |
+
'all_feature_columns': feature_columns,
|
| 664 |
+
'target_columns': target_columns,
|
| 665 |
+
|
| 666 |
+
# Training configuration
|
| 667 |
+
'training_config': {
|
| 668 |
+
'conservative_lgbm_params': conservative_lgbm_params,
|
| 669 |
+
'cost_sensitive_learning': True,
|
| 670 |
+
'smote_enabled': True,
|
| 671 |
+
'sample_weighting': True,
|
| 672 |
+
'preprocessing_enhanced': True
|
| 673 |
+
},
|
| 674 |
+
|
| 675 |
+
# Model performance
|
| 676 |
+
'model_performance': {k: {key: val for key, val in v.items()
|
| 677 |
+
if key not in ['predictions', 'probabilities']}
|
| 678 |
+
for k, v in model_performance.items()},
|
| 679 |
+
|
| 680 |
+
# Business metrics
|
| 681 |
+
'business_metrics': business_metrics,
|
| 682 |
+
|
| 683 |
+
# Overall performance
|
| 684 |
+
'overall_performance': {
|
| 685 |
+
'combined_mae': float(overall_mae),
|
| 686 |
+
'conservative_prediction_rate': float(conservative_score),
|
| 687 |
+
'critical_case_recall': float(critical_recall) if not np.isnan(critical_recall) else None,
|
| 688 |
+
'critical_case_precision': float(critical_precision) if not np.isnan(critical_precision) else None,
|
| 689 |
+
'severe_underestimation_cases': int(severe_underestimation),
|
| 690 |
+
'total_critical_cases': int(critical_actual.sum()),
|
| 691 |
+
'equipment_specific_performance': equipment_performance if 'equipment_type_class' in df.columns else None
|
| 692 |
+
},
|
| 693 |
+
|
| 694 |
+
# Data characteristics
|
| 695 |
+
'data_characteristics': {
|
| 696 |
+
'total_samples': len(df),
|
| 697 |
+
'total_features': len(feature_columns),
|
| 698 |
+
'critical_cases_in_data': len(critical_cases),
|
| 699 |
+
'equipment_types_available': 'equipment_type_class' in df.columns,
|
| 700 |
+
'label_confidence_available': 'label_confidence' in df.columns
|
| 701 |
+
}
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
joblib.dump(enhanced_feature_info, 'enhanced_model_metadata_v2.joblib')
|
| 705 |
+
print("β Saved enhanced model metadata to enhanced_model_metadata_v2.joblib")
|
| 706 |
+
|
| 707 |
+
# ============== STEP 11: ENHANCED VISUALIZATIONS ==============
|
| 708 |
+
print("\n" + "="*60)
|
| 709 |
+
print("STEP 11: CREATING ENHANCED PERFORMANCE VISUALIZATIONS")
|
| 710 |
+
print("="*60)
|
| 711 |
+
|
| 712 |
+
# Create comprehensive performance dashboard
|
| 713 |
+
fig = plt.figure(figsize=(20, 16))
|
| 714 |
+
|
| 715 |
+
# 1. Model Performance Comparison
|
| 716 |
+
plt.subplot(3, 4, 1)
|
| 717 |
+
targets = list(model_performance.keys())
|
| 718 |
+
train_accs = [model_performance[t]['train_accuracy'] for t in targets]
|
| 719 |
+
test_accs = [model_performance[t]['test_accuracy'] for t in targets]
|
| 720 |
+
|
| 721 |
+
x_pos = np.arange(len(targets))
|
| 722 |
+
plt.bar(x_pos - 0.2, train_accs, 0.4, label='Training', alpha=0.8)
|
| 723 |
+
plt.bar(x_pos + 0.2, test_accs, 0.4, label='Test', alpha=0.8)
|
| 724 |
+
plt.xlabel('Target Variables')
|
| 725 |
+
plt.ylabel('Accuracy')
|
| 726 |
+
plt.title('Enhanced Model Accuracy')
|
| 727 |
+
plt.xticks(x_pos, [t.replace(' ', '\n') for t in targets], rotation=0)
|
| 728 |
+
plt.legend()
|
| 729 |
+
plt.grid(True, alpha=0.3)
|
| 730 |
+
|
| 731 |
+
# 2. Business Metrics Performance
|
| 732 |
+
plt.subplot(3, 4, 2)
|
| 733 |
+
high_value_recalls = [business_metrics[t]['high_value_recall'] for t in targets]
|
| 734 |
+
underestimation_rates = [business_metrics[t]['underestimation_rate'] for t in targets]
|
| 735 |
+
|
| 736 |
+
x_pos = np.arange(len(targets))
|
| 737 |
+
plt.bar(x_pos - 0.2, high_value_recalls, 0.4, label='High Value Recall', alpha=0.8)
|
| 738 |
+
plt.bar(x_pos + 0.2, underestimation_rates, 0.4, label='Underestimation Rate', alpha=0.8, color='red')
|
| 739 |
+
plt.xlabel('Target Variables')
|
| 740 |
+
plt.ylabel('Rate')
|
| 741 |
+
plt.title('Business-Critical Metrics')
|
| 742 |
+
plt.xticks(x_pos, [t.replace(' ', '\n') for t in targets], rotation=0)
|
| 743 |
+
plt.legend()
|
| 744 |
+
plt.grid(True, alpha=0.3)
|
| 745 |
+
|
| 746 |
+
# 3. Overall Criticality Prediction vs Actual
|
| 747 |
+
plt.subplot(3, 4, 3)
|
| 748 |
+
plt.scatter(actual_criticality, predicted_criticality, alpha=0.6, s=30)
|
| 749 |
+
plt.plot([min(actual_criticality), max(actual_criticality)],
|
| 750 |
+
[min(actual_criticality), max(actual_criticality)], 'r--', linewidth=2)
|
| 751 |
+
plt.xlabel('Actual CriticitΓ©')
|
| 752 |
+
plt.ylabel('Predicted CriticitΓ©')
|
| 753 |
+
plt.title('Criticality Prediction vs Actual')
|
| 754 |
+
plt.grid(True, alpha=0.3)
|
| 755 |
+
|
| 756 |
+
# Add conservative prediction line
|
| 757 |
+
if len(actual_criticality) > 0:
|
| 758 |
+
plt.plot([min(actual_criticality), max(actual_criticality)],
|
| 759 |
+
[min(actual_criticality)-1, max(actual_criticality)-1], 'g--',
|
| 760 |
+
linewidth=1, alpha=0.7, label='Conservative Line')
|
| 761 |
+
plt.legend()
|
| 762 |
+
|
| 763 |
+
# 4. Critical Case Detection Analysis
|
| 764 |
+
plt.subplot(3, 4, 4)
|
| 765 |
+
critical_analysis_data = {
|
| 766 |
+
'Actual Critical': critical_actual.sum(),
|
| 767 |
+
'Predicted Critical': critical_predicted.sum(),
|
| 768 |
+
'True Positives': (critical_actual & critical_predicted).sum(),
|
| 769 |
+
'False Negatives': (critical_actual & ~critical_predicted).sum()
|
| 770 |
+
}
|
| 771 |
+
|
| 772 |
+
plt.bar(critical_analysis_data.keys(), critical_analysis_data.values(),
|
| 773 |
+
color=['blue', 'orange', 'green', 'red'], alpha=0.7)
|
| 774 |
+
plt.ylabel('Count')
|
| 775 |
+
plt.title('Critical Case Detection Analysis')
|
| 776 |
+
plt.xticks(rotation=45)
|
| 777 |
+
plt.grid(True, alpha=0.3)
|
| 778 |
+
|
| 779 |
+
# 5. Equipment Type Performance (if available)
|
| 780 |
+
plt.subplot(3, 4, 5)
|
| 781 |
+
if 'equipment_type_class' in df.columns and equipment_performance:
|
| 782 |
+
eq_types = list(equipment_performance.keys())[:8] # Top 8 equipment types
|
| 783 |
+
eq_maes = [equipment_performance[eq]['mae'] for eq in eq_types]
|
| 784 |
+
|
| 785 |
+
plt.barh(range(len(eq_types)), eq_maes, alpha=0.7)
|
| 786 |
+
plt.yticks(range(len(eq_types)), [eq.replace('_', '\n') for eq in eq_types])
|
| 787 |
+
plt.xlabel('MAE')
|
| 788 |
+
plt.title('Equipment-Specific MAE')
|
| 789 |
+
plt.grid(True, alpha=0.3)
|
| 790 |
+
else:
|
| 791 |
+
plt.text(0.5, 0.5, 'Equipment\nPerformance\nNot Available',
|
| 792 |
+
ha='center', va='center', transform=plt.gca().transAxes)
|
| 793 |
+
plt.title('Equipment Performance')
|
| 794 |
+
|
| 795 |
+
# 6. Confusion Matrix for Combined Criticality
|
| 796 |
+
plt.subplot(3, 4, 6)
|
| 797 |
+
if len(actual_criticality) > 0:
|
| 798 |
+
criticality_bins = [3, 6, 9, 12, 15] # Bin the criticality for better visualization
|
| 799 |
+
actual_binned = np.digitize(actual_criticality, criticality_bins)
|
| 800 |
+
predicted_binned = np.digitize(predicted_criticality, criticality_bins)
|
| 801 |
+
|
| 802 |
+
cm = confusion_matrix(actual_binned, predicted_binned)
|
| 803 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
| 804 |
+
xticklabels=[f'<{b}' for b in criticality_bins] + [f'>={criticality_bins[-1]}'],
|
| 805 |
+
yticklabels=[f'<{b}' for b in criticality_bins] + [f'>={criticality_bins[-1]}'])
|
| 806 |
+
plt.xlabel('Predicted Criticality Range')
|
| 807 |
+
plt.ylabel('Actual Criticality Range')
|
| 808 |
+
plt.title('Criticality Confusion Matrix')
|
| 809 |
+
else:
|
| 810 |
+
plt.text(0.5, 0.5, 'No Test Data\nfor Confusion Matrix', ha='center', va='center', transform=plt.gca().transAxes)
|
| 811 |
+
plt.title('Criticality Confusion Matrix')
|
| 812 |
+
|
| 813 |
+
# 7. Feature Importance (from metadata)
|
| 814 |
+
plt.subplot(3, 4, 7)
|
| 815 |
+
if feature_metadata and 'feature_correlations' in feature_metadata:
|
| 816 |
+
correlations = feature_metadata.get('feature_correlations', [])[:10] # Top 10
|
| 817 |
+
if correlations:
|
| 818 |
+
features = [item['Feature'] for item in correlations]
|
| 819 |
+
corr_values = [abs(item['Correlation']) for item in correlations]
|
| 820 |
+
|
| 821 |
+
plt.barh(range(len(features)), corr_values, alpha=0.7)
|
| 822 |
+
plt.yticks(range(len(features)), [f.replace('_', '\n') for f in features])
|
| 823 |
+
plt.xlabel('|Correlation|')
|
| 824 |
+
plt.title('Top Feature Correlations')
|
| 825 |
+
plt.grid(True, alpha=0.3)
|
| 826 |
+
else:
|
| 827 |
+
plt.text(0.5, 0.5, 'No Feature\nCorrelations Found', ha='center', va='center', transform=plt.gca().transAxes)
|
| 828 |
+
plt.title('Feature Importance')
|
| 829 |
+
else:
|
| 830 |
+
plt.text(0.5, 0.5, 'Feature\nCorrelations\nNot Available',
|
| 831 |
+
ha='center', va='center', transform=plt.gca().transAxes)
|
| 832 |
+
plt.title('Feature Importance')
|
| 833 |
+
|
| 834 |
+
|
| 835 |
+
# 8. Conservative Prediction Analysis
|
| 836 |
+
plt.subplot(3, 4, 8)
|
| 837 |
+
if len(actual_criticality) > 0:
|
| 838 |
+
conservative_analysis = {
|
| 839 |
+
'Conservative': (predicted_criticality >= actual_criticality).sum(),
|
| 840 |
+
'Exact': (predicted_criticality == actual_criticality).sum(),
|
| 841 |
+
'Underestimated': (predicted_criticality < actual_criticality).sum()
|
| 842 |
+
}
|
| 843 |
+
|
| 844 |
+
colors = ['green', 'blue', 'red']
|
| 845 |
+
plt.pie(conservative_analysis.values(), labels=conservative_analysis.keys(),
|
| 846 |
+
autopct='%1.1f%%', colors=colors, startangle=90)
|
| 847 |
+
plt.title('Prediction Conservatism Analysis')
|
| 848 |
+
else:
|
| 849 |
+
plt.text(0.5, 0.5, 'No Data for\nConservatism Analysis', ha='center', va='center', transform=plt.gca().transAxes)
|
| 850 |
+
plt.title('Prediction Conservatism Analysis')
|
| 851 |
+
|
| 852 |
+
|
| 853 |
+
# 9. MAE by Target
|
| 854 |
+
plt.subplot(3, 4, 9)
|
| 855 |
+
target_maes = [model_performance[t]['test_mae'] for t in targets]
|
| 856 |
+
plt.bar(targets, target_maes, alpha=0.7, color='orange')
|
| 857 |
+
plt.xlabel('Target Variables')
|
| 858 |
+
plt.ylabel('MAE')
|
| 859 |
+
plt.title('Mean Absolute Error by Target')
|
| 860 |
+
plt.xticks(rotation=45)
|
| 861 |
+
plt.grid(True, alpha=0.3)
|
| 862 |
+
|
| 863 |
+
# 10. Error Distribution
|
| 864 |
+
plt.subplot(3, 4, 10)
|
| 865 |
+
if len(actual_criticality) > 0:
|
| 866 |
+
errors = predicted_criticality - actual_criticality
|
| 867 |
+
plt.hist(errors, bins=20, alpha=0.7, edgecolor='black')
|
| 868 |
+
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
|
| 869 |
+
plt.xlabel('Prediction Error (Pred - Actual)')
|
| 870 |
+
plt.ylabel('Frequency')
|
| 871 |
+
plt.title('Error Distribution')
|
| 872 |
+
plt.grid(True, alpha=0.3)
|
| 873 |
+
else:
|
| 874 |
+
plt.text(0.5, 0.5, 'No Data for\nError Distribution', ha='center', va='center', transform=plt.gca().transAxes)
|
| 875 |
+
plt.title('Error Distribution')
|
| 876 |
+
|
| 877 |
+
|
| 878 |
+
# 11. Critical Equipment Performance
|
| 879 |
+
plt.subplot(3, 4, 11)
|
| 880 |
+
if 'equipment_type_class' in df.columns and equipment_performance:
|
| 881 |
+
critical_equipment = ['ELECTRICAL_CRITICAL', 'COOLING_CRITICAL', 'TURBINE_SYSTEMS']
|
| 882 |
+
critical_eq_data = {eq: equipment_performance.get(eq, {}).get('critical_recall', 0)
|
| 883 |
+
for eq in critical_equipment if eq in equipment_performance}
|
| 884 |
+
|
| 885 |
+
if critical_eq_data:
|
| 886 |
+
plt.bar(critical_eq_data.keys(), critical_eq_data.values(), alpha=0.7)
|
| 887 |
+
plt.ylabel('Critical Case Recall')
|
| 888 |
+
plt.title('Critical Equipment Performance')
|
| 889 |
+
plt.xticks(rotation=45)
|
| 890 |
+
plt.grid(True, alpha=0.3)
|
| 891 |
+
else:
|
| 892 |
+
plt.text(0.5, 0.5, 'Critical Equipment\nData Not Available\nin Test Set',
|
| 893 |
+
ha='center', va='center', transform=plt.gca().transAxes)
|
| 894 |
+
plt.title('Critical Equipment Performance')
|
| 895 |
+
else:
|
| 896 |
+
plt.text(0.5, 0.5, 'Equipment Data\nNot Available',
|
| 897 |
+
ha='center', va='center', transform=plt.gca().transAxes)
|
| 898 |
+
plt.title('Critical Equipment Performance')
|
| 899 |
+
|
| 900 |
+
# 12. Training Summary
|
| 901 |
+
plt.subplot(3, 4, 12)
|
| 902 |
+
plt.axis('off')
|
| 903 |
+
summary_text = f"""ENHANCED TRAINING SUMMARY
|
| 904 |
+
|
| 905 |
+
Dataset: {len(df):,} samples
|
| 906 |
+
Features: {len(feature_columns)} total
|
| 907 |
+
- Text: {len(text_features)}
|
| 908 |
+
- Numerical: {len(available_features)}
|
| 909 |
+
- Categorical: {len(categorical_features)}
|
| 910 |
+
|
| 911 |
+
Performance:
|
| 912 |
+
- Combined MAE: {overall_mae:.3f}
|
| 913 |
+
- Conservative Rate: {conservative_score:.3f}
|
| 914 |
+
- Critical Recall: {critical_recall:.3f}
|
| 915 |
+
|
| 916 |
+
Enhancements:
|
| 917 |
+
β Equipment Intelligence
|
| 918 |
+
β Cost-Sensitive Learning
|
| 919 |
+
β Sample Weighting
|
| 920 |
+
β Enhanced SMOTE
|
| 921 |
+
β Conservative Parameters
|
| 922 |
+
|
| 923 |
+
Business Impact:
|
| 924 |
+
- Severe Underestimation: {severe_underestimation} cases
|
| 925 |
+
- Critical Cases Detected: {critical_predicted.sum()}/{critical_actual.sum()}
|
| 926 |
+
"""
|
| 927 |
+
|
| 928 |
+
plt.text(0.05, 0.95, summary_text, transform=plt.gca().transAxes,
|
| 929 |
+
fontsize=9, verticalalignment='top', fontfamily='monospace')
|
| 930 |
+
|
| 931 |
+
plt.tight_layout()
|
| 932 |
+
plt.savefig('enhanced_model_performance_dashboard_v2.png', dpi=300, bbox_inches='tight')
|
| 933 |
+
print("β Enhanced performance dashboard saved as 'enhanced_model_performance_dashboard_v2.png'")
|
| 934 |
+
|
| 935 |
+
# ============== STEP 12: SAFETY OVERRIDE RULES ==============
|
| 936 |
+
print("\n" + "="*60)
|
| 937 |
+
print("STEP 12: IMPLEMENTING SAFETY OVERRIDE RULES")
|
| 938 |
+
print("="*60)
|
| 939 |
+
|
| 940 |
+
def create_safety_override_rules():
|
| 941 |
+
"""
|
| 942 |
+
Create safety override rules for conservative prediction
|
| 943 |
+
"""
|
| 944 |
+
rules = {
|
| 945 |
+
'structural_failure_override': {
|
| 946 |
+
'condition': 'has_structural_failure == 1',
|
| 947 |
+
'action': 'min_criticality = 9',
|
| 948 |
+
'description': 'Any structural failure gets minimum criticality 9'
|
| 949 |
+
},
|
| 950 |
+
'electrical_critical_equipment': {
|
| 951 |
+
'condition': 'equipment_type_class == "ELECTRICAL_CRITICAL"',
|
| 952 |
+
'action': 'apply_conservative_threshold = 0.7',
|
| 953 |
+
'description': 'Lower confidence threshold for electrical critical equipment'
|
| 954 |
+
},
|
| 955 |
+
'cooling_critical_equipment': {
|
| 956 |
+
'condition': 'equipment_type_class == "COOLING_CRITICAL"',
|
| 957 |
+
'action': 'min_criticality = 10',
|
| 958 |
+
'description': 'Cooling critical equipment gets minimum criticality 10'
|
| 959 |
+
},
|
| 960 |
+
'safety_mention_boost': {
|
| 961 |
+
'condition': 'has_safety_mention == 1',
|
| 962 |
+
'action': 'add_criticality_boost = 2',
|
| 963 |
+
'description': 'SAFETY mentions get +2 criticality boost'
|
| 964 |
+
},
|
| 965 |
+
'turbine_oil_issue': {
|
| 966 |
+
'condition': 'turbine_oil_issue == 1',
|
| 967 |
+
'action': 'min_criticality = 8',
|
| 968 |
+
'description': 'Turbine oil issues get minimum criticality 8'
|
| 969 |
+
}
|
| 970 |
+
}
|
| 971 |
+
return rules
|
| 972 |
+
|
| 973 |
+
safety_rules = create_safety_override_rules()
|
| 974 |
+
|
| 975 |
+
print("Safety Override Rules Created:")
|
| 976 |
+
for rule_name, rule_info in safety_rules.items():
|
| 977 |
+
print(f" {rule_name}:")
|
| 978 |
+
print(f" Condition: {rule_info['condition']}")
|
| 979 |
+
print(f" Action: {rule_info['action']}")
|
| 980 |
+
print(f" Description: {rule_info['description']}")
|
| 981 |
+
|
| 982 |
+
# Save safety rules
|
| 983 |
+
with open('safety_override_rules_v2.json', 'w') as f:
|
| 984 |
+
json.dump(safety_rules, f, indent=2)
|
| 985 |
+
print("β Safety override rules saved to safety_override_rules_v2.json")
|
| 986 |
+
|
| 987 |
+
# ============== STEP 13: FINAL RECOMMENDATIONS ==============
|
| 988 |
+
print("\n" + "="*60)
|
| 989 |
+
print("STEP 13: ENHANCED MODEL RECOMMENDATIONS")
|
| 990 |
+
print("="*60)
|
| 991 |
+
|
| 992 |
+
print("π― ENHANCED MODEL PERFORMANCE ANALYSIS:")
|
| 993 |
+
print(f"β Overall MAE improved with equipment intelligence: {overall_mae:.3f}")
|
| 994 |
+
print(f"β Conservative prediction rate: {conservative_score:.3f} (good for safety)")
|
| 995 |
+
print(f"β Critical case recall: {critical_recall:.3f}")
|
| 996 |
+
print(f"β Severe underestimation reduced to: {severe_underestimation} cases")
|
| 997 |
+
|
| 998 |
+
print(f"\nπ§ EQUIPMENT INTELLIGENCE IMPACT:")
|
| 999 |
+
for target in target_columns:
|
| 1000 |
+
performance = model_performance[target]
|
| 1001 |
+
business = business_metrics[target]
|
| 1002 |
+
print(f"{target}:")
|
| 1003 |
+
print(f" Test Accuracy: {performance['test_accuracy']:.3f}")
|
| 1004 |
+
print(f" High-Value Recall: {business['high_value_recall']:.3f}")
|
| 1005 |
+
print(f" Underestimation Rate: {business['underestimation_rate']:.3f}")
|
| 1006 |
+
|
| 1007 |
+
if equipment_performance:
|
| 1008 |
+
print(f"\nβ‘ HIGH-RISK EQUIPMENT PERFORMANCE:")
|
| 1009 |
+
critical_equipment_types = ['ELECTRICAL_CRITICAL', 'COOLING_CRITICAL', 'TURBINE_SYSTEMS']
|
| 1010 |
+
for eq_type in critical_equipment_types:
|
| 1011 |
+
if eq_type in equipment_performance:
|
| 1012 |
+
perf = equipment_performance[eq_type]
|
| 1013 |
+
print(f"{eq_type}:")
|
| 1014 |
+
print(f" MAE: {perf['mae']:.3f}")
|
| 1015 |
+
print(f" Conservative Rate: {perf['conservative_rate']:.3f}")
|
| 1016 |
+
if not np.isnan(perf['critical_recall']):
|
| 1017 |
+
print(f" Critical Recall: {perf['critical_recall']:.3f}")
|
| 1018 |
+
|
| 1019 |
+
print(f"\nπ DEPLOYMENT RECOMMENDATIONS:")
|
| 1020 |
+
print(f"1. Use safety override rules for critical equipment")
|
| 1021 |
+
print(f"2. Apply conservative thresholds for ELECTRICAL_CRITICAL equipment")
|
| 1022 |
+
print(f"3. Implement manual review for predictions with low confidence")
|
| 1023 |
+
print(f"4. Monitor underestimation rate in production")
|
| 1024 |
+
print(f"5. Retrain quarterly with new data to maintain performance")
|
| 1025 |
+
|
| 1026 |
+
print(f"\nπ BUSINESS IMPACT:")
|
| 1027 |
+
print(f"- Reduced risk of missing critical failures")
|
| 1028 |
+
print(f"- Better detection of electrical equipment issues")
|
| 1029 |
+
print(f"- Equipment-specific prediction strategies")
|
| 1030 |
+
print(f"- Conservative bias protects against safety risks")
|
| 1031 |
+
|
| 1032 |
+
# ============== FINAL SUMMARY ==============
|
| 1033 |
+
print("\n" + "="*80)
|
| 1034 |
+
print("ENHANCED TRAINING PIPELINE v2.0 COMPLETED!")
|
| 1035 |
+
print("="*80)
|
| 1036 |
+
|
| 1037 |
+
print(f"\nπ TRAINING ACHIEVEMENTS:")
|
| 1038 |
+
print(f"β Equipment Intelligence Integration: {len(categorical_features)} equipment features")
|
| 1039 |
+
print(f"β Cost-Sensitive Learning: Implemented with sample weighting")
|
| 1040 |
+
print(f"β Enhanced SMOTE: BorderlineSMOTE for better minority class handling")
|
| 1041 |
+
print(f"β Conservative Parameters: Lower learning rate, higher regularization")
|
| 1042 |
+
print(f"β Safety Override Rules: {len(safety_rules)} rules implemented")
|
| 1043 |
+
print(f"β Business Metrics Focus: High-value recall and underestimation tracking")
|
| 1044 |
+
|
| 1045 |
+
print(f"\nπ PERFORMANCE IMPROVEMENTS:")
|
| 1046 |
+
print(f"Feature enhancement: 10 β {len(feature_columns)} features")
|
| 1047 |
+
print(f"Equipment types classified: {len(df['equipment_type_class'].unique()) if 'equipment_type_class' in df.columns else 'N/A'}")
|
| 1048 |
+
print(f"Critical case detection: {critical_predicted.sum()}/{critical_actual.sum()} cases")
|
| 1049 |
+
print(f"Conservative prediction bias: {conservative_score:.1%} of predictions")
|
| 1050 |
+
|
| 1051 |
+
print(f"\nπ FILES GENERATED:")
|
| 1052 |
+
for target in target_columns:
|
| 1053 |
+
model_filename = f"enhanced_model_{target.replace(' ', '_').replace('Γ©', 'e')}_v2.joblib"
|
| 1054 |
+
print(f"β {model_filename}")
|
| 1055 |
+
|
| 1056 |
+
print("β enhanced_model_metadata_v2.joblib")
|
| 1057 |
+
print("β safety_override_rules_v2.json")
|
| 1058 |
+
print("β enhanced_model_performance_dashboard_v2.png")
|
| 1059 |
+
|
| 1060 |
+
print(f"\nπ― NEXT STEP: UPDATE ANOMALY INTELLIGENCE")
|
| 1061 |
+
print("The inference system needs to be updated to use:")
|
| 1062 |
+
print("1. New enhanced models and metadata")
|
| 1063 |
+
print("2. Equipment intelligence features")
|
| 1064 |
+
print("3. Safety override rules")
|
| 1065 |
+
print("4. Conservative prediction thresholds")
|
| 1066 |
+
|
| 1067 |
+
print("\n" + "="*80)
|
| 1068 |
+
print("ENHANCED MODELS READY FOR PRODUCTION DEPLOYMENT!")
|
| 1069 |
+
print("="*80)
|