bneay commited on
Commit
4256453
Β·
1 Parent(s): 5fa44a2

py scripts

Browse files
Files changed (4) hide show
  1. anomaly_intelligence.py +1260 -0
  2. descritption_v2.py +942 -0
  3. equipment_analysis.py +319 -0
  4. training.py +1069 -0
anomaly_intelligence.py ADDED
@@ -0,0 +1,1260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # enhanced_anomaly_intelligence_v2.py
2
+ # TAQATHON 2025 - Production Anomaly Intelligence with Equipment Intelligence
3
+ # Enhanced for single and batch processing with safety override rules
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import joblib
8
+ import json
9
+ import re
10
+ from datetime import datetime
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+ import warnings
13
+ from typing import Union, List, Dict, Any
14
+ import time
15
+
16
+ warnings.filterwarnings('ignore')
17
+
18
+ class EnhancedAnomalyIntelligence:
19
+ """
20
+ Enhanced Production-ready Anomaly Intelligence System v2.0
21
+ Features: Equipment Intelligence + Safety Override Rules + Conservative Prediction
22
+ """
23
+
24
+ def __init__(self):
25
+ self.models = {}
26
+ self.model_metadata = None
27
+ self.safety_rules = None
28
+ self.embeddings = None
29
+ self.embedding_metadata = None
30
+ self.sentence_model = None
31
+ self._models_loaded = False
32
+
33
+ # Equipment intelligence configuration
34
+ self.equipment_type_scores = {}
35
+ self.section_risk_multipliers = {}
36
+
37
+ def _load_models(self):
38
+ """Load all enhanced models and metadata (called once)"""
39
+ if self._models_loaded:
40
+ return
41
+
42
+ print("Loading enhanced models and metadata...")
43
+
44
+ try:
45
+ # Load enhanced model metadata
46
+ self.model_metadata = joblib.load('enhanced_model_metadata_v2.joblib')
47
+ target_columns = self.model_metadata['target_columns']
48
+
49
+ # Load enhanced trained models
50
+ for target in target_columns:
51
+ model_filename = f"enhanced_model_{target.replace(' ', '_').replace('Γ©', 'e')}_v2.joblib"
52
+ self.models[target] = joblib.load(model_filename)
53
+ print(f"βœ“ Loaded {target} model")
54
+
55
+ # Load safety override rules
56
+ try:
57
+ with open('safety_override_rules_v2.json', 'r') as f:
58
+ self.safety_rules = json.load(f)
59
+ print("βœ“ Loaded safety override rules")
60
+ except FileNotFoundError:
61
+ print("⚠️ Warning: safety_override_rules_v2.json not found - safety rules disabled")
62
+ self.safety_rules = {}
63
+
64
+ # Load embeddings and metadata for similarity search
65
+ try:
66
+ self.embeddings = np.load('anomaly_embeddings.npy')
67
+ self.embedding_metadata = joblib.load('embedding_metadata.joblib')
68
+ print("βœ“ Loaded similarity search embeddings")
69
+ except FileNotFoundError:
70
+ print("⚠️ Warning: Embedding files not found - similarity search disabled")
71
+ self.embeddings = None
72
+ self.embedding_metadata = None
73
+
74
+ # Load sentence transformer
75
+ try:
76
+ from sentence_transformers import SentenceTransformer
77
+ try:
78
+ self.sentence_model = SentenceTransformer('dangvantuan/sentence-camembert-large')
79
+ print("βœ“ Loaded French CamemBERT model")
80
+ except:
81
+ try:
82
+ self.sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
83
+ print("βœ“ Loaded multilingual model")
84
+ except:
85
+ self.sentence_model = SentenceTransformer('distiluse-base-multilingual-cased')
86
+ print("βœ“ Loaded basic multilingual model")
87
+ except Exception as e:
88
+ print(f"⚠️ Warning: Could not load sentence transformer: {e}")
89
+ self.sentence_model = None
90
+
91
+ # Extract equipment intelligence configuration
92
+ if 'training_config' in self.model_metadata:
93
+ training_config = self.model_metadata['training_config']
94
+ print("βœ“ Loaded training configuration")
95
+
96
+ self._models_loaded = True
97
+ print("βœ“ All enhanced models loaded successfully")
98
+
99
+ except Exception as e:
100
+ raise Exception(f"Failed to load enhanced models: {str(e)}")
101
+
102
+ def predict_single(self, anomaly_data: Dict,
103
+ confidence_threshold: float = 0.7,
104
+ include_similar: bool = True,
105
+ format_type: str = 'rich',
106
+ apply_safety_rules: bool = True) -> Dict:
107
+ """
108
+ Enhanced single anomaly prediction with equipment intelligence and safety rules
109
+
110
+ Args:
111
+ anomaly_data: Dictionary with anomaly information
112
+ confidence_threshold: Threshold for flagging manual review
113
+ include_similar: Whether to include similar anomalies
114
+ format_type: 'rich' for UI, 'simple' for database
115
+ apply_safety_rules: Whether to apply safety override rules
116
+ """
117
+ self._load_models()
118
+
119
+ try:
120
+ # Extract and prepare enhanced features
121
+ enhanced_features = self._extract_enhanced_features_single(anomaly_data)
122
+
123
+ # Make base predictions
124
+ predictions, confidences, probabilities = self._predict_criticality(enhanced_features)
125
+
126
+ # Apply safety override rules if enabled
127
+ if apply_safety_rules and self.safety_rules:
128
+ predictions = self._apply_safety_override_rules(enhanced_features, predictions)
129
+
130
+ # Calculate enhanced metrics
131
+ total_criticality = sum(predictions.values())
132
+ overall_confidence = np.mean(list(confidences.values()))
133
+
134
+ # Enhanced business logic for manual review
135
+ needs_review = self._determine_manual_review_need(
136
+ enhanced_features, predictions, overall_confidence, confidence_threshold
137
+ )
138
+
139
+ # Equipment-specific risk assessment
140
+ equipment_risk_assessment = self._assess_equipment_risk(enhanced_features, predictions)
141
+
142
+ # Find similar anomalies
143
+ similar_anomalies = []
144
+ if include_similar and self.sentence_model is not None:
145
+ similar_anomalies = self._find_similar_anomalies(
146
+ anomaly_data.get('Description', ''), top_k=3
147
+ )
148
+
149
+ # Format response based on type
150
+ if format_type == 'simple':
151
+ return self._format_simple_response(
152
+ anomaly_data, predictions, total_criticality,
153
+ overall_confidence, needs_review, equipment_risk_assessment
154
+ )
155
+ else:
156
+ return self._format_rich_response(
157
+ anomaly_data, predictions, confidences,
158
+ total_criticality, overall_confidence,
159
+ similar_anomalies, needs_review, confidence_threshold,
160
+ equipment_risk_assessment, enhanced_features
161
+ )
162
+
163
+ except Exception as e:
164
+ return {
165
+ 'error': f'Enhanced prediction failed: {str(e)}',
166
+ 'timestamp': datetime.now().isoformat(),
167
+ 'input_description': anomaly_data.get('Description', 'N/A')
168
+ }
169
+
170
+ def predict_batch(self, anomaly_list: List[Dict],
171
+ confidence_threshold: float = 0.7,
172
+ include_similar: bool = False,
173
+ format_type: str = 'simple',
174
+ apply_safety_rules: bool = True) -> List[Dict]:
175
+ """
176
+ Enhanced batch prediction with equipment intelligence
177
+
178
+ Args:
179
+ anomaly_list: List of anomaly dictionaries
180
+ confidence_threshold: Threshold for flagging manual review
181
+ include_similar: Whether to include similar anomalies (slower for batch)
182
+ format_type: 'rich' for UI, 'simple' for database
183
+ apply_safety_rules: Whether to apply safety override rules
184
+ """
185
+ self._load_models()
186
+
187
+ print(f"Processing enhanced batch of {len(anomaly_list)} anomalies...")
188
+ start_time = time.time()
189
+
190
+ results = []
191
+
192
+ try:
193
+ # Extract enhanced features for all anomalies
194
+ all_features = []
195
+ for anomaly_data in anomaly_list:
196
+ enhanced_features = self._extract_enhanced_features_single(anomaly_data)
197
+ all_features.append(enhanced_features)
198
+
199
+ # Create batch DataFrame with all enhanced features
200
+ batch_df = pd.DataFrame(all_features)
201
+
202
+ # Make batch predictions
203
+ batch_predictions = {}
204
+ batch_confidences = {}
205
+
206
+ target_columns = self.model_metadata['target_columns']
207
+ for target in target_columns:
208
+ model = self.models[target]
209
+ preds = model.predict(batch_df)
210
+ probas = model.predict_proba(batch_df)
211
+ confs = np.max(probas, axis=1)
212
+
213
+ batch_predictions[target] = preds
214
+ batch_confidences[target] = confs
215
+
216
+ # Process results with enhanced logic
217
+ for i, anomaly_data in enumerate(anomaly_list):
218
+ # Get base predictions
219
+ predictions = {target: int(batch_predictions[target][i])
220
+ for target in target_columns}
221
+ confidences = {target: float(batch_confidences[target][i])
222
+ for target in target_columns}
223
+
224
+ enhanced_features = all_features[i]
225
+
226
+ # Apply safety override rules if enabled
227
+ if apply_safety_rules and self.safety_rules:
228
+ predictions = self._apply_safety_override_rules(enhanced_features, predictions)
229
+
230
+ total_criticality = sum(predictions.values())
231
+ overall_confidence = np.mean(list(confidences.values()))
232
+
233
+ # Enhanced business logic
234
+ needs_review = self._determine_manual_review_need(
235
+ enhanced_features, predictions, overall_confidence, confidence_threshold
236
+ )
237
+
238
+ equipment_risk_assessment = self._assess_equipment_risk(enhanced_features, predictions)
239
+
240
+ # Find similar anomalies (optional for batch)
241
+ similar_anomalies = []
242
+ if include_similar and self.sentence_model is not None:
243
+ similar_anomalies = self._find_similar_anomalies(
244
+ anomaly_data.get('Description', ''), top_k=2
245
+ )
246
+
247
+ # Format response
248
+ if format_type == 'simple':
249
+ result = self._format_simple_response(
250
+ anomaly_data, predictions, total_criticality,
251
+ overall_confidence, needs_review, equipment_risk_assessment
252
+ )
253
+ else:
254
+ result = self._format_rich_response(
255
+ anomaly_data, predictions, confidences,
256
+ total_criticality, overall_confidence,
257
+ similar_anomalies, needs_review, confidence_threshold,
258
+ equipment_risk_assessment, enhanced_features
259
+ )
260
+
261
+ results.append(result)
262
+
263
+ processing_time = time.time() - start_time
264
+ print(f"βœ“ Enhanced batch processing completed in {processing_time:.2f}s")
265
+ print(f" Average time per anomaly: {processing_time/len(anomaly_list):.3f}s")
266
+
267
+ flagged_count = sum(1 for r in results if r.get('needs_manual_review', False))
268
+ safety_overrides = sum(1 for r in results if r.get('safety_override_applied', False))
269
+
270
+ print(f" Flagged for manual review: {flagged_count}/{len(anomaly_list)} ({flagged_count/len(anomaly_list)*100:.1f}%)")
271
+ print(f" Safety overrides applied: {safety_overrides}/{len(anomaly_list)} ({safety_overrides/len(anomaly_list)*100:.1f}%)")
272
+
273
+ return results
274
+
275
+ except Exception as e:
276
+ # Return error for all items in batch
277
+ error_result = {
278
+ 'error': f'Enhanced batch prediction failed: {str(e)}',
279
+ 'timestamp': datetime.now().isoformat()
280
+ }
281
+ return [error_result] * len(anomaly_list)
282
+
283
+ def _extract_enhanced_features_single(self, anomaly_data: Dict) -> Dict:
284
+ """Extract enhanced features including equipment intelligence"""
285
+
286
+ # Create temporary DataFrame for feature engineering
287
+ temp_df = pd.DataFrame([anomaly_data])
288
+
289
+ # Apply enhanced feature engineering (matching training pipeline)
290
+ enhanced_features = self._extract_enhanced_features(temp_df)
291
+
292
+ # Prepare feature dict with all required features
293
+ feature_columns = self.model_metadata.get('all_feature_columns', [])
294
+
295
+ input_data = {}
296
+
297
+ # Text feature
298
+ input_data['Description'] = anomaly_data.get('Description', '')
299
+
300
+ # Enhanced numerical features
301
+ numerical_features = self.model_metadata.get('numerical_features', [])
302
+ for feat in numerical_features:
303
+ if feat in enhanced_features.columns:
304
+ value = enhanced_features[feat].iloc[0]
305
+ # Ensure proper type conversion
306
+ if pd.isna(value):
307
+ input_data[feat] = 0.0
308
+ elif isinstance(value, (bool, np.bool_)):
309
+ input_data[feat] = float(value)
310
+ else:
311
+ input_data[feat] = float(value)
312
+ else:
313
+ input_data[feat] = 0.0
314
+
315
+ # Categorical features
316
+ categorical_features = self.model_metadata.get('categorical_features', [])
317
+ for feat in categorical_features:
318
+ input_data[feat] = anomaly_data.get(feat, 'Unknown')
319
+
320
+ return input_data
321
+
322
+ def _extract_enhanced_features(self, df):
323
+ """Extract enhanced features (matching training pipeline logic)"""
324
+ import re
325
+
326
+ features_df = df.copy()
327
+
328
+ # Create combined text field
329
+ features_df['combined_text'] = features_df['Description'].fillna('') + ' ' + features_df.get('Description de l\'Γ©quipement', '').fillna('')
330
+ features_df['combined_text_lower'] = features_df['combined_text'].str.lower()
331
+
332
+ # Basic text features
333
+ features_df['description_length'] = features_df['Description'].str.len()
334
+ features_df['description_word_count'] = features_df['Description'].str.split().str.len()
335
+ features_df['equipment_desc_length'] = features_df.get('Description de l\'Γ©quipement', '').str.len()
336
+ features_df['equipment_desc_word_count'] = features_df.get('Description de l\'Γ©quipement', '').str.split().str.len()
337
+ features_df['combined_length'] = features_df['combined_text'].str.len()
338
+ features_df['combined_word_count'] = features_df['combined_text'].str.split().str.len()
339
+
340
+ # Equipment intelligence classification
341
+ def classify_equipment_type(equipment_desc):
342
+ """Classify equipment based on training analysis"""
343
+ equipment_upper = str(equipment_desc).upper()
344
+
345
+ # Equipment type scoring (from training pipeline)
346
+ if any(keyword in equipment_upper for keyword in ['ALTERNATEUR', 'TRANSFO PRINCIPAL', 'PROTECTION ALTERNATEUR']):
347
+ return 'ELECTRICAL_CRITICAL', 8.0
348
+ elif any(keyword in equipment_upper for keyword in ['VENTILATEUR DE REFROIDISSEMENT', 'REFROIDISSEMENT TP', 'MOTEUR VENTILATEUR DE REFROIDISSEMENT']):
349
+ return 'COOLING_CRITICAL', 7.5
350
+ elif any(keyword in equipment_upper for keyword in ['TURBINE', 'SOUPAPE REGULATRICE', 'REFRIGERANT HUILE', 'POMPE DE SOULÈVEMENT']):
351
+ return 'TURBINE_SYSTEMS', 7.5
352
+ elif any(keyword in equipment_upper for keyword in ['DISJONCTEUR', 'TRANSFORMATEUR', 'MOTEUR', 'ARMOIRE', 'GROUPE']):
353
+ return 'ELECTRICAL_STANDARD', 6.5
354
+ elif any(keyword in equipment_upper for keyword in ['RECHAUFFEUR', 'RΓ‰CHAUFFEUR', 'CHAUDIERE', 'CHAUDIÈRE']):
355
+ return 'HEATING_SYSTEMS', 6.5
356
+ elif any(keyword in equipment_upper for keyword in ['VENTILATEUR', 'TIRAGE', 'SOUFFLAGE', 'AIR PRIMAIRE', 'AIR SECONDAIRE']):
357
+ return 'VENTILATION_SYSTEMS', 6.0
358
+ elif any(keyword in equipment_upper for keyword in ['POMPE', 'SOUPAPE', 'VANNE', 'CONVOYEUR', 'BROYEUR', 'COAL FEEDER']):
359
+ return 'PROCESS_SYSTEMS', 5.5
360
+ elif any(keyword in equipment_upper for keyword in ['DECRASSEUR', 'DÉGRILLEUR', 'FILTRE', 'CAPTEUR', 'TRANSMETTEUR']):
361
+ return 'AUXILIARY_SYSTEMS', 5.0
362
+ else:
363
+ return 'UNKNOWN', 4.5
364
+
365
+ def detect_equipment_redundancy(equipment_desc):
366
+ """Detect equipment redundancy based on naming patterns"""
367
+ equipment_upper = str(equipment_desc).upper()
368
+
369
+ if any(pattern in equipment_upper for pattern in ['PRINCIPAL', 'UNIQUE']):
370
+ return 'SINGLE_CRITICAL', 1.3
371
+ elif any(re.search(pattern, equipment_upper) for pattern in [r'\b[AB]$', r'NΒ°[12]$', r'PRIMAIRE$', r'SECONDAIRE$']):
372
+ return 'DUAL_SYSTEM', 1.0
373
+ elif any(re.search(pattern, equipment_upper) for pattern in [r'NΒ°[3-9]$', r'NΒ°[0-9][0-9]$']):
374
+ return 'MULTIPLE_SYSTEM', 0.8
375
+ else:
376
+ return 'UNKNOWN_REDUNDANCY', 1.0
377
+
378
+ # Apply equipment intelligence
379
+ if 'Description de l\'Γ©quipement' in features_df.columns:
380
+ equipment_classifications = features_df['Description de l\'Γ©quipement'].apply(classify_equipment_type)
381
+ features_df['equipment_type_class'] = [x[0] for x in equipment_classifications]
382
+ features_df['equipment_base_criticality'] = [x[1] for x in equipment_classifications]
383
+
384
+ redundancy_classifications = features_df['Description de l\'Γ©quipement'].apply(detect_equipment_redundancy)
385
+ features_df['equipment_redundancy_class'] = [x[0] for x in redundancy_classifications]
386
+ features_df['equipment_redundancy_multiplier'] = [x[1] for x in redundancy_classifications]
387
+ else:
388
+ features_df['equipment_type_class'] = 'UNKNOWN'
389
+ features_df['equipment_base_criticality'] = 4.5
390
+ features_df['equipment_redundancy_class'] = 'UNKNOWN_REDUNDANCY'
391
+ features_df['equipment_redundancy_multiplier'] = 1.0
392
+
393
+ # Section risk multiplier
394
+ section_risk_multipliers = {'34EL': 1.2, '34MM': 1.1, '34MD': 1.1, '34MC': 1.0, '34CT': 1.0}
395
+ features_df['section_risk_multiplier'] = features_df.get('Section propriΓ©taire', '').map(section_risk_multipliers).fillna(1.0)
396
+
397
+ # Combined equipment risk score
398
+ features_df['equipment_risk_score'] = (features_df['equipment_base_criticality'] *
399
+ features_df['equipment_redundancy_multiplier'] *
400
+ features_df['section_risk_multiplier'])
401
+
402
+ # Enhanced keyword extraction
403
+ def extract_keywords_dual_field(description, equipment_desc, keyword_dict):
404
+ """Extract keywords from both description and equipment description"""
405
+ combined_text = (str(description) + ' ' + str(equipment_desc)).lower()
406
+ found_keywords = []
407
+
408
+ for category, keywords in keyword_dict.items():
409
+ for keyword in keywords:
410
+ if keyword in combined_text:
411
+ found_keywords.append(category)
412
+ break
413
+
414
+ return found_keywords
415
+
416
+ # Keyword dictionaries (from training pipeline)
417
+ equipment_keywords = {
418
+ 'pompe': ['pompe', 'pompes'],
419
+ 'vanne': ['vanne', 'vannes'],
420
+ 'ventilateur': ['ventilateur', 'ventilateurs', 'ventilo'],
421
+ 'moteur': ['moteur', 'moteurs', 'moto'],
422
+ 'alternateur': ['alternateur', 'alternateurs'],
423
+ 'transformateur': ['transformateur', 'transformateurs', 'transfo'],
424
+ 'turbine': ['turbine', 'turbines'],
425
+ 'principal': ['principal', 'principale'],
426
+ 'groupe': ['groupe', 'groupes']
427
+ }
428
+
429
+ problem_keywords = {
430
+ 'fuite': ['fuite', 'fuites', 'fuit', 'fuyant'],
431
+ 'vibration': ['vibration', 'vibrations', 'vibre'],
432
+ 'bruit_anormal': ['bruit anormal', 'bruit anormale'],
433
+ 'percement': ['percement', 'percΓ©', 'percΓ©e'],
434
+ 'Γ©clatement': ['Γ©clatement', 'eclatement'],
435
+ 'fissure': ['fissure', 'fissurΓ©', 'fissures'],
436
+ 'aggravation': ['aggravation'],
437
+ 'sifflement': ['sifflement', 'siffler'],
438
+ 'dΓ©faillance': ['dΓ©faillance', 'dΓ©faillant'],
439
+ 'dysfonctionnement': ['dysfonctionnement', 'dysfonctionnel'],
440
+ 'sens_inverse': ['sens inverse', 'sens contraire'],
441
+ 'surchauffe': ['surchauffe', 'surchauffΓ©', 'tempΓ©rature Γ©levΓ©e', 'temp elevee']
442
+ }
443
+
444
+ action_keywords = {
445
+ 'maintenance': ['maintenance', 'entretien'],
446
+ 'prΓ©vision': ['prΓ©voir', 'prΓ©voire', 'prevoir'],
447
+ 'remplacement': ['remplacement', 'remplacer', 'remplacΓ©']
448
+ }
449
+
450
+ urgency_keywords = {
451
+ 'safety': ['safety', 'sΓ©curitΓ©'],
452
+ 'urgent': ['urgent', 'urgence'],
453
+ 'critique': ['critique', 'critiques'],
454
+ 'important': ['important', 'importante']
455
+ }
456
+
457
+ # Apply keyword extraction
458
+ description_col = features_df['Description']
459
+ equipment_col = features_df.get('Description de l\'Γ©quipement', '')
460
+
461
+ features_df['equipment_mentioned'] = features_df.apply(
462
+ lambda row: extract_keywords_dual_field(row['Description'], row.get('Description de l\'Γ©quipement', ''), equipment_keywords),
463
+ axis=1
464
+ )
465
+ features_df['equipment_count'] = features_df['equipment_mentioned'].str.len()
466
+
467
+ features_df['problem_types'] = features_df.apply(
468
+ lambda row: extract_keywords_dual_field(row['Description'], row.get('Description de l\'Γ©quipement', ''), problem_keywords),
469
+ axis=1
470
+ )
471
+ features_df['problem_count'] = features_df['problem_types'].str.len()
472
+
473
+ features_df['actions_mentioned'] = features_df.apply(
474
+ lambda row: extract_keywords_dual_field(row['Description'], row.get('Description de l\'Γ©quipement', ''), action_keywords),
475
+ axis=1
476
+ )
477
+ features_df['action_count'] = features_df['actions_mentioned'].str.len()
478
+
479
+ features_df['urgency_indicators'] = features_df.apply(
480
+ lambda row: extract_keywords_dual_field(row['Description'], row.get('Description de l\'Γ©quipement', ''), urgency_keywords),
481
+ axis=1
482
+ )
483
+ features_df['has_urgency'] = (features_df['urgency_indicators'].str.len() > 0).astype(int)
484
+
485
+ # Critical failure pattern detection
486
+ features_df['has_structural_failure'] = features_df['combined_text_lower'].str.contains(
487
+ 'percement|Γ©clatement|eclatement|fissure|rupture', regex=True, na=False
488
+ ).astype(int)
489
+
490
+ features_df['has_equipment_malfunction'] = features_df['combined_text_lower'].str.contains(
491
+ 'sens inverse|dysfonctionnement|dΓ©faillance|dΓ©faut|panne', regex=True, na=False
492
+ ).astype(int)
493
+
494
+ features_df['has_escalation'] = features_df['combined_text_lower'].str.contains(
495
+ 'aggravation|empirΓ©|empire', regex=True, na=False
496
+ ).astype(int)
497
+
498
+ features_df['has_safety_mention'] = features_df['Description'].str.contains('SAFETY', case=False, na=False).astype(int)
499
+
500
+ # Specific high-risk combinations
501
+ features_df['electrical_cooling_issue'] = (
502
+ (features_df['equipment_type_class'].isin(['ELECTRICAL_CRITICAL', 'ELECTRICAL_STANDARD'])) &
503
+ (features_df['combined_text_lower'].str.contains('refroidissement|ventilateur|tempΓ©rature', regex=True, na=False))
504
+ ).astype(int)
505
+
506
+ features_df['turbine_oil_issue'] = (
507
+ (features_df['equipment_type_class'] == 'TURBINE_SYSTEMS') &
508
+ (features_df['combined_text_lower'].str.contains('huile|fuite|graissage', regex=True, na=False))
509
+ ).astype(int)
510
+
511
+ features_df['main_equipment_failure'] = (
512
+ (features_df['equipment_redundancy_class'] == 'SINGLE_CRITICAL') &
513
+ (features_df['has_structural_failure'] == 1)
514
+ ).astype(int)
515
+
516
+ # Enhanced compound features
517
+ features_df['fuite_vapeur'] = features_df['combined_text_lower'].str.contains('fuite.*vapeur|vapeur.*fuite', regex=True, na=False).astype(int)
518
+ features_df['fuite_huile'] = features_df['combined_text_lower'].str.contains('fuite.*huile|huile.*fuite', regex=True, na=False).astype(int)
519
+ features_df['fuite_eau'] = features_df['combined_text_lower'].str.contains('fuite.*eau|eau.*fuite', regex=True, na=False).astype(int)
520
+ features_df['bruit_anormal'] = features_df['combined_text_lower'].str.contains('bruit anormal', regex=True, na=False).astype(int)
521
+ features_df['vibration_excessive'] = features_df['combined_text_lower'].str.contains('vibration.*excessive|vibration.*Γ©levΓ©e', regex=True, na=False).astype(int)
522
+ features_df['temperature_elevee'] = features_df['combined_text_lower'].str.contains('tempΓ©rature Γ©levΓ©e|temp Γ©levΓ©e|temp elevee', regex=True, na=False).astype(int)
523
+ features_df['maintenance_planning'] = features_df['combined_text_lower'].str.contains('prΓ©voir|prΓ©voire|planifier', regex=True, na=False).astype(int)
524
+ features_df['is_recurring'] = features_df['combined_text_lower'].str.contains('frΓ©quent|rΓ©pΓ©titif|souvent', regex=True, na=False).astype(int)
525
+
526
+ # Technical features
527
+ features_df['has_measurements'] = features_df['combined_text_lower'].str.contains(r'\d+\s*Β°c|\d+\s*bar|\d+\s*%', regex=True, na=False).astype(int)
528
+ features_df['has_equipment_codes'] = features_df['combined_text_lower'].str.contains(r'[A-Z0-9]{5,}', regex=True, na=False).astype(int)
529
+ features_df['has_location_details'] = features_df['combined_text_lower'].str.contains('niveau|angle|cΓ΄tΓ©|palier', regex=True, na=False).astype(int)
530
+
531
+ # Enhanced severity scoring
532
+ severity_words = {
533
+ 'critique': 4, 'grave': 4, 'majeur': 4, 'important': 3,
534
+ 'total': 5, 'complet': 5, 'rupture': 5, 'Γ©clatement': 5,
535
+ 'percement': 5, 'fissure': 4, 'aggravation': 4, 'urgent': 3
536
+ }
537
+
538
+ def calculate_enhanced_severity_score(text):
539
+ text = str(text).lower()
540
+ max_score = 0
541
+ for word, weight in severity_words.items():
542
+ if word in text:
543
+ max_score = max(max_score, weight)
544
+ return max_score
545
+
546
+ features_df['enhanced_severity_score'] = features_df['combined_text_lower'].apply(calculate_enhanced_severity_score)
547
+
548
+ # Equipment-Problem Risk Matrix
549
+ def calculate_equipment_problem_risk(equipment_type, problem_types, has_structural):
550
+ base_risk = 1.0
551
+
552
+ if equipment_type in ['ELECTRICAL_CRITICAL', 'TURBINE_SYSTEMS', 'COOLING_CRITICAL']:
553
+ base_risk = 1.5
554
+ elif equipment_type in ['ELECTRICAL_STANDARD', 'HEATING_SYSTEMS']:
555
+ base_risk = 1.2
556
+
557
+ if has_structural:
558
+ base_risk *= 2.0
559
+
560
+ if 'vibration' in problem_types:
561
+ base_risk *= 1.3
562
+ if 'fuite' in problem_types:
563
+ base_risk *= 1.2
564
+
565
+ return min(base_risk, 3.0)
566
+
567
+ features_df['equipment_problem_risk'] = features_df.apply(
568
+ lambda row: calculate_equipment_problem_risk(
569
+ row['equipment_type_class'],
570
+ row['problem_types'],
571
+ row['has_structural_failure']
572
+ ), axis=1
573
+ )
574
+
575
+ # Technical complexity
576
+ features_df['technical_complexity'] = (
577
+ features_df['combined_word_count'] / 15 +
578
+ features_df['equipment_count'] +
579
+ features_df['problem_count'] +
580
+ features_df['has_measurements'] +
581
+ features_df['has_equipment_codes'] +
582
+ features_df['has_location_details']
583
+ )
584
+
585
+ # Fill missing values and ensure proper types
586
+ numeric_columns = features_df.select_dtypes(include=[np.number]).columns
587
+ features_df[numeric_columns] = features_df[numeric_columns].fillna(0)
588
+
589
+ for col in features_df.select_dtypes(include=[np.integer, np.floating, bool]).columns:
590
+ features_df[col] = pd.to_numeric(features_df[col], errors='coerce').fillna(0)
591
+
592
+ return features_df
593
+
594
+ def _predict_criticality(self, input_data: Dict) -> tuple:
595
+ """Make criticality predictions using enhanced models"""
596
+
597
+ # Convert to DataFrame
598
+ input_df = pd.DataFrame([input_data])
599
+
600
+ target_columns = self.model_metadata['target_columns']
601
+ predictions = {}
602
+ confidences = {}
603
+ probabilities = {}
604
+
605
+ for target in target_columns:
606
+ model = self.models[target]
607
+ pred = model.predict(input_df)[0]
608
+ pred_proba = model.predict_proba(input_df)[0]
609
+ confidence = np.max(pred_proba)
610
+
611
+ predictions[target] = int(pred)
612
+ confidences[target] = float(confidence)
613
+ probabilities[target] = [float(x) for x in pred_proba]
614
+
615
+ return predictions, confidences, probabilities
616
+
617
+ def _apply_safety_override_rules(self, enhanced_features: Dict, predictions: Dict) -> Dict:
618
+ """Apply safety override rules to predictions"""
619
+
620
+ def _apply_safety_override_rules(self, enhanced_features: Dict, predictions: Dict) -> Dict:
621
+ """Apply safety override rules to predictions"""
622
+
623
+ if not self.safety_rules:
624
+ return predictions
625
+
626
+ modified_predictions = predictions.copy()
627
+ safety_override_applied = False
628
+
629
+ # Rule 1: Structural failure override
630
+ if enhanced_features.get('has_structural_failure', 0) == 1:
631
+ # Ensure minimum criticality of 9 for structural failures
632
+ total_current = sum(modified_predictions.values())
633
+ if total_current < 9:
634
+ # Boost Process Safety to 5 first (most critical for structural failures)
635
+ if modified_predictions['Process Safety'] < 5:
636
+ modified_predictions['Process Safety'] = 5
637
+ safety_override_applied = True
638
+
639
+ # Then boost FiabilitΓ© if still needed
640
+ total_after_safety = sum(modified_predictions.values())
641
+ if total_after_safety < 9:
642
+ needed_boost = 9 - total_after_safety
643
+ new_fiabilite = min(5, modified_predictions['FiabilitΓ© IntΓ©gritΓ©'] + needed_boost)
644
+ modified_predictions['FiabilitΓ© IntΓ©gritΓ©'] = new_fiabilite
645
+ safety_override_applied = True
646
+
647
+ # Rule 2: Cooling critical equipment override
648
+ if enhanced_features.get('equipment_type_class', '') == 'COOLING_CRITICAL':
649
+ # Ensure minimum criticality of 10 for cooling critical equipment
650
+ total_current = sum(modified_predictions.values())
651
+ if total_current < 10:
652
+ # Boost all components proportionally
653
+ needed_boost = 10 - total_current
654
+ for component in modified_predictions:
655
+ if modified_predictions[component] < 5:
656
+ boost = min(2, needed_boost // 3 + 1)
657
+ modified_predictions[component] = min(5, modified_predictions[component] + boost)
658
+ needed_boost -= boost
659
+ safety_override_applied = True
660
+ if needed_boost <= 0:
661
+ break
662
+
663
+ # Rule 3: Safety mention boost
664
+ if enhanced_features.get('has_safety_mention', 0) == 1:
665
+ # Add +2 to Process Safety for safety mentions
666
+ if modified_predictions['Process Safety'] < 5:
667
+ boost = min(2, 5 - modified_predictions['Process Safety'])
668
+ modified_predictions['Process Safety'] += boost
669
+ safety_override_applied = True
670
+
671
+ # Rule 4: Turbine oil issue override
672
+ if enhanced_features.get('turbine_oil_issue', 0) == 1:
673
+ # Ensure minimum criticality of 8 for turbine oil issues
674
+ total_current = sum(modified_predictions.values())
675
+ if total_current < 8:
676
+ # Boost FiabilitΓ© and DisponibilitΓ© (oil issues affect both)
677
+ needed_boost = 8 - total_current
678
+ for component in ['FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©']:
679
+ if needed_boost > 0 and modified_predictions[component] < 4:
680
+ boost = min(2, needed_boost)
681
+ modified_predictions[component] = min(5, modified_predictions[component] + boost)
682
+ needed_boost -= boost
683
+ safety_override_applied = True
684
+
685
+ # Rule 5: Electrical critical equipment override
686
+ if enhanced_features.get('equipment_type_class', '') == 'ELECTRICAL_CRITICAL':
687
+ # Conservative boost for electrical critical equipment
688
+ for component in modified_predictions:
689
+ if modified_predictions[component] >= 3: # Only boost already elevated predictions
690
+ boost = min(1, 5 - modified_predictions[component])
691
+ if boost > 0:
692
+ modified_predictions[component] += boost
693
+ safety_override_applied = True
694
+
695
+ return modified_predictions
696
+
697
+ def _determine_manual_review_need(self, enhanced_features: Dict, predictions: Dict,
698
+ overall_confidence: float, confidence_threshold: float) -> bool:
699
+ """Enhanced logic to determine if manual review is needed"""
700
+
701
+ # Base confidence check
702
+ if overall_confidence < confidence_threshold:
703
+ return True
704
+
705
+ # Critical equipment always needs review for high predictions
706
+ if enhanced_features.get('equipment_type_class', '') in ['ELECTRICAL_CRITICAL', 'COOLING_CRITICAL', 'TURBINE_SYSTEMS']:
707
+ if sum(predictions.values()) >= 8:
708
+ return True
709
+
710
+ # Structural failures always need review
711
+ if enhanced_features.get('has_structural_failure', 0) == 1:
712
+ return True
713
+
714
+ # Safety mentions need review
715
+ if enhanced_features.get('has_safety_mention', 0) == 1:
716
+ return True
717
+
718
+ # High criticality cases need review
719
+ if sum(predictions.values()) >= 10:
720
+ return True
721
+
722
+ # Equipment malfunction with high-risk equipment
723
+ if (enhanced_features.get('has_equipment_malfunction', 0) == 1 and
724
+ enhanced_features.get('equipment_type_class', '') in ['ELECTRICAL_CRITICAL', 'TURBINE_SYSTEMS']):
725
+ return True
726
+
727
+ return False
728
+
729
+ def _assess_equipment_risk(self, enhanced_features: Dict, predictions: Dict) -> Dict:
730
+ """Assess equipment-specific risk factors"""
731
+
732
+ equipment_type = enhanced_features.get('equipment_type_class', 'UNKNOWN')
733
+ total_criticality = sum(predictions.values())
734
+
735
+ risk_assessment = {
736
+ 'equipment_type': equipment_type,
737
+ 'redundancy_class': enhanced_features.get('equipment_redundancy_class', 'UNKNOWN'),
738
+ 'base_risk_score': enhanced_features.get('equipment_risk_score', 4.5),
739
+ 'risk_level': 'LOW',
740
+ 'risk_factors': [],
741
+ 'business_impact': 'MINOR'
742
+ }
743
+
744
+ # Determine risk level based on equipment type and criticality
745
+ if equipment_type == 'COOLING_CRITICAL':
746
+ risk_assessment['risk_level'] = 'CRITICAL'
747
+ risk_assessment['business_impact'] = 'SEVERE'
748
+ risk_assessment['risk_factors'].append('Critical cooling system failure')
749
+ elif equipment_type == 'ELECTRICAL_CRITICAL':
750
+ if total_criticality >= 8:
751
+ risk_assessment['risk_level'] = 'HIGH'
752
+ risk_assessment['business_impact'] = 'MAJOR'
753
+ else:
754
+ risk_assessment['risk_level'] = 'MEDIUM'
755
+ risk_assessment['business_impact'] = 'MODERATE'
756
+ risk_assessment['risk_factors'].append('Electrical critical infrastructure')
757
+ elif equipment_type == 'TURBINE_SYSTEMS':
758
+ if total_criticality >= 8:
759
+ risk_assessment['risk_level'] = 'HIGH'
760
+ risk_assessment['business_impact'] = 'MAJOR'
761
+ else:
762
+ risk_assessment['risk_level'] = 'MEDIUM'
763
+ risk_assessment['business_impact'] = 'MODERATE'
764
+ risk_assessment['risk_factors'].append('Turbine system component')
765
+
766
+ # Add specific risk factors
767
+ if enhanced_features.get('has_structural_failure', 0) == 1:
768
+ risk_assessment['risk_factors'].append('Structural integrity compromise')
769
+ risk_assessment['risk_level'] = 'HIGH'
770
+
771
+ if enhanced_features.get('has_safety_mention', 0) == 1:
772
+ risk_assessment['risk_factors'].append('Safety concern flagged')
773
+
774
+ if enhanced_features.get('equipment_redundancy_class', '') == 'SINGLE_CRITICAL':
775
+ risk_assessment['risk_factors'].append('Single point of failure')
776
+
777
+ if enhanced_features.get('turbine_oil_issue', 0) == 1:
778
+ risk_assessment['risk_factors'].append('Turbine lubrication system issue')
779
+
780
+ if enhanced_features.get('electrical_cooling_issue', 0) == 1:
781
+ risk_assessment['risk_factors'].append('Electrical equipment cooling problem')
782
+
783
+ # Determine business impact based on total criticality and equipment type
784
+ if total_criticality >= 12:
785
+ risk_assessment['business_impact'] = 'SEVERE'
786
+ elif total_criticality >= 10:
787
+ risk_assessment['business_impact'] = 'MAJOR'
788
+ elif total_criticality >= 8:
789
+ risk_assessment['business_impact'] = 'MODERATE'
790
+
791
+ return risk_assessment
792
+
793
+ def _find_similar_anomalies(self, description: str, top_k: int = 3) -> List[Dict]:
794
+ """Find similar historical anomalies"""
795
+
796
+ if not description or self.sentence_model is None or self.embeddings is None:
797
+ return []
798
+
799
+ try:
800
+ # Encode new description
801
+ new_embedding = self.sentence_model.encode([description])
802
+
803
+ # Calculate similarities
804
+ similarities = cosine_similarity(new_embedding, self.embeddings)[0]
805
+
806
+ # Get top k most similar
807
+ top_indices = np.argsort(similarities)[::-1]
808
+
809
+ similar_anomalies = []
810
+ for idx in top_indices[:top_k*2]: # Check more to filter
811
+ similarity_score = float(similarities[idx])
812
+
813
+ # Skip if too similar (likely duplicate) or too dissimilar
814
+ if similarity_score > 0.99 or similarity_score < 0.15:
815
+ continue
816
+
817
+ if len(similar_anomalies) >= top_k:
818
+ break
819
+
820
+ similar_anomalies.append({
821
+ 'description': self.embedding_metadata['descriptions'][idx],
822
+ 'criticality': int(self.embedding_metadata['criticality_scores'][idx]),
823
+ 'similarity_score': round(similarity_score, 3),
824
+ 'section': self.embedding_metadata.get('sections', ['Unknown'])[idx],
825
+ 'equipment_mentioned': self.embedding_metadata.get('equipment_mentioned', [[]])[idx]
826
+ })
827
+
828
+ return similar_anomalies
829
+
830
+ except Exception as e:
831
+ print(f"Warning: Similarity search failed: {e}")
832
+ return []
833
+
834
+ def _format_simple_response(self, anomaly_data: Dict, predictions: Dict,
835
+ total_criticality: int, overall_confidence: float,
836
+ needs_review: bool, equipment_risk_assessment: Dict) -> Dict:
837
+ """Format simple response for database insertion"""
838
+
839
+ return {
840
+ 'timestamp': datetime.now().isoformat(),
841
+ 'input_description': anomaly_data.get('Description', ''),
842
+ 'input_section': anomaly_data.get('Section propriΓ©taire', ''),
843
+ 'input_equipment': anomaly_data.get('Description de l\'Γ©quipement', ''),
844
+
845
+ # Predictions
846
+ 'predicted_criticite': total_criticality,
847
+ 'predicted_fiabilite': predictions['FiabilitΓ© IntΓ©gritΓ©'],
848
+ 'predicted_disponibilite': predictions['DisponibiltΓ©'],
849
+ 'predicted_safety': predictions['Process Safety'],
850
+
851
+ # AI Metrics
852
+ 'ai_confidence': round(overall_confidence, 3),
853
+ 'needs_manual_review': bool(needs_review),
854
+
855
+ # Equipment Intelligence
856
+ 'equipment_type': equipment_risk_assessment['equipment_type'],
857
+ 'equipment_risk_level': equipment_risk_assessment['risk_level'],
858
+ 'business_impact': equipment_risk_assessment['business_impact'],
859
+ 'safety_override_applied': any(pred > 3 for pred in predictions.values()),
860
+
861
+ # Metadata
862
+ 'model_version': '2.0_enhanced',
863
+ 'processing_timestamp': datetime.now().isoformat()
864
+ }
865
+
866
+ def _format_rich_response(self, anomaly_data: Dict, predictions: Dict,
867
+ confidences: Dict, total_criticality: int,
868
+ overall_confidence: float, similar_anomalies: List,
869
+ needs_review: bool, confidence_threshold: float,
870
+ equipment_risk_assessment: Dict, enhanced_features: Dict) -> Dict:
871
+ """Format rich response for UI display"""
872
+
873
+ # Calculate additional metrics
874
+ reliability_score = self._calculate_reliability_score(
875
+ confidences, enhanced_features, equipment_risk_assessment
876
+ )
877
+
878
+ return {
879
+ 'timestamp': datetime.now().isoformat(),
880
+ 'input_description': anomaly_data.get('Description', ''),
881
+ 'input_section': anomaly_data.get('Section propriΓ©taire', ''),
882
+ 'input_equipment': anomaly_data.get('Description de l\'Γ©quipement', ''),
883
+
884
+ 'predictions': {
885
+ 'criticite_totale': total_criticality,
886
+ 'components': {
887
+ 'fiabilite_integrite': predictions['FiabilitΓ© IntΓ©gritΓ©'],
888
+ 'disponibilite': predictions['DisponibiltΓ©'],
889
+ 'process_safety': predictions['Process Safety']
890
+ }
891
+ },
892
+
893
+ 'confidence': {
894
+ 'overall_confidence': round(overall_confidence, 3),
895
+ 'reliability_score': round(reliability_score, 3),
896
+ 'component_confidence': {
897
+ 'fiabilite_integrite': round(confidences['FiabilitΓ© IntΓ©gritΓ©'], 3),
898
+ 'disponibilite': round(confidences['DisponibiltΓ©'], 3),
899
+ 'process_safety': round(confidences['Process Safety'], 3)
900
+ },
901
+ 'needs_manual_review': bool(needs_review),
902
+ 'confidence_threshold': confidence_threshold,
903
+ 'recommendation': self._get_confidence_recommendation(reliability_score)
904
+ },
905
+
906
+ 'equipment_intelligence': {
907
+ 'equipment_type': equipment_risk_assessment['equipment_type'],
908
+ 'redundancy_class': equipment_risk_assessment['redundancy_class'],
909
+ 'risk_level': equipment_risk_assessment['risk_level'],
910
+ 'business_impact': equipment_risk_assessment['business_impact'],
911
+ 'risk_factors': equipment_risk_assessment['risk_factors'],
912
+ 'base_risk_score': round(equipment_risk_assessment['base_risk_score'], 2)
913
+ },
914
+
915
+ 'safety_analysis': {
916
+ 'structural_failure_detected': bool(enhanced_features.get('has_structural_failure', 0)),
917
+ 'safety_mention_present': bool(enhanced_features.get('has_safety_mention', 0)),
918
+ 'equipment_malfunction_detected': bool(enhanced_features.get('has_equipment_malfunction', 0)),
919
+ 'escalation_detected': bool(enhanced_features.get('has_escalation', 0)),
920
+ 'safety_override_applied': any(pred > 3 for pred in predictions.values()),
921
+ 'urgency_level': self._determine_urgency_level(total_criticality, reliability_score, equipment_risk_assessment)
922
+ },
923
+
924
+ 'similar_anomalies': similar_anomalies,
925
+
926
+ 'analysis': {
927
+ 'problem_types_detected': enhanced_features.get('problem_types', []),
928
+ 'equipment_mentioned': enhanced_features.get('equipment_mentioned', []),
929
+ 'severity_score': enhanced_features.get('enhanced_severity_score', 0),
930
+ 'technical_complexity': round(enhanced_features.get('technical_complexity', 0), 2),
931
+ 'pattern_indicators': self._identify_critical_patterns(enhanced_features)
932
+ },
933
+
934
+ 'model_metadata': {
935
+ 'version': '2.0_enhanced',
936
+ 'features_used': len([k for k in enhanced_features.keys() if k != 'Description']),
937
+ 'equipment_intelligence_enabled': True,
938
+ 'safety_rules_enabled': bool(self.safety_rules)
939
+ }
940
+ }
941
+
942
+ def _calculate_reliability_score(self, confidences: Dict, enhanced_features: Dict,
943
+ equipment_risk_assessment: Dict) -> float:
944
+ """Calculate enhanced reliability score"""
945
+
946
+ # Base prediction confidence
947
+ prediction_confidence = np.mean(list(confidences.values()))
948
+
949
+ # Model agreement (lower std = higher agreement)
950
+ model_agreement = 1.0 - (np.std(list(confidences.values())) / max(np.mean(list(confidences.values())), 0.1))
951
+
952
+ # Feature completeness
953
+ has_description = len(enhanced_features.get('Description', '')) > 10
954
+ has_equipment = enhanced_features.get('equipment_type_class', 'UNKNOWN') != 'UNKNOWN'
955
+ has_section = enhanced_features.get('Section propriΓ©taire', 'Unknown') != 'Unknown'
956
+ feature_completeness = (has_description + has_equipment + has_section) / 3
957
+
958
+ # Equipment intelligence confidence boost
959
+ equipment_confidence_boost = 0.0
960
+ if equipment_risk_assessment['equipment_type'] != 'UNKNOWN':
961
+ equipment_confidence_boost = 0.1
962
+
963
+ # Pattern detection confidence
964
+ pattern_confidence = 0.0
965
+ if enhanced_features.get('has_safety_mention', 0) == 1:
966
+ pattern_confidence += 0.1
967
+ if enhanced_features.get('has_structural_failure', 0) == 1:
968
+ pattern_confidence += 0.15
969
+ if enhanced_features.get('equipment_problem_risk', 0) > 1.5:
970
+ pattern_confidence += 0.1
971
+
972
+ # Combine all factors
973
+ reliability_score = (
974
+ prediction_confidence * 0.4 +
975
+ model_agreement * 0.25 +
976
+ feature_completeness * 0.2 +
977
+ equipment_confidence_boost +
978
+ pattern_confidence
979
+ )
980
+
981
+ return min(reliability_score, 1.0)
982
+
983
+ def _get_confidence_recommendation(self, reliability_score: float) -> str:
984
+ """Get confidence-based recommendation"""
985
+ if reliability_score >= 0.85:
986
+ return "Very high confidence - Prediction highly reliable"
987
+ elif reliability_score >= 0.75:
988
+ return "High confidence - Prediction can be trusted"
989
+ elif reliability_score >= 0.65:
990
+ return "Medium confidence - Consider expert review for critical decisions"
991
+ elif reliability_score >= 0.5:
992
+ return "Low confidence - Manual review recommended"
993
+ else:
994
+ return "Very low confidence - Expert assessment required"
995
+
996
+ def _determine_urgency_level(self, total_criticality: int, reliability_score: float,
997
+ equipment_risk_assessment: Dict) -> str:
998
+ """Determine enhanced urgency level"""
999
+
1000
+ # Adjust criticality by reliability and equipment risk
1001
+ adjusted_criticality = total_criticality * reliability_score
1002
+
1003
+ # Equipment type urgency multiplier
1004
+ equipment_urgency_multiplier = 1.0
1005
+ if equipment_risk_assessment['equipment_type'] in ['COOLING_CRITICAL', 'ELECTRICAL_CRITICAL']:
1006
+ equipment_urgency_multiplier = 1.3
1007
+ elif equipment_risk_assessment['equipment_type'] in ['TURBINE_SYSTEMS']:
1008
+ equipment_urgency_multiplier = 1.2
1009
+
1010
+ final_urgency_score = adjusted_criticality * equipment_urgency_multiplier
1011
+
1012
+ if final_urgency_score >= 14:
1013
+ return "EMERGENCY - Immediate shutdown may be required"
1014
+ elif final_urgency_score >= 12:
1015
+ return "CRITICAL - Immediate action required (within 1 hour)"
1016
+ elif final_urgency_score >= 9:
1017
+ return "HIGH - Action required within 24 hours"
1018
+ elif final_urgency_score >= 6:
1019
+ return "MEDIUM - Action required within 1 week"
1020
+ else:
1021
+ return "LOW - Routine maintenance scheduling"
1022
+
1023
+ def _identify_critical_patterns(self, enhanced_features: Dict) -> List[str]:
1024
+ """Identify critical patterns in the anomaly"""
1025
+
1026
+ patterns = []
1027
+
1028
+ if enhanced_features.get('has_structural_failure', 0) == 1:
1029
+ patterns.append('Structural failure detected')
1030
+
1031
+ if enhanced_features.get('has_safety_mention', 0) == 1:
1032
+ patterns.append('Safety concern explicitly mentioned')
1033
+
1034
+ if enhanced_features.get('electrical_cooling_issue', 0) == 1:
1035
+ patterns.append('Electrical equipment cooling issue')
1036
+
1037
+ if enhanced_features.get('turbine_oil_issue', 0) == 1:
1038
+ patterns.append('Turbine lubrication system problem')
1039
+
1040
+ if enhanced_features.get('main_equipment_failure', 0) == 1:
1041
+ patterns.append('Critical single-point equipment failure')
1042
+
1043
+ if enhanced_features.get('has_escalation', 0) == 1:
1044
+ patterns.append('Problem escalation indicated')
1045
+
1046
+ if enhanced_features.get('vibration_excessive', 0) == 1:
1047
+ patterns.append('Excessive vibration detected')
1048
+
1049
+ if enhanced_features.get('temperature_elevee', 0) == 1:
1050
+ patterns.append('High temperature condition')
1051
+
1052
+ if enhanced_features.get('enhanced_severity_score', 0) >= 4:
1053
+ patterns.append('High severity language detected')
1054
+
1055
+ return patterns
1056
+
1057
+
1058
+ # ============== CONVENIENCE FUNCTIONS ==============
1059
+
1060
+ # Global instance for easy use
1061
+ _enhanced_ai_instance = None
1062
+
1063
+ def get_enhanced_ai_instance():
1064
+ """Get singleton enhanced AI instance"""
1065
+ global _enhanced_ai_instance
1066
+ if _enhanced_ai_instance is None:
1067
+ _enhanced_ai_instance = EnhancedAnomalyIntelligence()
1068
+ return _enhanced_ai_instance
1069
+
1070
+ def predict_anomaly_single_enhanced(anomaly_data: Dict, **kwargs) -> Dict:
1071
+ """Convenience function for enhanced single prediction"""
1072
+ ai = get_enhanced_ai_instance()
1073
+ return ai.predict_single(anomaly_data, **kwargs)
1074
+
1075
+ def predict_anomaly_batch_enhanced(anomaly_list: List[Dict], **kwargs) -> List[Dict]:
1076
+ """Convenience function for enhanced batch prediction"""
1077
+ ai = get_enhanced_ai_instance()
1078
+ return ai.predict_batch(anomaly_list, **kwargs)
1079
+
1080
+ def process_excel_upload_enhanced(excel_data: pd.DataFrame,
1081
+ confidence_threshold: float = 0.7) -> pd.DataFrame:
1082
+ """
1083
+ Process Excel upload with enhanced AI predictions
1084
+
1085
+ Args:
1086
+ excel_data: DataFrame from uploaded Excel
1087
+ confidence_threshold: Confidence threshold for manual review
1088
+
1089
+ Returns:
1090
+ DataFrame with enhanced AI prediction columns
1091
+ """
1092
+
1093
+ # Convert DataFrame to list of dicts
1094
+ anomaly_list = excel_data.to_dict('records')
1095
+
1096
+ # Get enhanced batch predictions
1097
+ predictions = predict_anomaly_batch_enhanced(
1098
+ anomaly_list,
1099
+ confidence_threshold=confidence_threshold,
1100
+ include_similar=False, # Skip for batch processing speed
1101
+ format_type='simple',
1102
+ apply_safety_rules=True
1103
+ )
1104
+
1105
+ # Add enhanced prediction columns to original DataFrame
1106
+ result_df = excel_data.copy()
1107
+
1108
+ # Enhanced AI prediction columns
1109
+ result_df['AI_Predicted_Criticite'] = [p.get('predicted_criticite', 0) for p in predictions]
1110
+ result_df['AI_Predicted_Fiabilite'] = [p.get('predicted_fiabilite', 0) for p in predictions]
1111
+ result_df['AI_Predicted_Disponibilite'] = [p.get('predicted_disponibilite', 0) for p in predictions]
1112
+ result_df['AI_Predicted_Safety'] = [p.get('predicted_safety', 0) for p in predictions]
1113
+ result_df['AI_Confidence'] = [p.get('ai_confidence', 0.0) for p in predictions]
1114
+ result_df['AI_Needs_Review'] = [bool(p.get('needs_manual_review', True)) for p in predictions]
1115
+
1116
+ # Equipment intelligence columns
1117
+ result_df['AI_Equipment_Type'] = [p.get('equipment_type', 'UNKNOWN') for p in predictions]
1118
+ result_df['AI_Risk_Level'] = [p.get('equipment_risk_level', 'LOW') for p in predictions]
1119
+ result_df['AI_Business_Impact'] = [p.get('business_impact', 'MINOR') for p in predictions]
1120
+ result_df['AI_Safety_Override'] = [bool(p.get('safety_override_applied', False)) for p in predictions]
1121
+
1122
+ # Human verification columns
1123
+ result_df['Human_Verified'] = False
1124
+ result_df['Human_Criticite'] = None
1125
+ result_df['Human_Fiabilite'] = None
1126
+ result_df['Human_Disponibilite'] = None
1127
+ result_df['Human_Safety'] = None
1128
+ result_df['Correction_Reason'] = ''
1129
+ result_df['Verified_At'] = None
1130
+ result_df['Verified_By'] = ''
1131
+ result_df['Expert_Notes'] = ''
1132
+
1133
+ return result_df
1134
+
1135
+
1136
+ # ============== ENHANCED EXAMPLE USAGE ==============
1137
+
1138
+ if __name__ == "__main__":
1139
+
1140
+ # Example 1: Enhanced single anomaly prediction
1141
+ print("="*70)
1142
+ print("TESTING ENHANCED SINGLE ANOMALY PREDICTION")
1143
+ print("="*70)
1144
+
1145
+ single_anomaly = {
1146
+ 'Description': 'SAFETY : fuite vapeur importante sur TRANSFO PRINCIPAL, tempΓ©rature Γ©levΓ©e detectΓ©e, vibration excessive',
1147
+ 'Section propriΓ©taire': '34EL',
1148
+ 'Description de l\'Γ©quipement': 'TRANSFO PRINCIPAL'
1149
+ }
1150
+
1151
+ result = predict_anomaly_single_enhanced(
1152
+ single_anomaly,
1153
+ format_type='rich',
1154
+ apply_safety_rules=True,
1155
+ include_similar=True
1156
+ )
1157
+
1158
+ print("Enhanced rich format result:")
1159
+ print(f"Predicted Criticality: {result['predictions']['criticite_totale']}")
1160
+ print(f"Equipment Type: {result['equipment_intelligence']['equipment_type']}")
1161
+ print(f"Risk Level: {result['equipment_intelligence']['risk_level']}")
1162
+ print(f"Business Impact: {result['equipment_intelligence']['business_impact']}")
1163
+ print(f"Safety Override Applied: {result['safety_analysis']['safety_override_applied']}")
1164
+ print(f"Urgency Level: {result['safety_analysis']['urgency_level']}")
1165
+ print(f"Risk Factors: {result['equipment_intelligence']['risk_factors']}")
1166
+
1167
+ # Example 2: Enhanced batch processing
1168
+ print("\n" + "="*70)
1169
+ print("TESTING ENHANCED BATCH PREDICTION")
1170
+ print("="*70)
1171
+
1172
+ batch_anomalies = [
1173
+ {
1174
+ 'Description': 'vibration excessive ALTERNATEUR, bruit anormal dΓ©tectΓ©',
1175
+ 'Section propriΓ©taire': '34EL',
1176
+ 'Description de l\'Γ©quipement': 'ALTERNATEUR'
1177
+ },
1178
+ {
1179
+ 'Description': 'fuite huile système hydraulique TURBINE, pression basse',
1180
+ 'Section propriΓ©taire': '34MM',
1181
+ 'Description de l\'Γ©quipement': 'TURBINE'
1182
+ },
1183
+ {
1184
+ 'Description': 'maintenance prΓ©ventive DECRASSEUR Γ  prΓ©voir',
1185
+ 'Section propriΓ©taire': '34MC',
1186
+ 'Description de l\'Γ©quipement': 'DECRASSEUR'
1187
+ },
1188
+ {
1189
+ 'Description': 'percement conduite vapeur VENTILATEUR DE REFROIDISSEMENT TP',
1190
+ 'Section propriΓ©taire': '34EL',
1191
+ 'Description de l\'Γ©quipement': 'VENTILATEUR DE REFROIDISSEMENT TP'
1192
+ }
1193
+ ]
1194
+
1195
+ batch_results = predict_anomaly_batch_enhanced(
1196
+ batch_anomalies,
1197
+ confidence_threshold=0.7,
1198
+ format_type='simple',
1199
+ apply_safety_rules=True
1200
+ )
1201
+
1202
+ print("Enhanced batch results:")
1203
+ for i, result in enumerate(batch_results):
1204
+ print(f"\nAnomaly {i+1}:")
1205
+ print(f" Equipment Type: {result.get('equipment_type', 'N/A')}")
1206
+ print(f" CriticitΓ©: {result.get('predicted_criticite', 'N/A')}")
1207
+ print(f" Risk Level: {result.get('equipment_risk_level', 'N/A')}")
1208
+ print(f" Business Impact: {result.get('business_impact', 'N/A')}")
1209
+ print(f" Confidence: {result.get('ai_confidence', 'N/A')}")
1210
+ print(f" Safety Override: {result.get('safety_override_applied', 'N/A')}")
1211
+ print(f" Needs Review: {result.get('needs_manual_review', 'N/A')}")
1212
+
1213
+ # Example 3: Enhanced Excel processing simulation
1214
+ print("\n" + "="*70)
1215
+ print("TESTING ENHANCED EXCEL PROCESSING")
1216
+ print("="*70)
1217
+
1218
+ # Simulate Excel data with various equipment types
1219
+ excel_df = pd.DataFrame([
1220
+ {
1221
+ 'Description': 'problème refroidissement TRANSFO PRINCIPAL',
1222
+ 'Section propriΓ©taire': '34EL',
1223
+ 'Description de l\'Γ©quipement': 'TRANSFO PRINCIPAL',
1224
+ 'Date de dΓ©tΓ©ction de l\'anomalie': '2025-01-15'
1225
+ },
1226
+ {
1227
+ 'Description': 'SAFETY : éclatement tube chaudière, fissure détectée',
1228
+ 'Section propriΓ©taire': '34MD',
1229
+ 'Description de l\'Γ©quipement': 'CHAUDIERE',
1230
+ 'Date de dΓ©tΓ©ction de l\'anomalie': '2025-01-16'
1231
+ },
1232
+ {
1233
+ 'Description': 'maintenance POMPE A prΓ©voir',
1234
+ 'Section propriΓ©taire': '34MC',
1235
+ 'Description de l\'Γ©quipement': 'POMPE',
1236
+ 'Date de dΓ©tΓ©ction de l\'anomalie': '2025-01-17'
1237
+ }
1238
+ ])
1239
+
1240
+ processed_df = process_excel_upload_enhanced(excel_df, confidence_threshold=0.7)
1241
+
1242
+ print("Enhanced processed Excel columns:")
1243
+ enhanced_columns = [col for col in processed_df.columns if col.startswith('AI_')]
1244
+ print(enhanced_columns)
1245
+
1246
+ print("\nSample of enhanced processed data:")
1247
+ display_cols = ['Description', 'AI_Predicted_Criticite', 'AI_Equipment_Type',
1248
+ 'AI_Risk_Level', 'AI_Business_Impact', 'AI_Safety_Override', 'AI_Needs_Review']
1249
+ print(processed_df[display_cols].to_string(index=False))
1250
+
1251
+ print("\n" + "🎯" + "="*68)
1252
+ print("ENHANCED ANOMALY INTELLIGENCE v2.0 TESTS COMPLETED SUCCESSFULLY!")
1253
+ print("="*70)
1254
+ print("βœ“ Equipment Intelligence Integration")
1255
+ print("βœ“ Safety Override Rules")
1256
+ print("βœ“ Enhanced Risk Assessment")
1257
+ print("βœ“ Conservative Prediction Bias")
1258
+ print("βœ“ Business Impact Analysis")
1259
+ print("βœ“ Production-Ready Performance")
1260
+ print("="*70)
descritption_v2.py ADDED
@@ -0,0 +1,942 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # enhanced_data_processing_v2.py
2
+ # TAQATHON 2025 - Enhanced Data Processing with Equipment Intelligence
3
+ # Incorporates dual-field analysis + equipment criticality patterns from analysis
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import re
8
+ from collections import Counter, defaultdict
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+ from wordcloud import WordCloud
12
+ import warnings
13
+ warnings.filterwarnings('ignore')
14
+
15
+ print("="*70)
16
+ print("TAQATHON 2025 - ENHANCED DATA PROCESSING v2.0")
17
+ print("Equipment Intelligence + Dual-Field Analysis + Noise-Robust Features")
18
+ print("="*70)
19
+
20
+ # ============== STEP 1: LOAD DATA AND BASIC SETUP ==============
21
+ print("\n" + "="*50)
22
+ print("STEP 1: LOADING DATA AND BASIC SETUP")
23
+ print("="*50)
24
+
25
+ # Load the data
26
+ try:
27
+ df = pd.read_excel('Taqathon_data.xlsx', sheet_name='Oracle')
28
+ print(f"βœ“ Successfully loaded dataset: {df.shape}")
29
+ except FileNotFoundError:
30
+ print("❌ Error: Taqathon_data.xlsx not found!")
31
+ exit(1)
32
+
33
+ print("Columns:", df.columns.tolist())
34
+
35
+ # Check for missing values
36
+ print("\nMissing values per column:")
37
+ print(df.isnull().sum())
38
+
39
+ # Clean data
40
+ df = df.dropna(subset=['Description', 'Description de l\'Γ©quipement'])
41
+ print(f"After removing missing key fields: {df.shape}")
42
+
43
+ # Convert date column to datetime
44
+ df['Date de dΓ©tΓ©ction de l\'anomalie'] = pd.to_datetime(df['Date de dΓ©tΓ©ction de l\'anomalie'])
45
+
46
+ # Remove duplicates
47
+ df = df.drop_duplicates()
48
+ print(f"After removing duplicates: {df.shape}")
49
+
50
+ # ============== STEP 2: EQUIPMENT INTELLIGENCE SETUP ==============
51
+ print("\n" + "="*50)
52
+ print("STEP 2: EQUIPMENT INTELLIGENCE CLASSIFICATION")
53
+ print("="*50)
54
+
55
+ # Based on our analysis - Equipment Type Criticality Scores
56
+ EQUIPMENT_TYPE_SCORES = {
57
+ # High-risk electrical equipment (8.0+ avg criticality)
58
+ 'ELECTRICAL_CRITICAL': {
59
+ 'keywords': ['ALTERNATEUR', 'TRANSFO PRINCIPAL', 'PROTECTION ALTERNATEUR'],
60
+ 'score': 8.0
61
+ },
62
+ # Turbine and power generation systems (7.0+ avg)
63
+ 'TURBINE_SYSTEMS': {
64
+ 'keywords': ['TURBINE', 'SOUPAPE REGULATRICE', 'REFRIGERANT HUILE', 'POMPE DE SOULÈVEMENT'],
65
+ 'score': 7.5
66
+ },
67
+ # Cooling and ventilation systems (7.5+ avg for critical cooling)
68
+ 'COOLING_CRITICAL': {
69
+ 'keywords': ['VENTILATEUR DE REFROIDISSEMENT', 'REFROIDISSEMENT TP', 'MOTEUR VENTILATEUR DE REFROIDISSEMENT'],
70
+ 'score': 7.5
71
+ },
72
+ # Standard electrical equipment (6.0-7.0 avg)
73
+ 'ELECTRICAL_STANDARD': {
74
+ 'keywords': ['DISJONCTEUR', 'TRANSFORMATEUR', 'MOTEUR', 'ARMOIRE', 'GROUPE'],
75
+ 'score': 6.5
76
+ },
77
+ # Heating systems (6.0+ avg)
78
+ 'HEATING_SYSTEMS': {
79
+ 'keywords': ['RECHAUFFEUR', 'RΓ‰CHAUFFEUR', 'CHAUDIERE', 'CHAUDIÈRE'],
80
+ 'score': 6.5
81
+ },
82
+ # Ventilation systems (6.0+ avg)
83
+ 'VENTILATION_SYSTEMS': {
84
+ 'keywords': ['VENTILATEUR', 'TIRAGE', 'SOUFFLAGE', 'AIR PRIMAIRE', 'AIR SECONDAIRE'],
85
+ 'score': 6.0
86
+ },
87
+ # Process systems (5.5+ avg)
88
+ 'PROCESS_SYSTEMS': {
89
+ 'keywords': ['POMPE', 'SOUPAPE', 'VANNE', 'CONVOYEUR', 'BROYEUR', 'COAL FEEDER'],
90
+ 'score': 5.5
91
+ },
92
+ # Auxiliary/maintenance systems (5.0+ avg)
93
+ 'AUXILIARY_SYSTEMS': {
94
+ 'keywords': ['DECRASSEUR', 'DÉGRILLEUR', 'FILTRE', 'CAPTEUR', 'TRANSMETTEUR'],
95
+ 'score': 5.0
96
+ }
97
+ }
98
+
99
+ # Redundancy detection patterns (from analysis)
100
+ REDUNDANCY_PATTERNS = {
101
+ 'SINGLE_CRITICAL': {
102
+ 'patterns': [r'PRINCIPAL', r'UNIQUE', r'^(?!.*[AB]$)(?!.*NΒ°[0-9])(?!.*[0-9]$)'],
103
+ 'multiplier': 1.3
104
+ },
105
+ 'DUAL_SYSTEM': {
106
+ 'patterns': [r'\b[AB]$', r'NΒ°[12]$', r'PRIMAIRE$', r'SECONDAIRE$'],
107
+ 'multiplier': 1.0
108
+ },
109
+ 'MULTIPLE_SYSTEM': {
110
+ 'patterns': [r'NΒ°[3-9]$', r'NΒ°[0-9][0-9]$'],
111
+ 'multiplier': 0.8
112
+ }
113
+ }
114
+
115
+ # Section risk multipliers (from analysis)
116
+ SECTION_RISK_MULTIPLIERS = {
117
+ '34EL': 1.2, # Electrical - highest critical case rate
118
+ '34MM': 1.1, # Mechanical - high turbine/oil systems
119
+ '34MD': 1.1, # Medium risk
120
+ '34MC': 1.0, # Lower critical case rate
121
+ '34CT': 1.0 # Control systems
122
+ }
123
+
124
+ def classify_equipment_type(equipment_desc):
125
+ """Classify equipment based on criticality analysis"""
126
+ equipment_upper = str(equipment_desc).upper()
127
+
128
+ for category, info in EQUIPMENT_TYPE_SCORES.items():
129
+ for keyword in info['keywords']:
130
+ if keyword in equipment_upper:
131
+ return category, info['score']
132
+
133
+ return 'UNKNOWN', 4.5 # Default for unclassified
134
+
135
+ def detect_equipment_redundancy(equipment_desc):
136
+ """Detect equipment redundancy based on naming patterns"""
137
+ equipment_upper = str(equipment_desc).upper()
138
+
139
+ for redundancy_class, info in REDUNDANCY_PATTERNS.items():
140
+ for pattern in info['patterns']:
141
+ if re.search(pattern, equipment_upper):
142
+ return redundancy_class, info['multiplier']
143
+
144
+ return 'UNKNOWN_REDUNDANCY', 1.0
145
+
146
+ # Apply equipment intelligence
147
+ print("Applying equipment intelligence classification...")
148
+
149
+ # Equipment type classification
150
+ equipment_classifications = df['Description de l\'Γ©quipement'].apply(classify_equipment_type)
151
+ df['equipment_type_class'] = [x[0] for x in equipment_classifications]
152
+ df['equipment_base_criticality'] = [x[1] for x in equipment_classifications]
153
+
154
+ # Equipment redundancy detection
155
+ redundancy_classifications = df['Description de l\'Γ©quipement'].apply(detect_equipment_redundancy)
156
+ df['equipment_redundancy_class'] = [x[0] for x in redundancy_classifications]
157
+ df['equipment_redundancy_multiplier'] = [x[1] for x in redundancy_classifications]
158
+
159
+ # Section risk multiplier
160
+ df['section_risk_multiplier'] = df['Section propriΓ©taire'].map(SECTION_RISK_MULTIPLIERS).fillna(1.0)
161
+
162
+ # Combined equipment risk score
163
+ df['equipment_risk_score'] = (df['equipment_base_criticality'] *
164
+ df['equipment_redundancy_multiplier'] *
165
+ df['section_risk_multiplier'])
166
+
167
+ print("βœ“ Equipment intelligence classification completed")
168
+ print(f"Equipment type distribution:")
169
+ print(df['equipment_type_class'].value_counts())
170
+ print(f"\nRedundancy classification:")
171
+ print(df['equipment_redundancy_class'].value_counts())
172
+
173
+ # ============== STEP 3: DUAL-FIELD TEXT ANALYSIS ==============
174
+ print("\n" + "="*50)
175
+ print("STEP 3: DUAL-FIELD TEXT ANALYSIS")
176
+ print("="*50)
177
+
178
+ # Create combined text field for comprehensive analysis
179
+ df['combined_text'] = df['Description'].fillna('') + ' ' + df['Description de l\'Γ©quipement'].fillna('')
180
+ df['combined_text_lower'] = df['combined_text'].str.lower()
181
+
182
+ # Basic text features for both fields
183
+ df['description_length'] = df['Description'].str.len()
184
+ df['description_word_count'] = df['Description'].str.split().str.len()
185
+ df['equipment_desc_length'] = df['Description de l\'Γ©quipement'].str.len()
186
+ df['equipment_desc_word_count'] = df['Description de l\'Γ©quipement'].str.split().str.len()
187
+ df['combined_length'] = df['combined_text'].str.len()
188
+ df['combined_word_count'] = df['combined_text'].str.split().str.len()
189
+
190
+ print(f"Text analysis completed:")
191
+ print(f"Average description length: {df['description_length'].mean():.1f} chars")
192
+ print(f"Average equipment description length: {df['equipment_desc_length'].mean():.1f} chars")
193
+ print(f"Average combined length: {df['combined_length'].mean():.1f} chars")
194
+
195
+ # ============== STEP 4: ENHANCED KEYWORD EXTRACTION ==============
196
+ print("\n" + "="*50)
197
+ print("STEP 4: ENHANCED KEYWORD EXTRACTION (DUAL-FIELD)")
198
+ print("="*50)
199
+
200
+ # Enhanced equipment keywords (from analysis + original)
201
+ equipment_keywords = {
202
+ 'pompe': ['pompe', 'pompes'],
203
+ 'vanne': ['vanne', 'vannes'],
204
+ 'ventilateur': ['ventilateur', 'ventilateurs', 'ventilo'],
205
+ 'moteur': ['moteur', 'moteurs', 'moto'],
206
+ 'alternateur': ['alternateur', 'alternateurs'], # HIGH RISK
207
+ 'transformateur': ['transformateur', 'transformateurs', 'transfo'], # HIGH RISK
208
+ 'turbine': ['turbine', 'turbines'], # HIGH RISK
209
+ 'chaudière': ['chaudière', 'chaudières', 'chaudiere'],
210
+ 'rΓ©chauffeur': ['rΓ©chauffeur', 'rΓ©chauffeurs', 'rechauffeur'],
211
+ 'refroidissement': ['refroidissement', 'refroidisseur', 'refrigerant', 'rΓ©frigΓ©rant'], # HIGH RISK
212
+ 'compresseur': ['compresseur', 'compresseurs'],
213
+ 'soupape': ['soupape', 'soupapes'],
214
+ 'dΓ©crasseur': ['dΓ©crasseur', 'dΓ©crasseurs', 'decrasseur'],
215
+ 'principal': ['principal', 'principale'], # SINGLE CRITICAL
216
+ 'groupe': ['groupe', 'groupes'], # HIGH RISK
217
+ 'protection': ['protection', 'protections'],
218
+ 'armoire': ['armoire', 'armoires'],
219
+ 'disjoncteur': ['disjoncteur', 'disjoncteurs']
220
+ }
221
+
222
+ # Enhanced problem keywords (from critical case analysis)
223
+ problem_keywords = {
224
+ 'fuite': ['fuite', 'fuites', 'fuit', 'fuyant'],
225
+ 'vibration': ['vibration', 'vibrations', 'vibre'],
226
+ 'bruit_anormal': ['bruit anormal', 'bruit anormale'], # SPECIFIC PATTERN
227
+ 'percement': ['percement', 'percΓ©', 'percΓ©e'], # CRITICAL FAILURE
228
+ 'Γ©clatement': ['Γ©clatement', 'eclatement'], # CRITICAL FAILURE
229
+ 'fissure': ['fissure', 'fissurΓ©', 'fissures'], # STRUCTURAL FAILURE
230
+ 'aggravation': ['aggravation'], # ESCALATION INDICATOR
231
+ 'sifflement': ['sifflement', 'siffler'], # PRESSURE ISSUE
232
+ 'dΓ©faillance': ['dΓ©faillance', 'dΓ©faillant'],
233
+ 'dysfonctionnement': ['dysfonctionnement', 'dysfonctionnel'],
234
+ 'sens_inverse': ['sens inverse', 'sens contraire'], # CRITICAL MALFUNCTION
235
+ 'dΓ©tachΓ©s': ['dΓ©tachΓ©s', 'dΓ©tachΓ©', 'detaches'],
236
+ 'corrosion': ['corrosion', 'corrodΓ©', 'rouille'],
237
+ 'usure': ['usure', 'usΓ©', 'usΓ©e'],
238
+ 'surchauffe': ['surchauffe', 'surchauffΓ©', 'tempΓ©rature Γ©levΓ©e', 'temp elevee'],
239
+ 'blocage': ['blocage', 'bloquΓ©', 'bloque', 'coincΓ©'],
240
+ 'dΓ©gradation': ['dΓ©gradation', 'dΓ©gradΓ©'],
241
+ 'obstruction': ['obstruction', 'obstruΓ©', 'bouchΓ©', 'bouchage']
242
+ }
243
+
244
+ # Enhanced action keywords
245
+ action_keywords = {
246
+ 'remplacement': ['remplacement', 'remplacer', 'remplacΓ©', 'changement', 'changer'],
247
+ 'rΓ©paration': ['rΓ©paration', 'rΓ©parer', 'rΓ©parΓ©'],
248
+ 'maintenance': ['maintenance', 'entretien'],
249
+ 'prΓ©vision': ['prΓ©voir', 'prΓ©voire', 'prevoir'], # MAINTENANCE PLANNING
250
+ 'soufflage': ['soufflage', 'souffler', 'soufflΓ©'],
251
+ 'nettoyage': ['nettoyage', 'nettoyer', 'nettoyΓ©'],
252
+ 'dΓ©bouchage': ['dΓ©bouchage', 'dΓ©boucher'],
253
+ 'inspection': ['inspection', 'inspecter', 'contrΓ΄le', 'contrΓ΄ler'],
254
+ 'rΓ©vision': ['rΓ©vision', 'rΓ©viser'],
255
+ 'remise_Γ©tat': ['remise en Γ©tat', 'remise Γ©tat']
256
+ }
257
+
258
+ # SAFETY and urgency indicators (enhanced)
259
+ urgency_keywords = {
260
+ 'safety': ['safety', 'sΓ©curitΓ©'], # BUT NOT AUTOMATIC HIGH CRITICALITY
261
+ 'urgent': ['urgent', 'urgence'],
262
+ 'critique': ['critique', 'critiques'],
263
+ 'important': ['important', 'importante'],
264
+ 'immΓ©diat': ['immΓ©diat', 'immΓ©diatement'],
265
+ 'prioritaire': ['prioritaire', 'prioritΓ©'],
266
+ 'grave': ['grave', 'graves'],
267
+ 'majeur': ['majeur', 'majeure'],
268
+ 'dangereux': ['dangereux', 'dangereuse', 'danger'],
269
+ 'risque': ['risque', 'risques', 'risquΓ©'],
270
+ 'chute': ['chute', 'tomber'],
271
+ 'frΓ©quent': ['frΓ©quent', 'frΓ©quente', 'rΓ©pΓ©titif', 'rΓ©pΓ©titive']
272
+ }
273
+
274
+ def extract_keywords_dual_field(description, equipment_desc, keyword_dict):
275
+ """Extract keywords from both description and equipment description"""
276
+ combined_text = (str(description) + ' ' + str(equipment_desc)).lower()
277
+ found_keywords = []
278
+
279
+ for category, keywords in keyword_dict.items():
280
+ for keyword in keywords:
281
+ if keyword in combined_text:
282
+ found_keywords.append(category)
283
+ break
284
+
285
+ return found_keywords
286
+
287
+ # Apply enhanced keyword extraction
288
+ print("Extracting enhanced keywords from both fields...")
289
+
290
+ # Equipment mentions (dual-field)
291
+ df['equipment_mentioned'] = df.apply(
292
+ lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'Γ©quipement'], equipment_keywords),
293
+ axis=1
294
+ )
295
+ df['equipment_count'] = df['equipment_mentioned'].str.len()
296
+
297
+ # Problem types (dual-field)
298
+ df['problem_types'] = df.apply(
299
+ lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'Γ©quipement'], problem_keywords),
300
+ axis=1
301
+ )
302
+ df['problem_count'] = df['problem_types'].str.len()
303
+
304
+ # Actions mentioned (dual-field)
305
+ df['actions_mentioned'] = df.apply(
306
+ lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'Γ©quipement'], action_keywords),
307
+ axis=1
308
+ )
309
+ df['action_count'] = df['actions_mentioned'].str.len()
310
+
311
+ # Urgency indicators (dual-field)
312
+ df['urgency_indicators'] = df.apply(
313
+ lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'Γ©quipement'], urgency_keywords),
314
+ axis=1
315
+ )
316
+ df['has_urgency'] = df['urgency_indicators'].str.len() > 0
317
+
318
+ print(f"βœ“ Enhanced keyword extraction completed")
319
+
320
+ # ============== STEP 5: CRITICAL FAILURE PATTERN DETECTION ==============
321
+ print("\n" + "="*50)
322
+ print("STEP 5: CRITICAL FAILURE PATTERN DETECTION")
323
+ print("="*50)
324
+
325
+ # Structural failure indicators (highest severity)
326
+ df['has_structural_failure'] = df['combined_text_lower'].str.contains(
327
+ 'percement|Γ©clatement|eclatement|fissure|rupture', regex=True, na=False
328
+ ).astype(int)
329
+
330
+ # Equipment malfunction indicators
331
+ df['has_equipment_malfunction'] = df['combined_text_lower'].str.contains(
332
+ 'sens inverse|dysfonctionnement|dΓ©faillance|dΓ©faut|panne', regex=True, na=False
333
+ ).astype(int)
334
+
335
+ # Escalation indicators
336
+ df['has_escalation'] = df['combined_text_lower'].str.contains(
337
+ 'aggravation|empirΓ©|empire', regex=True, na=False
338
+ ).astype(int)
339
+
340
+ # Safety indicators (but not automatic high criticality)
341
+ df['has_safety_mention'] = df['Description'].str.contains('SAFETY', case=False, na=False).astype(int)
342
+
343
+ # Specific high-risk combinations (from critical case analysis)
344
+ df['electrical_cooling_issue'] = (
345
+ (df['equipment_type_class'].isin(['ELECTRICAL_CRITICAL', 'ELECTRICAL_STANDARD'])) &
346
+ (df['combined_text_lower'].str.contains('refroidissement|ventilateur|tempΓ©rature', regex=True, na=False))
347
+ ).astype(int)
348
+
349
+ df['turbine_oil_issue'] = (
350
+ (df['equipment_type_class'] == 'TURBINE_SYSTEMS') &
351
+ (df['combined_text_lower'].str.contains('huile|fuite|graissage', regex=True, na=False))
352
+ ).astype(int)
353
+
354
+ df['main_equipment_failure'] = (
355
+ (df['equipment_redundancy_class'] == 'SINGLE_CRITICAL') &
356
+ (df['has_structural_failure'] == 1)
357
+ ).astype(int)
358
+
359
+ print(f"Critical failure patterns detected:")
360
+ print(f"Structural failures: {df['has_structural_failure'].sum()}")
361
+ print(f"Equipment malfunctions: {df['has_equipment_malfunction'].sum()}")
362
+ print(f"Escalation indicators: {df['has_escalation'].sum()}")
363
+ print(f"Electrical cooling issues: {df['electrical_cooling_issue'].sum()}")
364
+ print(f"Turbine oil issues: {df['turbine_oil_issue'].sum()}")
365
+ print(f"Main equipment failures: {df['main_equipment_failure'].sum()}")
366
+
367
+ # ============== STEP 6: ENHANCED COMPOUND FEATURES ==============
368
+ print("\n" + "="*50)
369
+ print("STEP 6: ENHANCED COMPOUND FEATURES")
370
+ print("="*50)
371
+
372
+ # Specific leak types (from original analysis)
373
+ df['fuite_vapeur'] = df['combined_text_lower'].str.contains('fuite.*vapeur|vapeur.*fuite', regex=True, na=False).astype(int)
374
+ df['fuite_huile'] = df['combined_text_lower'].str.contains('fuite.*huile|huile.*fuite', regex=True, na=False).astype(int)
375
+ df['fuite_eau'] = df['combined_text_lower'].str.contains('fuite.*eau|eau.*fuite', regex=True, na=False).astype(int)
376
+
377
+ # Enhanced vibration/noise detection
378
+ df['bruit_anormal'] = df['combined_text_lower'].str.contains('bruit anormal', regex=True, na=False).astype(int)
379
+ df['vibration_excessive'] = df['combined_text_lower'].str.contains(
380
+ 'vibration.*excessive|vibration.*Γ©levΓ©e|vibration.*haute', regex=True, na=False
381
+ ).astype(int)
382
+
383
+ # Temperature issues
384
+ df['temperature_elevee'] = df['combined_text_lower'].str.contains(
385
+ 'tempΓ©rature Γ©levΓ©e|temp Γ©levΓ©e|temp elevee|surchauffe', regex=True, na=False
386
+ ).astype(int)
387
+
388
+ # Maintenance prediction indicators
389
+ df['maintenance_planning'] = df['combined_text_lower'].str.contains(
390
+ 'prΓ©voir|prΓ©voire|planifier|programmer', regex=True, na=False
391
+ ).astype(int)
392
+
393
+ # Recurring issue indicators
394
+ df['is_recurring'] = df['combined_text_lower'].str.contains(
395
+ 'frΓ©quent|rΓ©pΓ©titif|souvent|plusieurs fois|encore', regex=True, na=False
396
+ ).astype(int)
397
+
398
+ # Measurements and technical details
399
+ df['has_measurements'] = df['combined_text_lower'].str.contains(
400
+ r'\d+\s*Β°c|\d+\s*bar|\d+\s*%|\d+\s*mm|\d+\s*m3', regex=True, na=False
401
+ ).astype(int)
402
+
403
+ df['has_equipment_codes'] = df['combined_text_lower'].str.contains(
404
+ r'[A-Z0-9]{5,}|[0-9]{2}[A-Z]{3}[0-9]{2}', regex=True, na=False
405
+ ).astype(int)
406
+
407
+ # Equipment location indicators
408
+ df['has_location_details'] = df['combined_text_lower'].str.contains(
409
+ 'niveau|angle|cΓ΄tΓ©|cotΓ©|palier|entrΓ©e|sortie|amont|aval', regex=True, na=False
410
+ ).astype(int)
411
+
412
+ # ============== STEP 7: ADVANCED SEVERITY SCORING ==============
413
+ print("\n" + "="*50)
414
+ print("STEP 7: ADVANCED SEVERITY SCORING")
415
+ print("="*50)
416
+
417
+ # Enhanced severity word scoring (from critical case analysis)
418
+ severity_words = {
419
+ 'critique': 4, 'critiques': 4,
420
+ 'grave': 4, 'graves': 4,
421
+ 'majeur': 4, 'majeure': 4,
422
+ 'important': 3, 'importante': 3,
423
+ 'total': 5, 'totale': 5,
424
+ 'complet': 5, 'complète': 5,
425
+ 'rupture': 5, 'Γ©clatement': 5, 'eclatement': 5,
426
+ 'percement': 5, 'fissure': 4,
427
+ 'aggravation': 4,
428
+ 'sifflement': 3,
429
+ 'sens inverse': 5,
430
+ 'dysfonctionnement': 3,
431
+ 'dΓ©faillance': 3,
432
+ 'urgent': 3, 'urgence': 3,
433
+ 'immΓ©diat': 3, 'immΓ©diatement': 3,
434
+ 'dangereux': 4, 'dangereuse': 4,
435
+ 'léger': 1, 'légère': 1,
436
+ 'faible': 1, 'petit': 1, 'petite': 1,
437
+ 'normal': 1, 'normale': 1
438
+ }
439
+
440
+ def calculate_enhanced_severity_score(text):
441
+ """Calculate severity score based on enhanced word analysis"""
442
+ text = str(text).lower()
443
+ max_score = 0
444
+ word_count = 0
445
+
446
+ for word, weight in severity_words.items():
447
+ if word in text:
448
+ max_score = max(max_score, weight)
449
+ word_count += 1
450
+
451
+ # Bonus for multiple severity indicators
452
+ if word_count > 1:
453
+ max_score += 0.5
454
+
455
+ return max_score
456
+
457
+ df['enhanced_severity_score'] = df['combined_text_lower'].apply(calculate_enhanced_severity_score)
458
+
459
+ # Equipment-Problem Risk Matrix
460
+ def calculate_equipment_problem_risk(equipment_type, problem_types, has_structural):
461
+ """Calculate compound risk based on equipment type and problem severity"""
462
+ base_risk = 1.0
463
+
464
+ # High-risk equipment gets higher base risk
465
+ if equipment_type in ['ELECTRICAL_CRITICAL', 'TURBINE_SYSTEMS', 'COOLING_CRITICAL']:
466
+ base_risk = 1.5
467
+ elif equipment_type in ['ELECTRICAL_STANDARD', 'HEATING_SYSTEMS']:
468
+ base_risk = 1.2
469
+
470
+ # Structural failure on any equipment is serious
471
+ if has_structural:
472
+ base_risk *= 2.0
473
+
474
+ # Specific problem type multipliers
475
+ if 'vibration' in problem_types:
476
+ base_risk *= 1.3
477
+ if 'fuite' in problem_types:
478
+ base_risk *= 1.2
479
+ if 'bruit_anormal' in problem_types:
480
+ base_risk *= 1.2
481
+
482
+ return min(base_risk, 3.0) # Cap at 3.0
483
+
484
+ df['equipment_problem_risk'] = df.apply(
485
+ lambda row: calculate_equipment_problem_risk(
486
+ row['equipment_type_class'],
487
+ row['problem_types'],
488
+ row['has_structural_failure']
489
+ ), axis=1
490
+ )
491
+
492
+ # Complexity indicators
493
+ df['technical_complexity'] = (
494
+ df['combined_word_count'] / 15 + # Normalized word count
495
+ df['equipment_count'] +
496
+ df['problem_count'] +
497
+ df['has_measurements'] +
498
+ df['has_equipment_codes'] +
499
+ df['has_location_details']
500
+ )
501
+
502
+ print(f"βœ“ Advanced severity scoring completed")
503
+ print(f"Enhanced severity score distribution:")
504
+ print(df['enhanced_severity_score'].value_counts().sort_index())
505
+
506
+ # ============== STEP 8: NOISE-ROBUST LABEL ANALYSIS ==============
507
+ print("\n" + "="*50)
508
+ print("STEP 8: NOISE-ROBUST LABEL ANALYSIS")
509
+ print("="*50)
510
+
511
+ # Identify potentially noisy labels
512
+ def identify_label_inconsistencies(df, similarity_threshold=0.8):
513
+ """Identify potentially inconsistent labels for similar cases"""
514
+
515
+ # Group by similar characteristics
516
+ similar_groups = df.groupby([
517
+ 'equipment_type_class',
518
+ 'equipment_redundancy_class',
519
+ 'Section propriΓ©taire'
520
+ ])
521
+
522
+ inconsistent_cases = []
523
+
524
+ for group_key, group_df in similar_groups:
525
+ if len(group_df) >= 3: # Need at least 3 cases to detect outliers
526
+ criticality_std = group_df['CriticitΓ©'].std()
527
+ criticality_mean = group_df['CriticitΓ©'].mean()
528
+
529
+ if criticality_std > 3.0: # High variance in similar cases
530
+ for idx, row in group_df.iterrows():
531
+ z_score = abs(row['CriticitΓ©'] - criticality_mean) / (criticality_std + 0.1)
532
+ if z_score > 2.0: # Outlier
533
+ inconsistent_cases.append({
534
+ 'index': idx,
535
+ 'criticality': row['CriticitΓ©'],
536
+ 'expected_range': f"{criticality_mean-criticality_std:.1f}-{criticality_mean+criticality_std:.1f}",
537
+ 'z_score': z_score,
538
+ 'group': group_key
539
+ })
540
+
541
+ return inconsistent_cases
542
+
543
+ inconsistent_labels = identify_label_inconsistencies(df)
544
+ df['potentially_mislabeled'] = 0
545
+ if inconsistent_labels:
546
+ inconsistent_indices = [case['index'] for case in inconsistent_labels]
547
+ df.loc[inconsistent_indices, 'potentially_mislabeled'] = 1
548
+
549
+ print(f"Identified {len(inconsistent_labels)} potentially inconsistent labels")
550
+ print(f"Percentage of potentially noisy labels: {len(inconsistent_labels)/len(df)*100:.2f}%")
551
+
552
+ # Create label confidence scores
553
+ def calculate_label_confidence(row):
554
+ """Calculate confidence in the label based on consistency with similar cases"""
555
+ base_confidence = 1.0
556
+
557
+ # Reduce confidence for outliers
558
+ if row['potentially_mislabeled']:
559
+ base_confidence *= 0.6
560
+
561
+ # Increase confidence for cases that align with equipment risk
562
+ expected_criticality = row['equipment_risk_score']
563
+ actual_criticality = row['CriticitΓ©']
564
+
565
+ # If actual is close to expected, increase confidence
566
+ diff = abs(actual_criticality - expected_criticality)
567
+ if diff <= 2:
568
+ base_confidence *= 1.2
569
+ elif diff > 5:
570
+ base_confidence *= 0.8
571
+
572
+ return min(base_confidence, 1.0)
573
+
574
+ df['label_confidence'] = df.apply(calculate_label_confidence, axis=1)
575
+
576
+ print(f"Label confidence distribution:")
577
+ print(f"High confidence (>0.9): {(df['label_confidence'] > 0.9).sum()}")
578
+ print(f"Medium confidence (0.7-0.9): {((df['label_confidence'] > 0.7) & (df['label_confidence'] <= 0.9)).sum()}")
579
+ print(f"Low confidence (<0.7): {(df['label_confidence'] <= 0.7).sum()}")
580
+
581
+ # ============== STEP 9: CORRELATION ANALYSIS ==============
582
+ print("\n" + "="*50)
583
+ print("STEP 9: ENHANCED FEATURE CORRELATION ANALYSIS")
584
+ print("="*50)
585
+
586
+ # Enhanced feature list
587
+ enhanced_features = [
588
+ 'equipment_risk_score', 'equipment_base_criticality', 'equipment_redundancy_multiplier',
589
+ 'section_risk_multiplier', 'enhanced_severity_score', 'equipment_problem_risk',
590
+ 'technical_complexity', 'has_structural_failure', 'has_equipment_malfunction',
591
+ 'has_escalation', 'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure',
592
+ 'combined_word_count', 'equipment_count', 'problem_count', 'action_count',
593
+ 'has_urgency', 'bruit_anormal', 'vibration_excessive', 'temperature_elevee',
594
+ 'fuite_vapeur', 'fuite_huile', 'maintenance_planning', 'is_recurring',
595
+ 'has_measurements', 'has_equipment_codes', 'has_location_details', 'has_safety_mention'
596
+ ]
597
+
598
+ target_cols = ['FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©', 'Process Safety', 'CriticitΓ©']
599
+
600
+ print("\nTop correlations with CriticitΓ©:")
601
+ correlations = []
602
+ for feature in enhanced_features:
603
+ if feature in df.columns:
604
+ corr = df[feature].corr(df['CriticitΓ©'])
605
+ correlations.append({'Feature': feature, 'Correlation': corr})
606
+
607
+ correlation_df = pd.DataFrame(correlations).sort_values('Correlation', key=abs, ascending=False)
608
+ print(correlation_df.head(15).to_string(index=False))
609
+
610
+ # ============== STEP 10: SAVE ENHANCED DATASET ==============
611
+ print("\n" + "="*50)
612
+ print("STEP 10: SAVING ENHANCED DATASET")
613
+ print("="*50)
614
+
615
+ # Select final feature columns
616
+ final_columns = [
617
+ # Original columns
618
+ 'Num_equipement', 'Systeme', 'Description', 'Date de dΓ©tΓ©ction de l\'anomalie',
619
+ 'Description de l\'Γ©quipement', 'Section propriΓ©taire',
620
+ 'FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©', 'Process Safety', 'CriticitΓ©',
621
+
622
+ # Equipment Intelligence Features
623
+ 'equipment_type_class', 'equipment_base_criticality', 'equipment_redundancy_class',
624
+ 'equipment_redundancy_multiplier', 'section_risk_multiplier', 'equipment_risk_score',
625
+
626
+ # Text Analysis Features
627
+ 'combined_text', 'description_length', 'description_word_count',
628
+ 'equipment_desc_length', 'equipment_desc_word_count', 'combined_length', 'combined_word_count',
629
+
630
+ # Enhanced Keyword Features
631
+ 'equipment_mentioned', 'equipment_count', 'problem_types', 'problem_count',
632
+ 'actions_mentioned', 'action_count', 'urgency_indicators', 'has_urgency',
633
+
634
+ # Critical Failure Features
635
+ 'has_structural_failure', 'has_equipment_malfunction', 'has_escalation', 'has_safety_mention',
636
+ 'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure',
637
+
638
+ # Compound Features
639
+ 'fuite_vapeur', 'fuite_huile', 'fuite_eau', 'bruit_anormal', 'vibration_excessive',
640
+ 'temperature_elevee', 'maintenance_planning', 'is_recurring',
641
+
642
+ # Technical Features
643
+ 'has_measurements', 'has_equipment_codes', 'has_location_details',
644
+
645
+ # Advanced Features
646
+ 'enhanced_severity_score', 'equipment_problem_risk', 'technical_complexity',
647
+
648
+ # Noise-Robust Features
649
+ 'potentially_mislabeled', 'label_confidence'
650
+ ]
651
+
652
+ # Ensure all columns exist
653
+ available_columns = [col for col in final_columns if col in df.columns]
654
+ missing_columns = [col for col in final_columns if col not in df.columns]
655
+
656
+ if missing_columns:
657
+ print(f"Warning: Missing columns: {missing_columns}")
658
+
659
+ # Save enhanced dataset
660
+ enhanced_df = df[available_columns].copy()
661
+ enhanced_df.to_csv('enhanced_anomaly_data_v2.csv', index=False, encoding='utf-8')
662
+
663
+ print(f"βœ“ Enhanced dataset saved to 'enhanced_anomaly_data_v2.csv'")
664
+ print(f"Dataset shape: {enhanced_df.shape}")
665
+ print(f"Total features: {len(available_columns)}")
666
+
667
+ # ============== STEP 11: FEATURE SUMMARY AND RECOMMENDATIONS ==============
668
+ print("\n" + "="*50)
669
+ print("STEP 11: FEATURE SUMMARY AND RECOMMENDATIONS")
670
+ print("="*50)
671
+
672
+ # Feature importance ranking based on correlations
673
+ feature_importance = correlation_df.copy()
674
+ feature_importance['Abs_Correlation'] = feature_importance['Correlation'].abs()
675
+ feature_importance = feature_importance.sort_values('Abs_Correlation', ascending=False)
676
+
677
+ print("\n🎯 TOP 10 MOST IMPORTANT FEATURES:")
678
+ for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
679
+ print(f"{i:2d}. {row['Feature']:35s}: {row['Correlation']:6.3f}")
680
+
681
+ # Equipment intelligence summary
682
+ print(f"\nπŸ”§ EQUIPMENT INTELLIGENCE SUMMARY:")
683
+ print(f"Equipment types classified:")
684
+ equipment_type_summary = df['equipment_type_class'].value_counts()
685
+ for eq_type, count in equipment_type_summary.items():
686
+ avg_crit = df[df['equipment_type_class'] == eq_type]['CriticitΓ©'].mean()
687
+ print(f" {eq_type:25s}: {count:4d} cases (avg criticality: {avg_crit:.2f})")
688
+
689
+ print(f"\nRedundancy classification:")
690
+ redundancy_summary = df['equipment_redundancy_class'].value_counts()
691
+ for red_class, count in redundancy_summary.items():
692
+ avg_crit = df[df['equipment_redundancy_class'] == red_class]['CriticitΓ©'].mean()
693
+ print(f" {red_class:20s}: {count:4d} cases (avg criticality: {avg_crit:.2f})")
694
+
695
+ # Critical case analysis
696
+ critical_cases = df[df['CriticitΓ©'] >= 10]
697
+ print(f"\n⚠️ CRITICAL CASE ANALYSIS (Criticality >= 10): {len(critical_cases)} cases")
698
+
699
+ if len(critical_cases) > 0:
700
+ print("Equipment types in critical cases:")
701
+ crit_equipment = critical_cases['equipment_type_class'].value_counts()
702
+ for eq_type, count in crit_equipment.items():
703
+ total_type = len(df[df['equipment_type_class'] == eq_type])
704
+ percentage = count / total_type * 100
705
+ print(f" {eq_type:25s}: {count:2d}/{total_type:3d} cases ({percentage:5.1f}% critical)")
706
+
707
+ print("\nTop critical failure patterns:")
708
+ critical_patterns = {
709
+ 'Structural Failure': critical_cases['has_structural_failure'].sum(),
710
+ 'Equipment Malfunction': critical_cases['has_equipment_malfunction'].sum(),
711
+ 'Escalation': critical_cases['has_escalation'].sum(),
712
+ 'Electrical Cooling Issue': critical_cases['electrical_cooling_issue'].sum(),
713
+ 'Turbine Oil Issue': critical_cases['turbine_oil_issue'].sum(),
714
+ 'Main Equipment Failure': critical_cases['main_equipment_failure'].sum()
715
+ }
716
+
717
+ for pattern, count in sorted(critical_patterns.items(), key=lambda x: x[1], reverse=True):
718
+ if count > 0:
719
+ percentage = count / len(critical_cases) * 100
720
+ print(f" {pattern:25s}: {count:2d} cases ({percentage:5.1f}% of critical)")
721
+
722
+ # Data quality assessment
723
+ print(f"\nπŸ“Š DATA QUALITY ASSESSMENT:")
724
+ print(f"Total samples: {len(df)}")
725
+ print(f"Potentially mislabeled: {df['potentially_mislabeled'].sum()} ({df['potentially_mislabeled'].mean()*100:.1f}%)")
726
+ print(f"High confidence labels: {(df['label_confidence'] > 0.9).sum()} ({(df['label_confidence'] > 0.9).mean()*100:.1f}%)")
727
+ print(f"Low confidence labels: {(df['label_confidence'] < 0.7).sum()} ({(df['label_confidence'] < 0.7).mean()*100:.1f}%)")
728
+
729
+ # ============== STEP 12: VISUALIZATION CREATION ==============
730
+ print("\n" + "="*50)
731
+ print("STEP 12: CREATING ENHANCED VISUALIZATIONS")
732
+ print("="*50)
733
+
734
+ # Create comprehensive visualization
735
+ fig = plt.figure(figsize=(20, 16))
736
+
737
+ # 1. Equipment Risk Score vs Criticality
738
+ plt.subplot(3, 4, 1)
739
+ plt.scatter(df['equipment_risk_score'], df['CriticitΓ©'], alpha=0.6, s=20)
740
+ plt.xlabel('Equipment Risk Score')
741
+ plt.ylabel('Actual CriticitΓ©')
742
+ plt.title('Equipment Risk Score vs Actual CriticitΓ©')
743
+ plt.grid(True, alpha=0.3)
744
+
745
+ # 2. Equipment Type Distribution
746
+ plt.subplot(3, 4, 2)
747
+ equipment_counts = df['equipment_type_class'].value_counts()
748
+ plt.pie(equipment_counts.values, labels=equipment_counts.index, autopct='%1.1f%%', startangle=90)
749
+ plt.title('Equipment Type Distribution')
750
+
751
+ # 3. Section Risk Analysis
752
+ plt.subplot(3, 4, 3)
753
+ section_criticality = df.groupby('Section propriΓ©taire')['CriticitΓ©'].mean().sort_values(ascending=False)
754
+ plt.bar(section_criticality.index, section_criticality.values)
755
+ plt.xlabel('Section')
756
+ plt.ylabel('Average CriticitΓ©')
757
+ plt.title('Average Criticality by Section')
758
+ plt.xticks(rotation=45)
759
+
760
+ # 4. Feature Correlation Heatmap
761
+ plt.subplot(3, 4, 4)
762
+ top_features = feature_importance.head(8)['Feature'].tolist() + ['CriticitΓ©']
763
+ if len(top_features) > 1:
764
+ corr_matrix = df[top_features].corr()
765
+ sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', cbar_kws={'shrink': 0.8})
766
+ plt.title('Top Features Correlation')
767
+
768
+ # 5. Critical Failure Patterns
769
+ plt.subplot(3, 4, 5)
770
+ failure_patterns = {
771
+ 'Structural': df['has_structural_failure'].sum(),
772
+ 'Malfunction': df['has_equipment_malfunction'].sum(),
773
+ 'Escalation': df['has_escalation'].sum(),
774
+ 'Elec-Cooling': df['electrical_cooling_issue'].sum(),
775
+ 'Turbine-Oil': df['turbine_oil_issue'].sum(),
776
+ 'Main-Equip': df['main_equipment_failure'].sum()
777
+ }
778
+ plt.bar(failure_patterns.keys(), failure_patterns.values())
779
+ plt.xlabel('Failure Pattern')
780
+ plt.ylabel('Count')
781
+ plt.title('Critical Failure Pattern Frequency')
782
+ plt.xticks(rotation=45)
783
+
784
+ # 6. Redundancy vs Criticality
785
+ plt.subplot(3, 4, 6)
786
+ redundancy_crit = df.groupby('equipment_redundancy_class')['CriticitΓ©'].mean()
787
+ plt.bar(redundancy_crit.index, redundancy_crit.values)
788
+ plt.xlabel('Redundancy Class')
789
+ plt.ylabel('Average CriticitΓ©')
790
+ plt.title('Redundancy vs Average Criticality')
791
+ plt.xticks(rotation=45)
792
+
793
+ # 7. Label Confidence Distribution
794
+ plt.subplot(3, 4, 7)
795
+ plt.hist(df['label_confidence'], bins=20, alpha=0.7, edgecolor='black')
796
+ plt.xlabel('Label Confidence')
797
+ plt.ylabel('Frequency')
798
+ plt.title('Label Confidence Distribution')
799
+ plt.grid(True, alpha=0.3)
800
+
801
+ # 8. Enhanced Severity Score vs Criticality
802
+ plt.subplot(3, 4, 8)
803
+ plt.scatter(df['enhanced_severity_score'], df['CriticitΓ©'], alpha=0.6, s=20)
804
+ plt.xlabel('Enhanced Severity Score')
805
+ plt.ylabel('Actual CriticitΓ©')
806
+ plt.title('Severity Score vs Criticality')
807
+ plt.grid(True, alpha=0.3)
808
+
809
+ # 9. Equipment Problem Risk vs Criticality
810
+ plt.subplot(3, 4, 9)
811
+ plt.scatter(df['equipment_problem_risk'], df['CriticitΓ©'], alpha=0.6, s=20)
812
+ plt.xlabel('Equipment Problem Risk')
813
+ plt.ylabel('Actual CriticitΓ©')
814
+ plt.title('Equipment-Problem Risk vs Criticality')
815
+ plt.grid(True, alpha=0.3)
816
+
817
+ # 10. Critical Cases by Equipment Type
818
+ plt.subplot(3, 4, 10)
819
+ if len(critical_cases) > 0:
820
+ crit_eq_counts = critical_cases['equipment_type_class'].value_counts()
821
+ plt.barh(range(len(crit_eq_counts)), crit_eq_counts.values)
822
+ plt.yticks(range(len(crit_eq_counts)), crit_eq_counts.index)
823
+ plt.xlabel('Count')
824
+ plt.title('Critical Cases by Equipment Type')
825
+
826
+ # 11. Technical Complexity Distribution
827
+ plt.subplot(3, 4, 11)
828
+ plt.hist(df['technical_complexity'], bins=30, alpha=0.7, edgecolor='black')
829
+ plt.xlabel('Technical Complexity Score')
830
+ plt.ylabel('Frequency')
831
+ plt.title('Technical Complexity Distribution')
832
+ plt.grid(True, alpha=0.3)
833
+
834
+ # 12. Monthly Trend Analysis
835
+ plt.subplot(3, 4, 12)
836
+ df['Month'] = df['Date de dΓ©tΓ©ction de l\'anomalie'].dt.month
837
+ monthly_criticality = df.groupby('Month')['CriticitΓ©'].mean()
838
+ plt.plot(monthly_criticality.index, monthly_criticality.values, 'b-o', linewidth=2, markersize=6)
839
+ plt.xlabel('Month')
840
+ plt.ylabel('Average CriticitΓ©')
841
+ plt.title('Monthly Criticality Trend')
842
+ plt.grid(True, alpha=0.3)
843
+ plt.xticks(range(1, 13))
844
+
845
+ plt.tight_layout()
846
+ plt.savefig('enhanced_analysis_dashboard_v2.png', dpi=300, bbox_inches='tight')
847
+ print("βœ“ Enhanced analysis dashboard saved as 'enhanced_analysis_dashboard_v2.png'")
848
+
849
+ # ============== STEP 13: TRAINING RECOMMENDATIONS ==============
850
+ print("\n" + "="*50)
851
+ print("STEP 13: TRAINING RECOMMENDATIONS")
852
+ print("="*50)
853
+
854
+ print("πŸš€ ENHANCED MODEL TRAINING RECOMMENDATIONS:")
855
+ print("\n1. FEATURE SELECTION:")
856
+ print(" Prioritize features with |correlation| > 0.15:")
857
+ high_impact_features = feature_importance[feature_importance['Abs_Correlation'] > 0.15]['Feature'].tolist()
858
+ for i, feature in enumerate(high_impact_features, 1):
859
+ corr = feature_importance[feature_importance['Feature'] == feature]['Correlation'].iloc[0]
860
+ print(f" {i:2d}. {feature:35s} (r={corr:6.3f})")
861
+
862
+ print(f"\n2. NOISE-ROBUST TRAINING:")
863
+ print(f" - Use sample weighting based on 'label_confidence'")
864
+ print(f" - Apply higher weights to high-confidence samples")
865
+ print(f" - Consider excluding or down-weighting {df['potentially_mislabeled'].sum()} potentially mislabeled cases")
866
+
867
+ print(f"\n3. CLASS IMBALANCE HANDLING:")
868
+ print(f" - Focus SMOTE on high-criticality cases (>= 10)")
869
+ print(f" - Use cost-sensitive learning with heavy penalty for missing critical cases")
870
+ print(f" - Implement stratified sampling by equipment_type_class")
871
+
872
+ print(f"\n4. FEATURE ENGINEERING PRIORITIES:")
873
+ print(f" - Equipment intelligence features show strong correlation")
874
+ print(f" - Structural failure indicators are crucial for critical cases")
875
+ print(f" - Section-equipment interactions provide additional signal")
876
+
877
+ print(f"\n5. MODEL ARCHITECTURE SUGGESTIONS:")
878
+ print(f" - Use ensemble with equipment-type-specific models")
879
+ print(f" - Implement conservative prediction thresholds for ELECTRICAL_CRITICAL equipment")
880
+ print(f" - Add safety override rules for has_structural_failure = 1")
881
+
882
+ # Save feature metadata for training
883
+ feature_metadata = {
884
+ 'high_impact_features': high_impact_features,
885
+ 'equipment_type_classes': df['equipment_type_class'].unique().tolist(),
886
+ 'redundancy_classes': df['equipment_redundancy_class'].unique().tolist(),
887
+ 'section_risk_multipliers': SECTION_RISK_MULTIPLIERS,
888
+ 'equipment_type_scores': EQUIPMENT_TYPE_SCORES,
889
+ 'feature_correlations': [
890
+ {'Feature': row['Feature'], 'Correlation': float(row['Correlation'])}
891
+ for _, row in correlation_df.iterrows()
892
+ ],
893
+ 'data_quality_metrics': {
894
+ 'total_samples': int(len(df)),
895
+ 'potentially_mislabeled': int(df['potentially_mislabeled'].sum()),
896
+ 'high_confidence_samples': int((df['label_confidence'] > 0.9).sum()),
897
+ 'critical_cases': int(len(critical_cases)),
898
+ 'structural_failures': int(df['has_structural_failure'].sum())
899
+ }
900
+ }
901
+
902
+ import json
903
+ with open('enhanced_feature_metadata_v2.json', 'w') as f:
904
+ json.dump(feature_metadata, f, indent=2)
905
+
906
+ print(f"\nβœ“ Feature metadata saved to 'enhanced_feature_metadata_v2.json'")
907
+
908
+ # ============== FINAL SUMMARY ==============
909
+ print("\n" + "="*70)
910
+ print("ENHANCED DATA PROCESSING v2.0 COMPLETED!")
911
+ print("="*70)
912
+
913
+ print(f"\nπŸ“ˆ ACHIEVEMENTS:")
914
+ print(f"βœ“ Equipment Intelligence Classification: {len(EQUIPMENT_TYPE_SCORES)} equipment categories")
915
+ print(f"βœ“ Redundancy Detection: {len(REDUNDANCY_PATTERNS)} redundancy patterns")
916
+ print(f"βœ“ Dual-Field Text Analysis: Description + Equipment Description")
917
+ print(f"βœ“ Critical Failure Pattern Detection: {len(critical_patterns)} pattern types")
918
+ print(f"βœ“ Noise-Robust Label Analysis: Confidence scoring implemented")
919
+ print(f"βœ“ Enhanced Feature Engineering: {len(available_columns)} total features")
920
+
921
+ print(f"\nπŸ“Š DATASET ENHANCEMENT:")
922
+ print(f"Original features: 10")
923
+ print(f"Enhanced features: {len(available_columns)}")
924
+ print(f"Feature improvement: {(len(available_columns)/10-1)*100:.0f}% increase")
925
+
926
+ print(f"\n🎯 KEY INSIGHTS FOR MODEL:")
927
+ print(f"1. Equipment type is strongest predictor of criticality")
928
+ print(f"2. Structural failures require immediate attention regardless of equipment")
929
+ print(f"3. Electrical equipment (34EL) has highest critical case rate")
930
+ print(f"4. Label confidence varies significantly - use for robust training")
931
+ print(f"5. Equipment redundancy affects criticality but not as strongly as type")
932
+
933
+ print(f"\nπŸ“ FILES GENERATED:")
934
+ print(f"βœ“ enhanced_anomaly_data_v2.csv - Enhanced dataset")
935
+ print(f"βœ“ enhanced_feature_metadata_v2.json - Feature metadata for training")
936
+ print(f"βœ“ enhanced_analysis_dashboard_v2.png - Comprehensive visualizations")
937
+
938
+ print(f"\nπŸš€ READY FOR ENHANCED MODEL TRAINING!")
939
+ print(f"The enhanced dataset now includes equipment intelligence that should")
940
+ print(f"significantly improve high-criticality case detection.")
941
+
942
+ print("="*70)
equipment_analysis.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # equipment_analysis.py
2
+ # Analyze equipment patterns across full dataset to understand redundancy and criticality patterns
3
+
4
+ import pandas as pd
5
+ import numpy as np
6
+ import re
7
+ from collections import Counter, defaultdict
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+
11
+ print("="*60)
12
+ print("EQUIPMENT PATTERN ANALYSIS FOR CRITICALITY UNDERSTANDING")
13
+ print("="*60)
14
+
15
+ # Load the full dataset
16
+ try:
17
+ df = pd.read_excel('Taqathon_data.xlsx', sheet_name='Oracle')
18
+ print(f"βœ“ Loaded full dataset: {df.shape}")
19
+ except FileNotFoundError:
20
+ print("❌ Error: Taqathon_data.xlsx not found!")
21
+ print("Please ensure the file is in the current directory.")
22
+ exit(1)
23
+
24
+ print(f"Columns available: {df.columns.tolist()}")
25
+
26
+ # ============== STEP 1: BASIC EQUIPMENT OVERVIEW ==============
27
+ print("\n" + "="*50)
28
+ print("STEP 1: EQUIPMENT OVERVIEW")
29
+ print("="*50)
30
+
31
+ # Check for missing values in key columns
32
+ print("\nMissing values check:")
33
+ print(f"Description: {df['Description'].isnull().sum()}")
34
+ print(f"Description de l'Γ©quipement: {df['Description de l\'Γ©quipement'].isnull().sum()}")
35
+ print(f"CriticitΓ©: {df['CriticitΓ©'].isnull().sum()}")
36
+
37
+ # Remove rows with missing critical information
38
+ df_clean = df.dropna(subset=['Description', 'Description de l\'Γ©quipement', 'CriticitΓ©'])
39
+ print(f"\nClean dataset shape: {df_clean.shape}")
40
+
41
+ # ============== STEP 2: EQUIPMENT TYPE ANALYSIS ==============
42
+ print("\n" + "="*50)
43
+ print("STEP 2: EQUIPMENT TYPE FREQUENCY ANALYSIS")
44
+ print("="*50)
45
+
46
+ # Get all unique equipment types
47
+ equipment_types = df_clean['Description de l\'Γ©quipement'].value_counts()
48
+ print(f"\nTotal unique equipment types: {len(equipment_types)}")
49
+
50
+ print(f"\nTop 20 most frequent equipment types:")
51
+ for equipment, count in equipment_types.head(20).items():
52
+ avg_criticality = df_clean[df_clean['Description de l\'Γ©quipement'] == equipment]['CriticitΓ©'].mean()
53
+ print(f" {equipment}: {count} cases (avg criticality: {avg_criticality:.2f})")
54
+
55
+ # ============== STEP 3: REDUNDANCY PATTERN DETECTION ==============
56
+ print("\n" + "="*50)
57
+ print("STEP 3: REDUNDANCY PATTERN DETECTION")
58
+ print("="*50)
59
+
60
+ # Function to detect redundancy patterns
61
+ def analyze_redundancy_patterns(equipment_name):
62
+ patterns = {
63
+ 'has_ab_suffix': bool(re.search(r'\b[AB]$|\b[AB]\b', equipment_name, re.IGNORECASE)),
64
+ 'has_number_suffix': bool(re.search(r'\b[NΒ°]*\s*[0-9]+$|\b[0-9]+$', equipment_name)),
65
+ 'has_principal': 'PRINCIPAL' in equipment_name.upper(),
66
+ 'has_primaire': 'PRIMAIRE' in equipment_name.upper(),
67
+ 'has_secondaire': 'SECONDAIRE' in equipment_name.upper(),
68
+ 'has_auxiliaire': 'AUXILIAIRE' in equipment_name.upper(),
69
+ 'has_unique': 'UNIQUE' in equipment_name.upper(),
70
+ 'multiple_numbers': len(re.findall(r'\d+', equipment_name)) > 1
71
+ }
72
+ return patterns
73
+
74
+ # Apply redundancy analysis
75
+ equipment_analysis = []
76
+ for equipment in df_clean['Description de l\'Γ©quipement'].unique():
77
+ patterns = analyze_redundancy_patterns(equipment)
78
+ equipment_data = df_clean[df_clean['Description de l\'Γ©quipement'] == equipment]
79
+
80
+ analysis = {
81
+ 'equipment': equipment,
82
+ 'count': len(equipment_data),
83
+ 'avg_criticality': equipment_data['CriticitΓ©'].mean(),
84
+ 'max_criticality': equipment_data['CriticitΓ©'].max(),
85
+ 'min_criticality': equipment_data['CriticitΓ©'].min(),
86
+ 'std_criticality': equipment_data['CriticitΓ©'].std(),
87
+ **patterns
88
+ }
89
+ equipment_analysis.append(analysis)
90
+
91
+ equipment_df = pd.DataFrame(equipment_analysis)
92
+
93
+ # ============== STEP 4: REDUNDANCY CLASSIFICATION ==============
94
+ print("\n" + "="*50)
95
+ print("STEP 4: EQUIPMENT REDUNDANCY CLASSIFICATION")
96
+ print("="*50)
97
+
98
+ # Classify equipment by redundancy indicators
99
+ def classify_redundancy(row):
100
+ if row['has_principal'] or row['has_unique']:
101
+ return 'SINGLE_CRITICAL'
102
+ elif row['has_primaire'] or row['has_secondaire']:
103
+ return 'DUAL_SYSTEM'
104
+ elif row['has_ab_suffix']:
105
+ return 'DUAL_SYSTEM'
106
+ elif row['has_number_suffix']:
107
+ return 'MULTIPLE_SYSTEM'
108
+ elif row['has_auxiliaire']:
109
+ return 'AUXILIARY'
110
+ else:
111
+ return 'UNKNOWN'
112
+
113
+ equipment_df['redundancy_class'] = equipment_df.apply(classify_redundancy, axis=1)
114
+
115
+ # Analyze by redundancy class
116
+ print("\nEquipment distribution by redundancy classification:")
117
+ redundancy_stats = equipment_df.groupby('redundancy_class').agg({
118
+ 'count': 'sum',
119
+ 'avg_criticality': 'mean',
120
+ 'equipment': 'count'
121
+ }).round(3)
122
+
123
+ for redundancy_class, stats in redundancy_stats.iterrows():
124
+ print(f"\n{redundancy_class}:")
125
+ print(f" Number of equipment types: {stats['equipment']}")
126
+ print(f" Total anomaly cases: {stats['count']}")
127
+ print(f" Average criticality: {stats['avg_criticality']:.3f}")
128
+
129
+ # ============== STEP 5: HIGH CRITICALITY EQUIPMENT ANALYSIS ==============
130
+ print("\n" + "="*50)
131
+ print("STEP 5: HIGH CRITICALITY EQUIPMENT IDENTIFICATION")
132
+ print("="*50)
133
+
134
+ # Find equipment with highest average criticality
135
+ high_criticality_equipment = equipment_df[equipment_df['avg_criticality'] >= 6.0].sort_values('avg_criticality', ascending=False)
136
+
137
+ print(f"\nEquipment types with average criticality >= 6.0:")
138
+ for _, row in high_criticality_equipment.iterrows():
139
+ print(f" {row['equipment']}: {row['avg_criticality']:.2f} (n={row['count']}, class={row['redundancy_class']})")
140
+
141
+ # ============== STEP 6: EQUIPMENT NAMING PATTERN ANALYSIS ==============
142
+ print("\n" + "="*50)
143
+ print("STEP 6: EQUIPMENT NAMING PATTERN ANALYSIS")
144
+ print("="*50)
145
+
146
+ # Group similar equipment names to detect families
147
+ def extract_base_equipment_name(equipment_name):
148
+ # Remove common suffixes and numbers to group similar equipment
149
+ base_name = re.sub(r'\s*[AB]$|\s*[NΒ°]*\s*[0-9]+$', '', equipment_name)
150
+ base_name = re.sub(r'\s*PRIMAIRE$|\s*SECONDAIRE$|\s*PRINCIPAL$', '', base_name)
151
+ base_name = base_name.strip()
152
+ return base_name
153
+
154
+ # Create equipment families
155
+ equipment_families = defaultdict(list)
156
+ for equipment in df_clean['Description de l\'Γ©quipement'].unique():
157
+ base_name = extract_base_equipment_name(equipment)
158
+ equipment_families[base_name].append(equipment)
159
+
160
+ # Find equipment families with multiple variants (indicating redundancy)
161
+ print("\nEquipment families with multiple variants (indicating redundancy):")
162
+ redundant_families = {k: v for k, v in equipment_families.items() if len(v) > 1}
163
+
164
+ for family, variants in sorted(redundant_families.items(), key=lambda x: len(x[1]), reverse=True)[:15]:
165
+ if len(variants) <= 10: # Only show families with reasonable number of variants
166
+ print(f"\n{family} ({len(variants)} variants):")
167
+ for variant in sorted(variants):
168
+ variant_data = df_clean[df_clean['Description de l\'Γ©quipement'] == variant]
169
+ avg_crit = variant_data['CriticitΓ©'].mean()
170
+ count = len(variant_data)
171
+ print(f" - {variant}: {avg_crit:.2f} avg criticality ({count} cases)")
172
+
173
+ # ============== STEP 7: SECTION-EQUIPMENT CRITICALITY ANALYSIS ==============
174
+ print("\n" + "="*50)
175
+ print("STEP 7: SECTION-EQUIPMENT CRITICALITY ANALYSIS")
176
+ print("="*50)
177
+
178
+ # Analyze criticality by section and equipment type
179
+ section_equipment_analysis = df_clean.groupby(['Section propriΓ©taire', 'Description de l\'Γ©quipement']).agg({
180
+ 'CriticitΓ©': ['mean', 'count', 'max']
181
+ }).round(3)
182
+
183
+ section_equipment_analysis.columns = ['avg_criticality', 'count', 'max_criticality']
184
+ section_equipment_analysis = section_equipment_analysis.reset_index()
185
+
186
+ # Find section-equipment combinations with highest criticality
187
+ high_risk_combinations = section_equipment_analysis[
188
+ (section_equipment_analysis['avg_criticality'] >= 7.0) &
189
+ (section_equipment_analysis['count'] >= 3)
190
+ ].sort_values('avg_criticality', ascending=False)
191
+
192
+ print(f"\nHigh-risk Section-Equipment combinations (avg criticality >= 7.0, min 3 cases):")
193
+ for _, row in high_risk_combinations.iterrows():
194
+ print(f" {row['Section propriΓ©taire']} - {row['Description de l\'Γ©quipement']}: "
195
+ f"{row['avg_criticality']:.2f} avg ({row['count']} cases, max: {row['max_criticality']})")
196
+
197
+ # ============== STEP 8: EQUIPMENT KEYWORD ANALYSIS ==============
198
+ print("\n" + "="*50)
199
+ print("STEP 8: CRITICAL EQUIPMENT KEYWORD ANALYSIS")
200
+ print("="*50)
201
+
202
+ # Analyze keywords in equipment descriptions that correlate with high criticality
203
+ equipment_keywords = {}
204
+ all_equipment_text = ' '.join(df_clean['Description de l\'Γ©quipement'].values).upper()
205
+
206
+ # Define important keywords to analyze
207
+ important_keywords = [
208
+ 'PRINCIPAL', 'TRANSFO', 'TURBINE', 'ALTERNATEUR', 'POMPE', 'VENTILATEUR',
209
+ 'CHAUDIERE', 'CHAUDIÈRE', 'COMPRESSEUR', 'MOTEUR', 'VANNE', 'SOUPAPE',
210
+ 'RECHAUFFEUR', 'RÉCHAUFFEUR', 'REFROIDISSEMENT', 'REFRIGERANT', 'RÉFRIGÉRANT',
211
+ 'PRIMAIRE', 'SECONDAIRE', 'AUXILIAIRE', 'UNITE', 'UNITÉ', 'GROUPE'
212
+ ]
213
+
214
+ for keyword in important_keywords:
215
+ # Find equipment containing this keyword
216
+ equipment_with_keyword = df_clean[df_clean['Description de l\'Γ©quipement'].str.contains(keyword, case=False, na=False)]
217
+ if len(equipment_with_keyword) > 0:
218
+ avg_criticality = equipment_with_keyword['CriticitΓ©'].mean()
219
+ count = len(equipment_with_keyword)
220
+ equipment_keywords[keyword] = {
221
+ 'count': count,
222
+ 'avg_criticality': avg_criticality,
223
+ 'percentage': count / len(df_clean) * 100
224
+ }
225
+
226
+ print("\nEquipment keywords analysis (sorted by average criticality):")
227
+ sorted_keywords = sorted(equipment_keywords.items(), key=lambda x: x[1]['avg_criticality'], reverse=True)
228
+ for keyword, stats in sorted_keywords:
229
+ print(f" {keyword}: {stats['avg_criticality']:.3f} avg criticality "
230
+ f"({stats['count']} cases, {stats['percentage']:.1f}% of dataset)")
231
+
232
+ # ============== STEP 9: SPECIFIC PATTERNS FOR CRITICAL CASES ==============
233
+ print("\n" + "="*50)
234
+ print("STEP 9: PATTERNS IN CRITICAL CASES (CRITICALITY >= 10)")
235
+ print("="*50)
236
+
237
+ critical_cases = df_clean[df_clean['CriticitΓ©'] >= 10]
238
+ print(f"\nTotal critical cases (criticality >= 10): {len(critical_cases)}")
239
+
240
+ if len(critical_cases) > 0:
241
+ print(f"\nEquipment types in critical cases:")
242
+ critical_equipment_counts = critical_cases['Description de l\'Γ©quipement'].value_counts()
243
+ for equipment, count in critical_equipment_counts.items():
244
+ total_equipment_cases = len(df_clean[df_clean['Description de l\'Γ©quipement'] == equipment])
245
+ percentage = count / total_equipment_cases * 100
246
+ print(f" {equipment}: {count}/{total_equipment_cases} cases ({percentage:.1f}% critical)")
247
+
248
+ print(f"\nSections with critical cases:")
249
+ critical_section_counts = critical_cases['Section propriΓ©taire'].value_counts()
250
+ for section, count in critical_section_counts.items():
251
+ total_section_cases = len(df_clean[df_clean['Section propriΓ©taire'] == section])
252
+ percentage = count / total_section_cases * 100
253
+ print(f" {section}: {count}/{total_section_cases} cases ({percentage:.1f}% critical)")
254
+
255
+ # ============== STEP 10: RECOMMENDATIONS ==============
256
+ print("\n" + "="*50)
257
+ print("STEP 10: EQUIPMENT ANALYSIS RECOMMENDATIONS")
258
+ print("="*50)
259
+
260
+ print("\n🎯 KEY FINDINGS:")
261
+ print("1. Equipment Redundancy Patterns:")
262
+ print(f" - {len(equipment_df[equipment_df['redundancy_class'] == 'SINGLE_CRITICAL'])} equipment types classified as SINGLE_CRITICAL")
263
+ print(f" - {len(equipment_df[equipment_df['redundancy_class'] == 'DUAL_SYSTEM'])} equipment types classified as DUAL_SYSTEM")
264
+ print(f" - {len(equipment_df[equipment_df['redundancy_class'] == 'MULTIPLE_SYSTEM'])} equipment types classified as MULTIPLE_SYSTEM")
265
+
266
+ print("\n2. High-Risk Equipment Keywords:")
267
+ top_risk_keywords = sorted_keywords[:5]
268
+ for keyword, stats in top_risk_keywords:
269
+ print(f" - '{keyword}': {stats['avg_criticality']:.3f} avg criticality")
270
+
271
+ print("\n3. Equipment Families with Redundancy:")
272
+ print(f" - Found {len(redundant_families)} equipment families with multiple variants")
273
+ print(f" - This suggests systematic redundancy patterns in the data")
274
+
275
+ print("\nπŸš€ RECOMMENDATIONS FOR FEATURE ENGINEERING:")
276
+ print("1. Create 'equipment_redundancy_class' feature based on naming patterns")
277
+ print("2. Add 'equipment_base_type' feature by extracting equipment families")
278
+ print("3. Implement 'critical_equipment_keywords' scoring system")
279
+ print("4. Create 'section_equipment_risk' interaction features")
280
+ print("5. Build 'equipment_criticality_history' based on historical data")
281
+
282
+ # ============== SAVE ANALYSIS RESULTS ==============
283
+ print("\n" + "="*50)
284
+ print("SAVING ANALYSIS RESULTS")
285
+ print("="*50)
286
+
287
+ # Save equipment analysis dataframe
288
+ equipment_df.to_csv('equipment_analysis_results.csv', index=False)
289
+ print("βœ“ Saved equipment analysis to 'equipment_analysis_results.csv'")
290
+
291
+ # Save high-risk combinations
292
+ high_risk_combinations.to_csv('high_risk_equipment_combinations.csv', index=False)
293
+ print("βœ“ Saved high-risk combinations to 'high_risk_equipment_combinations.csv'")
294
+
295
+ # Create summary statistics
296
+ summary_stats = {
297
+ 'total_equipment_types': len(equipment_df),
298
+ 'single_critical_equipment': len(equipment_df[equipment_df['redundancy_class'] == 'SINGLE_CRITICAL']),
299
+ 'dual_system_equipment': len(equipment_df[equipment_df['redundancy_class'] == 'DUAL_SYSTEM']),
300
+ 'multiple_system_equipment': len(equipment_df[equipment_df['redundancy_class'] == 'MULTIPLE_SYSTEM']),
301
+ 'high_criticality_equipment': len(high_criticality_equipment),
302
+ 'equipment_families_with_redundancy': len(redundant_families),
303
+ 'critical_cases_count': len(critical_cases)
304
+ }
305
+
306
+ import json
307
+ with open('equipment_analysis_summary.json', 'w') as f:
308
+ json.dump(summary_stats, f, indent=2)
309
+ print("βœ“ Saved summary statistics to 'equipment_analysis_summary.json'")
310
+
311
+ print("\n" + "="*60)
312
+ print("EQUIPMENT ANALYSIS COMPLETED!")
313
+ print("="*60)
314
+ print("\nFiles generated:")
315
+ print("- equipment_analysis_results.csv")
316
+ print("- high_risk_equipment_combinations.csv")
317
+ print("- equipment_analysis_summary.json")
318
+ print("\nPlease review the analysis results and share the key findings!")
319
+ print("This will help us design the optimal equipment intelligence features.")
training.py ADDED
@@ -0,0 +1,1069 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # enhanced_training_pipeline_v2.py
2
+ # TAQATHON 2025 - Enhanced Training Pipeline with Equipment Intelligence
3
+ # Cost-sensitive learning + Equipment-specific strategies + Noise-robust training
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import joblib
8
+ import warnings
9
+ import json
10
+ from datetime import datetime
11
+ from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
12
+ from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ from sklearn.compose import ColumnTransformer
15
+ from sklearn.pipeline import Pipeline
16
+ from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, recall_score, precision_score
17
+ from sklearn.utils.class_weight import compute_class_weight
18
+ from lightgbm import LGBMClassifier
19
+ from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
20
+ from imblearn.pipeline import Pipeline as ImbPipeline
21
+ import matplotlib.pyplot as plt
22
+ import seaborn as sns
23
+
24
+ warnings.filterwarnings('ignore')
25
+
26
+ print("="*80)
27
+ print("TAQATHON 2025 - ENHANCED TRAINING PIPELINE v2.0")
28
+ print("Equipment Intelligence + Cost-Sensitive Learning + Conservative Prediction")
29
+ print("="*80)
30
+
31
+ # ============== STEP 1: LOAD ENHANCED DATA ==============
32
+ print("\n" + "="*60)
33
+ print("STEP 1: LOADING ENHANCED ANOMALY DATA")
34
+ print("="*60)
35
+
36
+ try:
37
+ df = pd.read_csv('enhanced_anomaly_data_v2.csv')
38
+ print(f"βœ“ Successfully loaded enhanced data: {df.shape}")
39
+ except FileNotFoundError:
40
+ print("❌ Error: enhanced_anomaly_data_v2.csv not found!")
41
+ print("Please run the enhanced data processing script first.")
42
+ exit(1)
43
+
44
+ # Load feature metadata
45
+ try:
46
+ with open('enhanced_feature_metadata_v2.json', 'r') as f:
47
+ feature_metadata = json.load(f)
48
+ print(f"βœ“ Successfully loaded feature metadata")
49
+ except FileNotFoundError:
50
+ print("❌ Warning: enhanced_feature_metadata_v2.json not found!")
51
+ feature_metadata = {}
52
+
53
+ # Check for required columns
54
+ required_cols = ['Description', 'FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©', 'Process Safety', 'CriticitΓ©']
55
+ missing_cols = [col for col in required_cols if col not in df.columns]
56
+ if missing_cols:
57
+ print(f"❌ Missing required columns: {missing_cols}")
58
+ exit(1)
59
+
60
+ print(f"Dataset shape: {df.shape}")
61
+ print(f"Enhanced features available: {len([col for col in df.columns if col not in required_cols])}")
62
+
63
+ # ============== STEP 2: BUSINESS-FOCUSED DATA ANALYSIS ==============
64
+ print("\n" + "="*60)
65
+ print("STEP 2: BUSINESS-FOCUSED ANALYSIS FOR TRAINING STRATEGY")
66
+ print("="*60)
67
+
68
+ # Target variable distributions with business impact analysis
69
+ target_columns = ['FiabilitΓ© IntΓ©gritΓ©', 'DisponibiltΓ©', 'Process Safety']
70
+
71
+ print("Target variable distributions:")
72
+ for target in target_columns:
73
+ print(f"\n{target}:")
74
+ distribution = df[target].value_counts().sort_index()
75
+ for value, count in distribution.items():
76
+ percentage = count / len(df) * 100
77
+ print(f" {value}: {count:4d} cases ({percentage:5.1f}%)")
78
+
79
+ # Critical case analysis (Criticality >= 10)
80
+ critical_cases = df[df['CriticitΓ©'] >= 10]
81
+ very_critical_cases = df[df['CriticitΓ©'] >= 12]
82
+
83
+ print(f"\nBUSINESS IMPACT ANALYSIS:")
84
+ print(f"Total critical cases (β‰₯10): {len(critical_cases)} ({len(critical_cases)/len(df)*100:.2f}%)")
85
+ print(f"Very critical cases (β‰₯12): {len(very_critical_cases)} ({len(very_critical_cases)/len(df)*100:.2f}%)")
86
+
87
+ # Equipment type risk analysis
88
+ if 'equipment_type_class' in df.columns:
89
+ print(f"\nCritical cases by equipment type:")
90
+ for eq_type in df['equipment_type_class'].unique():
91
+ eq_df = df[df['equipment_type_class'] == eq_type]
92
+ eq_critical = eq_df[eq_df['CriticitΓ©'] >= 10]
93
+ if len(eq_df) > 0:
94
+ critical_rate = len(eq_critical) / len(eq_df) * 100
95
+ print(f" {eq_type:25s}: {len(eq_critical):2d}/{len(eq_df):4d} ({critical_rate:5.1f}% critical)")
96
+
97
+ # ============== STEP 3: COST-SENSITIVE LOSS FUNCTION DESIGN ==============
98
+ print("\n" + "="*60)
99
+ print("STEP 3: COST-SENSITIVE LEARNING SETUP")
100
+ print("="*60)
101
+
102
+ def create_cost_matrix(num_classes, severity_penalty=5.0):
103
+ """
104
+ Create asymmetric cost matrix that heavily penalizes underestimation
105
+ """
106
+ cost_matrix = np.ones((num_classes, num_classes))
107
+
108
+ for i in range(num_classes):
109
+ for j in range(num_classes):
110
+ if i == j:
111
+ cost_matrix[i, j] = 0 # No cost for correct prediction
112
+ elif i > j: # Underestimation (predicted lower than actual)
113
+ # Severe penalty for underestimation, especially for high classes
114
+ underestimation_penalty = severity_penalty * (i - j) * (1 + i * 0.5)
115
+ cost_matrix[i, j] = underestimation_penalty
116
+ else: # Overestimation (predicted higher than actual)
117
+ # Lighter penalty for overestimation
118
+ overestimation_penalty = (j - i) * 0.5
119
+ cost_matrix[i, j] = overestimation_penalty
120
+
121
+ return cost_matrix
122
+
123
+ def calculate_sample_weights(y, equipment_types=None, label_confidence=None):
124
+ """
125
+ Calculate sample weights based on criticality, equipment type, and label confidence
126
+ """
127
+ weights = np.ones(len(y))
128
+
129
+ # Base class weights (inverse frequency)
130
+ class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
131
+ class_weight_dict = {cls: weight for cls, weight in zip(np.unique(y), class_weights)}
132
+
133
+ for i, value in enumerate(y):
134
+ weights[i] = class_weight_dict[value]
135
+
136
+ # Extra weight for high criticality cases
137
+ if value >= 4: # High individual component scores
138
+ weights[i] *= 2.0
139
+ if value >= 5: # Maximum individual component scores
140
+ weights[i] *= 3.0
141
+
142
+ # Equipment type weighting
143
+ if equipment_types is not None:
144
+ for i, eq_type in enumerate(equipment_types):
145
+ if eq_type in ['ELECTRICAL_CRITICAL', 'COOLING_CRITICAL']:
146
+ weights[i] *= 2.0 # Double weight for critical equipment
147
+ elif eq_type in ['TURBINE_SYSTEMS', 'HEATING_SYSTEMS']:
148
+ weights[i] *= 1.5 # 1.5x weight for important equipment
149
+
150
+ # Label confidence weighting
151
+ if label_confidence is not None:
152
+ weights = weights * label_confidence
153
+
154
+ return weights
155
+
156
+ # Calculate business impact weights
157
+ equipment_types = df.get('equipment_type_class', None)
158
+ label_confidence = df.get('label_confidence', None)
159
+
160
+ print("Creating cost-sensitive learning setup...")
161
+ print(f"βœ“ Equipment type information available: {equipment_types is not None}")
162
+ print(f"βœ“ Label confidence information available: {label_confidence is not None}")
163
+
164
+ # ============== STEP 4: ENHANCED FEATURE PREPARATION ==============
165
+ print("\n" + "="*60)
166
+ print("STEP 4: ENHANCED FEATURE PREPARATION")
167
+ print("="*60)
168
+
169
+ # High-impact features from analysis (correlation > 0.15)
170
+ high_impact_features = [
171
+ 'has_safety_mention', 'has_urgency', 'equipment_problem_risk', 'problem_count',
172
+ 'technical_complexity', 'section_risk_multiplier', 'equipment_risk_score',
173
+ 'enhanced_severity_score', 'has_structural_failure', 'equipment_base_criticality'
174
+ ]
175
+
176
+ # Additional important features
177
+ important_features = [
178
+ 'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure',
179
+ 'equipment_count', 'action_count', 'has_equipment_malfunction', 'has_escalation',
180
+ 'bruit_anormal', 'vibration_excessive', 'temperature_elevee', 'fuite_vapeur',
181
+ 'fuite_huile', 'maintenance_planning', 'is_recurring', 'has_measurements',
182
+ 'has_location_details', 'combined_word_count'
183
+ ]
184
+
185
+ # Text feature
186
+ text_features = ['Description']
187
+
188
+ # Categorical features
189
+ categorical_features = []
190
+ if 'equipment_type_class' in df.columns:
191
+ categorical_features.append('equipment_type_class')
192
+ if 'equipment_redundancy_class' in df.columns:
193
+ categorical_features.append('equipment_redundancy_class')
194
+ if 'Section propriΓ©taire' in df.columns:
195
+ categorical_features.append('Section propriΓ©taire')
196
+
197
+ # Combine all features
198
+ all_engineered_features = high_impact_features + important_features
199
+ available_features = [feat for feat in all_engineered_features if feat in df.columns]
200
+
201
+ print(f"High-impact features (>0.15 correlation): {len([f for f in high_impact_features if f in df.columns])}")
202
+ print(f"Additional important features: {len([f for f in important_features if f in df.columns])}")
203
+ print(f"Text features: {len(text_features)}")
204
+ print(f"Categorical features: {len(categorical_features)}")
205
+ print(f"Total engineered features: {len(available_features)}")
206
+
207
+ # Handle missing values
208
+ for col in available_features:
209
+ if df[col].dtype in ['int64', 'float64']:
210
+ df[col] = df[col].fillna(0)
211
+ elif df[col].dtype == 'bool':
212
+ df[col] = df[col].astype(int).fillna(0)
213
+
214
+ for col in categorical_features:
215
+ df[col] = df[col].fillna('Unknown')
216
+
217
+ # --- FIX #1a: Handle missing values in the text column ---
218
+ df['Description'] = df['Description'].fillna('')
219
+
220
+ print("βœ“ Feature preparation completed")
221
+
222
+ # ============== STEP 5: ENHANCED PREPROCESSING PIPELINES ==============
223
+ print("\n" + "="*60)
224
+ print("STEP 5: ENHANCED PREPROCESSING PIPELINES")
225
+ print("="*60)
226
+
227
+ # --- FIX #1b: Define the column name as a string for the ColumnTransformer ---
228
+ # This ensures the TfidfVectorizer receives a 1D Series instead of a 2D DataFrame.
229
+ text_feature_name_for_transformer = 'Description'
230
+
231
+ # Enhanced text preprocessing
232
+ text_pipeline = Pipeline([
233
+ ('tfidf', TfidfVectorizer(
234
+ max_features=1500, # Increased for better text representation
235
+ stop_words=None,
236
+ ngram_range=(1, 2),
237
+ min_df=2,
238
+ max_df=0.95,
239
+ lowercase=True,
240
+ strip_accents='unicode',
241
+ sublinear_tf=True # Better for high-dimensional data
242
+ ))
243
+ ])
244
+
245
+ # Numerical features preprocessing
246
+ numerical_pipeline = Pipeline([
247
+ ('scaler', StandardScaler())
248
+ ])
249
+
250
+ # Categorical features preprocessing
251
+ categorical_pipeline = Pipeline([
252
+ ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
253
+ ])
254
+
255
+ # Combined preprocessing
256
+ transformers = [
257
+ # --- FIX #1c: Use the string variable here ---
258
+ ('text', text_pipeline, text_feature_name_for_transformer),
259
+ ('numerical', numerical_pipeline, available_features)
260
+ ]
261
+
262
+ if categorical_features:
263
+ transformers.append(('categorical', categorical_pipeline, categorical_features))
264
+
265
+ preprocessor = ColumnTransformer(transformers, remainder='drop')
266
+
267
+ print("βœ“ Enhanced preprocessing pipelines created")
268
+ print(f" Text processing: 1 feature β†’ 1500 TF-IDF features")
269
+ print(f" Numerical processing: {len(available_features)} features")
270
+ print(f" Categorical processing: {len(categorical_features)} features")
271
+
272
+ # ============== STEP 6: ENHANCED DATA SPLITTING WITH CRITICALITY STRATIFICATION ==============
273
+ print("\n" + "="*60)
274
+ print("STEP 6: ENHANCED DATA SPLITTING WITH CRITICALITY STRATIFICATION")
275
+ print("="*60)
276
+
277
+ # Create feature matrix
278
+ feature_columns = text_features + available_features + categorical_features
279
+ X = df[feature_columns].copy()
280
+
281
+ # Calculate combined criticality for stratification
282
+ df['combined_criticality'] = df['FiabilitΓ© IntΓ©gritΓ©'] + df['DisponibiltΓ©'] + df['Process Safety']
283
+
284
+ # Create stratification groups to ensure critical cases in test set
285
+ def create_stratification_groups(criticality_scores):
286
+ """Create stratification groups ensuring critical cases in test set"""
287
+ groups = []
288
+ for score in criticality_scores:
289
+ if score >= 12:
290
+ groups.append('very_critical')
291
+ elif score >= 10:
292
+ groups.append('critical')
293
+ elif score >= 8:
294
+ groups.append('high')
295
+ elif score >= 6:
296
+ groups.append('medium')
297
+ else:
298
+ groups.append('low')
299
+ return groups
300
+
301
+ stratification_groups = create_stratification_groups(df['combined_criticality'])
302
+ df['stratification_group'] = stratification_groups
303
+
304
+ print(f"Stratification group distribution:")
305
+ for group, count in pd.Series(stratification_groups).value_counts().items():
306
+ percentage = count / len(df) * 100
307
+ print(f" {group}: {count} cases ({percentage:.1f}%)")
308
+
309
+ # Enhanced splitting strategy - single split for all targets using combined criticality
310
+ print(f"\nUsing combined criticality stratification for consistent test sets...")
311
+
312
+ # Filter out groups with too few samples for stratification
313
+ group_counts = pd.Series(stratification_groups).value_counts()
314
+ valid_groups = group_counts[group_counts >= 4].index
315
+ valid_mask = pd.Series(stratification_groups).isin(valid_groups)
316
+
317
+ df_filtered = df[valid_mask].copy()
318
+ X_filtered = df_filtered[feature_columns]
319
+ stratification_filtered = df_filtered['stratification_group']
320
+
321
+ print(f"Filtered dataset: {len(df_filtered)} samples (removed {len(df) - len(df_filtered)} rare cases)")
322
+
323
+ # Single stratified split for consistency across all targets
324
+ X_train_base, X_test_base, _, _ = train_test_split(
325
+ X_filtered, stratification_filtered,
326
+ test_size=0.2,
327
+ random_state=42,
328
+ stratify=stratification_filtered
329
+ )
330
+
331
+ # Check critical cases in splits
332
+ train_criticality = df_filtered.loc[X_train_base.index, 'combined_criticality']
333
+ test_criticality = df_filtered.loc[X_test_base.index, 'combined_criticality']
334
+
335
+ train_critical_cases = (train_criticality >= 10).sum()
336
+ test_critical_cases = (test_criticality >= 10).sum()
337
+
338
+ print(f"\nCritical case distribution after stratification:")
339
+ print(f" Training critical cases (β‰₯10): {train_critical_cases}")
340
+ print(f" Test critical cases (β‰₯10): {test_critical_cases}")
341
+ print(f" Test set critical case rate: {test_critical_cases/len(X_test_base)*100:.1f}%")
342
+
343
+ # Initialize dictionaries for each target
344
+ X_train_dict, X_test_dict, y_train_dict, y_test_dict = {}, {}, {}, {}
345
+ sample_weights_dict = {}
346
+
347
+ # Create consistent splits for each target
348
+ for target in target_columns:
349
+ print(f"\nPreparing data for {target}...")
350
+
351
+ # Use the same base splits for all targets
352
+ X_train_dict[target] = X_train_base
353
+ X_test_dict[target] = X_test_base
354
+ y_train_dict[target] = df_filtered.loc[X_train_base.index, target]
355
+ y_test_dict[target] = df_filtered.loc[X_test_base.index, target]
356
+
357
+ # Calculate sample weights for training
358
+ train_equipment_types = None
359
+ train_label_confidence = None
360
+
361
+ if 'equipment_type_class' in df_filtered.columns:
362
+ train_equipment_types = df_filtered.loc[X_train_base.index, 'equipment_type_class'].values
363
+ if 'label_confidence' in df_filtered.columns:
364
+ train_label_confidence = df_filtered.loc[X_train_base.index, 'label_confidence'].values
365
+
366
+ sample_weights = calculate_sample_weights(
367
+ y_train_dict[target].values,
368
+ train_equipment_types,
369
+ train_label_confidence
370
+ )
371
+ sample_weights_dict[target] = sample_weights
372
+
373
+ print(f" Training set: {len(X_train_dict[target])} samples")
374
+ print(f" Test set: {len(X_test_dict[target])} samples")
375
+ print(f" Training class distribution: {dict(y_train_dict[target].value_counts().sort_index())}")
376
+ print(f" Sample weights range: {sample_weights.min():.2f} - {sample_weights.max():.2f}")
377
+
378
+ print(f"\nβœ“ Enhanced stratification completed - Critical cases preserved in test set!")
379
+
380
+
381
+ # ============== STEP 7: CONSERVATIVE MODEL TRAINING ==============
382
+ print("\n" + "="*60)
383
+ print("STEP 7: CONSERVATIVE MODEL TRAINING WITH COST-SENSITIVE LEARNING")
384
+ print("="*60)
385
+
386
+ # Enhanced LightGBM parameters for conservative prediction
387
+ conservative_lgbm_params = {
388
+ 'objective': 'multiclass',
389
+ 'metric': 'multi_logloss',
390
+ 'boosting_type': 'gbdt',
391
+ 'num_leaves': 31,
392
+ 'learning_rate': 0.05, # Lower learning rate for better generalization
393
+ 'feature_fraction': 0.8,
394
+ 'bagging_fraction': 0.8,
395
+ 'bagging_freq': 5,
396
+ 'verbose': -1,
397
+ 'random_state': 42,
398
+ 'n_estimators': 500, # More estimators with lower learning rate
399
+ 'class_weight': 'balanced',
400
+ 'min_child_samples': 20, # Prevent overfitting
401
+ 'reg_alpha': 0.1, # L1 regularization
402
+ 'reg_lambda': 0.1, # L2 regularization
403
+ }
404
+
405
+ # Store trained models and performance
406
+ trained_models = {}
407
+ model_performance = {}
408
+ business_metrics = {}
409
+
410
+ for target in target_columns:
411
+ print(f"\n" + "-"*50)
412
+ print(f"TRAINING CONSERVATIVE MODEL FOR: {target}")
413
+ print("-"*50)
414
+
415
+ # Get data for this target
416
+ X_train = X_train_dict[target]
417
+ X_test = X_test_dict[target]
418
+ y_train = y_train_dict[target]
419
+ y_test = y_test_dict[target]
420
+ sample_weights = sample_weights_dict[target]
421
+
422
+ # Prepare model parameters
423
+ unique_classes = sorted(y_train.unique())
424
+ num_classes = len(unique_classes)
425
+ current_params = conservative_lgbm_params.copy()
426
+ current_params['num_class'] = num_classes
427
+
428
+ print(f"Classes: {unique_classes} (total: {num_classes})")
429
+
430
+ # Enhanced SMOTE for better minority class handling
431
+ min_class_size = min(y_train.value_counts())
432
+ k_neighbors = min(3, min_class_size - 1) if min_class_size > 1 else 1
433
+
434
+ # Use BorderlineSMOTE for better boundary detection
435
+ if num_classes > 2 and min_class_size > 1:
436
+ try:
437
+ smote = BorderlineSMOTE(
438
+ random_state=42,
439
+ k_neighbors=k_neighbors,
440
+ sampling_strategy='auto' # Only oversample minority classes
441
+ )
442
+ model_pipeline = ImbPipeline([
443
+ ('preprocessor', preprocessor),
444
+ ('smote', smote),
445
+ ('classifier', LGBMClassifier(**current_params))
446
+ ])
447
+ print(f"Using BorderlineSMOTE with k_neighbors={k_neighbors}")
448
+ except:
449
+ # Fallback to standard SMOTE
450
+ smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
451
+ model_pipeline = ImbPipeline([
452
+ ('preprocessor', preprocessor),
453
+ ('smote', smote),
454
+ ('classifier', LGBMClassifier(**current_params))
455
+ ])
456
+ print(f"Using standard SMOTE with k_neighbors={k_neighbors}")
457
+ else:
458
+ model_pipeline = Pipeline([
459
+ ('preprocessor', preprocessor),
460
+ ('classifier', LGBMClassifier(**current_params))
461
+ ])
462
+ print("Using standard pipeline (no SMOTE)")
463
+
464
+ # Train with sample weights
465
+ print("Training in progress...")
466
+ if 'smote' in model_pipeline.named_steps:
467
+ # SMOTE pipeline - fit without sample weights first, then use them for classifier
468
+ model_pipeline.fit(X_train, y_train)
469
+ else:
470
+ # Standard pipeline - use sample weights directly
471
+ model_pipeline.fit(X_train, y_train,
472
+ classifier__sample_weight=sample_weights)
473
+
474
+ # Make predictions
475
+ y_pred_train = model_pipeline.predict(X_train)
476
+ y_pred_test = model_pipeline.predict(X_test)
477
+ y_pred_proba_test = model_pipeline.predict_proba(X_test)
478
+
479
+ # Standard metrics
480
+ train_accuracy = (y_pred_train == y_train).mean()
481
+ test_accuracy = (y_pred_test == y_test).mean()
482
+ test_mae = mean_absolute_error(y_test, y_pred_test)
483
+
484
+ # Business-critical metrics
485
+ high_value_mask = y_test >= 4 # High component values
486
+ if high_value_mask.sum() > 0:
487
+ high_value_recall = recall_score(y_test, y_pred_test, labels=[4, 5], average='macro', zero_division=0)
488
+ high_value_precision = precision_score(y_test, y_pred_test, labels=[4, 5], average='macro', zero_division=0)
489
+
490
+ # Underestimation analysis for high values
491
+ underestimated = (y_test > y_pred_test) & high_value_mask
492
+ underestimation_rate = underestimated.mean() if high_value_mask.sum() > 0 else 0
493
+
494
+ print(f"HIGH-VALUE COMPONENT PERFORMANCE:")
495
+ print(f" Recall for values 4-5: {high_value_recall:.3f}")
496
+ print(f" Precision for values 4-5: {high_value_precision:.3f}")
497
+ print(f" Underestimation rate: {underestimation_rate:.3f}")
498
+ else:
499
+ high_value_recall = 0
500
+ high_value_precision = 0
501
+ underestimation_rate = 0
502
+ print("No high-value cases in test set")
503
+
504
+ print(f"OVERALL PERFORMANCE:")
505
+ print(f" Training Accuracy: {train_accuracy:.3f}")
506
+ print(f" Test Accuracy: {test_accuracy:.3f}")
507
+ print(f" Test MAE: {test_mae:.3f}")
508
+
509
+ # Store results
510
+ trained_models[target] = model_pipeline
511
+ model_performance[target] = {
512
+ 'train_accuracy': train_accuracy,
513
+ 'test_accuracy': test_accuracy,
514
+ 'test_mae': test_mae,
515
+ 'predictions': y_pred_test,
516
+ 'probabilities': y_pred_proba_test,
517
+ 'unique_classes': unique_classes
518
+ }
519
+
520
+ business_metrics[target] = {
521
+ 'high_value_recall': high_value_recall,
522
+ 'high_value_precision': high_value_precision,
523
+ 'underestimation_rate': underestimation_rate,
524
+ 'total_high_value_cases': high_value_mask.sum()
525
+ }
526
+
527
+ # Classification report
528
+ print(f"\nDetailed Classification Report:")
529
+ print(classification_report(y_test, y_pred_test, zero_division=0))
530
+
531
+ # ============== STEP 8: OVERALL CRITICALITY ANALYSIS ==============
532
+ print("\n" + "="*60)
533
+ print("STEP 8: OVERALL CRITICALITY PREDICTION ANALYSIS")
534
+ print("="*60)
535
+
536
+ # Calculate combined criticality predictions for common test set
537
+ print(f"\nCalculating combined criticality for {len(X_test_base)} test samples...")
538
+
539
+ predicted_criticality = np.zeros(len(X_test_base))
540
+ actual_criticality = df_filtered.loc[X_test_base.index, 'combined_criticality'].values
541
+
542
+ # Get predictions for each target and sum them
543
+ for target in target_columns:
544
+ model = trained_models[target]
545
+ target_predictions = model.predict(X_test_base)
546
+ predicted_criticality += target_predictions
547
+
548
+ predicted_criticality = predicted_criticality.astype(int)
549
+
550
+ print(f"Actual criticality range: {actual_criticality.min()} - {actual_criticality.max()}")
551
+ print(f"Predicted criticality range: {predicted_criticality.min()} - {predicted_criticality.max()}")
552
+
553
+
554
+ # Business impact analysis
555
+ critical_threshold = 10
556
+ very_critical_threshold = 12
557
+
558
+ critical_actual = actual_criticality >= critical_threshold
559
+ critical_predicted = predicted_criticality >= critical_threshold
560
+
561
+ very_critical_actual = actual_criticality >= very_critical_threshold
562
+ very_critical_predicted = predicted_criticality >= very_critical_threshold
563
+
564
+ # Calculate business metrics
565
+ overall_mae = mean_absolute_error(actual_criticality, predicted_criticality)
566
+ critical_recall = recall_score(critical_actual, critical_predicted) if critical_actual.sum() > 0 else 0
567
+ critical_precision = precision_score(critical_actual, critical_predicted) if critical_predicted.sum() > 0 else 0
568
+
569
+ # Conservative prediction analysis
570
+ conservative_score = (predicted_criticality >= actual_criticality).mean()
571
+ severe_underestimation = ((actual_criticality >= 10) & (predicted_criticality <= 6)).sum()
572
+
573
+ print(f"OVERALL CRITICALITY PERFORMANCE:")
574
+ print(f"Total test samples: {len(actual_criticality)}")
575
+ print(f"Combined MAE: {overall_mae:.3f}")
576
+ print(f"Conservative prediction rate: {conservative_score:.3f}")
577
+ print(f"Severe underestimation cases (actualβ‰₯10, pred≀6): {severe_underestimation}")
578
+
579
+ print(f"\nCRITICAL CASE DETECTION (β‰₯{critical_threshold}):")
580
+ print(f"Actual critical cases: {critical_actual.sum()}")
581
+ print(f"Predicted critical cases: {critical_predicted.sum()}")
582
+ print(f"Critical case recall: {critical_recall:.3f}")
583
+ print(f"Critical case precision: {critical_precision:.3f}")
584
+
585
+ if very_critical_actual.sum() > 0:
586
+ very_critical_recall = recall_score(very_critical_actual, very_critical_predicted)
587
+ print(f"\nVERY CRITICAL CASE DETECTION (β‰₯{very_critical_threshold}):")
588
+ print(f"Very critical recall: {very_critical_recall:.3f}")
589
+ else:
590
+ print(f"\nNo very critical cases (β‰₯{very_critical_threshold}) in test set")
591
+
592
+ # ============== STEP 9: EQUIPMENT-SPECIFIC ANALYSIS ==============
593
+ print("\n" + "="*60)
594
+ print("STEP 9: EQUIPMENT-SPECIFIC PERFORMANCE ANALYSIS")
595
+ print("="*60)
596
+
597
+ # Equipment-specific performance analysis
598
+ # --- FIX #2: Check if the test set is not empty ---
599
+ if 'equipment_type_class' in df.columns and not X_test_base.empty:
600
+ print("Equipment-specific performance analysis:")
601
+
602
+ # Get equipment types for the common test set
603
+ equipment_types_test = df_filtered.loc[X_test_base.index, 'equipment_type_class'].values
604
+
605
+ # Analyze by equipment type
606
+ equipment_performance = {}
607
+ for eq_type in set(equipment_types_test):
608
+ eq_mask = equipment_types_test == eq_type
609
+ if eq_mask.sum() > 0:
610
+ eq_actual = actual_criticality[eq_mask]
611
+ eq_predicted = predicted_criticality[eq_mask]
612
+
613
+ eq_mae = mean_absolute_error(eq_actual, eq_predicted)
614
+ eq_conservative = (eq_predicted >= eq_actual).mean()
615
+
616
+ # Critical case detection for this equipment type
617
+ eq_critical_actual = eq_actual >= critical_threshold
618
+ eq_critical_predicted = eq_predicted >= critical_threshold
619
+
620
+ if eq_critical_actual.sum() > 0:
621
+ eq_critical_recall = recall_score(eq_critical_actual, eq_critical_predicted)
622
+ else:
623
+ eq_critical_recall = np.nan
624
+
625
+ equipment_performance[eq_type] = {
626
+ 'samples': eq_mask.sum(),
627
+ 'mae': eq_mae,
628
+ 'conservative_rate': eq_conservative,
629
+ 'critical_cases': eq_critical_actual.sum(),
630
+ 'critical_recall': eq_critical_recall
631
+ }
632
+
633
+ print(f"\n{eq_type}:")
634
+ print(f" Samples: {eq_mask.sum()}")
635
+ print(f" MAE: {eq_mae:.3f}")
636
+ print(f" Conservative rate: {eq_conservative:.3f}")
637
+ print(f" Critical cases: {eq_critical_actual.sum()}")
638
+ if not np.isnan(eq_critical_recall):
639
+ print(f" Critical recall: {eq_critical_recall:.3f}")
640
+ else:
641
+ print(f" Critical recall: N/A (no critical cases)")
642
+ else:
643
+ # Handle the case where equipment performance can't be calculated
644
+ equipment_performance = {}
645
+
646
+ # ============== STEP 10: SAVE ENHANCED MODELS ==============
647
+ print("\n" + "="*60)
648
+ print("STEP 10: SAVING ENHANCED MODELS AND METADATA")
649
+ print("="*60)
650
+
651
+ # Save individual models
652
+ for target in target_columns:
653
+ model_filename = f"enhanced_model_{target.replace(' ', '_').replace('Γ©', 'e')}_v2.joblib"
654
+ joblib.dump(trained_models[target], model_filename)
655
+ print(f"βœ“ Saved {target} model to {model_filename}")
656
+
657
+ # Enhanced feature info with training metadata
658
+ enhanced_feature_info = {
659
+ 'text_features': text_features,
660
+ 'numerical_features': available_features,
661
+ 'categorical_features': categorical_features,
662
+ 'high_impact_features': high_impact_features,
663
+ 'all_feature_columns': feature_columns,
664
+ 'target_columns': target_columns,
665
+
666
+ # Training configuration
667
+ 'training_config': {
668
+ 'conservative_lgbm_params': conservative_lgbm_params,
669
+ 'cost_sensitive_learning': True,
670
+ 'smote_enabled': True,
671
+ 'sample_weighting': True,
672
+ 'preprocessing_enhanced': True
673
+ },
674
+
675
+ # Model performance
676
+ 'model_performance': {k: {key: val for key, val in v.items()
677
+ if key not in ['predictions', 'probabilities']}
678
+ for k, v in model_performance.items()},
679
+
680
+ # Business metrics
681
+ 'business_metrics': business_metrics,
682
+
683
+ # Overall performance
684
+ 'overall_performance': {
685
+ 'combined_mae': float(overall_mae),
686
+ 'conservative_prediction_rate': float(conservative_score),
687
+ 'critical_case_recall': float(critical_recall) if not np.isnan(critical_recall) else None,
688
+ 'critical_case_precision': float(critical_precision) if not np.isnan(critical_precision) else None,
689
+ 'severe_underestimation_cases': int(severe_underestimation),
690
+ 'total_critical_cases': int(critical_actual.sum()),
691
+ 'equipment_specific_performance': equipment_performance if 'equipment_type_class' in df.columns else None
692
+ },
693
+
694
+ # Data characteristics
695
+ 'data_characteristics': {
696
+ 'total_samples': len(df),
697
+ 'total_features': len(feature_columns),
698
+ 'critical_cases_in_data': len(critical_cases),
699
+ 'equipment_types_available': 'equipment_type_class' in df.columns,
700
+ 'label_confidence_available': 'label_confidence' in df.columns
701
+ }
702
+ }
703
+
704
+ joblib.dump(enhanced_feature_info, 'enhanced_model_metadata_v2.joblib')
705
+ print("βœ“ Saved enhanced model metadata to enhanced_model_metadata_v2.joblib")
706
+
707
+ # ============== STEP 11: ENHANCED VISUALIZATIONS ==============
708
+ print("\n" + "="*60)
709
+ print("STEP 11: CREATING ENHANCED PERFORMANCE VISUALIZATIONS")
710
+ print("="*60)
711
+
712
+ # Create comprehensive performance dashboard
713
+ fig = plt.figure(figsize=(20, 16))
714
+
715
+ # 1. Model Performance Comparison
716
+ plt.subplot(3, 4, 1)
717
+ targets = list(model_performance.keys())
718
+ train_accs = [model_performance[t]['train_accuracy'] for t in targets]
719
+ test_accs = [model_performance[t]['test_accuracy'] for t in targets]
720
+
721
+ x_pos = np.arange(len(targets))
722
+ plt.bar(x_pos - 0.2, train_accs, 0.4, label='Training', alpha=0.8)
723
+ plt.bar(x_pos + 0.2, test_accs, 0.4, label='Test', alpha=0.8)
724
+ plt.xlabel('Target Variables')
725
+ plt.ylabel('Accuracy')
726
+ plt.title('Enhanced Model Accuracy')
727
+ plt.xticks(x_pos, [t.replace(' ', '\n') for t in targets], rotation=0)
728
+ plt.legend()
729
+ plt.grid(True, alpha=0.3)
730
+
731
+ # 2. Business Metrics Performance
732
+ plt.subplot(3, 4, 2)
733
+ high_value_recalls = [business_metrics[t]['high_value_recall'] for t in targets]
734
+ underestimation_rates = [business_metrics[t]['underestimation_rate'] for t in targets]
735
+
736
+ x_pos = np.arange(len(targets))
737
+ plt.bar(x_pos - 0.2, high_value_recalls, 0.4, label='High Value Recall', alpha=0.8)
738
+ plt.bar(x_pos + 0.2, underestimation_rates, 0.4, label='Underestimation Rate', alpha=0.8, color='red')
739
+ plt.xlabel('Target Variables')
740
+ plt.ylabel('Rate')
741
+ plt.title('Business-Critical Metrics')
742
+ plt.xticks(x_pos, [t.replace(' ', '\n') for t in targets], rotation=0)
743
+ plt.legend()
744
+ plt.grid(True, alpha=0.3)
745
+
746
+ # 3. Overall Criticality Prediction vs Actual
747
+ plt.subplot(3, 4, 3)
748
+ plt.scatter(actual_criticality, predicted_criticality, alpha=0.6, s=30)
749
+ plt.plot([min(actual_criticality), max(actual_criticality)],
750
+ [min(actual_criticality), max(actual_criticality)], 'r--', linewidth=2)
751
+ plt.xlabel('Actual CriticitΓ©')
752
+ plt.ylabel('Predicted CriticitΓ©')
753
+ plt.title('Criticality Prediction vs Actual')
754
+ plt.grid(True, alpha=0.3)
755
+
756
+ # Add conservative prediction line
757
+ if len(actual_criticality) > 0:
758
+ plt.plot([min(actual_criticality), max(actual_criticality)],
759
+ [min(actual_criticality)-1, max(actual_criticality)-1], 'g--',
760
+ linewidth=1, alpha=0.7, label='Conservative Line')
761
+ plt.legend()
762
+
763
+ # 4. Critical Case Detection Analysis
764
+ plt.subplot(3, 4, 4)
765
+ critical_analysis_data = {
766
+ 'Actual Critical': critical_actual.sum(),
767
+ 'Predicted Critical': critical_predicted.sum(),
768
+ 'True Positives': (critical_actual & critical_predicted).sum(),
769
+ 'False Negatives': (critical_actual & ~critical_predicted).sum()
770
+ }
771
+
772
+ plt.bar(critical_analysis_data.keys(), critical_analysis_data.values(),
773
+ color=['blue', 'orange', 'green', 'red'], alpha=0.7)
774
+ plt.ylabel('Count')
775
+ plt.title('Critical Case Detection Analysis')
776
+ plt.xticks(rotation=45)
777
+ plt.grid(True, alpha=0.3)
778
+
779
+ # 5. Equipment Type Performance (if available)
780
+ plt.subplot(3, 4, 5)
781
+ if 'equipment_type_class' in df.columns and equipment_performance:
782
+ eq_types = list(equipment_performance.keys())[:8] # Top 8 equipment types
783
+ eq_maes = [equipment_performance[eq]['mae'] for eq in eq_types]
784
+
785
+ plt.barh(range(len(eq_types)), eq_maes, alpha=0.7)
786
+ plt.yticks(range(len(eq_types)), [eq.replace('_', '\n') for eq in eq_types])
787
+ plt.xlabel('MAE')
788
+ plt.title('Equipment-Specific MAE')
789
+ plt.grid(True, alpha=0.3)
790
+ else:
791
+ plt.text(0.5, 0.5, 'Equipment\nPerformance\nNot Available',
792
+ ha='center', va='center', transform=plt.gca().transAxes)
793
+ plt.title('Equipment Performance')
794
+
795
+ # 6. Confusion Matrix for Combined Criticality
796
+ plt.subplot(3, 4, 6)
797
+ if len(actual_criticality) > 0:
798
+ criticality_bins = [3, 6, 9, 12, 15] # Bin the criticality for better visualization
799
+ actual_binned = np.digitize(actual_criticality, criticality_bins)
800
+ predicted_binned = np.digitize(predicted_criticality, criticality_bins)
801
+
802
+ cm = confusion_matrix(actual_binned, predicted_binned)
803
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
804
+ xticklabels=[f'<{b}' for b in criticality_bins] + [f'>={criticality_bins[-1]}'],
805
+ yticklabels=[f'<{b}' for b in criticality_bins] + [f'>={criticality_bins[-1]}'])
806
+ plt.xlabel('Predicted Criticality Range')
807
+ plt.ylabel('Actual Criticality Range')
808
+ plt.title('Criticality Confusion Matrix')
809
+ else:
810
+ plt.text(0.5, 0.5, 'No Test Data\nfor Confusion Matrix', ha='center', va='center', transform=plt.gca().transAxes)
811
+ plt.title('Criticality Confusion Matrix')
812
+
813
+ # 7. Feature Importance (from metadata)
814
+ plt.subplot(3, 4, 7)
815
+ if feature_metadata and 'feature_correlations' in feature_metadata:
816
+ correlations = feature_metadata.get('feature_correlations', [])[:10] # Top 10
817
+ if correlations:
818
+ features = [item['Feature'] for item in correlations]
819
+ corr_values = [abs(item['Correlation']) for item in correlations]
820
+
821
+ plt.barh(range(len(features)), corr_values, alpha=0.7)
822
+ plt.yticks(range(len(features)), [f.replace('_', '\n') for f in features])
823
+ plt.xlabel('|Correlation|')
824
+ plt.title('Top Feature Correlations')
825
+ plt.grid(True, alpha=0.3)
826
+ else:
827
+ plt.text(0.5, 0.5, 'No Feature\nCorrelations Found', ha='center', va='center', transform=plt.gca().transAxes)
828
+ plt.title('Feature Importance')
829
+ else:
830
+ plt.text(0.5, 0.5, 'Feature\nCorrelations\nNot Available',
831
+ ha='center', va='center', transform=plt.gca().transAxes)
832
+ plt.title('Feature Importance')
833
+
834
+
835
+ # 8. Conservative Prediction Analysis
836
+ plt.subplot(3, 4, 8)
837
+ if len(actual_criticality) > 0:
838
+ conservative_analysis = {
839
+ 'Conservative': (predicted_criticality >= actual_criticality).sum(),
840
+ 'Exact': (predicted_criticality == actual_criticality).sum(),
841
+ 'Underestimated': (predicted_criticality < actual_criticality).sum()
842
+ }
843
+
844
+ colors = ['green', 'blue', 'red']
845
+ plt.pie(conservative_analysis.values(), labels=conservative_analysis.keys(),
846
+ autopct='%1.1f%%', colors=colors, startangle=90)
847
+ plt.title('Prediction Conservatism Analysis')
848
+ else:
849
+ plt.text(0.5, 0.5, 'No Data for\nConservatism Analysis', ha='center', va='center', transform=plt.gca().transAxes)
850
+ plt.title('Prediction Conservatism Analysis')
851
+
852
+
853
+ # 9. MAE by Target
854
+ plt.subplot(3, 4, 9)
855
+ target_maes = [model_performance[t]['test_mae'] for t in targets]
856
+ plt.bar(targets, target_maes, alpha=0.7, color='orange')
857
+ plt.xlabel('Target Variables')
858
+ plt.ylabel('MAE')
859
+ plt.title('Mean Absolute Error by Target')
860
+ plt.xticks(rotation=45)
861
+ plt.grid(True, alpha=0.3)
862
+
863
+ # 10. Error Distribution
864
+ plt.subplot(3, 4, 10)
865
+ if len(actual_criticality) > 0:
866
+ errors = predicted_criticality - actual_criticality
867
+ plt.hist(errors, bins=20, alpha=0.7, edgecolor='black')
868
+ plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
869
+ plt.xlabel('Prediction Error (Pred - Actual)')
870
+ plt.ylabel('Frequency')
871
+ plt.title('Error Distribution')
872
+ plt.grid(True, alpha=0.3)
873
+ else:
874
+ plt.text(0.5, 0.5, 'No Data for\nError Distribution', ha='center', va='center', transform=plt.gca().transAxes)
875
+ plt.title('Error Distribution')
876
+
877
+
878
+ # 11. Critical Equipment Performance
879
+ plt.subplot(3, 4, 11)
880
+ if 'equipment_type_class' in df.columns and equipment_performance:
881
+ critical_equipment = ['ELECTRICAL_CRITICAL', 'COOLING_CRITICAL', 'TURBINE_SYSTEMS']
882
+ critical_eq_data = {eq: equipment_performance.get(eq, {}).get('critical_recall', 0)
883
+ for eq in critical_equipment if eq in equipment_performance}
884
+
885
+ if critical_eq_data:
886
+ plt.bar(critical_eq_data.keys(), critical_eq_data.values(), alpha=0.7)
887
+ plt.ylabel('Critical Case Recall')
888
+ plt.title('Critical Equipment Performance')
889
+ plt.xticks(rotation=45)
890
+ plt.grid(True, alpha=0.3)
891
+ else:
892
+ plt.text(0.5, 0.5, 'Critical Equipment\nData Not Available\nin Test Set',
893
+ ha='center', va='center', transform=plt.gca().transAxes)
894
+ plt.title('Critical Equipment Performance')
895
+ else:
896
+ plt.text(0.5, 0.5, 'Equipment Data\nNot Available',
897
+ ha='center', va='center', transform=plt.gca().transAxes)
898
+ plt.title('Critical Equipment Performance')
899
+
900
+ # 12. Training Summary
901
+ plt.subplot(3, 4, 12)
902
+ plt.axis('off')
903
+ summary_text = f"""ENHANCED TRAINING SUMMARY
904
+
905
+ Dataset: {len(df):,} samples
906
+ Features: {len(feature_columns)} total
907
+ - Text: {len(text_features)}
908
+ - Numerical: {len(available_features)}
909
+ - Categorical: {len(categorical_features)}
910
+
911
+ Performance:
912
+ - Combined MAE: {overall_mae:.3f}
913
+ - Conservative Rate: {conservative_score:.3f}
914
+ - Critical Recall: {critical_recall:.3f}
915
+
916
+ Enhancements:
917
+ βœ“ Equipment Intelligence
918
+ βœ“ Cost-Sensitive Learning
919
+ βœ“ Sample Weighting
920
+ βœ“ Enhanced SMOTE
921
+ βœ“ Conservative Parameters
922
+
923
+ Business Impact:
924
+ - Severe Underestimation: {severe_underestimation} cases
925
+ - Critical Cases Detected: {critical_predicted.sum()}/{critical_actual.sum()}
926
+ """
927
+
928
+ plt.text(0.05, 0.95, summary_text, transform=plt.gca().transAxes,
929
+ fontsize=9, verticalalignment='top', fontfamily='monospace')
930
+
931
+ plt.tight_layout()
932
+ plt.savefig('enhanced_model_performance_dashboard_v2.png', dpi=300, bbox_inches='tight')
933
+ print("βœ“ Enhanced performance dashboard saved as 'enhanced_model_performance_dashboard_v2.png'")
934
+
935
+ # ============== STEP 12: SAFETY OVERRIDE RULES ==============
936
+ print("\n" + "="*60)
937
+ print("STEP 12: IMPLEMENTING SAFETY OVERRIDE RULES")
938
+ print("="*60)
939
+
940
+ def create_safety_override_rules():
941
+ """
942
+ Create safety override rules for conservative prediction
943
+ """
944
+ rules = {
945
+ 'structural_failure_override': {
946
+ 'condition': 'has_structural_failure == 1',
947
+ 'action': 'min_criticality = 9',
948
+ 'description': 'Any structural failure gets minimum criticality 9'
949
+ },
950
+ 'electrical_critical_equipment': {
951
+ 'condition': 'equipment_type_class == "ELECTRICAL_CRITICAL"',
952
+ 'action': 'apply_conservative_threshold = 0.7',
953
+ 'description': 'Lower confidence threshold for electrical critical equipment'
954
+ },
955
+ 'cooling_critical_equipment': {
956
+ 'condition': 'equipment_type_class == "COOLING_CRITICAL"',
957
+ 'action': 'min_criticality = 10',
958
+ 'description': 'Cooling critical equipment gets minimum criticality 10'
959
+ },
960
+ 'safety_mention_boost': {
961
+ 'condition': 'has_safety_mention == 1',
962
+ 'action': 'add_criticality_boost = 2',
963
+ 'description': 'SAFETY mentions get +2 criticality boost'
964
+ },
965
+ 'turbine_oil_issue': {
966
+ 'condition': 'turbine_oil_issue == 1',
967
+ 'action': 'min_criticality = 8',
968
+ 'description': 'Turbine oil issues get minimum criticality 8'
969
+ }
970
+ }
971
+ return rules
972
+
973
+ safety_rules = create_safety_override_rules()
974
+
975
+ print("Safety Override Rules Created:")
976
+ for rule_name, rule_info in safety_rules.items():
977
+ print(f" {rule_name}:")
978
+ print(f" Condition: {rule_info['condition']}")
979
+ print(f" Action: {rule_info['action']}")
980
+ print(f" Description: {rule_info['description']}")
981
+
982
+ # Save safety rules
983
+ with open('safety_override_rules_v2.json', 'w') as f:
984
+ json.dump(safety_rules, f, indent=2)
985
+ print("βœ“ Safety override rules saved to safety_override_rules_v2.json")
986
+
987
+ # ============== STEP 13: FINAL RECOMMENDATIONS ==============
988
+ print("\n" + "="*60)
989
+ print("STEP 13: ENHANCED MODEL RECOMMENDATIONS")
990
+ print("="*60)
991
+
992
+ print("🎯 ENHANCED MODEL PERFORMANCE ANALYSIS:")
993
+ print(f"βœ“ Overall MAE improved with equipment intelligence: {overall_mae:.3f}")
994
+ print(f"βœ“ Conservative prediction rate: {conservative_score:.3f} (good for safety)")
995
+ print(f"βœ“ Critical case recall: {critical_recall:.3f}")
996
+ print(f"βœ“ Severe underestimation reduced to: {severe_underestimation} cases")
997
+
998
+ print(f"\nπŸ”§ EQUIPMENT INTELLIGENCE IMPACT:")
999
+ for target in target_columns:
1000
+ performance = model_performance[target]
1001
+ business = business_metrics[target]
1002
+ print(f"{target}:")
1003
+ print(f" Test Accuracy: {performance['test_accuracy']:.3f}")
1004
+ print(f" High-Value Recall: {business['high_value_recall']:.3f}")
1005
+ print(f" Underestimation Rate: {business['underestimation_rate']:.3f}")
1006
+
1007
+ if equipment_performance:
1008
+ print(f"\n⚑ HIGH-RISK EQUIPMENT PERFORMANCE:")
1009
+ critical_equipment_types = ['ELECTRICAL_CRITICAL', 'COOLING_CRITICAL', 'TURBINE_SYSTEMS']
1010
+ for eq_type in critical_equipment_types:
1011
+ if eq_type in equipment_performance:
1012
+ perf = equipment_performance[eq_type]
1013
+ print(f"{eq_type}:")
1014
+ print(f" MAE: {perf['mae']:.3f}")
1015
+ print(f" Conservative Rate: {perf['conservative_rate']:.3f}")
1016
+ if not np.isnan(perf['critical_recall']):
1017
+ print(f" Critical Recall: {perf['critical_recall']:.3f}")
1018
+
1019
+ print(f"\nπŸš€ DEPLOYMENT RECOMMENDATIONS:")
1020
+ print(f"1. Use safety override rules for critical equipment")
1021
+ print(f"2. Apply conservative thresholds for ELECTRICAL_CRITICAL equipment")
1022
+ print(f"3. Implement manual review for predictions with low confidence")
1023
+ print(f"4. Monitor underestimation rate in production")
1024
+ print(f"5. Retrain quarterly with new data to maintain performance")
1025
+
1026
+ print(f"\nπŸ“Š BUSINESS IMPACT:")
1027
+ print(f"- Reduced risk of missing critical failures")
1028
+ print(f"- Better detection of electrical equipment issues")
1029
+ print(f"- Equipment-specific prediction strategies")
1030
+ print(f"- Conservative bias protects against safety risks")
1031
+
1032
+ # ============== FINAL SUMMARY ==============
1033
+ print("\n" + "="*80)
1034
+ print("ENHANCED TRAINING PIPELINE v2.0 COMPLETED!")
1035
+ print("="*80)
1036
+
1037
+ print(f"\nπŸ“ˆ TRAINING ACHIEVEMENTS:")
1038
+ print(f"βœ“ Equipment Intelligence Integration: {len(categorical_features)} equipment features")
1039
+ print(f"βœ“ Cost-Sensitive Learning: Implemented with sample weighting")
1040
+ print(f"βœ“ Enhanced SMOTE: BorderlineSMOTE for better minority class handling")
1041
+ print(f"βœ“ Conservative Parameters: Lower learning rate, higher regularization")
1042
+ print(f"βœ“ Safety Override Rules: {len(safety_rules)} rules implemented")
1043
+ print(f"βœ“ Business Metrics Focus: High-value recall and underestimation tracking")
1044
+
1045
+ print(f"\nπŸ“Š PERFORMANCE IMPROVEMENTS:")
1046
+ print(f"Feature enhancement: 10 β†’ {len(feature_columns)} features")
1047
+ print(f"Equipment types classified: {len(df['equipment_type_class'].unique()) if 'equipment_type_class' in df.columns else 'N/A'}")
1048
+ print(f"Critical case detection: {critical_predicted.sum()}/{critical_actual.sum()} cases")
1049
+ print(f"Conservative prediction bias: {conservative_score:.1%} of predictions")
1050
+
1051
+ print(f"\nπŸ“ FILES GENERATED:")
1052
+ for target in target_columns:
1053
+ model_filename = f"enhanced_model_{target.replace(' ', '_').replace('Γ©', 'e')}_v2.joblib"
1054
+ print(f"βœ“ {model_filename}")
1055
+
1056
+ print("βœ“ enhanced_model_metadata_v2.joblib")
1057
+ print("βœ“ safety_override_rules_v2.json")
1058
+ print("βœ“ enhanced_model_performance_dashboard_v2.png")
1059
+
1060
+ print(f"\n🎯 NEXT STEP: UPDATE ANOMALY INTELLIGENCE")
1061
+ print("The inference system needs to be updated to use:")
1062
+ print("1. New enhanced models and metadata")
1063
+ print("2. Equipment intelligence features")
1064
+ print("3. Safety override rules")
1065
+ print("4. Conservative prediction thresholds")
1066
+
1067
+ print("\n" + "="*80)
1068
+ print("ENHANCED MODELS READY FOR PRODUCTION DEPLOYMENT!")
1069
+ print("="*80)