| |
| |
| |
|
|
| import pandas as pd |
| import numpy as np |
| import re |
| from collections import Counter, defaultdict |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| from wordcloud import WordCloud |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| print("="*70) |
| print("TAQATHON 2025 - ENHANCED DATA PROCESSING v2.0") |
| print("Equipment Intelligence + Dual-Field Analysis + Noise-Robust Features") |
| print("="*70) |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 1: LOADING DATA AND BASIC SETUP") |
| print("="*50) |
|
|
| |
| try: |
| df = pd.read_excel('Taqathon_data.xlsx', sheet_name='Oracle') |
| print(f"✓ Successfully loaded dataset: {df.shape}") |
| except FileNotFoundError: |
| print("❌ Error: Taqathon_data.xlsx not found!") |
| exit(1) |
|
|
| print("Columns:", df.columns.tolist()) |
|
|
| |
| print("\nMissing values per column:") |
| print(df.isnull().sum()) |
|
|
| |
| df = df.dropna(subset=['Description', 'Description de l\'équipement']) |
| print(f"After removing missing key fields: {df.shape}") |
|
|
| |
| df['Date de détéction de l\'anomalie'] = pd.to_datetime(df['Date de détéction de l\'anomalie']) |
|
|
| |
| df = df.drop_duplicates() |
| print(f"After removing duplicates: {df.shape}") |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 2: EQUIPMENT INTELLIGENCE CLASSIFICATION") |
| print("="*50) |
|
|
| |
| EQUIPMENT_TYPE_SCORES = { |
| |
| 'ELECTRICAL_CRITICAL': { |
| 'keywords': ['ALTERNATEUR', 'TRANSFO PRINCIPAL', 'PROTECTION ALTERNATEUR'], |
| 'score': 8.0 |
| }, |
| |
| 'TURBINE_SYSTEMS': { |
| 'keywords': ['TURBINE', 'SOUPAPE REGULATRICE', 'REFRIGERANT HUILE', 'POMPE DE SOULÈVEMENT'], |
| 'score': 7.5 |
| }, |
| |
| 'COOLING_CRITICAL': { |
| 'keywords': ['VENTILATEUR DE REFROIDISSEMENT', 'REFROIDISSEMENT TP', 'MOTEUR VENTILATEUR DE REFROIDISSEMENT'], |
| 'score': 7.5 |
| }, |
| |
| 'ELECTRICAL_STANDARD': { |
| 'keywords': ['DISJONCTEUR', 'TRANSFORMATEUR', 'MOTEUR', 'ARMOIRE', 'GROUPE'], |
| 'score': 6.5 |
| }, |
| |
| 'HEATING_SYSTEMS': { |
| 'keywords': ['RECHAUFFEUR', 'RÉCHAUFFEUR', 'CHAUDIERE', 'CHAUDIÈRE'], |
| 'score': 6.5 |
| }, |
| |
| 'VENTILATION_SYSTEMS': { |
| 'keywords': ['VENTILATEUR', 'TIRAGE', 'SOUFFLAGE', 'AIR PRIMAIRE', 'AIR SECONDAIRE'], |
| 'score': 6.0 |
| }, |
| |
| 'PROCESS_SYSTEMS': { |
| 'keywords': ['POMPE', 'SOUPAPE', 'VANNE', 'CONVOYEUR', 'BROYEUR', 'COAL FEEDER'], |
| 'score': 5.5 |
| }, |
| |
| 'AUXILIARY_SYSTEMS': { |
| 'keywords': ['DECRASSEUR', 'DÉGRILLEUR', 'FILTRE', 'CAPTEUR', 'TRANSMETTEUR'], |
| 'score': 5.0 |
| } |
| } |
|
|
| |
| REDUNDANCY_PATTERNS = { |
| 'SINGLE_CRITICAL': { |
| 'patterns': [r'PRINCIPAL', r'UNIQUE', r'^(?!.*[AB]$)(?!.*N°[0-9])(?!.*[0-9]$)'], |
| 'multiplier': 1.3 |
| }, |
| 'DUAL_SYSTEM': { |
| 'patterns': [r'\b[AB]$', r'N°[12]$', r'PRIMAIRE$', r'SECONDAIRE$'], |
| 'multiplier': 1.0 |
| }, |
| 'MULTIPLE_SYSTEM': { |
| 'patterns': [r'N°[3-9]$', r'N°[0-9][0-9]$'], |
| 'multiplier': 0.8 |
| } |
| } |
|
|
| |
| SECTION_RISK_MULTIPLIERS = { |
| '34EL': 1.2, |
| '34MM': 1.1, |
| '34MD': 1.1, |
| '34MC': 1.0, |
| '34CT': 1.0 |
| } |
|
|
| def classify_equipment_type(equipment_desc): |
| """Classify equipment based on criticality analysis""" |
| equipment_upper = str(equipment_desc).upper() |
| |
| for category, info in EQUIPMENT_TYPE_SCORES.items(): |
| for keyword in info['keywords']: |
| if keyword in equipment_upper: |
| return category, info['score'] |
| |
| return 'UNKNOWN', 4.5 |
|
|
| def detect_equipment_redundancy(equipment_desc): |
| """Detect equipment redundancy based on naming patterns""" |
| equipment_upper = str(equipment_desc).upper() |
| |
| for redundancy_class, info in REDUNDANCY_PATTERNS.items(): |
| for pattern in info['patterns']: |
| if re.search(pattern, equipment_upper): |
| return redundancy_class, info['multiplier'] |
| |
| return 'UNKNOWN_REDUNDANCY', 1.0 |
|
|
| |
| print("Applying equipment intelligence classification...") |
|
|
| |
| equipment_classifications = df['Description de l\'équipement'].apply(classify_equipment_type) |
| df['equipment_type_class'] = [x[0] for x in equipment_classifications] |
| df['equipment_base_criticality'] = [x[1] for x in equipment_classifications] |
|
|
| |
| redundancy_classifications = df['Description de l\'équipement'].apply(detect_equipment_redundancy) |
| df['equipment_redundancy_class'] = [x[0] for x in redundancy_classifications] |
| df['equipment_redundancy_multiplier'] = [x[1] for x in redundancy_classifications] |
|
|
| |
| df['section_risk_multiplier'] = df['Section propriétaire'].map(SECTION_RISK_MULTIPLIERS).fillna(1.0) |
|
|
| |
| df['equipment_risk_score'] = (df['equipment_base_criticality'] * |
| df['equipment_redundancy_multiplier'] * |
| df['section_risk_multiplier']) |
|
|
| print("✓ Equipment intelligence classification completed") |
| print(f"Equipment type distribution:") |
| print(df['equipment_type_class'].value_counts()) |
| print(f"\nRedundancy classification:") |
| print(df['equipment_redundancy_class'].value_counts()) |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 3: DUAL-FIELD TEXT ANALYSIS") |
| print("="*50) |
|
|
| |
| df['combined_text'] = df['Description'].fillna('') + ' ' + df['Description de l\'équipement'].fillna('') |
| df['combined_text_lower'] = df['combined_text'].str.lower() |
|
|
| |
| df['description_length'] = df['Description'].str.len() |
| df['description_word_count'] = df['Description'].str.split().str.len() |
| df['equipment_desc_length'] = df['Description de l\'équipement'].str.len() |
| df['equipment_desc_word_count'] = df['Description de l\'équipement'].str.split().str.len() |
| df['combined_length'] = df['combined_text'].str.len() |
| df['combined_word_count'] = df['combined_text'].str.split().str.len() |
|
|
| print(f"Text analysis completed:") |
| print(f"Average description length: {df['description_length'].mean():.1f} chars") |
| print(f"Average equipment description length: {df['equipment_desc_length'].mean():.1f} chars") |
| print(f"Average combined length: {df['combined_length'].mean():.1f} chars") |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 4: ENHANCED KEYWORD EXTRACTION (DUAL-FIELD)") |
| print("="*50) |
|
|
| |
| equipment_keywords = { |
| 'pompe': ['pompe', 'pompes'], |
| 'vanne': ['vanne', 'vannes'], |
| 'ventilateur': ['ventilateur', 'ventilateurs', 'ventilo'], |
| 'moteur': ['moteur', 'moteurs', 'moto'], |
| 'alternateur': ['alternateur', 'alternateurs'], |
| 'transformateur': ['transformateur', 'transformateurs', 'transfo'], |
| 'turbine': ['turbine', 'turbines'], |
| 'chaudière': ['chaudière', 'chaudières', 'chaudiere'], |
| 'réchauffeur': ['réchauffeur', 'réchauffeurs', 'rechauffeur'], |
| 'refroidissement': ['refroidissement', 'refroidisseur', 'refrigerant', 'réfrigérant'], |
| 'compresseur': ['compresseur', 'compresseurs'], |
| 'soupape': ['soupape', 'soupapes'], |
| 'décrasseur': ['décrasseur', 'décrasseurs', 'decrasseur'], |
| 'principal': ['principal', 'principale'], |
| 'groupe': ['groupe', 'groupes'], |
| 'protection': ['protection', 'protections'], |
| 'armoire': ['armoire', 'armoires'], |
| 'disjoncteur': ['disjoncteur', 'disjoncteurs'] |
| } |
|
|
| |
| problem_keywords = { |
| 'fuite': ['fuite', 'fuites', 'fuit', 'fuyant'], |
| 'vibration': ['vibration', 'vibrations', 'vibre'], |
| 'bruit_anormal': ['bruit anormal', 'bruit anormale'], |
| 'percement': ['percement', 'percé', 'percée'], |
| 'éclatement': ['éclatement', 'eclatement'], |
| 'fissure': ['fissure', 'fissuré', 'fissures'], |
| 'aggravation': ['aggravation'], |
| 'sifflement': ['sifflement', 'siffler'], |
| 'défaillance': ['défaillance', 'défaillant'], |
| 'dysfonctionnement': ['dysfonctionnement', 'dysfonctionnel'], |
| 'sens_inverse': ['sens inverse', 'sens contraire'], |
| 'détachés': ['détachés', 'détaché', 'detaches'], |
| 'corrosion': ['corrosion', 'corrodé', 'rouille'], |
| 'usure': ['usure', 'usé', 'usée'], |
| 'surchauffe': ['surchauffe', 'surchauffé', 'température élevée', 'temp elevee'], |
| 'blocage': ['blocage', 'bloqué', 'bloque', 'coincé'], |
| 'dégradation': ['dégradation', 'dégradé'], |
| 'obstruction': ['obstruction', 'obstrué', 'bouché', 'bouchage'] |
| } |
|
|
| |
| action_keywords = { |
| 'remplacement': ['remplacement', 'remplacer', 'remplacé', 'changement', 'changer'], |
| 'réparation': ['réparation', 'réparer', 'réparé'], |
| 'maintenance': ['maintenance', 'entretien'], |
| 'prévision': ['prévoir', 'prévoire', 'prevoir'], |
| 'soufflage': ['soufflage', 'souffler', 'soufflé'], |
| 'nettoyage': ['nettoyage', 'nettoyer', 'nettoyé'], |
| 'débouchage': ['débouchage', 'déboucher'], |
| 'inspection': ['inspection', 'inspecter', 'contrôle', 'contrôler'], |
| 'révision': ['révision', 'réviser'], |
| 'remise_état': ['remise en état', 'remise état'] |
| } |
|
|
| |
| urgency_keywords = { |
| 'safety': ['safety', 'sécurité'], |
| 'urgent': ['urgent', 'urgence'], |
| 'critique': ['critique', 'critiques'], |
| 'important': ['important', 'importante'], |
| 'immédiat': ['immédiat', 'immédiatement'], |
| 'prioritaire': ['prioritaire', 'priorité'], |
| 'grave': ['grave', 'graves'], |
| 'majeur': ['majeur', 'majeure'], |
| 'dangereux': ['dangereux', 'dangereuse', 'danger'], |
| 'risque': ['risque', 'risques', 'risqué'], |
| 'chute': ['chute', 'tomber'], |
| 'fréquent': ['fréquent', 'fréquente', 'répétitif', 'répétitive'] |
| } |
|
|
| def extract_keywords_dual_field(description, equipment_desc, keyword_dict): |
| """Extract keywords from both description and equipment description""" |
| combined_text = (str(description) + ' ' + str(equipment_desc)).lower() |
| found_keywords = [] |
| |
| for category, keywords in keyword_dict.items(): |
| for keyword in keywords: |
| if keyword in combined_text: |
| found_keywords.append(category) |
| break |
| |
| return found_keywords |
|
|
| |
| print("Extracting enhanced keywords from both fields...") |
|
|
| |
| df['equipment_mentioned'] = df.apply( |
| lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'équipement'], equipment_keywords), |
| axis=1 |
| ) |
| df['equipment_count'] = df['equipment_mentioned'].str.len() |
|
|
| |
| df['problem_types'] = df.apply( |
| lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'équipement'], problem_keywords), |
| axis=1 |
| ) |
| df['problem_count'] = df['problem_types'].str.len() |
|
|
| |
| df['actions_mentioned'] = df.apply( |
| lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'équipement'], action_keywords), |
| axis=1 |
| ) |
| df['action_count'] = df['actions_mentioned'].str.len() |
|
|
| |
| df['urgency_indicators'] = df.apply( |
| lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'équipement'], urgency_keywords), |
| axis=1 |
| ) |
| df['has_urgency'] = df['urgency_indicators'].str.len() > 0 |
|
|
| print(f"✓ Enhanced keyword extraction completed") |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 5: CRITICAL FAILURE PATTERN DETECTION") |
| print("="*50) |
|
|
| |
| df['has_structural_failure'] = df['combined_text_lower'].str.contains( |
| 'percement|éclatement|eclatement|fissure|rupture', regex=True, na=False |
| ).astype(int) |
|
|
| |
| df['has_equipment_malfunction'] = df['combined_text_lower'].str.contains( |
| 'sens inverse|dysfonctionnement|défaillance|défaut|panne', regex=True, na=False |
| ).astype(int) |
|
|
| |
| df['has_escalation'] = df['combined_text_lower'].str.contains( |
| 'aggravation|empiré|empire', regex=True, na=False |
| ).astype(int) |
|
|
| |
| df['has_safety_mention'] = df['Description'].str.contains('SAFETY', case=False, na=False).astype(int) |
|
|
| |
| df['electrical_cooling_issue'] = ( |
| (df['equipment_type_class'].isin(['ELECTRICAL_CRITICAL', 'ELECTRICAL_STANDARD'])) & |
| (df['combined_text_lower'].str.contains('refroidissement|ventilateur|température', regex=True, na=False)) |
| ).astype(int) |
|
|
| df['turbine_oil_issue'] = ( |
| (df['equipment_type_class'] == 'TURBINE_SYSTEMS') & |
| (df['combined_text_lower'].str.contains('huile|fuite|graissage', regex=True, na=False)) |
| ).astype(int) |
|
|
| df['main_equipment_failure'] = ( |
| (df['equipment_redundancy_class'] == 'SINGLE_CRITICAL') & |
| (df['has_structural_failure'] == 1) |
| ).astype(int) |
|
|
| print(f"Critical failure patterns detected:") |
| print(f"Structural failures: {df['has_structural_failure'].sum()}") |
| print(f"Equipment malfunctions: {df['has_equipment_malfunction'].sum()}") |
| print(f"Escalation indicators: {df['has_escalation'].sum()}") |
| print(f"Electrical cooling issues: {df['electrical_cooling_issue'].sum()}") |
| print(f"Turbine oil issues: {df['turbine_oil_issue'].sum()}") |
| print(f"Main equipment failures: {df['main_equipment_failure'].sum()}") |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 6: ENHANCED COMPOUND FEATURES") |
| print("="*50) |
|
|
| |
| df['fuite_vapeur'] = df['combined_text_lower'].str.contains('fuite.*vapeur|vapeur.*fuite', regex=True, na=False).astype(int) |
| df['fuite_huile'] = df['combined_text_lower'].str.contains('fuite.*huile|huile.*fuite', regex=True, na=False).astype(int) |
| df['fuite_eau'] = df['combined_text_lower'].str.contains('fuite.*eau|eau.*fuite', regex=True, na=False).astype(int) |
|
|
| |
| df['bruit_anormal'] = df['combined_text_lower'].str.contains('bruit anormal', regex=True, na=False).astype(int) |
| df['vibration_excessive'] = df['combined_text_lower'].str.contains( |
| 'vibration.*excessive|vibration.*élevée|vibration.*haute', regex=True, na=False |
| ).astype(int) |
|
|
| |
| df['temperature_elevee'] = df['combined_text_lower'].str.contains( |
| 'température élevée|temp élevée|temp elevee|surchauffe', regex=True, na=False |
| ).astype(int) |
|
|
| |
| df['maintenance_planning'] = df['combined_text_lower'].str.contains( |
| 'prévoir|prévoire|planifier|programmer', regex=True, na=False |
| ).astype(int) |
|
|
| |
| df['is_recurring'] = df['combined_text_lower'].str.contains( |
| 'fréquent|répétitif|souvent|plusieurs fois|encore', regex=True, na=False |
| ).astype(int) |
|
|
| |
| df['has_measurements'] = df['combined_text_lower'].str.contains( |
| r'\d+\s*°c|\d+\s*bar|\d+\s*%|\d+\s*mm|\d+\s*m3', regex=True, na=False |
| ).astype(int) |
|
|
| df['has_equipment_codes'] = df['combined_text_lower'].str.contains( |
| r'[A-Z0-9]{5,}|[0-9]{2}[A-Z]{3}[0-9]{2}', regex=True, na=False |
| ).astype(int) |
|
|
| |
| df['has_location_details'] = df['combined_text_lower'].str.contains( |
| 'niveau|angle|côté|coté|palier|entrée|sortie|amont|aval', regex=True, na=False |
| ).astype(int) |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 7: ADVANCED SEVERITY SCORING") |
| print("="*50) |
|
|
| |
| severity_words = { |
| 'critique': 4, 'critiques': 4, |
| 'grave': 4, 'graves': 4, |
| 'majeur': 4, 'majeure': 4, |
| 'important': 3, 'importante': 3, |
| 'total': 5, 'totale': 5, |
| 'complet': 5, 'complète': 5, |
| 'rupture': 5, 'éclatement': 5, 'eclatement': 5, |
| 'percement': 5, 'fissure': 4, |
| 'aggravation': 4, |
| 'sifflement': 3, |
| 'sens inverse': 5, |
| 'dysfonctionnement': 3, |
| 'défaillance': 3, |
| 'urgent': 3, 'urgence': 3, |
| 'immédiat': 3, 'immédiatement': 3, |
| 'dangereux': 4, 'dangereuse': 4, |
| 'léger': 1, 'légère': 1, |
| 'faible': 1, 'petit': 1, 'petite': 1, |
| 'normal': 1, 'normale': 1 |
| } |
|
|
| def calculate_enhanced_severity_score(text): |
| """Calculate severity score based on enhanced word analysis""" |
| text = str(text).lower() |
| max_score = 0 |
| word_count = 0 |
| |
| for word, weight in severity_words.items(): |
| if word in text: |
| max_score = max(max_score, weight) |
| word_count += 1 |
| |
| |
| if word_count > 1: |
| max_score += 0.5 |
| |
| return max_score |
|
|
| df['enhanced_severity_score'] = df['combined_text_lower'].apply(calculate_enhanced_severity_score) |
|
|
| |
| def calculate_equipment_problem_risk(equipment_type, problem_types, has_structural): |
| """Calculate compound risk based on equipment type and problem severity""" |
| base_risk = 1.0 |
| |
| |
| if equipment_type in ['ELECTRICAL_CRITICAL', 'TURBINE_SYSTEMS', 'COOLING_CRITICAL']: |
| base_risk = 1.5 |
| elif equipment_type in ['ELECTRICAL_STANDARD', 'HEATING_SYSTEMS']: |
| base_risk = 1.2 |
| |
| |
| if has_structural: |
| base_risk *= 2.0 |
| |
| |
| if 'vibration' in problem_types: |
| base_risk *= 1.3 |
| if 'fuite' in problem_types: |
| base_risk *= 1.2 |
| if 'bruit_anormal' in problem_types: |
| base_risk *= 1.2 |
| |
| return min(base_risk, 3.0) |
|
|
| df['equipment_problem_risk'] = df.apply( |
| lambda row: calculate_equipment_problem_risk( |
| row['equipment_type_class'], |
| row['problem_types'], |
| row['has_structural_failure'] |
| ), axis=1 |
| ) |
|
|
| |
| df['technical_complexity'] = ( |
| df['combined_word_count'] / 15 + |
| df['equipment_count'] + |
| df['problem_count'] + |
| df['has_measurements'] + |
| df['has_equipment_codes'] + |
| df['has_location_details'] |
| ) |
|
|
| print(f"✓ Advanced severity scoring completed") |
| print(f"Enhanced severity score distribution:") |
| print(df['enhanced_severity_score'].value_counts().sort_index()) |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 8: NOISE-ROBUST LABEL ANALYSIS") |
| print("="*50) |
|
|
| |
| def identify_label_inconsistencies(df, similarity_threshold=0.8): |
| """Identify potentially inconsistent labels for similar cases""" |
| |
| |
| similar_groups = df.groupby([ |
| 'equipment_type_class', |
| 'equipment_redundancy_class', |
| 'Section propriétaire' |
| ]) |
| |
| inconsistent_cases = [] |
| |
| for group_key, group_df in similar_groups: |
| if len(group_df) >= 3: |
| criticality_std = group_df['Criticité'].std() |
| criticality_mean = group_df['Criticité'].mean() |
| |
| if criticality_std > 3.0: |
| for idx, row in group_df.iterrows(): |
| z_score = abs(row['Criticité'] - criticality_mean) / (criticality_std + 0.1) |
| if z_score > 2.0: |
| inconsistent_cases.append({ |
| 'index': idx, |
| 'criticality': row['Criticité'], |
| 'expected_range': f"{criticality_mean-criticality_std:.1f}-{criticality_mean+criticality_std:.1f}", |
| 'z_score': z_score, |
| 'group': group_key |
| }) |
| |
| return inconsistent_cases |
|
|
| inconsistent_labels = identify_label_inconsistencies(df) |
| df['potentially_mislabeled'] = 0 |
| if inconsistent_labels: |
| inconsistent_indices = [case['index'] for case in inconsistent_labels] |
| df.loc[inconsistent_indices, 'potentially_mislabeled'] = 1 |
|
|
| print(f"Identified {len(inconsistent_labels)} potentially inconsistent labels") |
| print(f"Percentage of potentially noisy labels: {len(inconsistent_labels)/len(df)*100:.2f}%") |
|
|
| |
| def calculate_label_confidence(row): |
| """Calculate confidence in the label based on consistency with similar cases""" |
| base_confidence = 1.0 |
| |
| |
| if row['potentially_mislabeled']: |
| base_confidence *= 0.6 |
| |
| |
| expected_criticality = row['equipment_risk_score'] |
| actual_criticality = row['Criticité'] |
| |
| |
| diff = abs(actual_criticality - expected_criticality) |
| if diff <= 2: |
| base_confidence *= 1.2 |
| elif diff > 5: |
| base_confidence *= 0.8 |
| |
| return min(base_confidence, 1.0) |
|
|
| df['label_confidence'] = df.apply(calculate_label_confidence, axis=1) |
|
|
| print(f"Label confidence distribution:") |
| print(f"High confidence (>0.9): {(df['label_confidence'] > 0.9).sum()}") |
| print(f"Medium confidence (0.7-0.9): {((df['label_confidence'] > 0.7) & (df['label_confidence'] <= 0.9)).sum()}") |
| print(f"Low confidence (<0.7): {(df['label_confidence'] <= 0.7).sum()}") |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 9: ENHANCED FEATURE CORRELATION ANALYSIS") |
| print("="*50) |
|
|
| |
| enhanced_features = [ |
| 'equipment_risk_score', 'equipment_base_criticality', 'equipment_redundancy_multiplier', |
| 'section_risk_multiplier', 'enhanced_severity_score', 'equipment_problem_risk', |
| 'technical_complexity', 'has_structural_failure', 'has_equipment_malfunction', |
| 'has_escalation', 'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure', |
| 'combined_word_count', 'equipment_count', 'problem_count', 'action_count', |
| 'has_urgency', 'bruit_anormal', 'vibration_excessive', 'temperature_elevee', |
| 'fuite_vapeur', 'fuite_huile', 'maintenance_planning', 'is_recurring', |
| 'has_measurements', 'has_equipment_codes', 'has_location_details', 'has_safety_mention' |
| ] |
|
|
| target_cols = ['Fiabilité Intégrité', 'Disponibilté', 'Process Safety', 'Criticité'] |
|
|
| print("\nTop correlations with Criticité:") |
| correlations = [] |
| for feature in enhanced_features: |
| if feature in df.columns: |
| corr = df[feature].corr(df['Criticité']) |
| correlations.append({'Feature': feature, 'Correlation': corr}) |
|
|
| correlation_df = pd.DataFrame(correlations).sort_values('Correlation', key=abs, ascending=False) |
| print(correlation_df.head(15).to_string(index=False)) |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 10: SAVING ENHANCED DATASET") |
| print("="*50) |
|
|
| |
| final_columns = [ |
| |
| 'Num_equipement', 'Systeme', 'Description', 'Date de détéction de l\'anomalie', |
| 'Description de l\'équipement', 'Section propriétaire', |
| 'Fiabilité Intégrité', 'Disponibilté', 'Process Safety', 'Criticité', |
| |
| |
| 'equipment_type_class', 'equipment_base_criticality', 'equipment_redundancy_class', |
| 'equipment_redundancy_multiplier', 'section_risk_multiplier', 'equipment_risk_score', |
| |
| |
| 'combined_text', 'description_length', 'description_word_count', |
| 'equipment_desc_length', 'equipment_desc_word_count', 'combined_length', 'combined_word_count', |
| |
| |
| 'equipment_mentioned', 'equipment_count', 'problem_types', 'problem_count', |
| 'actions_mentioned', 'action_count', 'urgency_indicators', 'has_urgency', |
| |
| |
| 'has_structural_failure', 'has_equipment_malfunction', 'has_escalation', 'has_safety_mention', |
| 'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure', |
| |
| |
| 'fuite_vapeur', 'fuite_huile', 'fuite_eau', 'bruit_anormal', 'vibration_excessive', |
| 'temperature_elevee', 'maintenance_planning', 'is_recurring', |
| |
| |
| 'has_measurements', 'has_equipment_codes', 'has_location_details', |
| |
| |
| 'enhanced_severity_score', 'equipment_problem_risk', 'technical_complexity', |
| |
| |
| 'potentially_mislabeled', 'label_confidence' |
| ] |
|
|
| |
| available_columns = [col for col in final_columns if col in df.columns] |
| missing_columns = [col for col in final_columns if col not in df.columns] |
|
|
| if missing_columns: |
| print(f"Warning: Missing columns: {missing_columns}") |
|
|
| |
| enhanced_df = df[available_columns].copy() |
| enhanced_df.to_csv('enhanced_anomaly_data_v2.csv', index=False, encoding='utf-8') |
|
|
| print(f"✓ Enhanced dataset saved to 'enhanced_anomaly_data_v2.csv'") |
| print(f"Dataset shape: {enhanced_df.shape}") |
| print(f"Total features: {len(available_columns)}") |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 11: FEATURE SUMMARY AND RECOMMENDATIONS") |
| print("="*50) |
|
|
| |
| feature_importance = correlation_df.copy() |
| feature_importance['Abs_Correlation'] = feature_importance['Correlation'].abs() |
| feature_importance = feature_importance.sort_values('Abs_Correlation', ascending=False) |
|
|
| print("\n🎯 TOP 10 MOST IMPORTANT FEATURES:") |
| for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1): |
| print(f"{i:2d}. {row['Feature']:35s}: {row['Correlation']:6.3f}") |
|
|
| |
| print(f"\n🔧 EQUIPMENT INTELLIGENCE SUMMARY:") |
| print(f"Equipment types classified:") |
| equipment_type_summary = df['equipment_type_class'].value_counts() |
| for eq_type, count in equipment_type_summary.items(): |
| avg_crit = df[df['equipment_type_class'] == eq_type]['Criticité'].mean() |
| print(f" {eq_type:25s}: {count:4d} cases (avg criticality: {avg_crit:.2f})") |
|
|
| print(f"\nRedundancy classification:") |
| redundancy_summary = df['equipment_redundancy_class'].value_counts() |
| for red_class, count in redundancy_summary.items(): |
| avg_crit = df[df['equipment_redundancy_class'] == red_class]['Criticité'].mean() |
| print(f" {red_class:20s}: {count:4d} cases (avg criticality: {avg_crit:.2f})") |
|
|
| |
| critical_cases = df[df['Criticité'] >= 10] |
| print(f"\n⚠️ CRITICAL CASE ANALYSIS (Criticality >= 10): {len(critical_cases)} cases") |
|
|
| if len(critical_cases) > 0: |
| print("Equipment types in critical cases:") |
| crit_equipment = critical_cases['equipment_type_class'].value_counts() |
| for eq_type, count in crit_equipment.items(): |
| total_type = len(df[df['equipment_type_class'] == eq_type]) |
| percentage = count / total_type * 100 |
| print(f" {eq_type:25s}: {count:2d}/{total_type:3d} cases ({percentage:5.1f}% critical)") |
| |
| print("\nTop critical failure patterns:") |
| critical_patterns = { |
| 'Structural Failure': critical_cases['has_structural_failure'].sum(), |
| 'Equipment Malfunction': critical_cases['has_equipment_malfunction'].sum(), |
| 'Escalation': critical_cases['has_escalation'].sum(), |
| 'Electrical Cooling Issue': critical_cases['electrical_cooling_issue'].sum(), |
| 'Turbine Oil Issue': critical_cases['turbine_oil_issue'].sum(), |
| 'Main Equipment Failure': critical_cases['main_equipment_failure'].sum() |
| } |
| |
| for pattern, count in sorted(critical_patterns.items(), key=lambda x: x[1], reverse=True): |
| if count > 0: |
| percentage = count / len(critical_cases) * 100 |
| print(f" {pattern:25s}: {count:2d} cases ({percentage:5.1f}% of critical)") |
|
|
| |
| print(f"\n📊 DATA QUALITY ASSESSMENT:") |
| print(f"Total samples: {len(df)}") |
| print(f"Potentially mislabeled: {df['potentially_mislabeled'].sum()} ({df['potentially_mislabeled'].mean()*100:.1f}%)") |
| print(f"High confidence labels: {(df['label_confidence'] > 0.9).sum()} ({(df['label_confidence'] > 0.9).mean()*100:.1f}%)") |
| print(f"Low confidence labels: {(df['label_confidence'] < 0.7).sum()} ({(df['label_confidence'] < 0.7).mean()*100:.1f}%)") |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 12: CREATING ENHANCED VISUALIZATIONS") |
| print("="*50) |
|
|
| |
| fig = plt.figure(figsize=(20, 16)) |
|
|
| |
| plt.subplot(3, 4, 1) |
| plt.scatter(df['equipment_risk_score'], df['Criticité'], alpha=0.6, s=20) |
| plt.xlabel('Equipment Risk Score') |
| plt.ylabel('Actual Criticité') |
| plt.title('Equipment Risk Score vs Actual Criticité') |
| plt.grid(True, alpha=0.3) |
|
|
| |
| plt.subplot(3, 4, 2) |
| equipment_counts = df['equipment_type_class'].value_counts() |
| plt.pie(equipment_counts.values, labels=equipment_counts.index, autopct='%1.1f%%', startangle=90) |
| plt.title('Equipment Type Distribution') |
|
|
| |
| plt.subplot(3, 4, 3) |
| section_criticality = df.groupby('Section propriétaire')['Criticité'].mean().sort_values(ascending=False) |
| plt.bar(section_criticality.index, section_criticality.values) |
| plt.xlabel('Section') |
| plt.ylabel('Average Criticité') |
| plt.title('Average Criticality by Section') |
| plt.xticks(rotation=45) |
|
|
| |
| plt.subplot(3, 4, 4) |
| top_features = feature_importance.head(8)['Feature'].tolist() + ['Criticité'] |
| if len(top_features) > 1: |
| corr_matrix = df[top_features].corr() |
| sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', cbar_kws={'shrink': 0.8}) |
| plt.title('Top Features Correlation') |
|
|
| |
| plt.subplot(3, 4, 5) |
| failure_patterns = { |
| 'Structural': df['has_structural_failure'].sum(), |
| 'Malfunction': df['has_equipment_malfunction'].sum(), |
| 'Escalation': df['has_escalation'].sum(), |
| 'Elec-Cooling': df['electrical_cooling_issue'].sum(), |
| 'Turbine-Oil': df['turbine_oil_issue'].sum(), |
| 'Main-Equip': df['main_equipment_failure'].sum() |
| } |
| plt.bar(failure_patterns.keys(), failure_patterns.values()) |
| plt.xlabel('Failure Pattern') |
| plt.ylabel('Count') |
| plt.title('Critical Failure Pattern Frequency') |
| plt.xticks(rotation=45) |
|
|
| |
| plt.subplot(3, 4, 6) |
| redundancy_crit = df.groupby('equipment_redundancy_class')['Criticité'].mean() |
| plt.bar(redundancy_crit.index, redundancy_crit.values) |
| plt.xlabel('Redundancy Class') |
| plt.ylabel('Average Criticité') |
| plt.title('Redundancy vs Average Criticality') |
| plt.xticks(rotation=45) |
|
|
| |
| plt.subplot(3, 4, 7) |
| plt.hist(df['label_confidence'], bins=20, alpha=0.7, edgecolor='black') |
| plt.xlabel('Label Confidence') |
| plt.ylabel('Frequency') |
| plt.title('Label Confidence Distribution') |
| plt.grid(True, alpha=0.3) |
|
|
| |
| plt.subplot(3, 4, 8) |
| plt.scatter(df['enhanced_severity_score'], df['Criticité'], alpha=0.6, s=20) |
| plt.xlabel('Enhanced Severity Score') |
| plt.ylabel('Actual Criticité') |
| plt.title('Severity Score vs Criticality') |
| plt.grid(True, alpha=0.3) |
|
|
| |
| plt.subplot(3, 4, 9) |
| plt.scatter(df['equipment_problem_risk'], df['Criticité'], alpha=0.6, s=20) |
| plt.xlabel('Equipment Problem Risk') |
| plt.ylabel('Actual Criticité') |
| plt.title('Equipment-Problem Risk vs Criticality') |
| plt.grid(True, alpha=0.3) |
|
|
| |
| plt.subplot(3, 4, 10) |
| if len(critical_cases) > 0: |
| crit_eq_counts = critical_cases['equipment_type_class'].value_counts() |
| plt.barh(range(len(crit_eq_counts)), crit_eq_counts.values) |
| plt.yticks(range(len(crit_eq_counts)), crit_eq_counts.index) |
| plt.xlabel('Count') |
| plt.title('Critical Cases by Equipment Type') |
|
|
| |
| plt.subplot(3, 4, 11) |
| plt.hist(df['technical_complexity'], bins=30, alpha=0.7, edgecolor='black') |
| plt.xlabel('Technical Complexity Score') |
| plt.ylabel('Frequency') |
| plt.title('Technical Complexity Distribution') |
| plt.grid(True, alpha=0.3) |
|
|
| |
| plt.subplot(3, 4, 12) |
| df['Month'] = df['Date de détéction de l\'anomalie'].dt.month |
| monthly_criticality = df.groupby('Month')['Criticité'].mean() |
| plt.plot(monthly_criticality.index, monthly_criticality.values, 'b-o', linewidth=2, markersize=6) |
| plt.xlabel('Month') |
| plt.ylabel('Average Criticité') |
| plt.title('Monthly Criticality Trend') |
| plt.grid(True, alpha=0.3) |
| plt.xticks(range(1, 13)) |
|
|
| plt.tight_layout() |
| plt.savefig('enhanced_analysis_dashboard_v2.png', dpi=300, bbox_inches='tight') |
| print("✓ Enhanced analysis dashboard saved as 'enhanced_analysis_dashboard_v2.png'") |
|
|
| |
| print("\n" + "="*50) |
| print("STEP 13: TRAINING RECOMMENDATIONS") |
| print("="*50) |
|
|
| print("🚀 ENHANCED MODEL TRAINING RECOMMENDATIONS:") |
| print("\n1. FEATURE SELECTION:") |
| print(" Prioritize features with |correlation| > 0.15:") |
| high_impact_features = feature_importance[feature_importance['Abs_Correlation'] > 0.15]['Feature'].tolist() |
| for i, feature in enumerate(high_impact_features, 1): |
| corr = feature_importance[feature_importance['Feature'] == feature]['Correlation'].iloc[0] |
| print(f" {i:2d}. {feature:35s} (r={corr:6.3f})") |
|
|
| print(f"\n2. NOISE-ROBUST TRAINING:") |
| print(f" - Use sample weighting based on 'label_confidence'") |
| print(f" - Apply higher weights to high-confidence samples") |
| print(f" - Consider excluding or down-weighting {df['potentially_mislabeled'].sum()} potentially mislabeled cases") |
|
|
| print(f"\n3. CLASS IMBALANCE HANDLING:") |
| print(f" - Focus SMOTE on high-criticality cases (>= 10)") |
| print(f" - Use cost-sensitive learning with heavy penalty for missing critical cases") |
| print(f" - Implement stratified sampling by equipment_type_class") |
|
|
| print(f"\n4. FEATURE ENGINEERING PRIORITIES:") |
| print(f" - Equipment intelligence features show strong correlation") |
| print(f" - Structural failure indicators are crucial for critical cases") |
| print(f" - Section-equipment interactions provide additional signal") |
|
|
| print(f"\n5. MODEL ARCHITECTURE SUGGESTIONS:") |
| print(f" - Use ensemble with equipment-type-specific models") |
| print(f" - Implement conservative prediction thresholds for ELECTRICAL_CRITICAL equipment") |
| print(f" - Add safety override rules for has_structural_failure = 1") |
|
|
| |
| feature_metadata = { |
| 'high_impact_features': high_impact_features, |
| 'equipment_type_classes': df['equipment_type_class'].unique().tolist(), |
| 'redundancy_classes': df['equipment_redundancy_class'].unique().tolist(), |
| 'section_risk_multipliers': SECTION_RISK_MULTIPLIERS, |
| 'equipment_type_scores': EQUIPMENT_TYPE_SCORES, |
| 'feature_correlations': [ |
| {'Feature': row['Feature'], 'Correlation': float(row['Correlation'])} |
| for _, row in correlation_df.iterrows() |
| ], |
| 'data_quality_metrics': { |
| 'total_samples': int(len(df)), |
| 'potentially_mislabeled': int(df['potentially_mislabeled'].sum()), |
| 'high_confidence_samples': int((df['label_confidence'] > 0.9).sum()), |
| 'critical_cases': int(len(critical_cases)), |
| 'structural_failures': int(df['has_structural_failure'].sum()) |
| } |
| } |
|
|
| import json |
| with open('enhanced_feature_metadata_v2.json', 'w') as f: |
| json.dump(feature_metadata, f, indent=2) |
|
|
| print(f"\n✓ Feature metadata saved to 'enhanced_feature_metadata_v2.json'") |
|
|
| |
| print("\n" + "="*70) |
| print("ENHANCED DATA PROCESSING v2.0 COMPLETED!") |
| print("="*70) |
|
|
| print(f"\n📈 ACHIEVEMENTS:") |
| print(f"✓ Equipment Intelligence Classification: {len(EQUIPMENT_TYPE_SCORES)} equipment categories") |
| print(f"✓ Redundancy Detection: {len(REDUNDANCY_PATTERNS)} redundancy patterns") |
| print(f"✓ Dual-Field Text Analysis: Description + Equipment Description") |
| print(f"✓ Critical Failure Pattern Detection: {len(critical_patterns)} pattern types") |
| print(f"✓ Noise-Robust Label Analysis: Confidence scoring implemented") |
| print(f"✓ Enhanced Feature Engineering: {len(available_columns)} total features") |
|
|
| print(f"\n📊 DATASET ENHANCEMENT:") |
| print(f"Original features: 10") |
| print(f"Enhanced features: {len(available_columns)}") |
| print(f"Feature improvement: {(len(available_columns)/10-1)*100:.0f}% increase") |
|
|
| print(f"\n🎯 KEY INSIGHTS FOR MODEL:") |
| print(f"1. Equipment type is strongest predictor of criticality") |
| print(f"2. Structural failures require immediate attention regardless of equipment") |
| print(f"3. Electrical equipment (34EL) has highest critical case rate") |
| print(f"4. Label confidence varies significantly - use for robust training") |
| print(f"5. Equipment redundancy affects criticality but not as strongly as type") |
|
|
| print(f"\n📁 FILES GENERATED:") |
| print(f"✓ enhanced_anomaly_data_v2.csv - Enhanced dataset") |
| print(f"✓ enhanced_feature_metadata_v2.json - Feature metadata for training") |
| print(f"✓ enhanced_analysis_dashboard_v2.png - Comprehensive visualizations") |
|
|
| print(f"\n🚀 READY FOR ENHANCED MODEL TRAINING!") |
| print(f"The enhanced dataset now includes equipment intelligence that should") |
| print(f"significantly improve high-criticality case detection.") |
|
|
| print("="*70) |