Spaces:

hari6677
/

intrusion_detector_model

Running

App Files Files Community

hari6677 commited on Oct 13, 2025

Commit

e59f971

verified ·

1 Parent(s): 619ca58

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -38

app.py CHANGED Viewed

@@ -11,13 +11,13 @@ from tensorflow.keras.models import load_model
 from sklearn.preprocessing import LabelEncoder
 # --- Model & Scaler Configuration ---
-H5_MODEL_FILE = "https://huggingface.co/spaces/hari6677/intrusion_detector_model/commit/db539a078811615764b2c84aacb85b1a3f804a3c"
-SCALER_FILE_NAME = "https://huggingface.co/spaces/hari6677/intrusion_detector_model/commit/db539a078811615764b2c84aacb85b1a3f804a3c"
 # Threshold optimized in Cell 11 for better Attack Recall
 PREDICTION_THRESHOLD = 0.40
-FEATURE_COUNT = 40 # Expected number of features after one-hot encoding
-# Pre-defined list of all feature names, used to create the input DataFrame
 FEATURE_NAMES = [
     'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
     'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
@@ -30,8 +30,7 @@ FEATURE_NAMES = [
     'dst_host_srv_rerror_rate'
 ]
-# List of all possible service values (simplified for demo input)
-# NOTE: In a real system, you would need the full list from your training data.
 SERVICES = [
     'http', 'smtp', 'ftp_data', 'private', 'ecr_i', 'other', 'domain_u',
     'finger', 'telnet', 'ftp', 'pop_3', 'courier', 'eco_i', 'imap4',
@@ -50,6 +49,25 @@ FLAGS = [
 # List of all possible protocol types
 PROTOCOLS = ['tcp', 'udp', 'icmp']
 # Global artifacts
 model = None
 scaler = None
@@ -57,50 +75,61 @@ label_encoder = None
 MAPPING = {'normal': 0, 'anomaly': 1}
-# --- Model Loading and Initialization ---
 def load_artifacts():
     """Loads the trained model and scaler globally."""
     global model, scaler, label_encoder
-    print("Loading model and scaler for Gradio app...")
     # 1. Load Scaler
     try:
         scaler = joblib.load(SCALER_FILE_NAME)
         print(f"✓ Scaler loaded from {SCALER_FILE_NAME}")
     except Exception as e:
-        print(f"Error loading scaler: {e}")
         return False
     # 2. Load Model
     try:
         # Load in Keras H5 format
-        model = load_model(H5_MODEL_FILE)
         print(f"✓ Model loaded from {H5_MODEL_FILE}")
     except Exception as e:
-        print(f"Error loading model: {e}")
         return False
     # 3. Initialize Label Encoder
     label_encoder = LabelEncoder()
     label_encoder.fit(list(MAPPING.keys()))
     print("✓ Label Encoder initialized.")
     return True
 # Load artifacts on startup
 if not load_artifacts():
-    print("CRITICAL: Failed to load model artifacts. Prediction will not work.")
-    # Exit or handle error appropriately in production
-# --- Prediction Function ---
 def predict_intrusion(*inputs):
     """
     Takes 41 raw network features, preprocesses them, and makes a prediction.
     """
     if model is None or scaler is None:
-        return "ERROR: Model not loaded. Check server logs.", "N/A"
     # 1. Create a dictionary from the inputs
     raw_input_dict = {FEATURE_NAMES[i]: [inputs[i]] for i in range(len(FEATURE_NAMES))}
@@ -110,31 +139,19 @@ def predict_intrusion(*inputs):
     categorical_cols = ['protocol_type', 'service', 'flag']
     df = pd.get_dummies(df, columns=categorical_cols, prefix=categorical_cols)
-    # 3. Re-align columns to match training data (CRITICAL STEP)
-    # This creates a zero-filled array of the 40 expected features,
-    # then populates them with the values from the current input.
-    expected_features = [
-        'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
-        'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
-        'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
-        'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
-        'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
-        'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
-        'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
-        'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
-        'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp', # Protocol one-hots
-        # NOTE: A real deployment needs ALL 1-hot columns defined.
-        # For this demo, we rely on the scaler.transform() to handle alignment.
-    ]
-    # We must ensure the final feature set has 40 columns before scaling
-    if df.shape[1] != FEATURE_COUNT:
-         # A full-scale alignment is too complex for this demo, so we'll
-         # rely on the subsequent scaling step to fit the 40 columns.
-         pass
     # 4. Scale and Reshape for CNN
-    data_scaled = scaler.transform(df)
     X_processed = data_scaled.reshape(1, FEATURE_COUNT, 1)
     # 5. Predict probability
@@ -161,7 +178,7 @@ def predict_intrusion(*inputs):
     return html_output, f"{prediction_prob:.4f}"
-# --- Gradio Interface Definition ---
 # Define input components corresponding to the 41 features
 input_components = [

 from sklearn.preprocessing import LabelEncoder
 # --- Model & Scaler Configuration ---
+H5_MODEL_FILE = "intrusion_detector_model.h5"
+SCALER_FILE_NAME = "scaler.pkl"
 # Threshold optimized in Cell 11 for better Attack Recall
 PREDICTION_THRESHOLD = 0.40
+FEATURE_COUNT = 40
+# Pre-defined list of all feature names (41 raw features)
 FEATURE_NAMES = [
     'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
     'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
     'dst_host_srv_rerror_rate'
 ]
+# List of all possible service values (Must be comprehensive for correct OHE alignment)
 SERVICES = [
     'http', 'smtp', 'ftp_data', 'private', 'ecr_i', 'other', 'domain_u',
     'finger', 'telnet', 'ftp', 'pop_3', 'courier', 'eco_i', 'imap4',
 # List of all possible protocol types
 PROTOCOLS = ['tcp', 'udp', 'icmp']
+# --- Define ALL Expected OHE Columns ---
+PROTOCOL_OHE = [f'protocol_type_{p}' for p in PROTOCOLS]
+FLAG_OHE = [f'flag_{f}' for f in FLAGS]
+SERVICE_OHE = [f'service_{s}' for s in SERVICES]
+NUMERICAL_BINARY_COLS = [
+    'duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
+    'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
+    'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
+    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
+    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
+    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
+    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
+    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'
+]
+MASTER_OHE_COLUMNS = NUMERICAL_BINARY_COLS + PROTOCOL_OHE + SERVICE_OHE + FLAG_OHE
 # Global artifacts
 model = None
 scaler = None
 MAPPING = {'normal': 0, 'anomaly': 1}
+# --- Model Loading and Initialization (CRITICAL STEP) ---
 def load_artifacts():
     """Loads the trained model and scaler globally."""
     global model, scaler, label_encoder
+    print("--- Starting Artifact Loading ---")
+    # Check for file existence first
+    if not os.path.exists(SCALER_FILE_NAME) or not os.path.exists(H5_MODEL_FILE):
+        print(f"CRITICAL ERROR: One or both files are missing in the current directory:")
+        print(f"  Expected Scaler: {SCALER_FILE_NAME} (Exists: {os.path.exists(SCALER_FILE_NAME)})")
+        print(f"  Expected Model: {H5_MODEL_FILE} (Exists: {os.path.exists(H5_MODEL_FILE)})")
+        print("Please ensure both files are uploaded to the root of your Hugging Face Space.")
+        return False
     # 1. Load Scaler
     try:
         scaler = joblib.load(SCALER_FILE_NAME)
         print(f"✓ Scaler loaded from {SCALER_FILE_NAME}")
     except Exception as e:
+        print(f"Error loading scaler. Check file format or compatibility: {e}")
         return False
     # 2. Load Model
     try:
         # Load in Keras H5 format
+        # Setting compile=False often helps with deployment stability
+        model = load_model(H5_MODEL_FILE, compile=False)
         print(f"✓ Model loaded from {H5_MODEL_FILE}")
     except Exception as e:
+        print(f"Error loading model. Check Keras version compatibility: {e}")
         return False
     # 3. Initialize Label Encoder
     label_encoder = LabelEncoder()
     label_encoder.fit(list(MAPPING.keys()))
     print("✓ Label Encoder initialized.")
+    print("--- Artifact Loading Complete ---")
     return True
 # Load artifacts on startup
 if not load_artifacts():
+    # If loading failed, the prediction function will return the error message
+    pass
+# --- Prediction Function (Same as before) ---
 def predict_intrusion(*inputs):
     """
     Takes 41 raw network features, preprocesses them, and makes a prediction.
     """
     if model is None or scaler is None:
+        return "<h2 style='color: red; text-align: center;'>FATAL ERROR: Model Not Loaded. See Logs.</h2>", "N/A"
     # 1. Create a dictionary from the inputs
     raw_input_dict = {FEATURE_NAMES[i]: [inputs[i]] for i in range(len(FEATURE_NAMES))}
     categorical_cols = ['protocol_type', 'service', 'flag']
     df = pd.get_dummies(df, columns=categorical_cols, prefix=categorical_cols)
+    # 3. Re-align columns to match training data (CRITICAL FIX)
+    df_aligned = df.reindex(columns=MASTER_OHE_COLUMNS, fill_value=0)
+    # Drop the redundant categorical columns (if they weren't dropped by get_dummies)
+    df_aligned = df_aligned.drop(columns=['protocol_type', 'service', 'flag'], errors='ignore')
     # 4. Scale and Reshape for CNN
+    data_scaled = scaler.transform(df_aligned)
+    # Check shape to ensure correct feature count before reshaping
+    if data_scaled.shape[1] != FEATURE_COUNT:
+        return f"SCALER ERROR: Expected {FEATURE_COUNT} features, got {data_scaled.shape[1]} after scaling.", "N/A"
     X_processed = data_scaled.reshape(1, FEATURE_COUNT, 1)
     # 5. Predict probability
     return html_output, f"{prediction_prob:.4f}"
+# --- Gradio Interface Definition (Same as before) ---
 # Define input components corresponding to the 41 features
 input_components = [