Inder-26 commited on
Commit
f228efb
·
1 Parent(s): 42da6ea

Data Validation done

Browse files
data_schema/schema.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ columns:
2
+ - having_IP_Address: int64
3
+ - URL_Length: int64
4
+ - Shortining_Service: int64
5
+ - having_At_Symbol: int64
6
+ - double_slash_redirecting: int64
7
+ - Prefix_Suffix: int64
8
+ - having_Sub_Domain: int64
9
+ - SSLfinal_State: int64
10
+ - Domain_registeration_length: int64
11
+ - Favicon: int64
12
+ - port: int64
13
+ - HTTPS_token: int64
14
+ - Request_URL: int64
15
+ - URL_of_Anchor: int64
16
+ - Links_in_tags: int64
17
+ - SFH: int64
18
+ - Submitting_to_email: int64
19
+ - Abnormal_URL: int64
20
+ - Redirect: int64
21
+ - on_mouseover: int64
22
+ - RightClick: int64
23
+ - popUpWidnow: int64
24
+ - Iframe: int64
25
+ - age_of_domain: int64
26
+ - DNSRecord: int64
27
+ - web_traffic: int64
28
+ - Page_Rank: int64
29
+ - Google_Index: int64
30
+ - Links_pointing_to_page: int64
31
+ - Statistical_report: int64
32
+ - Result: int64
33
+
34
+
35
+ numerical_columns:
36
+ - having_IP_Address
37
+ - URL_Length
38
+ - Shortining_Service
39
+ - having_At_Symbol
40
+ - double_slash_redirecting
41
+ - Prefix_Suffix
42
+ - having_Sub_Domain
43
+ - SSLfinal_State
44
+ - Domain_registeration_length
45
+ - Favicon
46
+ - port
47
+ - HTTPS_token
48
+ - Request_URL
49
+ - URL_of_Anchor
50
+ - Links_in_tags
51
+ - SFH
52
+ - Submitting_to_email
53
+ - Abnormal_URL
54
+ - Redirect
55
+ - on_mouseover
56
+ - RightClick
57
+ - popUpWidnow
58
+ - Iframe
59
+ - age_of_domain
60
+ - DNSRecord
61
+ - web_traffic
62
+ - Page_Rank
63
+ - Google_Index
64
+ - Links_pointing_to_page
65
+ - Statistical_report
66
+ - Result
main.py CHANGED
@@ -1,7 +1,8 @@
1
  from networksecurity.components.data_ingestion import DataIngestion
 
2
  from networksecurity.exception.exception import NetworkSecurityException
3
  from networksecurity.logging.logger import logging
4
- from networksecurity.entity.config_entity import DataIngestionConfig
5
  from networksecurity.entity.config_entity import TraningPipelineConfig
6
  import sys
7
 
@@ -13,6 +14,12 @@ if __name__ == "__main__":
13
  logging.info("Initiate the data ingestion")
14
  dataingestionartifact=dataingestion.initiate_data_ingestion()
15
  print(dataingestionartifact)
16
-
 
 
 
 
 
 
17
  except Exception as e:
18
  raise NetworkSecurityException(e, sys)
 
1
  from networksecurity.components.data_ingestion import DataIngestion
2
+ from networksecurity.components.data_validation import DataValidation
3
  from networksecurity.exception.exception import NetworkSecurityException
4
  from networksecurity.logging.logger import logging
5
+ from networksecurity.entity.config_entity import DataIngestionConfig,DataValidationConfig
6
  from networksecurity.entity.config_entity import TraningPipelineConfig
7
  import sys
8
 
 
14
  logging.info("Initiate the data ingestion")
15
  dataingestionartifact=dataingestion.initiate_data_ingestion()
16
  print(dataingestionartifact)
17
+ data_validation_config=DataValidationConfig(traningpipelineconfig)
18
+ datavalidation=DataValidation(dataingestionartifact,data_validation_config)
19
+ logging.info("Initiate the data validation")
20
+ data_validation_artifact=datavalidation.initiate_data_validation()
21
+ logging.info(f"Data validation completed {data_validation_artifact}")
22
+ print(data_validation_artifact)
23
+
24
  except Exception as e:
25
  raise NetworkSecurityException(e, sys)
networksecurity/components/data_validation.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from networksecurity.entity.artifact_entity import DataIngestionArtifact, DataValidationArtifact
2
+ from networksecurity.entity.config_entity import DataValidationConfig
3
+ from networksecurity.exception.exception import NetworkSecurityException
4
+ from networksecurity.constant.training_pipeline import SCHEMA_FILE_PATH
5
+ from networksecurity.utils.main_utils.utils import read_yaml_file
6
+ from networksecurity.utils.main_utils.utils import write_yaml_file
7
+ from networksecurity.logging.logger import logging
8
+ from scipy.stats import ks_2samp
9
+ import pandas as pd
10
+ import os,sys
11
+
12
+ class DataValidation:
13
+ def __init__(self, data_ingestion_artifact: DataIngestionArtifact,
14
+ data_validation_config: DataValidationConfig):
15
+
16
+ try:
17
+ self.data_ingestion_artifact = data_ingestion_artifact
18
+ self.data_validation_config = data_validation_config
19
+ self._schema_config = read_yaml_file(SCHEMA_FILE_PATH)
20
+ except Exception as e:
21
+ raise NetworkSecurityException(e, sys)
22
+
23
+ @staticmethod
24
+ def read_data(file_path)->pd.DataFrame:
25
+ try:
26
+ return pd.read_csv(file_path)
27
+ except Exception as e:
28
+ raise NetworkSecurityException(e, sys)
29
+
30
+ def validate_number_of_columns(self, dataframe: pd.DataFrame)->bool:
31
+ try:
32
+ number_of_columns = len(self._schema_config)
33
+ logging.info(f"Required number of columns: {number_of_columns}")
34
+ logging.info(f"DataFrame columns count: {len(dataframe.columns)}")
35
+
36
+ if len(dataframe.columns)==number_of_columns:
37
+ return True
38
+ return False
39
+
40
+ except Exception as e:
41
+ raise NetworkSecurityException(e, sys)
42
+
43
+ def validate_number_of_numerical_columns(self, dataframe: pd.DataFrame) -> bool:
44
+ try:
45
+ # Required numerical columns from schema
46
+ required_numerical_columns = self._schema_config["numerical_columns"]
47
+ required_count = len(required_numerical_columns)
48
+
49
+ # Actual numerical columns in dataframe
50
+ numerical_df = dataframe.select_dtypes(include=["int64"])
51
+ actual_count = len(numerical_df.columns)
52
+
53
+ logging.info(f"Required number of numerical columns: {required_count}")
54
+ logging.info(f"Dataframe numerical columns count: {actual_count}")
55
+
56
+ if actual_count == required_count:
57
+ return True
58
+ return False
59
+
60
+ except Exception as e:
61
+ raise NetworkSecurityException(e, sys)
62
+
63
+ def detect_dataset_drift(self, base_df, current_df, threshold=0.01)-> bool:
64
+ try:
65
+ status = True
66
+ report={}
67
+ for column in base_df.columns:
68
+ d1 = base_df[column]
69
+ d2 = current_df[column]
70
+ is_same_dist=ks_2samp(d1, d2)
71
+ if threshold <=is_same_dist.pvalue:
72
+ is_found = False
73
+ else:
74
+ is_found = True
75
+ status = False
76
+ report.update({column:{
77
+ "p_value":float(is_same_dist.pvalue),
78
+ "drift_status":is_found
79
+ }})
80
+ drift_report_file_path = self.data_validation_config.drift_report_file_path
81
+ dir_path = os.path.dirname(drift_report_file_path)
82
+ os.makedirs(dir_path, exist_ok=True)
83
+ write_yaml_file(file_path=drift_report_file_path, content=report)
84
+
85
+
86
+ except Exception as e:
87
+ raise NetworkSecurityException(e, sys)
88
+
89
+ def initiate_data_validation(self)-> DataValidationArtifact:
90
+ try:
91
+ train_file_path = self.data_ingestion_artifact.training_file_path
92
+ test_file_path = self.data_ingestion_artifact.testing_file_path
93
+
94
+ ## Read data from train and test
95
+ train_dataframe = DataValidation.read_data(train_file_path)
96
+ test_dataframe = DataValidation.read_data(test_file_path)
97
+
98
+ ## Validate number of columns
99
+ status = self.validate_number_of_columns(dataframe=train_dataframe)
100
+ if not status:
101
+ error_message = f"Train dataframe does not contain all columns \n"
102
+ status = self.validate_number_of_columns(dataframe=test_dataframe)
103
+ if not status:
104
+ error_message = f"Test dataframe does not contain all columns \n"
105
+
106
+ ## Check datadrift
107
+ status=self.detect_dataset_drift(base_df=train_dataframe, current_df=test_dataframe)
108
+ dir_path = os.path.dirname(self.data_validation_config.valid_train_file_path)
109
+ os.makedirs(dir_path, exist_ok=True)
110
+
111
+ train_dataframe.to_csv(self.data_validation_config.valid_train_file_path, index=False, header=True)
112
+ test_dataframe.to_csv(self.data_validation_config.valid_test_file_path, index=False, header=True)
113
+
114
+ data_validation_artifact = DataValidationArtifact(
115
+ validation_status=status,
116
+ valid_train_file_path=self.data_validation_config.valid_train_file_path,
117
+ valid_test_file_path=self.data_validation_config.valid_test_file_path,
118
+ invalid_train_file_path=None,
119
+ invalid_test_file_path=None,
120
+ drift_report_file_path=self.data_validation_config.drift_report_file_path
121
+ )
122
+ return data_validation_artifact
123
+
124
+ except Exception as e:
125
+ raise NetworkSecurityException(e, sys)
networksecurity/constant/training_pipeline/__init__.py CHANGED
@@ -14,6 +14,8 @@ FILE_NAME: str = "phisingkData.csv"
14
  TRAIN_FILE_NAME: str = "train.csv"
15
  TEST_FILE_NAME: str = "test.csv"
16
 
 
 
17
  """
18
  Data Ingestion realted constant start with DATA_INGESTION VAR NAME
19
  """
@@ -24,3 +26,13 @@ DATA_INGESTION_DIR_NAME: str = "data_ingestion"
24
  DATA_INGESTION_FEATURE_STORE_DIR: str = "feature_store"
25
  DATA_INGESTION_INGESTED_DIR: str = "ingested"
26
  DATA_INGESTION_TRAIN_TEST_SPLIT_RATION: float = 0.2
 
 
 
 
 
 
 
 
 
 
 
14
  TRAIN_FILE_NAME: str = "train.csv"
15
  TEST_FILE_NAME: str = "test.csv"
16
 
17
+ SCHEMA_FILE_PATH = os.path.join("data_schema", "schema.yaml")
18
+
19
  """
20
  Data Ingestion realted constant start with DATA_INGESTION VAR NAME
21
  """
 
26
  DATA_INGESTION_FEATURE_STORE_DIR: str = "feature_store"
27
  DATA_INGESTION_INGESTED_DIR: str = "ingested"
28
  DATA_INGESTION_TRAIN_TEST_SPLIT_RATION: float = 0.2
29
+
30
+
31
+ """
32
+ Data Validation related constant start with DATA_VALIDATION VAR NAME
33
+ """
34
+ DATA_VALIDATION_DIR_NAME: str ="data_validation"
35
+ DATA_VALIDATION_VALID_DIR: str ="validated"
36
+ DATA_VALIDATION_INVALID_DIR: str ="invalid"
37
+ DATA_VALIDATION_DRIFT_REPORT_DIR: str ="drift_report"
38
+ DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str ="report.yaml"
networksecurity/entity/artifact_entity.py CHANGED
@@ -3,4 +3,13 @@ from dataclasses import dataclass
3
  @dataclass
4
  class DataIngestionArtifact:
5
  training_file_path:str
6
- testing_file_path:str
 
 
 
 
 
 
 
 
 
 
3
  @dataclass
4
  class DataIngestionArtifact:
5
  training_file_path:str
6
+ testing_file_path:str
7
+
8
+ @dataclass
9
+ class DataValidationArtifact:
10
+ validation_status: bool
11
+ valid_train_file_path: str
12
+ valid_test_file_path: str
13
+ invalid_train_file_path: str
14
+ invalid_test_file_path: str
15
+ drift_report_file_path: str
networksecurity/entity/config_entity.py CHANGED
@@ -30,4 +30,56 @@ class DataIngestionConfig:
30
  )
31
  self.train_test_split_ratio: float = training_pipeline.DATA_INGESTION_TRAIN_TEST_SPLIT_RATION
32
  self.collection_name: str = training_pipeline.DATA_INGESTION_COLLECTION_NAME
33
- self.database_name: str = training_pipeline.DATA_INGESTION_DATABASE_NAME
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  )
31
  self.train_test_split_ratio: float = training_pipeline.DATA_INGESTION_TRAIN_TEST_SPLIT_RATION
32
  self.collection_name: str = training_pipeline.DATA_INGESTION_COLLECTION_NAME
33
+ self.database_name: str = training_pipeline.DATA_INGESTION_DATABASE_NAME
34
+
35
+
36
+ class DataValidationConfig:
37
+ def __init__(self, training_pipeline_config: TraningPipelineConfig):
38
+ # Main data validation directory
39
+ self.data_validation_dir: str = os.path.join(
40
+ training_pipeline_config.artifact_dir,
41
+ training_pipeline.DATA_VALIDATION_DIR_NAME
42
+ )
43
+
44
+ # Valid data directory
45
+ self.valid_data_dir: str = os.path.join(
46
+ self.data_validation_dir,
47
+ training_pipeline.DATA_VALIDATION_VALID_DIR
48
+ )
49
+
50
+ # Invalid data directory
51
+ self.invalid_data_dir: str = os.path.join(
52
+ self.data_validation_dir,
53
+ training_pipeline.DATA_VALIDATION_INVALID_DIR
54
+ )
55
+
56
+ # Valid train file path
57
+ self.valid_train_file_path: str = os.path.join(
58
+ self.valid_data_dir,
59
+ training_pipeline.TRAIN_FILE_NAME
60
+ )
61
+
62
+ # Valid test file path
63
+ self.valid_test_file_path: str = os.path.join(
64
+ self.valid_data_dir,
65
+ training_pipeline.TEST_FILE_NAME
66
+ )
67
+
68
+ # Invalid train file path
69
+ self.invalid_train_file_path: str = os.path.join(
70
+ self.invalid_data_dir,
71
+ training_pipeline.TRAIN_FILE_NAME
72
+ )
73
+
74
+ # Invalid test file path
75
+ self.invalid_test_file_path: str = os.path.join(
76
+ self.invalid_data_dir,
77
+ training_pipeline.TEST_FILE_NAME
78
+ )
79
+
80
+ # Drift report file path
81
+ self.drift_report_file_path: str = os.path.join(
82
+ self.data_validation_dir,
83
+ training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
84
+ training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME,
85
+ )
networksecurity/utils/main_utils/__init__.py ADDED
File without changes
networksecurity/utils/main_utils/utils.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ from networksecurity.exception.exception import NetworkSecurityException
3
+ from networksecurity.logging.logger import logging
4
+ import os,sys
5
+ import numpy as np
6
+ import dill
7
+ import pickle
8
+
9
+ def read_yaml_file(file_path: str) -> dict:
10
+ """
11
+ Reads a YAML file and returns its contents as a dictionary.
12
+
13
+ Args:
14
+ file_path (str): The path to the YAML file.
15
+ Returns:
16
+ dict: The contents of the YAML file.
17
+ """
18
+ try:
19
+ with open(file_path, 'rb') as yaml_file:
20
+ return yaml.safe_load(yaml_file)
21
+ except Exception as e:
22
+ raise NetworkSecurityException(e, sys)
23
+
24
+ def write_yaml_file(file_path: str, content: object, replace: bool = False) -> None:
25
+ try:
26
+ if replace:
27
+ if os.path.exists(file_path):
28
+ os.remove(file_path)
29
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
30
+ with open(file_path, 'w') as file:
31
+ yaml.dump(content, file)
32
+ except Exception as e:
33
+ raise NetworkSecurityException(e, sys)
requirements.txt CHANGED
@@ -6,5 +6,6 @@ pymongo
6
  certifi
7
  pymongo[srv]==3.11
8
  scikit-learn
9
-
 
10
  #-e .
 
6
  certifi
7
  pymongo[srv]==3.11
8
  scikit-learn
9
+ dill
10
+ pyaml
11
  #-e .