Kshitijk20 commited on
Commit
a21e473
·
0 Parent(s):

Clean deployment without binary files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. .github/workflows/main.yml +92 -0
  3. .gitignore +14 -0
  4. .python-version +1 -0
  5. Dockerfile +31 -0
  6. README.md +10 -0
  7. app.py +75 -0
  8. data_schema/schema.yaml +65 -0
  9. data_to_mongo.py +73 -0
  10. load_data_to_sqlite.py +15 -0
  11. main.py +41 -0
  12. pyproject.toml +20 -0
  13. requirements.txt +14 -0
  14. setup.py +26 -0
  15. src/__init__.py +0 -0
  16. src/__pycache__/__init__.cpython-310.pyc +0 -0
  17. src/cloud/__init__.py +0 -0
  18. src/cloud/__pycache__/__init__.cpython-310.pyc +0 -0
  19. src/cloud/__pycache__/s3_syncer.cpython-310.pyc +0 -0
  20. src/cloud/s3_syncer.py +45 -0
  21. src/components/__init__.py +0 -0
  22. src/components/__pycache__/__init__.cpython-310.pyc +0 -0
  23. src/components/__pycache__/data_ingestion.cpython-310.pyc +0 -0
  24. src/components/__pycache__/data_transformation.cpython-310.pyc +0 -0
  25. src/components/__pycache__/data_validation.cpython-310.pyc +0 -0
  26. src/components/__pycache__/model_trainer.cpython-310.pyc +0 -0
  27. src/components/data_ingestion.py +80 -0
  28. src/components/data_transformation.py +93 -0
  29. src/components/data_validation.py +104 -0
  30. src/components/model_trainer.py +140 -0
  31. src/constant/__init__.py +0 -0
  32. src/constant/__pycache__/__init__.cpython-310.pyc +0 -0
  33. src/constant/training_pipeline/__init__.py +64 -0
  34. src/constant/training_pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
  35. src/data/__init__.py +0 -0
  36. src/data/sqlite_manager.py +162 -0
  37. src/entity/__init__.py +0 -0
  38. src/entity/__pycache__/__init__.cpython-310.pyc +0 -0
  39. src/entity/__pycache__/artifact_entity.cpython-310.pyc +0 -0
  40. src/entity/__pycache__/config_entity.cpython-310.pyc +0 -0
  41. src/entity/artifact_entity.py +34 -0
  42. src/entity/config_entity.py +60 -0
  43. src/exception/__init__.py +0 -0
  44. src/exception/__pycache__/__init__.cpython-310.pyc +0 -0
  45. src/exception/__pycache__/exception.cpython-310.pyc +0 -0
  46. src/exception/exception.py +12 -0
  47. src/logging/__init__.py +0 -0
  48. src/logging/__pycache__/__init__.cpython-310.pyc +0 -0
  49. src/logging/__pycache__/logger.cpython-310.pyc +0 -0
  50. src/logging/logger.py +14 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.github/workflows/main.yml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: workflow
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths-ignore:
8
+ - 'README.md'
9
+
10
+ jobs:
11
+ integration:
12
+ name: Continuous Integration
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - name: Checkout Code
16
+ uses: actions/checkout@v3
17
+
18
+ - name: Lint code
19
+ run: echo "Linting repository"
20
+
21
+ - name: Run unit tests
22
+ run: echo "Running unit tests"
23
+
24
+ build-and-push-ecr-image:
25
+ name: Continuous Delivery
26
+ needs: integration
27
+ runs-on: ubuntu-latest
28
+ steps:
29
+ - name: Checkout Code
30
+ uses: actions/checkout@v3
31
+
32
+ - name: Install Utilities
33
+ run: |
34
+ sudo apt-get update
35
+ sudo apt-get install -y jq unzip
36
+ - name: Configure AWS credentials
37
+ uses: aws-actions/configure-aws-credentials@v1
38
+ with:
39
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
40
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
41
+ aws-region: ${{ secrets.AWS_REGION }}
42
+
43
+ - name: Login to Amazon ECR
44
+ id: login-ecr
45
+ uses: aws-actions/amazon-ecr-login@v1
46
+
47
+ - name: Build, tag, and push image to ECR
48
+ id: build-image
49
+ env:
50
+ ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
51
+ ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY_NAME }}
52
+ IMAGE_TAG: latest
53
+ run: |
54
+ echo "ECR_REGISTRY: $ECR_REGISTRY"
55
+ echo "ECR_REPOSITORY: $ECR_REPOSITORY"
56
+ echo "IMAGE_TAG: $IMAGE_TAG"
57
+ docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
58
+ docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
59
+ echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
60
+ Continuous-Deployment:
61
+ needs: build-and-push-ecr-image
62
+ runs-on: self-hosted
63
+ steps:
64
+ - name: Checkout
65
+ uses: actions/checkout@v3
66
+
67
+ - name: Configure AWS credentials
68
+ uses: aws-actions/configure-aws-credentials@v1
69
+ with:
70
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
71
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
72
+ aws-region: ${{ secrets.AWS_REGION }}
73
+
74
+ - name: Login to Amazon ECR
75
+ id: login-ecr
76
+ uses: aws-actions/amazon-ecr-login@v1
77
+
78
+
79
+ - name: Pull latest images
80
+ run: |
81
+ docker pull ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
82
+
83
+ #- name: Stop and remove container if running
84
+ # run: |
85
+ # docker ps -q --filter "name=networksecurity" | grep -q . && docker stop networksecurity && docker rm -fv networksecurity
86
+
87
+ - name: Run Docker Image to serve users
88
+ run: |
89
+ docker run -d -p 8080:8080 --ipc="host" --name=networksecurity -e 'AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}' -e 'AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}' -e 'AWS_REGION=${{ secrets.AWS_REGION }}' ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
90
+ - name: Clean previous images and containers
91
+ run: |
92
+ docker system prune -f
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ .venv/
3
+ pycache/
4
+ __pycache__/
5
+ .ipynb_checkpoints
6
+ .vscode/
7
+ .DS_Store
8
+ *.pyc
9
+ logs/
10
+ *.log
11
+ Artifacts/
12
+ s3_sync_folder/
13
+ final_model/
14
+ data/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.13
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # for aws
2
+ # FROM python:3.10-slim-buster
3
+ # WORKDIR /app
4
+ # COPY . /app
5
+ # RUN apt update -y && apt install awscli -y
6
+ # RUN apt-get update && pip install -r requirements.txt
7
+ # CMD ["python3", "app.py"]
8
+
9
+ FROM python:3.13-slim
10
+
11
+ RUN useradd -m -u 1000 user
12
+ USER user
13
+ ENV PATH="/home/user/.local/bin:$PATH"
14
+
15
+ WORKDIR /app
16
+
17
+ # Copy requirements
18
+ COPY --chown=user ./requirements.txt requirements.txt
19
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
20
+
21
+ # Copy all application files
22
+ COPY --chown=user . /app
23
+
24
+ # Create necessary directories
25
+ RUN mkdir -p /app/data /app/final_model /app/templates
26
+
27
+ # Expose port 7860 (HF Space requirement)
28
+ EXPOSE 7860
29
+
30
+ # Run the application
31
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: NSS
3
+ emoji: 🏃
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os,sys
2
+ import certifi
3
+ from dotenv import load_dotenv
4
+ from src.exception.exception import NetworkSecurityException
5
+ from src.logging.logger import logging
6
+ from src.pipeline.training_pipeline import Trainingpipeline
7
+ from fastapi import FastAPI, File, UploadFile, Request
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from uvicorn import run as app_run
10
+ from fastapi.responses import Response
11
+ from starlette.responses import RedirectResponse
12
+ import pandas as pd
13
+ from src.utils.ml_utils.model.estimator import NetworkSecurityModel
14
+ ca = certifi.where()
15
+ load_dotenv()
16
+ mongo_db_uri = os.getenv("MONGO_DB_URI")
17
+
18
+ from src.constant.training_pipeline import DATA_INGESTION_COLLECTION_NAME
19
+ from src.constant.training_pipeline import DATA_INGESTION_DATBASE_NANE
20
+ from src.utils.main_utils.utils import load_object
21
+ # import pymongo
22
+
23
+ # client = pymongo.MongoClient(mongo_db_uri,tlsCAFile=ca)
24
+ # database = client[DATA_INGESTION_DATBASE_NANE]
25
+ # collection = database[DATA_INGESTION_COLLECTION_NAME]
26
+ from fastapi.templating import Jinja2Templates
27
+ templates = Jinja2Templates(directory="./templates")
28
+ app = FastAPI()
29
+ orgin = ["*"]
30
+
31
+ app.add_middleware(
32
+ CORSMiddleware,
33
+ allow_origins=orgin,
34
+ allow_credentials=True,
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+
39
+ # @app.get("/", tags = ["authentication"])
40
+ # async def index():
41
+ # return RedirectResponse(url="/docs")
42
+
43
+ @app.get("/train")
44
+ async def training_route():
45
+ try:
46
+ training_pipeline = Trainingpipeline()
47
+ training_pipeline.run_pipeline()
48
+ return Response("Training successfull !!")
49
+ except Exception as e:
50
+ raise NetworkSecurityException(e, sys)
51
+
52
+ @app.post("/predict") # predict route
53
+ async def predict_route(request: Request, file: UploadFile =File(...)):
54
+ try:
55
+ df = pd.read_csv(file.file)
56
+ # Remove target column if it exists
57
+ if 'Result' in df.columns:
58
+ df = df.drop(columns=['Result'])
59
+ preprocessor = load_object(file_path = "final_model/preprocessor.pkl")
60
+ model = load_object(file_path= "final_model/model.pkl")
61
+ NSmodel = NetworkSecurityModel(preprocessing_object= preprocessor, trained_model_object= model)
62
+ print(df.iloc[0])
63
+ y_pred = NSmodel.predict(df)
64
+ print(y_pred)
65
+ df['predicted_column'] = y_pred
66
+ print(df['predicted_column'])
67
+ df.to_csv("final_model/predicted.csv")
68
+ table_html = df.to_html(classes = 'table table-striped')
69
+ return templates.TemplateResponse("table.html", {"request": request, "table": table_html})
70
+
71
+ except Exception as e:
72
+ raise NetworkSecurityException(e, sys)
73
+ if __name__ == "__main__":
74
+ app_run(app, host="0.0.0.0", port=8080)
75
+
data_schema/schema.yaml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ columns:
2
+ - having_IP_Address: int64
3
+ - URL_Length: int64
4
+ - Shortining_Service: int64
5
+ - having_At_Symbol: int64
6
+ - double_slash_redirecting: int64
7
+ - Prefix_Suffix: int64
8
+ - having_Sub_Domain: int64
9
+ - SSLfinal_State: int64
10
+ - Domain_registeration_length: int64
11
+ - Favicon: int64
12
+ - port: int64
13
+ - HTTPS_token: int64
14
+ - Request_URL: int64
15
+ - URL_of_Anchor: int64
16
+ - Links_in_tags: int64
17
+ - SFH: int64
18
+ - Submitting_to_email: int64
19
+ - Abnormal_URL: int64
20
+ - Redirect: int64
21
+ - on_mouseover: int64
22
+ - RightClick: int64
23
+ - popUpWidnow: int64
24
+ - Iframe: int64
25
+ - age_of_domain: int64
26
+ - DNSRecord: int64
27
+ - web_traffic: int64
28
+ - Page_Rank: int64
29
+ - Google_Index: int64
30
+ - Links_pointing_to_page: int64
31
+ - Statistical_report: int64
32
+ - Result: int64
33
+
34
+ numerical_columns:
35
+ - having_IP_Address
36
+ - URL_Length
37
+ - Shortining_Service
38
+ - having_At_Symbol
39
+ - double_slash_redirecting
40
+ - Prefix_Suffix
41
+ - having_Sub_Domain
42
+ - SSLfinal_State
43
+ - Domain_registeration_length
44
+ - Favicon
45
+ - port
46
+ - HTTPS_token
47
+ - Request_URL
48
+ - URL_of_Anchor
49
+ - Links_in_tags
50
+ - SFH
51
+ - Submitting_to_email
52
+ - Abnormal_URL
53
+ - Redirect
54
+ - on_mouseover
55
+ - RightClick
56
+ - popUpWidnow
57
+ - Iframe
58
+ - age_of_domain
59
+ - DNSRecord
60
+ - web_traffic
61
+ - Page_Rank
62
+ - Google_Index
63
+ - Links_pointing_to_page
64
+ - Statistical_report
65
+ - Result
data_to_mongo.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Test code
2
+ # import os
3
+ # from pymongo.mongo_client import MongoClient
4
+ # from dotenv import load_dotenv
5
+ # load_dotenv()
6
+ # uri = os.getenv("MONGODB_URL")
7
+
8
+ # # Create a new client and connect to the server
9
+ # client = MongoClient(uri)
10
+
11
+ # # Send a ping to confirm a successful connection
12
+ # try:
13
+ # client.admin.command('ping')
14
+ # print("Pinged your deployment. You successfully connected to MongoDB!")
15
+ # except Exception as e:
16
+ # print(e)
17
+
18
+
19
+ import os
20
+ import sys
21
+ import json
22
+ import certifi
23
+ import pandas as pd
24
+ import numpy as np
25
+ # import pymongo
26
+ from src.exception.exception import NetworkSecurityException
27
+ from src.logging.logger import logging
28
+ from dotenv import load_dotenv
29
+ load_dotenv()
30
+
31
+ MONGODB_URL = os.getenv("MONGODB_URL")
32
+ print(MONGODB_URL)
33
+ ca = certifi.where() # ca= certified authority
34
+
35
+ class NetworkDataExtract():
36
+ def __init__(self):
37
+ try:
38
+ pass
39
+ except Exception as e:
40
+ raise NetworkSecurityException(e, sys)
41
+ def csv_to_json_converter(self, file_path):
42
+ try:
43
+ data = pd.read_csv(file_path)
44
+ data.reset_index(drop=True, inplace=True)
45
+ records = list(json.loads(data.T.to_json()).values())
46
+ return records
47
+ except Exception as e:
48
+ raise NetworkSecurityException(e, sys)
49
+
50
+ def insert_data_to_mongodb(self, records, database, collection):
51
+ try:
52
+ self.database = database
53
+ self.collection = collection
54
+ self.records = records
55
+
56
+ self.mongo_client = pymongo.MongoClient(MONGODB_URL)
57
+ self.database = self.mongo_client[self.database]
58
+
59
+ self.collection = self.database[self.collection ]
60
+ self.collection.insert_many(self.records)
61
+ return(len(self.records))
62
+ except Exception as e:
63
+ raise NetworkSecurityException(e, sys)
64
+
65
+ if __name__ == "__main__":
66
+ FILE_PATH = "data\phisingData.csv"
67
+ DATABASE = "Network_data"
68
+ Collection = "phising_data"
69
+ obj = NetworkDataExtract()
70
+ records = obj.csv_to_json_converter(file_path=FILE_PATH)
71
+ print("records converted to json")
72
+ noOfRecords = obj.insert_data_to_mongodb(records,DATABASE, Collection)
73
+ print(noOfRecords)
load_data_to_sqlite.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.data.sqlite_manager import PhishingDataManager
2
+
3
+ if __name__ == "__main__":
4
+ FILE_PATH = "data/phisingData.csv"
5
+
6
+ print("Initializing SQLite database...")
7
+ db_manager = PhishingDataManager()
8
+
9
+ print("Loading data from CSV...")
10
+ count = db_manager.insert_data_from_csv(FILE_PATH)
11
+
12
+ print(f"✅ Successfully loaded {count} records into SQLite database!")
13
+ print(f"Database location: data/phishing_data.db")
14
+
15
+ db_manager.close()
main.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.components.data_ingestion import DataIngestion
2
+ from src.components.data_validation import DataValidation
3
+ from src.components.data_transformation import DataTransformation
4
+ from src.exception.exception import NetworkSecurityException
5
+ from src.logging.logger import logging
6
+ from src.entity.config_entity import Data_ingestion_config, TrainingPipelineConfig, Data_validation_config, Data_transformation_config, Model_trainer_config
7
+ from src.components.model_trainer import ModelTrainer
8
+
9
+ import sys
10
+
11
+ if __name__ == "__main__":
12
+ try:
13
+ traingning_pipeline_config = TrainingPipelineConfig()
14
+ data_ingestion_config = Data_ingestion_config(traingning_pipeline_config)
15
+ Data_ingestion = DataIngestion(data_ingestion_config)
16
+ logging.info("Data ingestion started")
17
+ data_ingestion_artifacts = Data_ingestion.initiate_data_ingestion()
18
+ logging.info("Data ingestion completed")
19
+ print("Data ingestion completed")
20
+
21
+ data_validation_config = Data_validation_config(traingning_pipeline_config)
22
+ Data_validation = DataValidation(data_ingestion_artifacts, data_validation_config)
23
+ logging.info("Data validation started")
24
+ data_validation_artifacts = Data_validation.intiate_data_validation()
25
+ logging.info("Data validation completed")
26
+ print(data_validation_artifacts)
27
+
28
+ data_transformation_config = Data_transformation_config(traingning_pipeline_config)
29
+ logging.info("data Transformation started")
30
+ data_transformation = DataTransformation(data_validation_artifacts, data_transformation_config)
31
+ data_transformation_artifact = data_transformation.initiate_data_transformation()
32
+ print(data_transformation_artifact)
33
+ logging.info("data Transformation completed")
34
+
35
+ logging.info("Model training started")
36
+ model_trainer_config = Model_trainer_config(traingning_pipeline_config)
37
+ model_trainer = ModelTrainer(model_trainer_config=model_trainer_config, data_transformation_artifact=data_transformation_artifact)
38
+ model_trainer_artifact = model_trainer.initiate_model_trainer()
39
+ logging.info("Model training completed")
40
+ except Exception as e:
41
+ raise NetworkSecurityException(e, sys)
pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "NetworkSecuritySystemMLProject"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "certifi>=2026.1.4",
9
+ "dagshub>=0.6.4",
10
+ "dill>=0.4.0",
11
+ "fastapi>=0.128.0",
12
+ "mlflow>=3.8.1",
13
+ "numpy>=2.4.1",
14
+ "pandas>=2.3.3",
15
+ "pyaml>=25.7.0",
16
+ "pymongo[srv]==3.6",
17
+ "python-dotenv>=1.2.1",
18
+ "python-multipart>=0.0.21",
19
+ "uvicorn>=0.40.0",
20
+ ]
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python-dotenv
2
+ pandas
3
+ numpy
4
+ pymongo[srv]==3.6
5
+ pymongo
6
+ certifi
7
+ dill
8
+ mlflow
9
+ pyaml
10
+ dagshub
11
+ fastapi
12
+ uvicorn
13
+ python-multipart
14
+ # -e .
setup.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import find_packages, setup
2
+ from typing import List
3
+
4
+ def get_requirements()-> List[str]:
5
+ """This function is going to return list of requirements
6
+ """
7
+ requirement_list:List[str]=[]
8
+ try:
9
+ with open('requirements.txt', 'r') as file:
10
+ lines= file.readlines()
11
+ for line in lines:
12
+ requirement=line.strip()
13
+ if requirement and requirement!= '-e .': # ignore empty line and -e .
14
+ requirement_list.append(requirement)
15
+
16
+ except FileNotFoundError:
17
+ print("Error: requirements.txt file not found.")
18
+
19
+ setup(
20
+ name="Network_Security_system",
21
+ version="0.0.1",
22
+ author="Kshitij",
23
+ author_email="kshitijk146@gmail.com",
24
+ packages=find_packages(),
25
+ install_requires=get_requirements()
26
+ )
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (158 Bytes). View file
 
src/cloud/__init__.py ADDED
File without changes
src/cloud/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (164 Bytes). View file
 
src/cloud/__pycache__/s3_syncer.cpython-310.pyc ADDED
Binary file (721 Bytes). View file
 
src/cloud/s3_syncer.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ class s3sync:
4
+ def __init__(self, Local_sync_folder = "s3_sync_folder"):
5
+ self.Local_sync_folder = Local_sync_folder
6
+ if not os.path.exists(self.Local_sync_folder):
7
+ os.makedirs(self.Local_sync_folder, exist_ok=True)
8
+
9
+ def sync_folder_to_s3(self, folder, aws_bucket_url):
10
+ # command = (
11
+ # f"aws s3 sync {folder} {aws_bucket_url}"
12
+ # )
13
+ # os.system(command)
14
+ try:
15
+ dest_path= aws_bucket_url.replace("s3://","").replace(aws_bucket_url.split("/")[0] + "/", "")
16
+ destination = os.path.join(self.Local_sync_folder, dest_path)
17
+
18
+ # copy folder
19
+ if os.path.exists(destination):
20
+ shutil.rmtree(destination)
21
+ shutil.copytree(folder, destination)
22
+ print(f"Synced {folder} to {destination}")
23
+ except Exception as e:
24
+ print(f"Error syncing folder to S3: {e}")
25
+
26
+
27
+
28
+ def sync_folder_from_s3(self, folder, aws_bucket_url):
29
+ # command = (
30
+ # f"aws s3 sync {aws_bucket_url} {folder}"
31
+ # )
32
+ # os.system(command)
33
+
34
+ try:
35
+ source_path = aws_bucket_url.replace("s3://","").replace(aws_bucket_url.split("/")[0] + "/", "")
36
+ source = os.path.join(self.Local_sync_folder, source_path)
37
+
38
+ if os.path.exists(folder):
39
+ shutil.rmtree(folder)
40
+ shutil.copytree(source, folder)
41
+ print(f"Synced {source} to {folder}")
42
+ except Exception as e:
43
+ print(f"Error syncing folder from S3: {e}")
44
+
45
+
src/components/__init__.py ADDED
File without changes
src/components/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (169 Bytes). View file
 
src/components/__pycache__/data_ingestion.cpython-310.pyc ADDED
Binary file (3.31 kB). View file
 
src/components/__pycache__/data_transformation.cpython-310.pyc ADDED
Binary file (3.64 kB). View file
 
src/components/__pycache__/data_validation.cpython-310.pyc ADDED
Binary file (3.47 kB). View file
 
src/components/__pycache__/model_trainer.cpython-310.pyc ADDED
Binary file (4.67 kB). View file
 
src/components/data_ingestion.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.exception.exception import NetworkSecurityException
2
+ from src.logging.logger import logging
3
+ from src.entity.config_entity import Data_ingestion_config
4
+ from src.entity.artifact_entity import DataIngestionArtifact
5
+ import os, sys
6
+ import pandas as pd
7
+ from typing import List
8
+ from sklearn.model_selection import train_test_split
9
+ from dotenv import load_dotenv
10
+ # import pymongo
11
+ import numpy as np
12
+ from src.data.sqlite_manager import PhishingDataManager
13
+
14
+ load_dotenv()
15
+ MONGODB_URL = os.getenv("MONGODB_URL")
16
+
17
+ class DataIngestion:
18
+ def __init__(self, data_ingestion_config: Data_ingestion_config):
19
+ try:
20
+ self.data_ingestion_config = data_ingestion_config
21
+ self.db_manager = PhishingDataManager()
22
+ except Exception as e:
23
+ raise NetworkSecurityException(e, sys)
24
+
25
+ def export_collection_as_dataframe(self):
26
+ """Export data from SQLite instead of MongoDB"""
27
+ try:
28
+ # Get all training data
29
+ df = self.db_manager.get_training_data(include_new_only=False)
30
+ return df
31
+ except Exception as e:
32
+ raise NetworkSecurityException(e, sys)
33
+
34
+ def move_data_into_feature_store(self, dataframe: pd.DataFrame):
35
+ try:
36
+ feature_store_file = self.data_ingestion_config.feature_store_file_path
37
+ dir_path = os.path.dirname(feature_store_file)
38
+ os.makedirs(dir_path, exist_ok=True)
39
+ dataframe.to_csv(feature_store_file, index=False, header=True)
40
+ return dataframe
41
+ except Exception as e:
42
+ raise NetworkSecurityException(e, sys)
43
+
44
+ def data_train_test_split(self,dataframe:pd.DataFrame):
45
+ try:
46
+ train_set, test_set = train_test_split(
47
+ dataframe, test_size=self.data_ingestion_config.train_test_split_ratio
48
+ )
49
+ logging.info("Trained test spltting done on dataframe")
50
+ dir_path = os.path.dirname(self.data_ingestion_config.train_file_path)
51
+ os.makedirs(dir_path, exist_ok=True)
52
+ logging.info("Exporting train and test file path")
53
+ train_set.to_csv(
54
+ self.data_ingestion_config.train_file_path, index = False, header = True
55
+ )
56
+ test_set.to_csv(
57
+ self.data_ingestion_config.test_file_path, index = False, header = True
58
+ )
59
+ logging.info("Exported train and test file path.")
60
+
61
+ except Exception as e:
62
+ raise NetworkSecurityException(e, sys)
63
+
64
+ def initiate_data_ingestion(self):
65
+ try:
66
+ dataframe = self.export_collection_as_dataframe()
67
+ dataframe = self.move_data_into_feature_store(dataframe)
68
+ self.data_train_test_split(dataframe)
69
+
70
+ # Mark data as used
71
+ self.db_manager.mark_data_as_trained()
72
+
73
+ data_ingestion_artifact = DataIngestionArtifact(
74
+ train_file_path=self.data_ingestion_config.train_file_path,
75
+ test_file_path=self.data_ingestion_config.test_file_path
76
+ )
77
+ return data_ingestion_artifact
78
+ except Exception as e:
79
+ raise NetworkSecurityException(e, sys)
80
+
src/components/data_transformation.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.impute import KNNImputer
6
+ from sklearn.pipeline import Pipeline
7
+ from src.constant.training_pipeline import TARGET_COLUMN
8
+ from src.constant.training_pipeline import DATA_TRANSFORMATION_IMPUTER_PARAMS
9
+ from src.entity.artifact_entity import (
10
+ DataTransformationArtifact,
11
+ DataValidationArtifact,
12
+ )
13
+ from src.exception.exception import NetworkSecurityException
14
+ from src.logging.logger import logging
15
+ from src.utils.main_utils.utils import save_np_array, save_object
16
+ from src.entity.config_entity import Data_transformation_config
17
+ class DataTransformation:
18
+ def __init__(self, data_validation_artifact: DataValidationArtifact, data_transformation_config: Data_transformation_config):
19
+ try:
20
+ self.data_validation_artifact:DataValidationArtifact = data_validation_artifact
21
+ self.data_transformation_config:Data_transformation_config = data_transformation_config
22
+ except Exception as e:
23
+ raise NetworkSecurityException(e, sys) from e
24
+
25
+ @staticmethod
26
+ def read_data(file_path) -> pd.DataFrame:
27
+ try:
28
+ return pd.read_csv(file_path)
29
+ except Exception as e:
30
+ raise NetworkSecurityException(e, sys) from e
31
+
32
+ def get_data_transformer_object(self) -> Pipeline:
33
+ """
34
+ it initialises a KNNImputer object with the parameter specified in the training_pipeline.py file and returns
35
+ a pipeline with the KNNImputer object as the first step.
36
+
37
+ args:
38
+ cls: DataTransformation
39
+ Returns:
40
+ a pipeline object
41
+ """
42
+ logging.info("Entered get_data_transformation_object methof of transformation class")
43
+
44
+ try:
45
+ knn_imputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)
46
+ logging.info(f"intialise knn imputer with {DATA_TRANSFORMATION_IMPUTER_PARAMS}")
47
+ pipeline = Pipeline(steps=[("imputer", knn_imputer)])
48
+ return pipeline
49
+ except Exception as e:
50
+ raise NetworkSecurityException(e, sys) from e
51
+
52
+ def initiate_data_transformation(self)-> DataTransformationArtifact:
53
+ try:
54
+ logging.info("Started data transformation!")
55
+ train_df = DataTransformation.read_data(self.data_validation_artifact.valid_train_file_path)
56
+ test_df = DataTransformation.read_data(self.data_validation_artifact.valid_test_file_path)
57
+
58
+ # training dataframe
59
+ input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN],axis = 1)
60
+ target_feature_train_df = train_df[TARGET_COLUMN]
61
+ target_feature_train_df = target_feature_train_df.replace(-1,0)
62
+ # testing dataframe
63
+ input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN],axis = 1)
64
+ target_feature_test_df = test_df[TARGET_COLUMN]
65
+ target_feature_test_df = target_feature_test_df.replace(-1,0)
66
+
67
+ preprocessor = self.get_data_transformer_object()
68
+ preprocessor_obj = preprocessor.fit(input_feature_train_df)
69
+ logging.info("Preprocessor object created and fitted on training data")
70
+
71
+ transformed_input_train_feature = preprocessor_obj.transform(input_feature_train_df)
72
+ transformed_input_test_feature = preprocessor_obj.transform(input_feature_test_df)
73
+
74
+ # combining transformed input features with target feature
75
+ train_arr = np.c_[transformed_input_train_feature, np.array(target_feature_train_df)]
76
+ test_arr = np.c_[transformed_input_test_feature, np.array(target_feature_test_df)]
77
+
78
+ # save numpy array data
79
+ save_np_array(self.data_transformation_config.transformed_train_file_path, array=train_arr)
80
+ save_np_array(self.data_transformation_config.transformed_test_file_path,array = test_arr )
81
+ save_object(self.data_transformation_config.transformed_object_file_path,preprocessor_obj)
82
+ save_object("final_model/preprocessor.pkl", preprocessor_obj)
83
+
84
+ # preparing artifacts
85
+ Data_transformation_artifact = DataTransformationArtifact(
86
+ transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
87
+ transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
88
+ transformed_test_file_path=self.data_transformation_config.transformed_test_file_path,
89
+ )
90
+ return Data_transformation_artifact
91
+ except Exception as e:
92
+ raise NetworkSecurityException(e, sys) from e
93
+
src/components/data_validation.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.entity.artifact_entity import DataIngestionArtifact,DataValidationArtifact
2
+ from src.entity.config_entity import Data_validation_config
3
+ from src.exception.exception import NetworkSecurityException
4
+ from src.constant.training_pipeline import SCHEMA_FILE_PATH
5
+ from src.logging.logger import logging
6
+ from scipy.stats import ks_2samp
7
+ import pandas as pd
8
+ import os, sys
9
+ from src.utils.main_utils.utils import read_yaml_file, write_yaml_file
10
+
11
+ class DataValidation:
12
+ def __init__(self, data_ingestion_artifact: DataIngestionArtifact, data_validation_config: Data_validation_config):
13
+ try:
14
+ self.data_ingestion_artifact = data_ingestion_artifact
15
+ self.data_validation_config = data_validation_config
16
+ self._schema_config = read_yaml_file(file_path=SCHEMA_FILE_PATH)
17
+ except Exception as e:
18
+ raise NetworkSecurityException(e, sys) from e
19
+
20
+ @staticmethod
21
+ def read_data(file_path) -> pd.DataFrame:
22
+ try:
23
+ return pd.read_csv(file_path)
24
+ except Exception as e:
25
+ raise NetworkSecurityException(e, sys) from e
26
+
27
+ def validate_number_of_columns(self, dataframe: pd.DataFrame)-> bool:
28
+ try:
29
+ number_of_columns = len(self._schema_config)
30
+ logging.info(f"Required number of columns: {number_of_columns}")
31
+ logging.info(f"Data frame has columns: {len(dataframe.columns)}")
32
+ if len(dataframe.columns)==number_of_columns:
33
+ return True
34
+ else:
35
+ return False
36
+
37
+ except Exception as e:
38
+ raise NetworkSecurityException(e, sys) from e
39
+ def detect_drift(self,base_df, current_df, threshold=0.05)->bool:
40
+ try:
41
+ status = True
42
+ report = {}
43
+ for column in base_df:
44
+ d1 = base_df[column]
45
+ d2 = current_df[column]
46
+ is_sample_dist = ks_2samp(d1, d2)
47
+ if threshold <= is_sample_dist.pvalue:
48
+ is_found = False
49
+ else:
50
+ is_found = True
51
+ status = False
52
+ report.update({column: {
53
+ "p_value": float(is_sample_dist.pvalue),
54
+ "drift_status": is_found
55
+ }})
56
+ drift_report_file_path = self.data_validation_config.drift_report_file_path
57
+ # Create directory
58
+ dir_path = os.path.dirname(drift_report_file_path)
59
+ os.makedirs(dir_path, exist_ok=True)
60
+ write_yaml_file(file_path=drift_report_file_path, content=report)
61
+ return status
62
+ except Exception as e:
63
+ raise NetworkSecurityException(e, sys )
64
+
65
+ def intiate_data_validation(self)-> DataValidationArtifact:
66
+ try:
67
+ train_file_path = self.data_ingestion_artifact.train_file_path
68
+ test_file_path = self.data_ingestion_artifact.test_file_path
69
+
70
+ # read the data from train and test csv
71
+ train_df = DataValidation.read_data(train_file_path)
72
+ test_df = DataValidation.read_data(test_file_path)
73
+
74
+ # validating no. of columns
75
+ status = self.validate_number_of_columns(dataframe=train_df)
76
+ if not status:
77
+ error_message = f"{train_file_path} does not match schema"
78
+
79
+
80
+ status = self.validate_number_of_columns(dataframe=test_df)
81
+ if not status:
82
+ error_message = f"{test_file_path} does not match schema"
83
+
84
+ # check data drift
85
+ status = self.detect_drift(base_df=train_df, current_df=test_df)
86
+ dir_path = os.path.dirname(self.data_validation_config.valid_train_file_path)
87
+ os.makedirs(dir_path, exist_ok=True)
88
+
89
+ train_df.to_csv(self.data_validation_config.valid_train_file_path, index=False, header = True)
90
+ test_df.to_csv(self.data_validation_config.valid_test_file_path, index=False, header = True)
91
+
92
+
93
+ data_validation_artifacts = DataValidationArtifact(
94
+ validation_status=status,
95
+ valid_train_file_path=self.data_ingestion_artifact.train_file_path,
96
+ valid_test_file_path=self.data_ingestion_artifact.test_file_path,
97
+ invalid_train_file_path=None,
98
+ invalid_test_file_path=None,
99
+ drift_report_file_path=self.data_validation_config.drift_report_file_path
100
+ )
101
+ return data_validation_artifacts
102
+
103
+ except Exception as e:
104
+ raise NetworkSecurityException(e, sys)
src/components/model_trainer.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ from src.utils.ml_utils.model.estimator import NetworkSecurityModel
3
+ from src.exception.exception import NetworkSecurityException
4
+ from src.logging.logger import logging
5
+ from src.entity.artifact_entity import DataTransformationArtifact, ModelTrainerArtifact
6
+ from src.entity.config_entity import Model_trainer_config
7
+ from src.utils.main_utils.utils import save_object, load_object
8
+ from src.utils.main_utils.utils import load_numpy_array_data, evaluate_models
9
+ from src.utils.ml_utils.metric.classification_metric import classification_score
10
+ from sklearn.neighbors import KNeighborsClassifier
11
+ from sklearn.tree import DecisionTreeClassifier
12
+ from sklearn.ensemble import (
13
+ RandomForestClassifier,
14
+ AdaBoostClassifier,
15
+ GradientBoostingClassifier,
16
+ )
17
+ from sklearn.linear_model import LogisticRegression
18
+ from sklearn.metrics import r2_score
19
+ import mlflow
20
+ import dagshub
21
+ dagshub.init(repo_owner='kshitijk146', repo_name='MLOPS_project_network_Security_system', mlflow=True)
22
+ class ModelTrainer:
23
+ def __init__(self, model_trainer_config: Model_trainer_config, data_transformation_artifact: DataTransformationArtifact):
24
+ try:
25
+ self.model_trainer_config = model_trainer_config
26
+ self.data_transformation_artifact = data_transformation_artifact
27
+ except Exception as e:
28
+ raise NetworkSecurityException(e, sys) from e
29
+
30
+ def track_mlflow(self,best_model, classificationmetric):
31
+ with mlflow.start_run():
32
+ f1_score = classificationmetric.f1_score
33
+ precision_score = classificationmetric.precision_score
34
+ recall_score = classificationmetric.recall_score
35
+
36
+ mlflow.log_metric("f1_score", f1_score)
37
+ mlflow.log_metric("precision_score", precision_score)
38
+ mlflow.log_metric("recall_score", recall_score)
39
+ mlflow.sklearn.log_model(best_model, "model")
40
+
41
+ def train_model(self, x_train, y_train,x_test, y_test):
42
+ models = {
43
+ "KNN": KNeighborsClassifier(),
44
+ "Decision Tree": DecisionTreeClassifier(),
45
+ "Random Forest": RandomForestClassifier(verbose=True),
46
+ "AdaBoost": AdaBoostClassifier(),
47
+ "Gradient Boosting": GradientBoostingClassifier(verbose=True),
48
+ "logistic regression": LogisticRegression(verbose=True)
49
+ }
50
+ params = {
51
+ "KNN": {
52
+ 'n_neighbors': [3, 5, 7],
53
+ 'weights': ['uniform', 'distance'],
54
+ 'metric': ['euclidean']
55
+ },
56
+ "Decision Tree": {
57
+ 'criterion': ['gini', 'entropy'],
58
+ 'max_depth': [None, 5, 10],
59
+ 'min_samples_split': [2, 5],
60
+ 'min_samples_leaf': [1, 2]
61
+ },
62
+ "Random Forest": {
63
+ 'n_estimators': [50, 100],
64
+ 'max_depth': [None, 5],
65
+ 'min_samples_split': [2, 5],
66
+ 'min_samples_leaf': [1, 2],
67
+ 'max_features': ['sqrt']
68
+ },
69
+ "AdaBoost": {
70
+ 'n_estimators': [50, 100],
71
+ 'learning_rate': [0.1, 1.0],
72
+ # 'algorithm': ['SAMME.R']
73
+ },
74
+ "Gradient Boosting": {
75
+ 'n_estimators': [50, 100],
76
+ 'learning_rate': [0.1],
77
+ 'max_depth': [3, 5],
78
+ 'min_samples_split': [2],
79
+ 'min_samples_leaf': [1],
80
+ 'max_features': ['sqrt']
81
+ },
82
+ "logistic regression": {
83
+ 'C': [1.0, 10.0],
84
+ 'penalty': ['l2'],
85
+ 'solver': ['liblinear']
86
+ }
87
+ }
88
+
89
+ model_report:dict = evaluate_models(
90
+ x_train = x_train,y_train = y_train,x_test = x_test,y_test = y_test,models = models,params = params)
91
+
92
+ # to get the best model score from the dict
93
+ best_model_score = max(sorted(model_report.values()))
94
+
95
+ # to get best model name from dict
96
+ best_model_name = list(model_report.keys())[
97
+ list(model_report.values()).index(best_model_score)
98
+ ]
99
+ logging.info(f"best model name: {best_model_name}")
100
+ best_model = models[best_model_name]
101
+ y_train_pred = best_model.predict(x_train)
102
+ classification_train_metric= classification_score(y_true = y_train, y_pred=y_train_pred)
103
+
104
+ # track mlfow
105
+ self.track_mlflow(best_model, classification_train_metric)
106
+
107
+
108
+
109
+ y_test_pred = best_model.predict(x_test)
110
+ classification_test_metric = classification_score(y_true = y_test, y_pred=y_test_pred)
111
+
112
+ preprocessor = load_object(file_path=self.data_transformation_artifact.transformed_object_file_path)
113
+ model_dir_path = os.path.dirname(self.model_trainer_config.trained_model_file_path)
114
+ os.makedirs(model_dir_path, exist_ok=True)
115
+
116
+ NetwerkModel= NetworkSecurityModel(preprocessing_object=preprocessor, trained_model_object=best_model)
117
+ save_object(self.model_trainer_config.trained_model_file_path, obj=NetwerkModel)
118
+ save_object("final_model/model.pkl", best_model)
119
+
120
+ model_trainer_artifact = ModelTrainerArtifact(trained_model_file_path=self.model_trainer_config.trained_model_file_path, train_metric_artifact=classification_train_metric, test_metric_artifact=classification_test_metric)
121
+ logging.info(f"Model trainer artifact: {model_trainer_artifact}")
122
+ return model_trainer_artifact
123
+
124
+ def initiate_model_trainer(self)-> ModelTrainerArtifact:
125
+ try:
126
+ train_file_path = self.data_transformation_artifact.transformed_train_file_path
127
+ test_file_path = self.data_transformation_artifact.transformed_test_file_path
128
+
129
+ # loading training array and testing array
130
+ train_array = load_numpy_array_data(train_file_path)
131
+ test_array = load_numpy_array_data(test_file_path)
132
+ x_train, y_train, x_test, y_test = (
133
+ train_array[:, :-1],
134
+ train_array[:, -1],
135
+ test_array[:, :-1],
136
+ test_array[:, -1],
137
+ )
138
+ model = self.train_model(x_train, y_train, x_test=x_test, y_test=y_test)
139
+ except Exception as e:
140
+ raise NetworkSecurityException(e, sys) from e
src/constant/__init__.py ADDED
File without changes
src/constant/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (167 Bytes). View file
 
src/constant/training_pipeline/__init__.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import numpy as np
4
+
5
+ """
6
+ Common constant variable
7
+ """
8
+ TARGET_COLUMN = "Result"
9
+ PIPELINE_NAME: str= "NetworkSecurity"
10
+ ARTIFACT_DIR: str = "Artifacts"
11
+ FILE_NAME: str = "phisingData.csv"
12
+
13
+ TRAIN_FILE_NAME: str = "train.csv"
14
+ TEST_FILE_NAME: str = "test.csv"
15
+ SCHEMA_FILE_PATH = os.path.join("data_schema", "schema.yaml")
16
+
17
+ SAVED_MODEL_DIR = os.path.join("saved_models")
18
+ MODEL_FILE_NAME = "model.pkl"
19
+ """
20
+ Data ingestion variable
21
+ """
22
+ DATA_INGESTION_COLLECTION_NAME: str= "phising_data"
23
+ DATA_INGESTION_DATBASE_NANE: str= "Network_data"
24
+ DATA_INGESTION_DIR_NAME:str = "data_ingestion"
25
+ DATA_INGESTION_FEATURE_STORE_DIR: str = "feature_store"
26
+ DATA_INGESTION_INGESTED_DIR: str = "ingested"
27
+ DATA_INGESTION_TRAIN_TEST_SPLIT_RATION: float = 0.2
28
+
29
+
30
+ """
31
+ Data validation realated constant start with DATA_VALIDATION VAR NAME
32
+
33
+ """
34
+ DATA_VALIDATION_DIR_NAMR:str = "data_validation"
35
+ DATA_VALIDATION_VALID_DIR: str = "validated"
36
+ DATA_VALIDATION_INVALID_DIR: str = "invalid"
37
+ DATA_VALIDATION_DRIFT_REPORT_DIR: str = "drift_report"
38
+ DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str = "report.yaml"
39
+
40
+
41
+ """
42
+ Data transformation realated constant start with DATA_TRANSFORMATION VAR NAME
43
+ """
44
+ DATA_TRANSFORMATION_DIR_NAME: str = "data_transformation"
45
+ DATA_TRANSFORMATION_TRANSFORMED_DIR_NAME: str = "transformed"
46
+ DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR:str = "transformed_object"
47
+ PREPROCESSING_OBJECT_FILE_NAME:str = "preprocessing.pkl"
48
+ # using knn imputer
49
+ DATA_TRANSFORMATION_IMPUTER_PARAMS: dict = {
50
+ "missing_values": np.nan,
51
+ "n_neighbors" : 3,
52
+ "weights" : "uniform"
53
+ }
54
+
55
+ """
56
+ Model trainer realated constant start with DATA_TRANSFORMATION VAR NAME
57
+ """
58
+ MODEL_TRAINER_DIR_NAME: str = "model_trainer"
59
+ MODEL_TRAINER_MODEL_DIR:str = "trained_model"
60
+ MODEL_TRAINER_MODEL_NAME:str = "model.pkl"
61
+ MODEL_TRAINER_EXPECTED_SCORE: float = 0.6
62
+ MODEL_TRAINER_OVERFITTING_UNDERFITTING_THRESHOLD: float = 0.05
63
+
64
+ TRAINING_BUCKET_NAME = "networksecuritymodelbucket"
src/constant/training_pipeline/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (2.18 kB). View file
 
src/data/__init__.py ADDED
File without changes
src/data/sqlite_manager.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import pandas as pd
3
+ import os
4
+ from datetime import datetime
5
+ from src.exception.exception import NetworkSecurityException
6
+ from src.logging.logger import logging
7
+ import sys
8
+
9
+ class PhishingDataManager:
10
+ def __init__(self, db_path="data/phishing_data.db"):
11
+ """Initialize SQLite database for phishing data"""
12
+ try:
13
+ self.db_path = db_path
14
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
15
+ self.conn = sqlite3.connect(db_path, check_same_thread=False)
16
+ self._create_tables()
17
+ except Exception as e:
18
+ raise NetworkSecurityException(e, sys) from e
19
+
20
+ def _create_tables(self):
21
+ """Create phishing data table and metadata table"""
22
+ try:
23
+ cursor = self.conn.cursor()
24
+
25
+ # Main data table
26
+ cursor.execute("""
27
+ CREATE TABLE IF NOT EXISTS phishing_data (
28
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
29
+ having_IP_Address INTEGER,
30
+ URL_Length INTEGER,
31
+ Shortining_Service INTEGER,
32
+ having_At_Symbol INTEGER,
33
+ double_slash_redirecting INTEGER,
34
+ Prefix_Suffix INTEGER,
35
+ having_Sub_Domain INTEGER,
36
+ SSLfinal_State INTEGER,
37
+ Domain_registeration_length INTEGER,
38
+ Favicon INTEGER,
39
+ port INTEGER,
40
+ HTTPS_token INTEGER,
41
+ Request_URL INTEGER,
42
+ URL_of_Anchor INTEGER,
43
+ Links_in_tags INTEGER,
44
+ SFH INTEGER,
45
+ Submitting_to_email INTEGER,
46
+ Abnormal_URL INTEGER,
47
+ Redirect INTEGER,
48
+ on_mouseover INTEGER,
49
+ RightClick INTEGER,
50
+ popUpWidnow INTEGER,
51
+ Iframe INTEGER,
52
+ age_of_domain INTEGER,
53
+ DNSRecord INTEGER,
54
+ web_traffic INTEGER,
55
+ Page_Rank INTEGER,
56
+ Google_Index INTEGER,
57
+ Links_pointing_to_page INTEGER,
58
+ Statistical_report INTEGER,
59
+ Result INTEGER,
60
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
61
+ used_in_training BOOLEAN DEFAULT 0
62
+ )
63
+ """)
64
+
65
+ # Training metadata table
66
+ cursor.execute("""
67
+ CREATE TABLE IF NOT EXISTS training_metadata (
68
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
69
+ training_timestamp TIMESTAMP,
70
+ data_count INTEGER,
71
+ model_accuracy REAL,
72
+ model_version TEXT,
73
+ artifact_path TEXT
74
+ )
75
+ """)
76
+
77
+ self.conn.commit()
78
+ logging.info("Database tables created successfully")
79
+ except Exception as e:
80
+ raise NetworkSecurityException(e, sys) from e
81
+
82
+ def insert_data_from_csv(self, csv_path):
83
+ """Bulk insert from CSV (initial load)"""
84
+ try:
85
+ df = pd.read_csv(csv_path)
86
+ df.replace({"na": None}, inplace=True)
87
+
88
+ # Insert only new records (avoid duplicates)
89
+ df.to_sql('phishing_data', self.conn, if_exists='append', index=False)
90
+ logging.info(f"Inserted {len(df)} records from CSV")
91
+ return len(df)
92
+ except Exception as e:
93
+ raise NetworkSecurityException(e, sys) from e
94
+
95
+ def add_new_samples(self, data_dict_list):
96
+ """Add new phishing samples incrementally"""
97
+ try:
98
+ df = pd.DataFrame(data_dict_list)
99
+ df.to_sql('phishing_data', self.conn, if_exists='append', index=False)
100
+ logging.info(f"Added {len(df)} new samples")
101
+ return len(df)
102
+ except Exception as e:
103
+ raise NetworkSecurityException(e, sys) from e
104
+
105
+ def get_training_data(self, include_new_only=False):
106
+ """Fetch data for training"""
107
+ try:
108
+ if include_new_only:
109
+ # Only get data not used in training yet
110
+ query = "SELECT * FROM phishing_data WHERE used_in_training = 0"
111
+ else:
112
+ # Get all data
113
+ query = "SELECT * FROM phishing_data"
114
+
115
+ df = pd.read_sql_query(query, self.conn)
116
+
117
+ # Drop metadata columns
118
+ df = df.drop(['id', 'created_at', 'used_in_training'], axis=1, errors='ignore')
119
+
120
+ logging.info(f"Fetched {len(df)} records for training")
121
+ return df
122
+ except Exception as e:
123
+ raise NetworkSecurityException(e, sys) from e
124
+
125
+ def mark_data_as_trained(self):
126
+ """Mark all data as used in training"""
127
+ try:
128
+ cursor = self.conn.cursor()
129
+ cursor.execute("UPDATE phishing_data SET used_in_training = 1 WHERE used_in_training = 0")
130
+ self.conn.commit()
131
+ logging.info("Marked data as trained")
132
+ except Exception as e:
133
+ raise NetworkSecurityException(e, sys) from e
134
+
135
+ def get_new_data_count(self):
136
+ """Count untrained samples"""
137
+ try:
138
+ cursor = self.conn.cursor()
139
+ result = cursor.execute("SELECT COUNT(*) FROM phishing_data WHERE used_in_training = 0").fetchone()
140
+ return result[0]
141
+ except Exception as e:
142
+ raise NetworkSecurityException(e, sys) from e
143
+
144
+ def log_training_run(self, data_count, accuracy, version, artifact_path):
145
+ """Log training metadata"""
146
+ try:
147
+ cursor = self.conn.cursor()
148
+ cursor.execute("""
149
+ INSERT INTO training_metadata (training_timestamp, data_count, model_accuracy, model_version, artifact_path)
150
+ VALUES (?, ?, ?, ?, ?)
151
+ """, (datetime.now(), data_count, accuracy, version, artifact_path))
152
+ self.conn.commit()
153
+ except Exception as e:
154
+ raise NetworkSecurityException(e, sys) from e
155
+
156
+ def should_retrain(self, threshold=100):
157
+ """Check if retraining is needed based on new data"""
158
+ new_count = self.get_new_data_count()
159
+ return new_count >= threshold
160
+
161
+ def close(self):
162
+ self.conn.close()
src/entity/__init__.py ADDED
File without changes
src/entity/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (165 Bytes). View file
 
src/entity/__pycache__/artifact_entity.cpython-310.pyc ADDED
Binary file (1.56 kB). View file
 
src/entity/__pycache__/config_entity.cpython-310.pyc ADDED
Binary file (3.71 kB). View file
 
src/entity/artifact_entity.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ @dataclass
4
+ class DataIngestionArtifact:
5
+ train_file_path: str
6
+ test_file_path: str
7
+
8
+ @dataclass
9
+ class DataValidationArtifact:
10
+ validation_status: bool
11
+ valid_train_file_path: str
12
+ valid_test_file_path: str
13
+ invalid_train_file_path: str
14
+ invalid_test_file_path: str
15
+ drift_report_file_path: str
16
+
17
+
18
+ @dataclass
19
+ class DataTransformationArtifact:
20
+ transformed_object_file_path: str
21
+ transformed_train_file_path: str
22
+ transformed_test_file_path: str
23
+
24
+ @dataclass
25
+ class ClassificationMetricArtifact:
26
+ f1_score: float
27
+ precision_score: float
28
+ recall_score: float
29
+
30
+ @dataclass
31
+ class ModelTrainerArtifact:
32
+ trained_model_file_path: str
33
+ train_metric_artifact: ClassificationMetricArtifact
34
+ test_metric_artifact: ClassificationMetricArtifact
src/entity/config_entity.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import os
3
+ from src.constant import training_pipeline
4
+
5
+
6
+ class TrainingPipelineConfig:
7
+ def __init__(self, timestamp = datetime.now()):
8
+ timestamp = timestamp.strftime("%m_%d_%Y_%H_%M_%S")
9
+ self.pipeline_name = training_pipeline.PIPELINE_NAME
10
+ self.artifact_name = training_pipeline.ARTIFACT_DIR
11
+ self.artifact_dir = os.path.join(self.artifact_name, timestamp)
12
+ self.model_dir=os.path.join("final_model")
13
+ self.timestamp:str = timestamp
14
+
15
+ class Data_ingestion_config:
16
+ def __init__(self, training_pipeline_config:TrainingPipelineConfig):
17
+ self.data_ingestion_dir = os.path.join(training_pipeline_config.artifact_dir,training_pipeline.DATA_INGESTION_DIR_NAME)
18
+ self.feature_store_file_path = os.path.join(self.data_ingestion_dir, training_pipeline.DATA_INGESTION_FEATURE_STORE_DIR, training_pipeline.FILE_NAME)
19
+ self.train_file_path = os.path.join(self.data_ingestion_dir, training_pipeline.DATA_INGESTION_INGESTED_DIR, training_pipeline.TRAIN_FILE_NAME)
20
+ self.test_file_path = os.path.join(self.data_ingestion_dir, training_pipeline.DATA_INGESTION_INGESTED_DIR, training_pipeline.TEST_FILE_NAME)
21
+ self.database_name = training_pipeline.DATA_INGESTION_DATBASE_NANE
22
+ self.collection_name = training_pipeline.DATA_INGESTION_COLLECTION_NAME
23
+ self.train_test_split_ratio = training_pipeline.DATA_INGESTION_TRAIN_TEST_SPLIT_RATION
24
+
25
+ class Data_validation_config:
26
+ def __init__(self,training_pipeline_config: TrainingPipelineConfig):
27
+ self.data_validation_dir:str = os.path.join(training_pipeline_config.artifact_dir, training_pipeline.DATA_VALIDATION_DIR_NAMR)
28
+ self.valid_data_dir:str = os.path.join(self.data_validation_dir, training_pipeline.DATA_VALIDATION_VALID_DIR)
29
+ self.invalid_data_dir:str = os.path.join(self.data_validation_dir, training_pipeline.DATA_VALIDATION_INVALID_DIR)
30
+ self.valid_train_file_path:str = os.path.join(self.valid_data_dir, training_pipeline.TRAIN_FILE_NAME)
31
+ self.valid_test_file_path:str = os.path.join(self.valid_data_dir, training_pipeline.TEST_FILE_NAME)
32
+ self.invalid_train_file_path:str = os.path.join(self.invalid_data_dir, training_pipeline.TRAIN_FILE_NAME)
33
+ self.invalid_test_file_path:str = os.path.join(self.invalid_data_dir, training_pipeline.TEST_FILE_NAME)
34
+ self.drift_report_file_path:str = os.path.join(self.data_validation_dir, training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR, training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME)
35
+
36
+
37
+ class Data_transformation_config:
38
+ def __init__(self, training_pipeline_config: TrainingPipelineConfig):
39
+ self.data_transformation_dir:str = os.path.join(training_pipeline_config.artifact_dir, training_pipeline.DATA_TRANSFORMATION_DIR_NAME)
40
+
41
+ self.transformed_train_file_path: str = os.path.join( self.data_transformation_dir,training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DIR_NAME,
42
+ training_pipeline.TRAIN_FILE_NAME.replace("csv", "npy"),)
43
+
44
+ self.transformed_test_file_path: str = os.path.join(self.data_transformation_dir, training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DIR_NAME,
45
+ training_pipeline.TEST_FILE_NAME.replace("csv", "npy"),
46
+ )
47
+ self.transformed_object_file_path: str = os.path.join( self.data_transformation_dir, training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR,
48
+ training_pipeline.PREPROCESSING_OBJECT_FILE_NAME)
49
+
50
+ class Model_trainer_config:
51
+ def __init__(self, training_pipeline_config: TrainingPipelineConfig):
52
+ self.model_trainer_dir:str = os.path.join(
53
+ training_pipeline_config.artifact_dir, training_pipeline.MODEL_TRAINER_DIR_NAME
54
+ )
55
+ self.trained_model_file_path:str = os.path.join(
56
+ self.model_trainer_dir, training_pipeline.MODEL_TRAINER_MODEL_DIR, training_pipeline.MODEL_TRAINER_MODEL_NAME
57
+ )
58
+ self.expected_accuracy:float = training_pipeline.MODEL_TRAINER_EXPECTED_SCORE
59
+ self.overfitting_underfitting_threshold = training_pipeline.MODEL_TRAINER_OVERFITTING_UNDERFITTING_THRESHOLD
60
+
src/exception/__init__.py ADDED
File without changes
src/exception/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (168 Bytes). View file
 
src/exception/__pycache__/exception.cpython-310.pyc ADDED
Binary file (977 Bytes). View file
 
src/exception/exception.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from src.logging import logger
3
+
4
+ class NetworkSecurityException(Exception):
5
+ def __init__(self, error_message, error_details:sys):
6
+ self.error_message = error_message
7
+ _, _, exc_tb = error_details.exc_info()
8
+ self.line_number = exc_tb.tb_lineno
9
+ self.file_name = exc_tb.tb_frame.f_code.co_filename
10
+
11
+ def __str__(self):
12
+ return f"Error occurred in python script name [{self.file_name}] line number [{self.line_number}] error message [{self.error_message}]"
src/logging/__init__.py ADDED
File without changes
src/logging/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (166 Bytes). View file
 
src/logging/__pycache__/logger.cpython-310.pyc ADDED
Binary file (568 Bytes). View file
 
src/logging/logger.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from datetime import datetime
4
+
5
+ LOG_FILE = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
6
+ logs_path= os.path.join(os.getcwd(), "logs", LOG_FILE)
7
+ os.makedirs(logs_path, exist_ok=True)
8
+ LOG_FILE_PATH = os.path.join(logs_path, LOG_FILE)
9
+
10
+ logging.basicConfig(
11
+ filename = LOG_FILE_PATH,
12
+ format = "[%(asctime)s] %(lineno)d %(name)s - %(levelname)s - %(message)s",
13
+ level = logging.INFO
14
+ )