Spaces:
Sleeping
Sleeping
Commit ·
a21e473
0
Parent(s):
Clean deployment without binary files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +35 -0
- .github/workflows/main.yml +92 -0
- .gitignore +14 -0
- .python-version +1 -0
- Dockerfile +31 -0
- README.md +10 -0
- app.py +75 -0
- data_schema/schema.yaml +65 -0
- data_to_mongo.py +73 -0
- load_data_to_sqlite.py +15 -0
- main.py +41 -0
- pyproject.toml +20 -0
- requirements.txt +14 -0
- setup.py +26 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/cloud/__init__.py +0 -0
- src/cloud/__pycache__/__init__.cpython-310.pyc +0 -0
- src/cloud/__pycache__/s3_syncer.cpython-310.pyc +0 -0
- src/cloud/s3_syncer.py +45 -0
- src/components/__init__.py +0 -0
- src/components/__pycache__/__init__.cpython-310.pyc +0 -0
- src/components/__pycache__/data_ingestion.cpython-310.pyc +0 -0
- src/components/__pycache__/data_transformation.cpython-310.pyc +0 -0
- src/components/__pycache__/data_validation.cpython-310.pyc +0 -0
- src/components/__pycache__/model_trainer.cpython-310.pyc +0 -0
- src/components/data_ingestion.py +80 -0
- src/components/data_transformation.py +93 -0
- src/components/data_validation.py +104 -0
- src/components/model_trainer.py +140 -0
- src/constant/__init__.py +0 -0
- src/constant/__pycache__/__init__.cpython-310.pyc +0 -0
- src/constant/training_pipeline/__init__.py +64 -0
- src/constant/training_pipeline/__pycache__/__init__.cpython-310.pyc +0 -0
- src/data/__init__.py +0 -0
- src/data/sqlite_manager.py +162 -0
- src/entity/__init__.py +0 -0
- src/entity/__pycache__/__init__.cpython-310.pyc +0 -0
- src/entity/__pycache__/artifact_entity.cpython-310.pyc +0 -0
- src/entity/__pycache__/config_entity.cpython-310.pyc +0 -0
- src/entity/artifact_entity.py +34 -0
- src/entity/config_entity.py +60 -0
- src/exception/__init__.py +0 -0
- src/exception/__pycache__/__init__.cpython-310.pyc +0 -0
- src/exception/__pycache__/exception.cpython-310.pyc +0 -0
- src/exception/exception.py +12 -0
- src/logging/__init__.py +0 -0
- src/logging/__pycache__/__init__.cpython-310.pyc +0 -0
- src/logging/__pycache__/logger.cpython-310.pyc +0 -0
- src/logging/logger.py +14 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/main.yml
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: workflow
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
paths-ignore:
|
| 8 |
+
- 'README.md'
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
integration:
|
| 12 |
+
name: Continuous Integration
|
| 13 |
+
runs-on: ubuntu-latest
|
| 14 |
+
steps:
|
| 15 |
+
- name: Checkout Code
|
| 16 |
+
uses: actions/checkout@v3
|
| 17 |
+
|
| 18 |
+
- name: Lint code
|
| 19 |
+
run: echo "Linting repository"
|
| 20 |
+
|
| 21 |
+
- name: Run unit tests
|
| 22 |
+
run: echo "Running unit tests"
|
| 23 |
+
|
| 24 |
+
build-and-push-ecr-image:
|
| 25 |
+
name: Continuous Delivery
|
| 26 |
+
needs: integration
|
| 27 |
+
runs-on: ubuntu-latest
|
| 28 |
+
steps:
|
| 29 |
+
- name: Checkout Code
|
| 30 |
+
uses: actions/checkout@v3
|
| 31 |
+
|
| 32 |
+
- name: Install Utilities
|
| 33 |
+
run: |
|
| 34 |
+
sudo apt-get update
|
| 35 |
+
sudo apt-get install -y jq unzip
|
| 36 |
+
- name: Configure AWS credentials
|
| 37 |
+
uses: aws-actions/configure-aws-credentials@v1
|
| 38 |
+
with:
|
| 39 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
| 40 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
| 41 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
| 42 |
+
|
| 43 |
+
- name: Login to Amazon ECR
|
| 44 |
+
id: login-ecr
|
| 45 |
+
uses: aws-actions/amazon-ecr-login@v1
|
| 46 |
+
|
| 47 |
+
- name: Build, tag, and push image to ECR
|
| 48 |
+
id: build-image
|
| 49 |
+
env:
|
| 50 |
+
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
|
| 51 |
+
ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY_NAME }}
|
| 52 |
+
IMAGE_TAG: latest
|
| 53 |
+
run: |
|
| 54 |
+
echo "ECR_REGISTRY: $ECR_REGISTRY"
|
| 55 |
+
echo "ECR_REPOSITORY: $ECR_REPOSITORY"
|
| 56 |
+
echo "IMAGE_TAG: $IMAGE_TAG"
|
| 57 |
+
docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
|
| 58 |
+
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
|
| 59 |
+
echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
|
| 60 |
+
Continuous-Deployment:
|
| 61 |
+
needs: build-and-push-ecr-image
|
| 62 |
+
runs-on: self-hosted
|
| 63 |
+
steps:
|
| 64 |
+
- name: Checkout
|
| 65 |
+
uses: actions/checkout@v3
|
| 66 |
+
|
| 67 |
+
- name: Configure AWS credentials
|
| 68 |
+
uses: aws-actions/configure-aws-credentials@v1
|
| 69 |
+
with:
|
| 70 |
+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
| 71 |
+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
| 72 |
+
aws-region: ${{ secrets.AWS_REGION }}
|
| 73 |
+
|
| 74 |
+
- name: Login to Amazon ECR
|
| 75 |
+
id: login-ecr
|
| 76 |
+
uses: aws-actions/amazon-ecr-login@v1
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
- name: Pull latest images
|
| 80 |
+
run: |
|
| 81 |
+
docker pull ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
|
| 82 |
+
|
| 83 |
+
#- name: Stop and remove container if running
|
| 84 |
+
# run: |
|
| 85 |
+
# docker ps -q --filter "name=networksecurity" | grep -q . && docker stop networksecurity && docker rm -fv networksecurity
|
| 86 |
+
|
| 87 |
+
- name: Run Docker Image to serve users
|
| 88 |
+
run: |
|
| 89 |
+
docker run -d -p 8080:8080 --ipc="host" --name=networksecurity -e 'AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }}' -e 'AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }}' -e 'AWS_REGION=${{ secrets.AWS_REGION }}' ${{secrets.AWS_ECR_LOGIN_URI}}/${{ secrets.ECR_REPOSITORY_NAME }}:latest
|
| 90 |
+
- name: Clean previous images and containers
|
| 91 |
+
run: |
|
| 92 |
+
docker system prune -f
|
.gitignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
.venv/
|
| 3 |
+
pycache/
|
| 4 |
+
__pycache__/
|
| 5 |
+
.ipynb_checkpoints
|
| 6 |
+
.vscode/
|
| 7 |
+
.DS_Store
|
| 8 |
+
*.pyc
|
| 9 |
+
logs/
|
| 10 |
+
*.log
|
| 11 |
+
Artifacts/
|
| 12 |
+
s3_sync_folder/
|
| 13 |
+
final_model/
|
| 14 |
+
data/
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.13
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# for aws
|
| 2 |
+
# FROM python:3.10-slim-buster
|
| 3 |
+
# WORKDIR /app
|
| 4 |
+
# COPY . /app
|
| 5 |
+
# RUN apt update -y && apt install awscli -y
|
| 6 |
+
# RUN apt-get update && pip install -r requirements.txt
|
| 7 |
+
# CMD ["python3", "app.py"]
|
| 8 |
+
|
| 9 |
+
FROM python:3.13-slim
|
| 10 |
+
|
| 11 |
+
RUN useradd -m -u 1000 user
|
| 12 |
+
USER user
|
| 13 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 14 |
+
|
| 15 |
+
WORKDIR /app
|
| 16 |
+
|
| 17 |
+
# Copy requirements
|
| 18 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 19 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 20 |
+
|
| 21 |
+
# Copy all application files
|
| 22 |
+
COPY --chown=user . /app
|
| 23 |
+
|
| 24 |
+
# Create necessary directories
|
| 25 |
+
RUN mkdir -p /app/data /app/final_model /app/templates
|
| 26 |
+
|
| 27 |
+
# Expose port 7860 (HF Space requirement)
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
# Run the application
|
| 31 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: NSS
|
| 3 |
+
emoji: 🏃
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os,sys
|
| 2 |
+
import certifi
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from src.exception.exception import NetworkSecurityException
|
| 5 |
+
from src.logging.logger import logging
|
| 6 |
+
from src.pipeline.training_pipeline import Trainingpipeline
|
| 7 |
+
from fastapi import FastAPI, File, UploadFile, Request
|
| 8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
+
from uvicorn import run as app_run
|
| 10 |
+
from fastapi.responses import Response
|
| 11 |
+
from starlette.responses import RedirectResponse
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from src.utils.ml_utils.model.estimator import NetworkSecurityModel
|
| 14 |
+
ca = certifi.where()
|
| 15 |
+
load_dotenv()
|
| 16 |
+
mongo_db_uri = os.getenv("MONGO_DB_URI")
|
| 17 |
+
|
| 18 |
+
from src.constant.training_pipeline import DATA_INGESTION_COLLECTION_NAME
|
| 19 |
+
from src.constant.training_pipeline import DATA_INGESTION_DATBASE_NANE
|
| 20 |
+
from src.utils.main_utils.utils import load_object
|
| 21 |
+
# import pymongo
|
| 22 |
+
|
| 23 |
+
# client = pymongo.MongoClient(mongo_db_uri,tlsCAFile=ca)
|
| 24 |
+
# database = client[DATA_INGESTION_DATBASE_NANE]
|
| 25 |
+
# collection = database[DATA_INGESTION_COLLECTION_NAME]
|
| 26 |
+
from fastapi.templating import Jinja2Templates
|
| 27 |
+
templates = Jinja2Templates(directory="./templates")
|
| 28 |
+
app = FastAPI()
|
| 29 |
+
orgin = ["*"]
|
| 30 |
+
|
| 31 |
+
app.add_middleware(
|
| 32 |
+
CORSMiddleware,
|
| 33 |
+
allow_origins=orgin,
|
| 34 |
+
allow_credentials=True,
|
| 35 |
+
allow_methods=["*"],
|
| 36 |
+
allow_headers=["*"],
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# @app.get("/", tags = ["authentication"])
|
| 40 |
+
# async def index():
|
| 41 |
+
# return RedirectResponse(url="/docs")
|
| 42 |
+
|
| 43 |
+
@app.get("/train")
|
| 44 |
+
async def training_route():
|
| 45 |
+
try:
|
| 46 |
+
training_pipeline = Trainingpipeline()
|
| 47 |
+
training_pipeline.run_pipeline()
|
| 48 |
+
return Response("Training successfull !!")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
raise NetworkSecurityException(e, sys)
|
| 51 |
+
|
| 52 |
+
@app.post("/predict") # predict route
|
| 53 |
+
async def predict_route(request: Request, file: UploadFile =File(...)):
|
| 54 |
+
try:
|
| 55 |
+
df = pd.read_csv(file.file)
|
| 56 |
+
# Remove target column if it exists
|
| 57 |
+
if 'Result' in df.columns:
|
| 58 |
+
df = df.drop(columns=['Result'])
|
| 59 |
+
preprocessor = load_object(file_path = "final_model/preprocessor.pkl")
|
| 60 |
+
model = load_object(file_path= "final_model/model.pkl")
|
| 61 |
+
NSmodel = NetworkSecurityModel(preprocessing_object= preprocessor, trained_model_object= model)
|
| 62 |
+
print(df.iloc[0])
|
| 63 |
+
y_pred = NSmodel.predict(df)
|
| 64 |
+
print(y_pred)
|
| 65 |
+
df['predicted_column'] = y_pred
|
| 66 |
+
print(df['predicted_column'])
|
| 67 |
+
df.to_csv("final_model/predicted.csv")
|
| 68 |
+
table_html = df.to_html(classes = 'table table-striped')
|
| 69 |
+
return templates.TemplateResponse("table.html", {"request": request, "table": table_html})
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
raise NetworkSecurityException(e, sys)
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
app_run(app, host="0.0.0.0", port=8080)
|
| 75 |
+
|
data_schema/schema.yaml
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
columns:
|
| 2 |
+
- having_IP_Address: int64
|
| 3 |
+
- URL_Length: int64
|
| 4 |
+
- Shortining_Service: int64
|
| 5 |
+
- having_At_Symbol: int64
|
| 6 |
+
- double_slash_redirecting: int64
|
| 7 |
+
- Prefix_Suffix: int64
|
| 8 |
+
- having_Sub_Domain: int64
|
| 9 |
+
- SSLfinal_State: int64
|
| 10 |
+
- Domain_registeration_length: int64
|
| 11 |
+
- Favicon: int64
|
| 12 |
+
- port: int64
|
| 13 |
+
- HTTPS_token: int64
|
| 14 |
+
- Request_URL: int64
|
| 15 |
+
- URL_of_Anchor: int64
|
| 16 |
+
- Links_in_tags: int64
|
| 17 |
+
- SFH: int64
|
| 18 |
+
- Submitting_to_email: int64
|
| 19 |
+
- Abnormal_URL: int64
|
| 20 |
+
- Redirect: int64
|
| 21 |
+
- on_mouseover: int64
|
| 22 |
+
- RightClick: int64
|
| 23 |
+
- popUpWidnow: int64
|
| 24 |
+
- Iframe: int64
|
| 25 |
+
- age_of_domain: int64
|
| 26 |
+
- DNSRecord: int64
|
| 27 |
+
- web_traffic: int64
|
| 28 |
+
- Page_Rank: int64
|
| 29 |
+
- Google_Index: int64
|
| 30 |
+
- Links_pointing_to_page: int64
|
| 31 |
+
- Statistical_report: int64
|
| 32 |
+
- Result: int64
|
| 33 |
+
|
| 34 |
+
numerical_columns:
|
| 35 |
+
- having_IP_Address
|
| 36 |
+
- URL_Length
|
| 37 |
+
- Shortining_Service
|
| 38 |
+
- having_At_Symbol
|
| 39 |
+
- double_slash_redirecting
|
| 40 |
+
- Prefix_Suffix
|
| 41 |
+
- having_Sub_Domain
|
| 42 |
+
- SSLfinal_State
|
| 43 |
+
- Domain_registeration_length
|
| 44 |
+
- Favicon
|
| 45 |
+
- port
|
| 46 |
+
- HTTPS_token
|
| 47 |
+
- Request_URL
|
| 48 |
+
- URL_of_Anchor
|
| 49 |
+
- Links_in_tags
|
| 50 |
+
- SFH
|
| 51 |
+
- Submitting_to_email
|
| 52 |
+
- Abnormal_URL
|
| 53 |
+
- Redirect
|
| 54 |
+
- on_mouseover
|
| 55 |
+
- RightClick
|
| 56 |
+
- popUpWidnow
|
| 57 |
+
- Iframe
|
| 58 |
+
- age_of_domain
|
| 59 |
+
- DNSRecord
|
| 60 |
+
- web_traffic
|
| 61 |
+
- Page_Rank
|
| 62 |
+
- Google_Index
|
| 63 |
+
- Links_pointing_to_page
|
| 64 |
+
- Statistical_report
|
| 65 |
+
- Result
|
data_to_mongo.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Test code
|
| 2 |
+
# import os
|
| 3 |
+
# from pymongo.mongo_client import MongoClient
|
| 4 |
+
# from dotenv import load_dotenv
|
| 5 |
+
# load_dotenv()
|
| 6 |
+
# uri = os.getenv("MONGODB_URL")
|
| 7 |
+
|
| 8 |
+
# # Create a new client and connect to the server
|
| 9 |
+
# client = MongoClient(uri)
|
| 10 |
+
|
| 11 |
+
# # Send a ping to confirm a successful connection
|
| 12 |
+
# try:
|
| 13 |
+
# client.admin.command('ping')
|
| 14 |
+
# print("Pinged your deployment. You successfully connected to MongoDB!")
|
| 15 |
+
# except Exception as e:
|
| 16 |
+
# print(e)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import json
|
| 22 |
+
import certifi
|
| 23 |
+
import pandas as pd
|
| 24 |
+
import numpy as np
|
| 25 |
+
# import pymongo
|
| 26 |
+
from src.exception.exception import NetworkSecurityException
|
| 27 |
+
from src.logging.logger import logging
|
| 28 |
+
from dotenv import load_dotenv
|
| 29 |
+
load_dotenv()
|
| 30 |
+
|
| 31 |
+
MONGODB_URL = os.getenv("MONGODB_URL")
|
| 32 |
+
print(MONGODB_URL)
|
| 33 |
+
ca = certifi.where() # ca= certified authority
|
| 34 |
+
|
| 35 |
+
class NetworkDataExtract():
|
| 36 |
+
def __init__(self):
|
| 37 |
+
try:
|
| 38 |
+
pass
|
| 39 |
+
except Exception as e:
|
| 40 |
+
raise NetworkSecurityException(e, sys)
|
| 41 |
+
def csv_to_json_converter(self, file_path):
|
| 42 |
+
try:
|
| 43 |
+
data = pd.read_csv(file_path)
|
| 44 |
+
data.reset_index(drop=True, inplace=True)
|
| 45 |
+
records = list(json.loads(data.T.to_json()).values())
|
| 46 |
+
return records
|
| 47 |
+
except Exception as e:
|
| 48 |
+
raise NetworkSecurityException(e, sys)
|
| 49 |
+
|
| 50 |
+
def insert_data_to_mongodb(self, records, database, collection):
|
| 51 |
+
try:
|
| 52 |
+
self.database = database
|
| 53 |
+
self.collection = collection
|
| 54 |
+
self.records = records
|
| 55 |
+
|
| 56 |
+
self.mongo_client = pymongo.MongoClient(MONGODB_URL)
|
| 57 |
+
self.database = self.mongo_client[self.database]
|
| 58 |
+
|
| 59 |
+
self.collection = self.database[self.collection ]
|
| 60 |
+
self.collection.insert_many(self.records)
|
| 61 |
+
return(len(self.records))
|
| 62 |
+
except Exception as e:
|
| 63 |
+
raise NetworkSecurityException(e, sys)
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
FILE_PATH = "data\phisingData.csv"
|
| 67 |
+
DATABASE = "Network_data"
|
| 68 |
+
Collection = "phising_data"
|
| 69 |
+
obj = NetworkDataExtract()
|
| 70 |
+
records = obj.csv_to_json_converter(file_path=FILE_PATH)
|
| 71 |
+
print("records converted to json")
|
| 72 |
+
noOfRecords = obj.insert_data_to_mongodb(records,DATABASE, Collection)
|
| 73 |
+
print(noOfRecords)
|
load_data_to_sqlite.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.data.sqlite_manager import PhishingDataManager
|
| 2 |
+
|
| 3 |
+
if __name__ == "__main__":
|
| 4 |
+
FILE_PATH = "data/phisingData.csv"
|
| 5 |
+
|
| 6 |
+
print("Initializing SQLite database...")
|
| 7 |
+
db_manager = PhishingDataManager()
|
| 8 |
+
|
| 9 |
+
print("Loading data from CSV...")
|
| 10 |
+
count = db_manager.insert_data_from_csv(FILE_PATH)
|
| 11 |
+
|
| 12 |
+
print(f"✅ Successfully loaded {count} records into SQLite database!")
|
| 13 |
+
print(f"Database location: data/phishing_data.db")
|
| 14 |
+
|
| 15 |
+
db_manager.close()
|
main.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.components.data_ingestion import DataIngestion
|
| 2 |
+
from src.components.data_validation import DataValidation
|
| 3 |
+
from src.components.data_transformation import DataTransformation
|
| 4 |
+
from src.exception.exception import NetworkSecurityException
|
| 5 |
+
from src.logging.logger import logging
|
| 6 |
+
from src.entity.config_entity import Data_ingestion_config, TrainingPipelineConfig, Data_validation_config, Data_transformation_config, Model_trainer_config
|
| 7 |
+
from src.components.model_trainer import ModelTrainer
|
| 8 |
+
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
try:
|
| 13 |
+
traingning_pipeline_config = TrainingPipelineConfig()
|
| 14 |
+
data_ingestion_config = Data_ingestion_config(traingning_pipeline_config)
|
| 15 |
+
Data_ingestion = DataIngestion(data_ingestion_config)
|
| 16 |
+
logging.info("Data ingestion started")
|
| 17 |
+
data_ingestion_artifacts = Data_ingestion.initiate_data_ingestion()
|
| 18 |
+
logging.info("Data ingestion completed")
|
| 19 |
+
print("Data ingestion completed")
|
| 20 |
+
|
| 21 |
+
data_validation_config = Data_validation_config(traingning_pipeline_config)
|
| 22 |
+
Data_validation = DataValidation(data_ingestion_artifacts, data_validation_config)
|
| 23 |
+
logging.info("Data validation started")
|
| 24 |
+
data_validation_artifacts = Data_validation.intiate_data_validation()
|
| 25 |
+
logging.info("Data validation completed")
|
| 26 |
+
print(data_validation_artifacts)
|
| 27 |
+
|
| 28 |
+
data_transformation_config = Data_transformation_config(traingning_pipeline_config)
|
| 29 |
+
logging.info("data Transformation started")
|
| 30 |
+
data_transformation = DataTransformation(data_validation_artifacts, data_transformation_config)
|
| 31 |
+
data_transformation_artifact = data_transformation.initiate_data_transformation()
|
| 32 |
+
print(data_transformation_artifact)
|
| 33 |
+
logging.info("data Transformation completed")
|
| 34 |
+
|
| 35 |
+
logging.info("Model training started")
|
| 36 |
+
model_trainer_config = Model_trainer_config(traingning_pipeline_config)
|
| 37 |
+
model_trainer = ModelTrainer(model_trainer_config=model_trainer_config, data_transformation_artifact=data_transformation_artifact)
|
| 38 |
+
model_trainer_artifact = model_trainer.initiate_model_trainer()
|
| 39 |
+
logging.info("Model training completed")
|
| 40 |
+
except Exception as e:
|
| 41 |
+
raise NetworkSecurityException(e, sys)
|
pyproject.toml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "NetworkSecuritySystemMLProject"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.13"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"certifi>=2026.1.4",
|
| 9 |
+
"dagshub>=0.6.4",
|
| 10 |
+
"dill>=0.4.0",
|
| 11 |
+
"fastapi>=0.128.0",
|
| 12 |
+
"mlflow>=3.8.1",
|
| 13 |
+
"numpy>=2.4.1",
|
| 14 |
+
"pandas>=2.3.3",
|
| 15 |
+
"pyaml>=25.7.0",
|
| 16 |
+
"pymongo[srv]==3.6",
|
| 17 |
+
"python-dotenv>=1.2.1",
|
| 18 |
+
"python-multipart>=0.0.21",
|
| 19 |
+
"uvicorn>=0.40.0",
|
| 20 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python-dotenv
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
pymongo[srv]==3.6
|
| 5 |
+
pymongo
|
| 6 |
+
certifi
|
| 7 |
+
dill
|
| 8 |
+
mlflow
|
| 9 |
+
pyaml
|
| 10 |
+
dagshub
|
| 11 |
+
fastapi
|
| 12 |
+
uvicorn
|
| 13 |
+
python-multipart
|
| 14 |
+
# -e .
|
setup.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import find_packages, setup
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
def get_requirements()-> List[str]:
|
| 5 |
+
"""This function is going to return list of requirements
|
| 6 |
+
"""
|
| 7 |
+
requirement_list:List[str]=[]
|
| 8 |
+
try:
|
| 9 |
+
with open('requirements.txt', 'r') as file:
|
| 10 |
+
lines= file.readlines()
|
| 11 |
+
for line in lines:
|
| 12 |
+
requirement=line.strip()
|
| 13 |
+
if requirement and requirement!= '-e .': # ignore empty line and -e .
|
| 14 |
+
requirement_list.append(requirement)
|
| 15 |
+
|
| 16 |
+
except FileNotFoundError:
|
| 17 |
+
print("Error: requirements.txt file not found.")
|
| 18 |
+
|
| 19 |
+
setup(
|
| 20 |
+
name="Network_Security_system",
|
| 21 |
+
version="0.0.1",
|
| 22 |
+
author="Kshitij",
|
| 23 |
+
author_email="kshitijk146@gmail.com",
|
| 24 |
+
packages=find_packages(),
|
| 25 |
+
install_requires=get_requirements()
|
| 26 |
+
)
|
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (158 Bytes). View file
|
|
|
src/cloud/__init__.py
ADDED
|
File without changes
|
src/cloud/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (164 Bytes). View file
|
|
|
src/cloud/__pycache__/s3_syncer.cpython-310.pyc
ADDED
|
Binary file (721 Bytes). View file
|
|
|
src/cloud/s3_syncer.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
class s3sync:
|
| 4 |
+
def __init__(self, Local_sync_folder = "s3_sync_folder"):
|
| 5 |
+
self.Local_sync_folder = Local_sync_folder
|
| 6 |
+
if not os.path.exists(self.Local_sync_folder):
|
| 7 |
+
os.makedirs(self.Local_sync_folder, exist_ok=True)
|
| 8 |
+
|
| 9 |
+
def sync_folder_to_s3(self, folder, aws_bucket_url):
|
| 10 |
+
# command = (
|
| 11 |
+
# f"aws s3 sync {folder} {aws_bucket_url}"
|
| 12 |
+
# )
|
| 13 |
+
# os.system(command)
|
| 14 |
+
try:
|
| 15 |
+
dest_path= aws_bucket_url.replace("s3://","").replace(aws_bucket_url.split("/")[0] + "/", "")
|
| 16 |
+
destination = os.path.join(self.Local_sync_folder, dest_path)
|
| 17 |
+
|
| 18 |
+
# copy folder
|
| 19 |
+
if os.path.exists(destination):
|
| 20 |
+
shutil.rmtree(destination)
|
| 21 |
+
shutil.copytree(folder, destination)
|
| 22 |
+
print(f"Synced {folder} to {destination}")
|
| 23 |
+
except Exception as e:
|
| 24 |
+
print(f"Error syncing folder to S3: {e}")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def sync_folder_from_s3(self, folder, aws_bucket_url):
|
| 29 |
+
# command = (
|
| 30 |
+
# f"aws s3 sync {aws_bucket_url} {folder}"
|
| 31 |
+
# )
|
| 32 |
+
# os.system(command)
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
source_path = aws_bucket_url.replace("s3://","").replace(aws_bucket_url.split("/")[0] + "/", "")
|
| 36 |
+
source = os.path.join(self.Local_sync_folder, source_path)
|
| 37 |
+
|
| 38 |
+
if os.path.exists(folder):
|
| 39 |
+
shutil.rmtree(folder)
|
| 40 |
+
shutil.copytree(source, folder)
|
| 41 |
+
print(f"Synced {source} to {folder}")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Error syncing folder from S3: {e}")
|
| 44 |
+
|
| 45 |
+
|
src/components/__init__.py
ADDED
|
File without changes
|
src/components/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (169 Bytes). View file
|
|
|
src/components/__pycache__/data_ingestion.cpython-310.pyc
ADDED
|
Binary file (3.31 kB). View file
|
|
|
src/components/__pycache__/data_transformation.cpython-310.pyc
ADDED
|
Binary file (3.64 kB). View file
|
|
|
src/components/__pycache__/data_validation.cpython-310.pyc
ADDED
|
Binary file (3.47 kB). View file
|
|
|
src/components/__pycache__/model_trainer.cpython-310.pyc
ADDED
|
Binary file (4.67 kB). View file
|
|
|
src/components/data_ingestion.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.exception.exception import NetworkSecurityException
|
| 2 |
+
from src.logging.logger import logging
|
| 3 |
+
from src.entity.config_entity import Data_ingestion_config
|
| 4 |
+
from src.entity.artifact_entity import DataIngestionArtifact
|
| 5 |
+
import os, sys
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from typing import List
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
# import pymongo
|
| 11 |
+
import numpy as np
|
| 12 |
+
from src.data.sqlite_manager import PhishingDataManager
|
| 13 |
+
|
| 14 |
+
load_dotenv()
|
| 15 |
+
MONGODB_URL = os.getenv("MONGODB_URL")
|
| 16 |
+
|
| 17 |
+
class DataIngestion:
|
| 18 |
+
def __init__(self, data_ingestion_config: Data_ingestion_config):
|
| 19 |
+
try:
|
| 20 |
+
self.data_ingestion_config = data_ingestion_config
|
| 21 |
+
self.db_manager = PhishingDataManager()
|
| 22 |
+
except Exception as e:
|
| 23 |
+
raise NetworkSecurityException(e, sys)
|
| 24 |
+
|
| 25 |
+
def export_collection_as_dataframe(self):
|
| 26 |
+
"""Export data from SQLite instead of MongoDB"""
|
| 27 |
+
try:
|
| 28 |
+
# Get all training data
|
| 29 |
+
df = self.db_manager.get_training_data(include_new_only=False)
|
| 30 |
+
return df
|
| 31 |
+
except Exception as e:
|
| 32 |
+
raise NetworkSecurityException(e, sys)
|
| 33 |
+
|
| 34 |
+
def move_data_into_feature_store(self, dataframe: pd.DataFrame):
|
| 35 |
+
try:
|
| 36 |
+
feature_store_file = self.data_ingestion_config.feature_store_file_path
|
| 37 |
+
dir_path = os.path.dirname(feature_store_file)
|
| 38 |
+
os.makedirs(dir_path, exist_ok=True)
|
| 39 |
+
dataframe.to_csv(feature_store_file, index=False, header=True)
|
| 40 |
+
return dataframe
|
| 41 |
+
except Exception as e:
|
| 42 |
+
raise NetworkSecurityException(e, sys)
|
| 43 |
+
|
| 44 |
+
def data_train_test_split(self,dataframe:pd.DataFrame):
|
| 45 |
+
try:
|
| 46 |
+
train_set, test_set = train_test_split(
|
| 47 |
+
dataframe, test_size=self.data_ingestion_config.train_test_split_ratio
|
| 48 |
+
)
|
| 49 |
+
logging.info("Trained test spltting done on dataframe")
|
| 50 |
+
dir_path = os.path.dirname(self.data_ingestion_config.train_file_path)
|
| 51 |
+
os.makedirs(dir_path, exist_ok=True)
|
| 52 |
+
logging.info("Exporting train and test file path")
|
| 53 |
+
train_set.to_csv(
|
| 54 |
+
self.data_ingestion_config.train_file_path, index = False, header = True
|
| 55 |
+
)
|
| 56 |
+
test_set.to_csv(
|
| 57 |
+
self.data_ingestion_config.test_file_path, index = False, header = True
|
| 58 |
+
)
|
| 59 |
+
logging.info("Exported train and test file path.")
|
| 60 |
+
|
| 61 |
+
except Exception as e:
|
| 62 |
+
raise NetworkSecurityException(e, sys)
|
| 63 |
+
|
| 64 |
+
def initiate_data_ingestion(self):
|
| 65 |
+
try:
|
| 66 |
+
dataframe = self.export_collection_as_dataframe()
|
| 67 |
+
dataframe = self.move_data_into_feature_store(dataframe)
|
| 68 |
+
self.data_train_test_split(dataframe)
|
| 69 |
+
|
| 70 |
+
# Mark data as used
|
| 71 |
+
self.db_manager.mark_data_as_trained()
|
| 72 |
+
|
| 73 |
+
data_ingestion_artifact = DataIngestionArtifact(
|
| 74 |
+
train_file_path=self.data_ingestion_config.train_file_path,
|
| 75 |
+
test_file_path=self.data_ingestion_config.test_file_path
|
| 76 |
+
)
|
| 77 |
+
return data_ingestion_artifact
|
| 78 |
+
except Exception as e:
|
| 79 |
+
raise NetworkSecurityException(e, sys)
|
| 80 |
+
|
src/components/data_transformation.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from sklearn.impute import KNNImputer
|
| 6 |
+
from sklearn.pipeline import Pipeline
|
| 7 |
+
from src.constant.training_pipeline import TARGET_COLUMN
|
| 8 |
+
from src.constant.training_pipeline import DATA_TRANSFORMATION_IMPUTER_PARAMS
|
| 9 |
+
from src.entity.artifact_entity import (
|
| 10 |
+
DataTransformationArtifact,
|
| 11 |
+
DataValidationArtifact,
|
| 12 |
+
)
|
| 13 |
+
from src.exception.exception import NetworkSecurityException
|
| 14 |
+
from src.logging.logger import logging
|
| 15 |
+
from src.utils.main_utils.utils import save_np_array, save_object
|
| 16 |
+
from src.entity.config_entity import Data_transformation_config
|
| 17 |
+
class DataTransformation:
|
| 18 |
+
def __init__(self, data_validation_artifact: DataValidationArtifact, data_transformation_config: Data_transformation_config):
|
| 19 |
+
try:
|
| 20 |
+
self.data_validation_artifact:DataValidationArtifact = data_validation_artifact
|
| 21 |
+
self.data_transformation_config:Data_transformation_config = data_transformation_config
|
| 22 |
+
except Exception as e:
|
| 23 |
+
raise NetworkSecurityException(e, sys) from e
|
| 24 |
+
|
| 25 |
+
@staticmethod
|
| 26 |
+
def read_data(file_path) -> pd.DataFrame:
|
| 27 |
+
try:
|
| 28 |
+
return pd.read_csv(file_path)
|
| 29 |
+
except Exception as e:
|
| 30 |
+
raise NetworkSecurityException(e, sys) from e
|
| 31 |
+
|
| 32 |
+
def get_data_transformer_object(self) -> Pipeline:
|
| 33 |
+
"""
|
| 34 |
+
it initialises a KNNImputer object with the parameter specified in the training_pipeline.py file and returns
|
| 35 |
+
a pipeline with the KNNImputer object as the first step.
|
| 36 |
+
|
| 37 |
+
args:
|
| 38 |
+
cls: DataTransformation
|
| 39 |
+
Returns:
|
| 40 |
+
a pipeline object
|
| 41 |
+
"""
|
| 42 |
+
logging.info("Entered get_data_transformation_object methof of transformation class")
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
knn_imputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)
|
| 46 |
+
logging.info(f"intialise knn imputer with {DATA_TRANSFORMATION_IMPUTER_PARAMS}")
|
| 47 |
+
pipeline = Pipeline(steps=[("imputer", knn_imputer)])
|
| 48 |
+
return pipeline
|
| 49 |
+
except Exception as e:
|
| 50 |
+
raise NetworkSecurityException(e, sys) from e
|
| 51 |
+
|
| 52 |
+
def initiate_data_transformation(self)-> DataTransformationArtifact:
|
| 53 |
+
try:
|
| 54 |
+
logging.info("Started data transformation!")
|
| 55 |
+
train_df = DataTransformation.read_data(self.data_validation_artifact.valid_train_file_path)
|
| 56 |
+
test_df = DataTransformation.read_data(self.data_validation_artifact.valid_test_file_path)
|
| 57 |
+
|
| 58 |
+
# training dataframe
|
| 59 |
+
input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN],axis = 1)
|
| 60 |
+
target_feature_train_df = train_df[TARGET_COLUMN]
|
| 61 |
+
target_feature_train_df = target_feature_train_df.replace(-1,0)
|
| 62 |
+
# testing dataframe
|
| 63 |
+
input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN],axis = 1)
|
| 64 |
+
target_feature_test_df = test_df[TARGET_COLUMN]
|
| 65 |
+
target_feature_test_df = target_feature_test_df.replace(-1,0)
|
| 66 |
+
|
| 67 |
+
preprocessor = self.get_data_transformer_object()
|
| 68 |
+
preprocessor_obj = preprocessor.fit(input_feature_train_df)
|
| 69 |
+
logging.info("Preprocessor object created and fitted on training data")
|
| 70 |
+
|
| 71 |
+
transformed_input_train_feature = preprocessor_obj.transform(input_feature_train_df)
|
| 72 |
+
transformed_input_test_feature = preprocessor_obj.transform(input_feature_test_df)
|
| 73 |
+
|
| 74 |
+
# combining transformed input features with target feature
|
| 75 |
+
train_arr = np.c_[transformed_input_train_feature, np.array(target_feature_train_df)]
|
| 76 |
+
test_arr = np.c_[transformed_input_test_feature, np.array(target_feature_test_df)]
|
| 77 |
+
|
| 78 |
+
# save numpy array data
|
| 79 |
+
save_np_array(self.data_transformation_config.transformed_train_file_path, array=train_arr)
|
| 80 |
+
save_np_array(self.data_transformation_config.transformed_test_file_path,array = test_arr )
|
| 81 |
+
save_object(self.data_transformation_config.transformed_object_file_path,preprocessor_obj)
|
| 82 |
+
save_object("final_model/preprocessor.pkl", preprocessor_obj)
|
| 83 |
+
|
| 84 |
+
# preparing artifacts
|
| 85 |
+
Data_transformation_artifact = DataTransformationArtifact(
|
| 86 |
+
transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
|
| 87 |
+
transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
|
| 88 |
+
transformed_test_file_path=self.data_transformation_config.transformed_test_file_path,
|
| 89 |
+
)
|
| 90 |
+
return Data_transformation_artifact
|
| 91 |
+
except Exception as e:
|
| 92 |
+
raise NetworkSecurityException(e, sys) from e
|
| 93 |
+
|
src/components/data_validation.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.entity.artifact_entity import DataIngestionArtifact,DataValidationArtifact
|
| 2 |
+
from src.entity.config_entity import Data_validation_config
|
| 3 |
+
from src.exception.exception import NetworkSecurityException
|
| 4 |
+
from src.constant.training_pipeline import SCHEMA_FILE_PATH
|
| 5 |
+
from src.logging.logger import logging
|
| 6 |
+
from scipy.stats import ks_2samp
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import os, sys
|
| 9 |
+
from src.utils.main_utils.utils import read_yaml_file, write_yaml_file
|
| 10 |
+
|
| 11 |
+
class DataValidation:
|
| 12 |
+
def __init__(self, data_ingestion_artifact: DataIngestionArtifact, data_validation_config: Data_validation_config):
|
| 13 |
+
try:
|
| 14 |
+
self.data_ingestion_artifact = data_ingestion_artifact
|
| 15 |
+
self.data_validation_config = data_validation_config
|
| 16 |
+
self._schema_config = read_yaml_file(file_path=SCHEMA_FILE_PATH)
|
| 17 |
+
except Exception as e:
|
| 18 |
+
raise NetworkSecurityException(e, sys) from e
|
| 19 |
+
|
| 20 |
+
@staticmethod
|
| 21 |
+
def read_data(file_path) -> pd.DataFrame:
|
| 22 |
+
try:
|
| 23 |
+
return pd.read_csv(file_path)
|
| 24 |
+
except Exception as e:
|
| 25 |
+
raise NetworkSecurityException(e, sys) from e
|
| 26 |
+
|
| 27 |
+
def validate_number_of_columns(self, dataframe: pd.DataFrame)-> bool:
|
| 28 |
+
try:
|
| 29 |
+
number_of_columns = len(self._schema_config)
|
| 30 |
+
logging.info(f"Required number of columns: {number_of_columns}")
|
| 31 |
+
logging.info(f"Data frame has columns: {len(dataframe.columns)}")
|
| 32 |
+
if len(dataframe.columns)==number_of_columns:
|
| 33 |
+
return True
|
| 34 |
+
else:
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
except Exception as e:
|
| 38 |
+
raise NetworkSecurityException(e, sys) from e
|
| 39 |
+
def detect_drift(self,base_df, current_df, threshold=0.05)->bool:
|
| 40 |
+
try:
|
| 41 |
+
status = True
|
| 42 |
+
report = {}
|
| 43 |
+
for column in base_df:
|
| 44 |
+
d1 = base_df[column]
|
| 45 |
+
d2 = current_df[column]
|
| 46 |
+
is_sample_dist = ks_2samp(d1, d2)
|
| 47 |
+
if threshold <= is_sample_dist.pvalue:
|
| 48 |
+
is_found = False
|
| 49 |
+
else:
|
| 50 |
+
is_found = True
|
| 51 |
+
status = False
|
| 52 |
+
report.update({column: {
|
| 53 |
+
"p_value": float(is_sample_dist.pvalue),
|
| 54 |
+
"drift_status": is_found
|
| 55 |
+
}})
|
| 56 |
+
drift_report_file_path = self.data_validation_config.drift_report_file_path
|
| 57 |
+
# Create directory
|
| 58 |
+
dir_path = os.path.dirname(drift_report_file_path)
|
| 59 |
+
os.makedirs(dir_path, exist_ok=True)
|
| 60 |
+
write_yaml_file(file_path=drift_report_file_path, content=report)
|
| 61 |
+
return status
|
| 62 |
+
except Exception as e:
|
| 63 |
+
raise NetworkSecurityException(e, sys )
|
| 64 |
+
|
| 65 |
+
def intiate_data_validation(self)-> DataValidationArtifact:
|
| 66 |
+
try:
|
| 67 |
+
train_file_path = self.data_ingestion_artifact.train_file_path
|
| 68 |
+
test_file_path = self.data_ingestion_artifact.test_file_path
|
| 69 |
+
|
| 70 |
+
# read the data from train and test csv
|
| 71 |
+
train_df = DataValidation.read_data(train_file_path)
|
| 72 |
+
test_df = DataValidation.read_data(test_file_path)
|
| 73 |
+
|
| 74 |
+
# validating no. of columns
|
| 75 |
+
status = self.validate_number_of_columns(dataframe=train_df)
|
| 76 |
+
if not status:
|
| 77 |
+
error_message = f"{train_file_path} does not match schema"
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
status = self.validate_number_of_columns(dataframe=test_df)
|
| 81 |
+
if not status:
|
| 82 |
+
error_message = f"{test_file_path} does not match schema"
|
| 83 |
+
|
| 84 |
+
# check data drift
|
| 85 |
+
status = self.detect_drift(base_df=train_df, current_df=test_df)
|
| 86 |
+
dir_path = os.path.dirname(self.data_validation_config.valid_train_file_path)
|
| 87 |
+
os.makedirs(dir_path, exist_ok=True)
|
| 88 |
+
|
| 89 |
+
train_df.to_csv(self.data_validation_config.valid_train_file_path, index=False, header = True)
|
| 90 |
+
test_df.to_csv(self.data_validation_config.valid_test_file_path, index=False, header = True)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
data_validation_artifacts = DataValidationArtifact(
|
| 94 |
+
validation_status=status,
|
| 95 |
+
valid_train_file_path=self.data_ingestion_artifact.train_file_path,
|
| 96 |
+
valid_test_file_path=self.data_ingestion_artifact.test_file_path,
|
| 97 |
+
invalid_train_file_path=None,
|
| 98 |
+
invalid_test_file_path=None,
|
| 99 |
+
drift_report_file_path=self.data_validation_config.drift_report_file_path
|
| 100 |
+
)
|
| 101 |
+
return data_validation_artifacts
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
raise NetworkSecurityException(e, sys)
|
src/components/model_trainer.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, sys
|
| 2 |
+
from src.utils.ml_utils.model.estimator import NetworkSecurityModel
|
| 3 |
+
from src.exception.exception import NetworkSecurityException
|
| 4 |
+
from src.logging.logger import logging
|
| 5 |
+
from src.entity.artifact_entity import DataTransformationArtifact, ModelTrainerArtifact
|
| 6 |
+
from src.entity.config_entity import Model_trainer_config
|
| 7 |
+
from src.utils.main_utils.utils import save_object, load_object
|
| 8 |
+
from src.utils.main_utils.utils import load_numpy_array_data, evaluate_models
|
| 9 |
+
from src.utils.ml_utils.metric.classification_metric import classification_score
|
| 10 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 11 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 12 |
+
from sklearn.ensemble import (
|
| 13 |
+
RandomForestClassifier,
|
| 14 |
+
AdaBoostClassifier,
|
| 15 |
+
GradientBoostingClassifier,
|
| 16 |
+
)
|
| 17 |
+
from sklearn.linear_model import LogisticRegression
|
| 18 |
+
from sklearn.metrics import r2_score
|
| 19 |
+
import mlflow
|
| 20 |
+
import dagshub
|
| 21 |
+
dagshub.init(repo_owner='kshitijk146', repo_name='MLOPS_project_network_Security_system', mlflow=True)
|
| 22 |
+
class ModelTrainer:
|
| 23 |
+
def __init__(self, model_trainer_config: Model_trainer_config, data_transformation_artifact: DataTransformationArtifact):
|
| 24 |
+
try:
|
| 25 |
+
self.model_trainer_config = model_trainer_config
|
| 26 |
+
self.data_transformation_artifact = data_transformation_artifact
|
| 27 |
+
except Exception as e:
|
| 28 |
+
raise NetworkSecurityException(e, sys) from e
|
| 29 |
+
|
| 30 |
+
def track_mlflow(self,best_model, classificationmetric):
|
| 31 |
+
with mlflow.start_run():
|
| 32 |
+
f1_score = classificationmetric.f1_score
|
| 33 |
+
precision_score = classificationmetric.precision_score
|
| 34 |
+
recall_score = classificationmetric.recall_score
|
| 35 |
+
|
| 36 |
+
mlflow.log_metric("f1_score", f1_score)
|
| 37 |
+
mlflow.log_metric("precision_score", precision_score)
|
| 38 |
+
mlflow.log_metric("recall_score", recall_score)
|
| 39 |
+
mlflow.sklearn.log_model(best_model, "model")
|
| 40 |
+
|
| 41 |
+
def train_model(self, x_train, y_train,x_test, y_test):
|
| 42 |
+
models = {
|
| 43 |
+
"KNN": KNeighborsClassifier(),
|
| 44 |
+
"Decision Tree": DecisionTreeClassifier(),
|
| 45 |
+
"Random Forest": RandomForestClassifier(verbose=True),
|
| 46 |
+
"AdaBoost": AdaBoostClassifier(),
|
| 47 |
+
"Gradient Boosting": GradientBoostingClassifier(verbose=True),
|
| 48 |
+
"logistic regression": LogisticRegression(verbose=True)
|
| 49 |
+
}
|
| 50 |
+
params = {
|
| 51 |
+
"KNN": {
|
| 52 |
+
'n_neighbors': [3, 5, 7],
|
| 53 |
+
'weights': ['uniform', 'distance'],
|
| 54 |
+
'metric': ['euclidean']
|
| 55 |
+
},
|
| 56 |
+
"Decision Tree": {
|
| 57 |
+
'criterion': ['gini', 'entropy'],
|
| 58 |
+
'max_depth': [None, 5, 10],
|
| 59 |
+
'min_samples_split': [2, 5],
|
| 60 |
+
'min_samples_leaf': [1, 2]
|
| 61 |
+
},
|
| 62 |
+
"Random Forest": {
|
| 63 |
+
'n_estimators': [50, 100],
|
| 64 |
+
'max_depth': [None, 5],
|
| 65 |
+
'min_samples_split': [2, 5],
|
| 66 |
+
'min_samples_leaf': [1, 2],
|
| 67 |
+
'max_features': ['sqrt']
|
| 68 |
+
},
|
| 69 |
+
"AdaBoost": {
|
| 70 |
+
'n_estimators': [50, 100],
|
| 71 |
+
'learning_rate': [0.1, 1.0],
|
| 72 |
+
# 'algorithm': ['SAMME.R']
|
| 73 |
+
},
|
| 74 |
+
"Gradient Boosting": {
|
| 75 |
+
'n_estimators': [50, 100],
|
| 76 |
+
'learning_rate': [0.1],
|
| 77 |
+
'max_depth': [3, 5],
|
| 78 |
+
'min_samples_split': [2],
|
| 79 |
+
'min_samples_leaf': [1],
|
| 80 |
+
'max_features': ['sqrt']
|
| 81 |
+
},
|
| 82 |
+
"logistic regression": {
|
| 83 |
+
'C': [1.0, 10.0],
|
| 84 |
+
'penalty': ['l2'],
|
| 85 |
+
'solver': ['liblinear']
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
model_report:dict = evaluate_models(
|
| 90 |
+
x_train = x_train,y_train = y_train,x_test = x_test,y_test = y_test,models = models,params = params)
|
| 91 |
+
|
| 92 |
+
# to get the best model score from the dict
|
| 93 |
+
best_model_score = max(sorted(model_report.values()))
|
| 94 |
+
|
| 95 |
+
# to get best model name from dict
|
| 96 |
+
best_model_name = list(model_report.keys())[
|
| 97 |
+
list(model_report.values()).index(best_model_score)
|
| 98 |
+
]
|
| 99 |
+
logging.info(f"best model name: {best_model_name}")
|
| 100 |
+
best_model = models[best_model_name]
|
| 101 |
+
y_train_pred = best_model.predict(x_train)
|
| 102 |
+
classification_train_metric= classification_score(y_true = y_train, y_pred=y_train_pred)
|
| 103 |
+
|
| 104 |
+
# track mlfow
|
| 105 |
+
self.track_mlflow(best_model, classification_train_metric)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
y_test_pred = best_model.predict(x_test)
|
| 110 |
+
classification_test_metric = classification_score(y_true = y_test, y_pred=y_test_pred)
|
| 111 |
+
|
| 112 |
+
preprocessor = load_object(file_path=self.data_transformation_artifact.transformed_object_file_path)
|
| 113 |
+
model_dir_path = os.path.dirname(self.model_trainer_config.trained_model_file_path)
|
| 114 |
+
os.makedirs(model_dir_path, exist_ok=True)
|
| 115 |
+
|
| 116 |
+
NetwerkModel= NetworkSecurityModel(preprocessing_object=preprocessor, trained_model_object=best_model)
|
| 117 |
+
save_object(self.model_trainer_config.trained_model_file_path, obj=NetwerkModel)
|
| 118 |
+
save_object("final_model/model.pkl", best_model)
|
| 119 |
+
|
| 120 |
+
model_trainer_artifact = ModelTrainerArtifact(trained_model_file_path=self.model_trainer_config.trained_model_file_path, train_metric_artifact=classification_train_metric, test_metric_artifact=classification_test_metric)
|
| 121 |
+
logging.info(f"Model trainer artifact: {model_trainer_artifact}")
|
| 122 |
+
return model_trainer_artifact
|
| 123 |
+
|
| 124 |
+
def initiate_model_trainer(self)-> ModelTrainerArtifact:
|
| 125 |
+
try:
|
| 126 |
+
train_file_path = self.data_transformation_artifact.transformed_train_file_path
|
| 127 |
+
test_file_path = self.data_transformation_artifact.transformed_test_file_path
|
| 128 |
+
|
| 129 |
+
# loading training array and testing array
|
| 130 |
+
train_array = load_numpy_array_data(train_file_path)
|
| 131 |
+
test_array = load_numpy_array_data(test_file_path)
|
| 132 |
+
x_train, y_train, x_test, y_test = (
|
| 133 |
+
train_array[:, :-1],
|
| 134 |
+
train_array[:, -1],
|
| 135 |
+
test_array[:, :-1],
|
| 136 |
+
test_array[:, -1],
|
| 137 |
+
)
|
| 138 |
+
model = self.train_model(x_train, y_train, x_test=x_test, y_test=y_test)
|
| 139 |
+
except Exception as e:
|
| 140 |
+
raise NetworkSecurityException(e, sys) from e
|
src/constant/__init__.py
ADDED
|
File without changes
|
src/constant/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (167 Bytes). View file
|
|
|
src/constant/training_pipeline/__init__.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
"""
|
| 6 |
+
Common constant variable
|
| 7 |
+
"""
|
| 8 |
+
TARGET_COLUMN = "Result"
|
| 9 |
+
PIPELINE_NAME: str= "NetworkSecurity"
|
| 10 |
+
ARTIFACT_DIR: str = "Artifacts"
|
| 11 |
+
FILE_NAME: str = "phisingData.csv"
|
| 12 |
+
|
| 13 |
+
TRAIN_FILE_NAME: str = "train.csv"
|
| 14 |
+
TEST_FILE_NAME: str = "test.csv"
|
| 15 |
+
SCHEMA_FILE_PATH = os.path.join("data_schema", "schema.yaml")
|
| 16 |
+
|
| 17 |
+
SAVED_MODEL_DIR = os.path.join("saved_models")
|
| 18 |
+
MODEL_FILE_NAME = "model.pkl"
|
| 19 |
+
"""
|
| 20 |
+
Data ingestion variable
|
| 21 |
+
"""
|
| 22 |
+
DATA_INGESTION_COLLECTION_NAME: str= "phising_data"
|
| 23 |
+
DATA_INGESTION_DATBASE_NANE: str= "Network_data"
|
| 24 |
+
DATA_INGESTION_DIR_NAME:str = "data_ingestion"
|
| 25 |
+
DATA_INGESTION_FEATURE_STORE_DIR: str = "feature_store"
|
| 26 |
+
DATA_INGESTION_INGESTED_DIR: str = "ingested"
|
| 27 |
+
DATA_INGESTION_TRAIN_TEST_SPLIT_RATION: float = 0.2
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
"""
|
| 31 |
+
Data validation realated constant start with DATA_VALIDATION VAR NAME
|
| 32 |
+
|
| 33 |
+
"""
|
| 34 |
+
DATA_VALIDATION_DIR_NAMR:str = "data_validation"
|
| 35 |
+
DATA_VALIDATION_VALID_DIR: str = "validated"
|
| 36 |
+
DATA_VALIDATION_INVALID_DIR: str = "invalid"
|
| 37 |
+
DATA_VALIDATION_DRIFT_REPORT_DIR: str = "drift_report"
|
| 38 |
+
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str = "report.yaml"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
"""
|
| 42 |
+
Data transformation realated constant start with DATA_TRANSFORMATION VAR NAME
|
| 43 |
+
"""
|
| 44 |
+
DATA_TRANSFORMATION_DIR_NAME: str = "data_transformation"
|
| 45 |
+
DATA_TRANSFORMATION_TRANSFORMED_DIR_NAME: str = "transformed"
|
| 46 |
+
DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR:str = "transformed_object"
|
| 47 |
+
PREPROCESSING_OBJECT_FILE_NAME:str = "preprocessing.pkl"
|
| 48 |
+
# using knn imputer
|
| 49 |
+
DATA_TRANSFORMATION_IMPUTER_PARAMS: dict = {
|
| 50 |
+
"missing_values": np.nan,
|
| 51 |
+
"n_neighbors" : 3,
|
| 52 |
+
"weights" : "uniform"
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
"""
|
| 56 |
+
Model trainer realated constant start with DATA_TRANSFORMATION VAR NAME
|
| 57 |
+
"""
|
| 58 |
+
MODEL_TRAINER_DIR_NAME: str = "model_trainer"
|
| 59 |
+
MODEL_TRAINER_MODEL_DIR:str = "trained_model"
|
| 60 |
+
MODEL_TRAINER_MODEL_NAME:str = "model.pkl"
|
| 61 |
+
MODEL_TRAINER_EXPECTED_SCORE: float = 0.6
|
| 62 |
+
MODEL_TRAINER_OVERFITTING_UNDERFITTING_THRESHOLD: float = 0.05
|
| 63 |
+
|
| 64 |
+
TRAINING_BUCKET_NAME = "networksecuritymodelbucket"
|
src/constant/training_pipeline/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (2.18 kB). View file
|
|
|
src/data/__init__.py
ADDED
|
File without changes
|
src/data/sqlite_manager.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import os
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from src.exception.exception import NetworkSecurityException
|
| 6 |
+
from src.logging.logger import logging
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
class PhishingDataManager:
|
| 10 |
+
def __init__(self, db_path="data/phishing_data.db"):
|
| 11 |
+
"""Initialize SQLite database for phishing data"""
|
| 12 |
+
try:
|
| 13 |
+
self.db_path = db_path
|
| 14 |
+
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
| 15 |
+
self.conn = sqlite3.connect(db_path, check_same_thread=False)
|
| 16 |
+
self._create_tables()
|
| 17 |
+
except Exception as e:
|
| 18 |
+
raise NetworkSecurityException(e, sys) from e
|
| 19 |
+
|
| 20 |
+
def _create_tables(self):
|
| 21 |
+
"""Create phishing data table and metadata table"""
|
| 22 |
+
try:
|
| 23 |
+
cursor = self.conn.cursor()
|
| 24 |
+
|
| 25 |
+
# Main data table
|
| 26 |
+
cursor.execute("""
|
| 27 |
+
CREATE TABLE IF NOT EXISTS phishing_data (
|
| 28 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 29 |
+
having_IP_Address INTEGER,
|
| 30 |
+
URL_Length INTEGER,
|
| 31 |
+
Shortining_Service INTEGER,
|
| 32 |
+
having_At_Symbol INTEGER,
|
| 33 |
+
double_slash_redirecting INTEGER,
|
| 34 |
+
Prefix_Suffix INTEGER,
|
| 35 |
+
having_Sub_Domain INTEGER,
|
| 36 |
+
SSLfinal_State INTEGER,
|
| 37 |
+
Domain_registeration_length INTEGER,
|
| 38 |
+
Favicon INTEGER,
|
| 39 |
+
port INTEGER,
|
| 40 |
+
HTTPS_token INTEGER,
|
| 41 |
+
Request_URL INTEGER,
|
| 42 |
+
URL_of_Anchor INTEGER,
|
| 43 |
+
Links_in_tags INTEGER,
|
| 44 |
+
SFH INTEGER,
|
| 45 |
+
Submitting_to_email INTEGER,
|
| 46 |
+
Abnormal_URL INTEGER,
|
| 47 |
+
Redirect INTEGER,
|
| 48 |
+
on_mouseover INTEGER,
|
| 49 |
+
RightClick INTEGER,
|
| 50 |
+
popUpWidnow INTEGER,
|
| 51 |
+
Iframe INTEGER,
|
| 52 |
+
age_of_domain INTEGER,
|
| 53 |
+
DNSRecord INTEGER,
|
| 54 |
+
web_traffic INTEGER,
|
| 55 |
+
Page_Rank INTEGER,
|
| 56 |
+
Google_Index INTEGER,
|
| 57 |
+
Links_pointing_to_page INTEGER,
|
| 58 |
+
Statistical_report INTEGER,
|
| 59 |
+
Result INTEGER,
|
| 60 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 61 |
+
used_in_training BOOLEAN DEFAULT 0
|
| 62 |
+
)
|
| 63 |
+
""")
|
| 64 |
+
|
| 65 |
+
# Training metadata table
|
| 66 |
+
cursor.execute("""
|
| 67 |
+
CREATE TABLE IF NOT EXISTS training_metadata (
|
| 68 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 69 |
+
training_timestamp TIMESTAMP,
|
| 70 |
+
data_count INTEGER,
|
| 71 |
+
model_accuracy REAL,
|
| 72 |
+
model_version TEXT,
|
| 73 |
+
artifact_path TEXT
|
| 74 |
+
)
|
| 75 |
+
""")
|
| 76 |
+
|
| 77 |
+
self.conn.commit()
|
| 78 |
+
logging.info("Database tables created successfully")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
raise NetworkSecurityException(e, sys) from e
|
| 81 |
+
|
| 82 |
+
def insert_data_from_csv(self, csv_path):
|
| 83 |
+
"""Bulk insert from CSV (initial load)"""
|
| 84 |
+
try:
|
| 85 |
+
df = pd.read_csv(csv_path)
|
| 86 |
+
df.replace({"na": None}, inplace=True)
|
| 87 |
+
|
| 88 |
+
# Insert only new records (avoid duplicates)
|
| 89 |
+
df.to_sql('phishing_data', self.conn, if_exists='append', index=False)
|
| 90 |
+
logging.info(f"Inserted {len(df)} records from CSV")
|
| 91 |
+
return len(df)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
raise NetworkSecurityException(e, sys) from e
|
| 94 |
+
|
| 95 |
+
def add_new_samples(self, data_dict_list):
|
| 96 |
+
"""Add new phishing samples incrementally"""
|
| 97 |
+
try:
|
| 98 |
+
df = pd.DataFrame(data_dict_list)
|
| 99 |
+
df.to_sql('phishing_data', self.conn, if_exists='append', index=False)
|
| 100 |
+
logging.info(f"Added {len(df)} new samples")
|
| 101 |
+
return len(df)
|
| 102 |
+
except Exception as e:
|
| 103 |
+
raise NetworkSecurityException(e, sys) from e
|
| 104 |
+
|
| 105 |
+
def get_training_data(self, include_new_only=False):
|
| 106 |
+
"""Fetch data for training"""
|
| 107 |
+
try:
|
| 108 |
+
if include_new_only:
|
| 109 |
+
# Only get data not used in training yet
|
| 110 |
+
query = "SELECT * FROM phishing_data WHERE used_in_training = 0"
|
| 111 |
+
else:
|
| 112 |
+
# Get all data
|
| 113 |
+
query = "SELECT * FROM phishing_data"
|
| 114 |
+
|
| 115 |
+
df = pd.read_sql_query(query, self.conn)
|
| 116 |
+
|
| 117 |
+
# Drop metadata columns
|
| 118 |
+
df = df.drop(['id', 'created_at', 'used_in_training'], axis=1, errors='ignore')
|
| 119 |
+
|
| 120 |
+
logging.info(f"Fetched {len(df)} records for training")
|
| 121 |
+
return df
|
| 122 |
+
except Exception as e:
|
| 123 |
+
raise NetworkSecurityException(e, sys) from e
|
| 124 |
+
|
| 125 |
+
def mark_data_as_trained(self):
|
| 126 |
+
"""Mark all data as used in training"""
|
| 127 |
+
try:
|
| 128 |
+
cursor = self.conn.cursor()
|
| 129 |
+
cursor.execute("UPDATE phishing_data SET used_in_training = 1 WHERE used_in_training = 0")
|
| 130 |
+
self.conn.commit()
|
| 131 |
+
logging.info("Marked data as trained")
|
| 132 |
+
except Exception as e:
|
| 133 |
+
raise NetworkSecurityException(e, sys) from e
|
| 134 |
+
|
| 135 |
+
def get_new_data_count(self):
|
| 136 |
+
"""Count untrained samples"""
|
| 137 |
+
try:
|
| 138 |
+
cursor = self.conn.cursor()
|
| 139 |
+
result = cursor.execute("SELECT COUNT(*) FROM phishing_data WHERE used_in_training = 0").fetchone()
|
| 140 |
+
return result[0]
|
| 141 |
+
except Exception as e:
|
| 142 |
+
raise NetworkSecurityException(e, sys) from e
|
| 143 |
+
|
| 144 |
+
def log_training_run(self, data_count, accuracy, version, artifact_path):
|
| 145 |
+
"""Log training metadata"""
|
| 146 |
+
try:
|
| 147 |
+
cursor = self.conn.cursor()
|
| 148 |
+
cursor.execute("""
|
| 149 |
+
INSERT INTO training_metadata (training_timestamp, data_count, model_accuracy, model_version, artifact_path)
|
| 150 |
+
VALUES (?, ?, ?, ?, ?)
|
| 151 |
+
""", (datetime.now(), data_count, accuracy, version, artifact_path))
|
| 152 |
+
self.conn.commit()
|
| 153 |
+
except Exception as e:
|
| 154 |
+
raise NetworkSecurityException(e, sys) from e
|
| 155 |
+
|
| 156 |
+
def should_retrain(self, threshold=100):
|
| 157 |
+
"""Check if retraining is needed based on new data"""
|
| 158 |
+
new_count = self.get_new_data_count()
|
| 159 |
+
return new_count >= threshold
|
| 160 |
+
|
| 161 |
+
def close(self):
|
| 162 |
+
self.conn.close()
|
src/entity/__init__.py
ADDED
|
File without changes
|
src/entity/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (165 Bytes). View file
|
|
|
src/entity/__pycache__/artifact_entity.cpython-310.pyc
ADDED
|
Binary file (1.56 kB). View file
|
|
|
src/entity/__pycache__/config_entity.cpython-310.pyc
ADDED
|
Binary file (3.71 kB). View file
|
|
|
src/entity/artifact_entity.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
|
| 3 |
+
@dataclass
|
| 4 |
+
class DataIngestionArtifact:
|
| 5 |
+
train_file_path: str
|
| 6 |
+
test_file_path: str
|
| 7 |
+
|
| 8 |
+
@dataclass
|
| 9 |
+
class DataValidationArtifact:
|
| 10 |
+
validation_status: bool
|
| 11 |
+
valid_train_file_path: str
|
| 12 |
+
valid_test_file_path: str
|
| 13 |
+
invalid_train_file_path: str
|
| 14 |
+
invalid_test_file_path: str
|
| 15 |
+
drift_report_file_path: str
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@dataclass
|
| 19 |
+
class DataTransformationArtifact:
|
| 20 |
+
transformed_object_file_path: str
|
| 21 |
+
transformed_train_file_path: str
|
| 22 |
+
transformed_test_file_path: str
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class ClassificationMetricArtifact:
|
| 26 |
+
f1_score: float
|
| 27 |
+
precision_score: float
|
| 28 |
+
recall_score: float
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class ModelTrainerArtifact:
|
| 32 |
+
trained_model_file_path: str
|
| 33 |
+
train_metric_artifact: ClassificationMetricArtifact
|
| 34 |
+
test_metric_artifact: ClassificationMetricArtifact
|
src/entity/config_entity.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
import os
|
| 3 |
+
from src.constant import training_pipeline
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TrainingPipelineConfig:
|
| 7 |
+
def __init__(self, timestamp = datetime.now()):
|
| 8 |
+
timestamp = timestamp.strftime("%m_%d_%Y_%H_%M_%S")
|
| 9 |
+
self.pipeline_name = training_pipeline.PIPELINE_NAME
|
| 10 |
+
self.artifact_name = training_pipeline.ARTIFACT_DIR
|
| 11 |
+
self.artifact_dir = os.path.join(self.artifact_name, timestamp)
|
| 12 |
+
self.model_dir=os.path.join("final_model")
|
| 13 |
+
self.timestamp:str = timestamp
|
| 14 |
+
|
| 15 |
+
class Data_ingestion_config:
|
| 16 |
+
def __init__(self, training_pipeline_config:TrainingPipelineConfig):
|
| 17 |
+
self.data_ingestion_dir = os.path.join(training_pipeline_config.artifact_dir,training_pipeline.DATA_INGESTION_DIR_NAME)
|
| 18 |
+
self.feature_store_file_path = os.path.join(self.data_ingestion_dir, training_pipeline.DATA_INGESTION_FEATURE_STORE_DIR, training_pipeline.FILE_NAME)
|
| 19 |
+
self.train_file_path = os.path.join(self.data_ingestion_dir, training_pipeline.DATA_INGESTION_INGESTED_DIR, training_pipeline.TRAIN_FILE_NAME)
|
| 20 |
+
self.test_file_path = os.path.join(self.data_ingestion_dir, training_pipeline.DATA_INGESTION_INGESTED_DIR, training_pipeline.TEST_FILE_NAME)
|
| 21 |
+
self.database_name = training_pipeline.DATA_INGESTION_DATBASE_NANE
|
| 22 |
+
self.collection_name = training_pipeline.DATA_INGESTION_COLLECTION_NAME
|
| 23 |
+
self.train_test_split_ratio = training_pipeline.DATA_INGESTION_TRAIN_TEST_SPLIT_RATION
|
| 24 |
+
|
| 25 |
+
class Data_validation_config:
|
| 26 |
+
def __init__(self,training_pipeline_config: TrainingPipelineConfig):
|
| 27 |
+
self.data_validation_dir:str = os.path.join(training_pipeline_config.artifact_dir, training_pipeline.DATA_VALIDATION_DIR_NAMR)
|
| 28 |
+
self.valid_data_dir:str = os.path.join(self.data_validation_dir, training_pipeline.DATA_VALIDATION_VALID_DIR)
|
| 29 |
+
self.invalid_data_dir:str = os.path.join(self.data_validation_dir, training_pipeline.DATA_VALIDATION_INVALID_DIR)
|
| 30 |
+
self.valid_train_file_path:str = os.path.join(self.valid_data_dir, training_pipeline.TRAIN_FILE_NAME)
|
| 31 |
+
self.valid_test_file_path:str = os.path.join(self.valid_data_dir, training_pipeline.TEST_FILE_NAME)
|
| 32 |
+
self.invalid_train_file_path:str = os.path.join(self.invalid_data_dir, training_pipeline.TRAIN_FILE_NAME)
|
| 33 |
+
self.invalid_test_file_path:str = os.path.join(self.invalid_data_dir, training_pipeline.TEST_FILE_NAME)
|
| 34 |
+
self.drift_report_file_path:str = os.path.join(self.data_validation_dir, training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR, training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class Data_transformation_config:
|
| 38 |
+
def __init__(self, training_pipeline_config: TrainingPipelineConfig):
|
| 39 |
+
self.data_transformation_dir:str = os.path.join(training_pipeline_config.artifact_dir, training_pipeline.DATA_TRANSFORMATION_DIR_NAME)
|
| 40 |
+
|
| 41 |
+
self.transformed_train_file_path: str = os.path.join( self.data_transformation_dir,training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DIR_NAME,
|
| 42 |
+
training_pipeline.TRAIN_FILE_NAME.replace("csv", "npy"),)
|
| 43 |
+
|
| 44 |
+
self.transformed_test_file_path: str = os.path.join(self.data_transformation_dir, training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DIR_NAME,
|
| 45 |
+
training_pipeline.TEST_FILE_NAME.replace("csv", "npy"),
|
| 46 |
+
)
|
| 47 |
+
self.transformed_object_file_path: str = os.path.join( self.data_transformation_dir, training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR,
|
| 48 |
+
training_pipeline.PREPROCESSING_OBJECT_FILE_NAME)
|
| 49 |
+
|
| 50 |
+
class Model_trainer_config:
|
| 51 |
+
def __init__(self, training_pipeline_config: TrainingPipelineConfig):
|
| 52 |
+
self.model_trainer_dir:str = os.path.join(
|
| 53 |
+
training_pipeline_config.artifact_dir, training_pipeline.MODEL_TRAINER_DIR_NAME
|
| 54 |
+
)
|
| 55 |
+
self.trained_model_file_path:str = os.path.join(
|
| 56 |
+
self.model_trainer_dir, training_pipeline.MODEL_TRAINER_MODEL_DIR, training_pipeline.MODEL_TRAINER_MODEL_NAME
|
| 57 |
+
)
|
| 58 |
+
self.expected_accuracy:float = training_pipeline.MODEL_TRAINER_EXPECTED_SCORE
|
| 59 |
+
self.overfitting_underfitting_threshold = training_pipeline.MODEL_TRAINER_OVERFITTING_UNDERFITTING_THRESHOLD
|
| 60 |
+
|
src/exception/__init__.py
ADDED
|
File without changes
|
src/exception/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (168 Bytes). View file
|
|
|
src/exception/__pycache__/exception.cpython-310.pyc
ADDED
|
Binary file (977 Bytes). View file
|
|
|
src/exception/exception.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from src.logging import logger
|
| 3 |
+
|
| 4 |
+
class NetworkSecurityException(Exception):
|
| 5 |
+
def __init__(self, error_message, error_details:sys):
|
| 6 |
+
self.error_message = error_message
|
| 7 |
+
_, _, exc_tb = error_details.exc_info()
|
| 8 |
+
self.line_number = exc_tb.tb_lineno
|
| 9 |
+
self.file_name = exc_tb.tb_frame.f_code.co_filename
|
| 10 |
+
|
| 11 |
+
def __str__(self):
|
| 12 |
+
return f"Error occurred in python script name [{self.file_name}] line number [{self.line_number}] error message [{self.error_message}]"
|
src/logging/__init__.py
ADDED
|
File without changes
|
src/logging/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (166 Bytes). View file
|
|
|
src/logging/__pycache__/logger.cpython-310.pyc
ADDED
|
Binary file (568 Bytes). View file
|
|
|
src/logging/logger.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
LOG_FILE = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
|
| 6 |
+
logs_path= os.path.join(os.getcwd(), "logs", LOG_FILE)
|
| 7 |
+
os.makedirs(logs_path, exist_ok=True)
|
| 8 |
+
LOG_FILE_PATH = os.path.join(logs_path, LOG_FILE)
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(
|
| 11 |
+
filename = LOG_FILE_PATH,
|
| 12 |
+
format = "[%(asctime)s] %(lineno)d %(name)s - %(levelname)s - %(message)s",
|
| 13 |
+
level = logging.INFO
|
| 14 |
+
)
|