Inder-26
Fix data ingestion path, update README images, and enable reload
2d7183c
from networksecurity.exception.exception import NetworkSecurityException
from networksecurity.logging.logger import logging
import os
import sys
import numpy as np
import pandas as pd
from typing import List
import pymongo
from sklearn.model_selection import train_test_split
## Configuration of the data ingestion config
from networksecurity.entity.config_entity import DataIngestionConfig
from networksecurity.entity.artifact_entity import DataIngestionArtifact
from dotenv import load_dotenv
load_dotenv()
MONGO_DB_URL = os.getenv("MONGO_DB_URL")
class DataIngestion:
def __init__(self, data_ingestion_config):
try:
self.data_ingestion_config = data_ingestion_config
except Exception as e:
raise NetworkSecurityException(e, sys)
def export_collection_as_dataframe(self):
"""
Read data from mongodb OR fallback to CSV if MongoDB unavailable
"""
try:
# Original MongoDB code
database_name = self.data_ingestion_config.database_name
collection_name = self.data_ingestion_config.collection_name
self.mongo_client = pymongo.MongoClient(MONGO_DB_URL)
collection = self.mongo_client[database_name][collection_name]
df = pd.DataFrame(list(collection.find()))
if "_id" in df.columns.to_list():
df = df.drop(columns=["_id"], axis = 1)
df.replace({"na":np.nan}, inplace=True)
return df
except Exception as e:
# FALLBACK: Use your test.csv
import logging
logging.info(f"MongoDB unavailable, using sample CSV: {str(e)}")
try:
# Construct absolute path relative to this script file
# Script is in networksecurity/components/data_ingestion.py
# Root is 2 levels up from current_dir (components)
current_dir = os.path.dirname(os.path.abspath(__file__))
root_dir = os.path.dirname(os.path.dirname(current_dir))
csv_path = os.path.join(root_dir, "Network_data", "phisingData.csv")
logging.info(f"MongoDB unavailable. Attempting to load CSV from: {csv_path}")
df = pd.read_csv(csv_path)
logging.info(f" Loaded {len(df)} rows from CSV")
return df
except FileNotFoundError:
raise NetworkSecurityException(f"Sample CSV not found at {csv_path}. CWD: {os.getcwd()}", sys)
def export_data_into_feature_store(self, dataframe: pd.DataFrame):
try:
feature_store_file_path = self.data_ingestion_config.feature_store_file_path
## Creating Folder
dir_path = os.path.dirname(feature_store_file_path)
os.makedirs(dir_path,exist_ok=True)
dataframe.to_csv(feature_store_file_path, index=False, header=True)
return dataframe
except Exception as e:
raise NetworkSecurityException(e, sys)
def split_data_as_train_test(self, dataframe: pd.DataFrame):
try:
train_set, test_set = train_test_split(
dataframe, test_size=self.data_ingestion_config.train_test_split_ratio
)
logging.info("Performed train test split on the dataframe")
logging.info("Exited split_data_as_train_test method of Data_Ingestion class")
dir_path = os.path.dirname(self.data_ingestion_config.training_file_path)
os.makedirs(dir_path, exist_ok=True)
logging.info(f"Exporting train and test file path")
train_set.to_csv(
self.data_ingestion_config.training_file_path, index=False, header=True
)
test_set.to_csv(
self.data_ingestion_config.testing_file_path, index=False, header=True
)
logging.info(f"Exported train and test file path")
except Exception as e:
raise NetworkSecurityException(e, sys)
def initiate_data_ingestion(self):
try:
dataframe = self.export_collection_as_dataframe()
dataframe = self.export_data_into_feature_store(dataframe)
self.split_data_as_train_test(dataframe)
dataingestionartifact=DataIngestionArtifact(training_file_path=self.data_ingestion_config.training_file_path,
testing_file_path=self.data_ingestion_config.testing_file_path)
return dataingestionartifact
except Exception as e:
raise NetworkSecurityException(e, sys)