from networksecurity.exception.exception import NetworkSecurityException from networksecurity.logging.logger import logging import os import sys import numpy as np import pandas as pd from typing import List import pymongo from sklearn.model_selection import train_test_split ## Configuration of the data ingestion config from networksecurity.entity.config_entity import DataIngestionConfig from networksecurity.entity.artifact_entity import DataIngestionArtifact from dotenv import load_dotenv load_dotenv() MONGO_DB_URL = os.getenv("MONGO_DB_URL") class DataIngestion: def __init__(self, data_ingestion_config): try: self.data_ingestion_config = data_ingestion_config except Exception as e: raise NetworkSecurityException(e, sys) def export_collection_as_dataframe(self): """ Read data from mongodb OR fallback to CSV if MongoDB unavailable """ try: # Original MongoDB code database_name = self.data_ingestion_config.database_name collection_name = self.data_ingestion_config.collection_name self.mongo_client = pymongo.MongoClient(MONGO_DB_URL) collection = self.mongo_client[database_name][collection_name] df = pd.DataFrame(list(collection.find())) if "_id" in df.columns.to_list(): df = df.drop(columns=["_id"], axis = 1) df.replace({"na":np.nan}, inplace=True) return df except Exception as e: # FALLBACK: Use your test.csv import logging logging.info(f"MongoDB unavailable, using sample CSV: {str(e)}") try: # Construct absolute path relative to this script file # Script is in networksecurity/components/data_ingestion.py # Root is 2 levels up from current_dir (components) current_dir = os.path.dirname(os.path.abspath(__file__)) root_dir = os.path.dirname(os.path.dirname(current_dir)) csv_path = os.path.join(root_dir, "Network_data", "phisingData.csv") logging.info(f"MongoDB unavailable. Attempting to load CSV from: {csv_path}") df = pd.read_csv(csv_path) logging.info(f" Loaded {len(df)} rows from CSV") return df except FileNotFoundError: raise NetworkSecurityException(f"Sample CSV not found at {csv_path}. CWD: {os.getcwd()}", sys) def export_data_into_feature_store(self, dataframe: pd.DataFrame): try: feature_store_file_path = self.data_ingestion_config.feature_store_file_path ## Creating Folder dir_path = os.path.dirname(feature_store_file_path) os.makedirs(dir_path,exist_ok=True) dataframe.to_csv(feature_store_file_path, index=False, header=True) return dataframe except Exception as e: raise NetworkSecurityException(e, sys) def split_data_as_train_test(self, dataframe: pd.DataFrame): try: train_set, test_set = train_test_split( dataframe, test_size=self.data_ingestion_config.train_test_split_ratio ) logging.info("Performed train test split on the dataframe") logging.info("Exited split_data_as_train_test method of Data_Ingestion class") dir_path = os.path.dirname(self.data_ingestion_config.training_file_path) os.makedirs(dir_path, exist_ok=True) logging.info(f"Exporting train and test file path") train_set.to_csv( self.data_ingestion_config.training_file_path, index=False, header=True ) test_set.to_csv( self.data_ingestion_config.testing_file_path, index=False, header=True ) logging.info(f"Exported train and test file path") except Exception as e: raise NetworkSecurityException(e, sys) def initiate_data_ingestion(self): try: dataframe = self.export_collection_as_dataframe() dataframe = self.export_data_into_feature_store(dataframe) self.split_data_as_train_test(dataframe) dataingestionartifact=DataIngestionArtifact(training_file_path=self.data_ingestion_config.training_file_path, testing_file_path=self.data_ingestion_config.testing_file_path) return dataingestionartifact except Exception as e: raise NetworkSecurityException(e, sys)