Spaces:
Runtime error
Runtime error
File size: 4,702 Bytes
42da6ea 78b54a8 42da6ea 78b54a8 42da6ea 78b54a8 42da6ea 78b54a8 2d7183c 78b54a8 2d7183c 78b54a8 42da6ea | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | from networksecurity.exception.exception import NetworkSecurityException
from networksecurity.logging.logger import logging
import os
import sys
import numpy as np
import pandas as pd
from typing import List
import pymongo
from sklearn.model_selection import train_test_split
## Configuration of the data ingestion config
from networksecurity.entity.config_entity import DataIngestionConfig
from networksecurity.entity.artifact_entity import DataIngestionArtifact
from dotenv import load_dotenv
load_dotenv()
MONGO_DB_URL = os.getenv("MONGO_DB_URL")
class DataIngestion:
def __init__(self, data_ingestion_config):
try:
self.data_ingestion_config = data_ingestion_config
except Exception as e:
raise NetworkSecurityException(e, sys)
def export_collection_as_dataframe(self):
"""
Read data from mongodb OR fallback to CSV if MongoDB unavailable
"""
try:
# Original MongoDB code
database_name = self.data_ingestion_config.database_name
collection_name = self.data_ingestion_config.collection_name
self.mongo_client = pymongo.MongoClient(MONGO_DB_URL)
collection = self.mongo_client[database_name][collection_name]
df = pd.DataFrame(list(collection.find()))
if "_id" in df.columns.to_list():
df = df.drop(columns=["_id"], axis = 1)
df.replace({"na":np.nan}, inplace=True)
return df
except Exception as e:
# FALLBACK: Use your test.csv
import logging
logging.info(f"MongoDB unavailable, using sample CSV: {str(e)}")
try:
# Construct absolute path relative to this script file
# Script is in networksecurity/components/data_ingestion.py
# Root is 2 levels up from current_dir (components)
current_dir = os.path.dirname(os.path.abspath(__file__))
root_dir = os.path.dirname(os.path.dirname(current_dir))
csv_path = os.path.join(root_dir, "Network_data", "phisingData.csv")
logging.info(f"MongoDB unavailable. Attempting to load CSV from: {csv_path}")
df = pd.read_csv(csv_path)
logging.info(f" Loaded {len(df)} rows from CSV")
return df
except FileNotFoundError:
raise NetworkSecurityException(f"Sample CSV not found at {csv_path}. CWD: {os.getcwd()}", sys)
def export_data_into_feature_store(self, dataframe: pd.DataFrame):
try:
feature_store_file_path = self.data_ingestion_config.feature_store_file_path
## Creating Folder
dir_path = os.path.dirname(feature_store_file_path)
os.makedirs(dir_path,exist_ok=True)
dataframe.to_csv(feature_store_file_path, index=False, header=True)
return dataframe
except Exception as e:
raise NetworkSecurityException(e, sys)
def split_data_as_train_test(self, dataframe: pd.DataFrame):
try:
train_set, test_set = train_test_split(
dataframe, test_size=self.data_ingestion_config.train_test_split_ratio
)
logging.info("Performed train test split on the dataframe")
logging.info("Exited split_data_as_train_test method of Data_Ingestion class")
dir_path = os.path.dirname(self.data_ingestion_config.training_file_path)
os.makedirs(dir_path, exist_ok=True)
logging.info(f"Exporting train and test file path")
train_set.to_csv(
self.data_ingestion_config.training_file_path, index=False, header=True
)
test_set.to_csv(
self.data_ingestion_config.testing_file_path, index=False, header=True
)
logging.info(f"Exported train and test file path")
except Exception as e:
raise NetworkSecurityException(e, sys)
def initiate_data_ingestion(self):
try:
dataframe = self.export_collection_as_dataframe()
dataframe = self.export_data_into_feature_store(dataframe)
self.split_data_as_train_test(dataframe)
dataingestionartifact=DataIngestionArtifact(training_file_path=self.data_ingestion_config.training_file_path,
testing_file_path=self.data_ingestion_config.testing_file_path)
return dataingestionartifact
except Exception as e:
raise NetworkSecurityException(e, sys) |