Spaces:
Sleeping
Sleeping
File size: 3,233 Bytes
a21e473 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | from src.exception.exception import NetworkSecurityException
from src.logging.logger import logging
from src.entity.config_entity import Data_ingestion_config
from src.entity.artifact_entity import DataIngestionArtifact
import os, sys
import pandas as pd
from typing import List
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
# import pymongo
import numpy as np
from src.data.sqlite_manager import PhishingDataManager
load_dotenv()
MONGODB_URL = os.getenv("MONGODB_URL")
class DataIngestion:
def __init__(self, data_ingestion_config: Data_ingestion_config):
try:
self.data_ingestion_config = data_ingestion_config
self.db_manager = PhishingDataManager()
except Exception as e:
raise NetworkSecurityException(e, sys)
def export_collection_as_dataframe(self):
"""Export data from SQLite instead of MongoDB"""
try:
# Get all training data
df = self.db_manager.get_training_data(include_new_only=False)
return df
except Exception as e:
raise NetworkSecurityException(e, sys)
def move_data_into_feature_store(self, dataframe: pd.DataFrame):
try:
feature_store_file = self.data_ingestion_config.feature_store_file_path
dir_path = os.path.dirname(feature_store_file)
os.makedirs(dir_path, exist_ok=True)
dataframe.to_csv(feature_store_file, index=False, header=True)
return dataframe
except Exception as e:
raise NetworkSecurityException(e, sys)
def data_train_test_split(self,dataframe:pd.DataFrame):
try:
train_set, test_set = train_test_split(
dataframe, test_size=self.data_ingestion_config.train_test_split_ratio
)
logging.info("Trained test spltting done on dataframe")
dir_path = os.path.dirname(self.data_ingestion_config.train_file_path)
os.makedirs(dir_path, exist_ok=True)
logging.info("Exporting train and test file path")
train_set.to_csv(
self.data_ingestion_config.train_file_path, index = False, header = True
)
test_set.to_csv(
self.data_ingestion_config.test_file_path, index = False, header = True
)
logging.info("Exported train and test file path.")
except Exception as e:
raise NetworkSecurityException(e, sys)
def initiate_data_ingestion(self):
try:
dataframe = self.export_collection_as_dataframe()
dataframe = self.move_data_into_feature_store(dataframe)
self.data_train_test_split(dataframe)
# Mark data as used
self.db_manager.mark_data_as_trained()
data_ingestion_artifact = DataIngestionArtifact(
train_file_path=self.data_ingestion_config.train_file_path,
test_file_path=self.data_ingestion_config.test_file_path
)
return data_ingestion_artifact
except Exception as e:
raise NetworkSecurityException(e, sys)
|