import sqlite3 import pandas as pd import os from datetime import datetime from src.exception.exception import NetworkSecurityException from src.logging.logger import logging import sys class PhishingDataManager: def __init__(self, db_path="data/phishing_data.db"): """Initialize SQLite database for phishing data""" try: self.db_path = db_path os.makedirs(os.path.dirname(db_path), exist_ok=True) self.conn = sqlite3.connect(db_path, check_same_thread=False) self._create_tables() except Exception as e: raise NetworkSecurityException(e, sys) from e def _create_tables(self): """Create phishing data table and metadata table""" try: cursor = self.conn.cursor() # Main data table cursor.execute(""" CREATE TABLE IF NOT EXISTS phishing_data ( id INTEGER PRIMARY KEY AUTOINCREMENT, having_IP_Address INTEGER, URL_Length INTEGER, Shortining_Service INTEGER, having_At_Symbol INTEGER, double_slash_redirecting INTEGER, Prefix_Suffix INTEGER, having_Sub_Domain INTEGER, SSLfinal_State INTEGER, Domain_registeration_length INTEGER, Favicon INTEGER, port INTEGER, HTTPS_token INTEGER, Request_URL INTEGER, URL_of_Anchor INTEGER, Links_in_tags INTEGER, SFH INTEGER, Submitting_to_email INTEGER, Abnormal_URL INTEGER, Redirect INTEGER, on_mouseover INTEGER, RightClick INTEGER, popUpWidnow INTEGER, Iframe INTEGER, age_of_domain INTEGER, DNSRecord INTEGER, web_traffic INTEGER, Page_Rank INTEGER, Google_Index INTEGER, Links_pointing_to_page INTEGER, Statistical_report INTEGER, Result INTEGER, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, used_in_training BOOLEAN DEFAULT 0 ) """) # Training metadata table cursor.execute(""" CREATE TABLE IF NOT EXISTS training_metadata ( id INTEGER PRIMARY KEY AUTOINCREMENT, training_timestamp TIMESTAMP, data_count INTEGER, model_accuracy REAL, model_version TEXT, artifact_path TEXT ) """) self.conn.commit() logging.info("Database tables created successfully") except Exception as e: raise NetworkSecurityException(e, sys) from e def insert_data_from_csv(self, csv_path): """Bulk insert from CSV (initial load)""" try: df = pd.read_csv(csv_path) df.replace({"na": None}, inplace=True) # Insert only new records (avoid duplicates) df.to_sql('phishing_data', self.conn, if_exists='append', index=False) logging.info(f"Inserted {len(df)} records from CSV") return len(df) except Exception as e: raise NetworkSecurityException(e, sys) from e def add_new_samples(self, data_dict_list): """Add new phishing samples incrementally""" try: df = pd.DataFrame(data_dict_list) df.to_sql('phishing_data', self.conn, if_exists='append', index=False) logging.info(f"Added {len(df)} new samples") return len(df) except Exception as e: raise NetworkSecurityException(e, sys) from e def get_training_data(self, include_new_only=False): """Fetch data for training""" try: if include_new_only: # Only get data not used in training yet query = "SELECT * FROM phishing_data WHERE used_in_training = 0" else: # Get all data query = "SELECT * FROM phishing_data" df = pd.read_sql_query(query, self.conn) # Drop metadata columns df = df.drop(['id', 'created_at', 'used_in_training'], axis=1, errors='ignore') logging.info(f"Fetched {len(df)} records for training") return df except Exception as e: raise NetworkSecurityException(e, sys) from e def mark_data_as_trained(self): """Mark all data as used in training""" try: cursor = self.conn.cursor() cursor.execute("UPDATE phishing_data SET used_in_training = 1 WHERE used_in_training = 0") self.conn.commit() logging.info("Marked data as trained") except Exception as e: raise NetworkSecurityException(e, sys) from e def get_new_data_count(self): """Count untrained samples""" try: cursor = self.conn.cursor() result = cursor.execute("SELECT COUNT(*) FROM phishing_data WHERE used_in_training = 0").fetchone() return result[0] except Exception as e: raise NetworkSecurityException(e, sys) from e def log_training_run(self, data_count, accuracy, version, artifact_path): """Log training metadata""" try: cursor = self.conn.cursor() cursor.execute(""" INSERT INTO training_metadata (training_timestamp, data_count, model_accuracy, model_version, artifact_path) VALUES (?, ?, ?, ?, ?) """, (datetime.now(), data_count, accuracy, version, artifact_path)) self.conn.commit() except Exception as e: raise NetworkSecurityException(e, sys) from e def should_retrain(self, threshold=100): """Check if retraining is needed based on new data""" new_count = self.get_new_data_count() return new_count >= threshold def close(self): self.conn.close()