File size: 6,444 Bytes
a21e473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import sqlite3
import pandas as pd
import os
from datetime import datetime
from src.exception.exception import NetworkSecurityException
from src.logging.logger import logging
import sys

class PhishingDataManager:
    def __init__(self, db_path="data/phishing_data.db"):
        """Initialize SQLite database for phishing data"""
        try:
            self.db_path = db_path
            os.makedirs(os.path.dirname(db_path), exist_ok=True)
            self.conn = sqlite3.connect(db_path, check_same_thread=False)
            self._create_tables()
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e
    
    def _create_tables(self):
        """Create phishing data table and metadata table"""
        try:
            cursor = self.conn.cursor()
            
            # Main data table
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS phishing_data (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    having_IP_Address INTEGER,
                    URL_Length INTEGER,
                    Shortining_Service INTEGER,
                    having_At_Symbol INTEGER,
                    double_slash_redirecting INTEGER,
                    Prefix_Suffix INTEGER,
                    having_Sub_Domain INTEGER,
                    SSLfinal_State INTEGER,
                    Domain_registeration_length INTEGER,
                    Favicon INTEGER,
                    port INTEGER,
                    HTTPS_token INTEGER,
                    Request_URL INTEGER,
                    URL_of_Anchor INTEGER,
                    Links_in_tags INTEGER,
                    SFH INTEGER,
                    Submitting_to_email INTEGER,
                    Abnormal_URL INTEGER,
                    Redirect INTEGER,
                    on_mouseover INTEGER,
                    RightClick INTEGER,
                    popUpWidnow INTEGER,
                    Iframe INTEGER,
                    age_of_domain INTEGER,
                    DNSRecord INTEGER,
                    web_traffic INTEGER,
                    Page_Rank INTEGER,
                    Google_Index INTEGER,
                    Links_pointing_to_page INTEGER,
                    Statistical_report INTEGER,
                    Result INTEGER,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    used_in_training BOOLEAN DEFAULT 0
                )
            """)
            
            # Training metadata table
            cursor.execute("""
                CREATE TABLE IF NOT EXISTS training_metadata (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    training_timestamp TIMESTAMP,
                    data_count INTEGER,
                    model_accuracy REAL,
                    model_version TEXT,
                    artifact_path TEXT
                )
            """)
            
            self.conn.commit()
            logging.info("Database tables created successfully")
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e
    
    def insert_data_from_csv(self, csv_path):
        """Bulk insert from CSV (initial load)"""
        try:
            df = pd.read_csv(csv_path)
            df.replace({"na": None}, inplace=True)
            
            # Insert only new records (avoid duplicates)
            df.to_sql('phishing_data', self.conn, if_exists='append', index=False)
            logging.info(f"Inserted {len(df)} records from CSV")
            return len(df)
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e
    
    def add_new_samples(self, data_dict_list):
        """Add new phishing samples incrementally"""
        try:
            df = pd.DataFrame(data_dict_list)
            df.to_sql('phishing_data', self.conn, if_exists='append', index=False)
            logging.info(f"Added {len(df)} new samples")
            return len(df)
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e
    
    def get_training_data(self, include_new_only=False):
        """Fetch data for training"""
        try:
            if include_new_only:
                # Only get data not used in training yet
                query = "SELECT * FROM phishing_data WHERE used_in_training = 0"
            else:
                # Get all data
                query = "SELECT * FROM phishing_data"
            
            df = pd.read_sql_query(query, self.conn)
            
            # Drop metadata columns
            df = df.drop(['id', 'created_at', 'used_in_training'], axis=1, errors='ignore')
            
            logging.info(f"Fetched {len(df)} records for training")
            return df
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e
    
    def mark_data_as_trained(self):
        """Mark all data as used in training"""
        try:
            cursor = self.conn.cursor()
            cursor.execute("UPDATE phishing_data SET used_in_training = 1 WHERE used_in_training = 0")
            self.conn.commit()
            logging.info("Marked data as trained")
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e
    
    def get_new_data_count(self):
        """Count untrained samples"""
        try:
            cursor = self.conn.cursor()
            result = cursor.execute("SELECT COUNT(*) FROM phishing_data WHERE used_in_training = 0").fetchone()
            return result[0]
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e
    
    def log_training_run(self, data_count, accuracy, version, artifact_path):
        """Log training metadata"""
        try:
            cursor = self.conn.cursor()
            cursor.execute("""
                INSERT INTO training_metadata (training_timestamp, data_count, model_accuracy, model_version, artifact_path)
                VALUES (?, ?, ?, ?, ?)
            """, (datetime.now(), data_count, accuracy, version, artifact_path))
            self.conn.commit()
        except Exception as e:
            raise NetworkSecurityException(e, sys) from e
    
    def should_retrain(self, threshold=100):
        """Check if retraining is needed based on new data"""
        new_count = self.get_new_data_count()
        return new_count >= threshold
    
    def close(self):
        self.conn.close()