Rasel Santillan commited on
Commit
3ab7d75
·
1 Parent(s): a1daef0
Files changed (3) hide show
  1. Dockerfile +34 -4
  2. model/email_feature_extractor.py +522 -190
  3. model/model.py +4 -5
Dockerfile CHANGED
@@ -34,17 +34,47 @@ ENV PATH="/home/user/.local/bin:$PATH"
34
  RUN pip install --user --no-cache-dir --upgrade pip && \
35
  pip install --user --no-cache-dir -r requirements.txt
36
 
 
 
 
 
37
  # Copy application code and model
38
  COPY --chown=user:user . .
39
 
40
- # Expose port 7860 (default for HuggingFace Spaces) and 8000 (standard FastAPI)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  EXPOSE 7860 8000
42
 
43
- # Health check
44
  HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
45
- CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1
46
 
47
  # Run the application
48
- # Use app.py which defaults to port 7860 (HuggingFace Spaces standard)
49
  CMD ["python", "app.py"]
50
 
 
34
  RUN pip install --user --no-cache-dir --upgrade pip && \
35
  pip install --user --no-cache-dir -r requirements.txt
36
 
37
+ # Download spaCy language model during build
38
+ # This is a required dependency - the application will not start without it
39
+ RUN python -m spacy download en_core_web_sm
40
+
41
  # Copy application code and model
42
  COPY --chown=user:user . .
43
 
44
+ # Download NLTK data during build
45
+ # These are required dependencies - the application will not start without them
46
+ RUN python -c "\
47
+ import nltk; \
48
+ nltk.download('punkt', quiet=True); \
49
+ nltk.download('punkt_tab', quiet=True); \
50
+ nltk.download('stopwords', quiet=True); \
51
+ nltk.download('averaged_perceptron_tagger', quiet=True); \
52
+ print('NLTK data downloaded successfully')"
53
+
54
+ # Verify all NLP resources are properly installed
55
+ RUN python -c "\
56
+ import nltk; \
57
+ from nltk.tokenize import word_tokenize; \
58
+ from nltk.corpus import stopwords; \
59
+ import spacy; \
60
+ from langdetect import detect_langs, DetectorFactory; \
61
+ DetectorFactory.seed = 0; \
62
+ nlp = spacy.load('en_core_web_sm'); \
63
+ print('Verification: NLTK tokenization:', word_tokenize('test')); \
64
+ print('Verification: NLTK stopwords:', len(stopwords.words('english')), 'words'); \
65
+ print('Verification: NLTK stopwords languages:', len(stopwords.fileids())); \
66
+ print('Verification: spaCy model loaded successfully'); \
67
+ print('Verification: langdetect:', detect_langs('This is a test.')); \
68
+ print('All NLP resources verified!')"
69
+
70
+ # Expose ports (7860 is default, 8000 for compatibility)
71
  EXPOSE 7860 8000
72
 
73
+ # Health check (uses port 7860 by default)
74
  HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
75
+ CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
76
 
77
  # Run the application
78
+ # Use app.py for HuggingFace Spaces compatibility, defaults to port 7860
79
  CMD ["python", "app.py"]
80
 
model/email_feature_extractor.py CHANGED
@@ -1,287 +1,619 @@
1
  """
2
- Spam Email Feature Extraction System
3
- Extracts 57 features from email content for spam detection.
4
 
5
- Features:
6
- - 48 word frequency features (word_freq_WORD)
7
- - 6 character frequency features (char_freq_CHAR)
8
- - 3 capital letter run length features
9
-
10
- Based on the UCI Spambase dataset feature definitions.
11
  """
12
 
13
  import re
14
  import logging
15
- from typing import Dict, Any, List
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
  # ============================================================================
22
- # Feature Definitions
23
  # ============================================================================
24
 
25
- # 48 words to track frequency for
26
- TRACKED_WORDS = [
27
- "make", "address", "all", "3d", "our", "over", "remove", "internet",
28
- "order", "mail", "receive", "will", "people", "report", "addresses",
29
- "free", "business", "email", "you", "credit", "your", "font", "000",
30
- "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
31
- "data", "415", "85", "technology", "1999", "parts", "pm", "direct",
32
- "cs", "meeting", "original", "project", "re", "edu", "table", "conference"
33
- ]
34
 
35
- # 6 characters to track frequency for
36
- TRACKED_CHARS = [';', '(', '[', '!', '$', '#']
37
 
38
- # ============================================================================
39
- # Helper Functions
40
- # ============================================================================
41
 
42
- def extract_words(text: str) -> List[str]:
43
- """
44
- Extract words from text.
45
- A "word" is any string of alphanumeric characters bounded by
46
- non-alphanumeric characters or end-of-string.
47
 
48
- Args:
49
- text: Email content
50
 
51
- Returns:
52
- List[str]: List of words (lowercase)
53
  """
54
- # Split by non-alphanumeric characters
55
- words = re.findall(r'[a-zA-Z0-9]+', text.lower())
56
- return words
57
 
58
-
59
- def count_total_words(text: str) -> int:
60
  """
61
- Count total number of words in the email.
 
 
62
 
63
- Args:
64
- text: Email content
65
 
66
- Returns:
67
- int: Total word count
68
- """
69
- words = extract_words(text)
70
- return len(words)
71
 
 
 
 
72
 
73
- def calculate_word_frequency(text: str, word: str) -> float:
74
  """
75
- Calculate percentage of words in the email that match the given word.
 
76
 
77
- Formula: 100 * (number of times WORD appears) / (total number of words)
 
 
 
 
 
 
78
 
79
  Args:
80
- text: Email content
81
- word: Word to search for (case-insensitive)
82
 
83
  Returns:
84
- float: Percentage [0, 100]
85
- """
86
- words = extract_words(text)
87
- total_words = len(words)
88
 
89
- if total_words == 0:
90
- return 0.0
 
91
 
92
- word_lower = word.lower()
93
- word_count = sum(1 for w in words if w == word_lower)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- return 100.0 * word_count / total_words
96
 
 
 
 
97
 
98
- def calculate_char_frequency(text: str, char: str) -> float:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  """
100
- Calculate percentage of characters in the email that match the given character.
101
 
102
- Formula: 100 * (number of CHAR occurrences) / (total characters in email)
 
103
 
104
  Args:
105
- text: Email content
106
- char: Character to search for
107
 
108
  Returns:
109
- float: Percentage [0, 100]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  """
111
- total_chars = len(text)
112
 
113
- if total_chars == 0:
114
- return 0.0
 
 
115
 
116
- char_count = text.count(char)
 
117
 
118
- return 100.0 * char_count / total_chars
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- def calculate_capital_run_length_average(text: str) -> float:
 
122
  """
123
- Calculate average length of uninterrupted sequences of capital letters.
124
 
125
  Args:
126
  text: Email content
127
 
128
  Returns:
129
- float: Average run length (minimum 1.0)
130
  """
131
- # Find all sequences of capital letters
132
- capital_sequences = re.findall(r'[A-Z]+', text)
 
 
 
 
133
 
134
- if not capital_sequences:
135
- return 1.0
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- total_length = sum(len(seq) for seq in capital_sequences)
138
- avg_length = total_length / len(capital_sequences)
139
 
140
- return max(1.0, avg_length)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
 
143
- def calculate_capital_run_length_longest(text: str) -> int:
144
  """
145
- Calculate length of longest uninterrupted sequence of capital letters.
 
 
 
146
 
147
  Args:
148
- text: Email content
 
149
 
150
  Returns:
151
- int: Longest run length (minimum 1)
152
- """
153
- # Find all sequences of capital letters
154
- capital_sequences = re.findall(r'[A-Z]+', text)
155
 
156
- if not capital_sequences:
157
- return 1
 
 
 
 
 
158
 
159
- longest = max(len(seq) for seq in capital_sequences)
 
160
 
161
- return max(1, longest)
 
162
 
163
 
164
- def calculate_capital_run_length_total(text: str) -> int:
165
  """
166
- Calculate total number of capital letters in the email.
167
- This is the sum of length of uninterrupted sequences of capital letters.
168
 
169
  Args:
170
- text: Email content
171
 
172
  Returns:
173
- int: Total capital letters (minimum 1)
174
  """
175
- # Count all capital letters
176
- capital_count = sum(1 for c in text if c.isupper())
177
-
178
- return max(1, capital_count)
179
 
180
 
181
  # ============================================================================
182
- # Main Feature Extraction Function
183
  # ============================================================================
184
 
185
- def extract_features(email_text: str) -> Dict[str, Any]:
186
  """
187
- Extract all 57 spam detection features from email content.
 
188
 
189
- Features are returned in exact order as specified:
190
- 1-48: word_freq_* (48 features)
191
- 49-54: char_freq_* (6 features)
192
- 55-57: capital_run_length_* (3 features)
193
 
194
- Based on UCI Spambase dataset feature definitions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  Args:
197
- email_text: Raw email content as string
 
198
 
199
  Returns:
200
- dict: Dictionary containing all 57 features with exact column names in order
201
-
202
- Raises:
203
- ValueError: If email_text is empty or not a string
204
  """
205
  # Handle empty or None input
206
  if not email_text or not isinstance(email_text, str):
207
  raise ValueError("Email text must be a non-empty string")
208
 
209
- logger.info(f"Extracting features from email (length: {len(email_text)} chars)")
210
-
211
- # Initialize ordered features dictionary
212
- features = {}
213
-
214
- # ========================================================================
215
- # 48 Word Frequency Features (in exact order)
216
- # ========================================================================
217
- features["word_freq_make"] = calculate_word_frequency(email_text, "make")
218
- features["word_freq_address"] = calculate_word_frequency(email_text, "address")
219
- features["word_freq_all"] = calculate_word_frequency(email_text, "all")
220
- features["word_freq_3d"] = calculate_word_frequency(email_text, "3d")
221
- features["word_freq_our"] = calculate_word_frequency(email_text, "our")
222
- features["word_freq_over"] = calculate_word_frequency(email_text, "over")
223
- features["word_freq_remove"] = calculate_word_frequency(email_text, "remove")
224
- features["word_freq_internet"] = calculate_word_frequency(email_text, "internet")
225
- features["word_freq_order"] = calculate_word_frequency(email_text, "order")
226
- features["word_freq_mail"] = calculate_word_frequency(email_text, "mail")
227
- features["word_freq_receive"] = calculate_word_frequency(email_text, "receive")
228
- features["word_freq_will"] = calculate_word_frequency(email_text, "will")
229
- features["word_freq_people"] = calculate_word_frequency(email_text, "people")
230
- features["word_freq_report"] = calculate_word_frequency(email_text, "report")
231
- features["word_freq_addresses"] = calculate_word_frequency(email_text, "addresses")
232
- features["word_freq_free"] = calculate_word_frequency(email_text, "free")
233
- features["word_freq_business"] = calculate_word_frequency(email_text, "business")
234
- features["word_freq_email"] = calculate_word_frequency(email_text, "email")
235
- features["word_freq_you"] = calculate_word_frequency(email_text, "you")
236
- features["word_freq_credit"] = calculate_word_frequency(email_text, "credit")
237
- features["word_freq_your"] = calculate_word_frequency(email_text, "your")
238
- features["word_freq_font"] = calculate_word_frequency(email_text, "font")
239
- features["word_freq_000"] = calculate_word_frequency(email_text, "000")
240
- features["word_freq_money"] = calculate_word_frequency(email_text, "money")
241
- features["word_freq_hp"] = calculate_word_frequency(email_text, "hp")
242
- features["word_freq_hpl"] = calculate_word_frequency(email_text, "hpl")
243
- features["word_freq_george"] = calculate_word_frequency(email_text, "george")
244
- features["word_freq_650"] = calculate_word_frequency(email_text, "650")
245
- features["word_freq_lab"] = calculate_word_frequency(email_text, "lab")
246
- features["word_freq_labs"] = calculate_word_frequency(email_text, "labs")
247
- features["word_freq_telnet"] = calculate_word_frequency(email_text, "telnet")
248
- features["word_freq_857"] = calculate_word_frequency(email_text, "857")
249
- features["word_freq_data"] = calculate_word_frequency(email_text, "data")
250
- features["word_freq_415"] = calculate_word_frequency(email_text, "415")
251
- features["word_freq_85"] = calculate_word_frequency(email_text, "85")
252
- features["word_freq_technology"] = calculate_word_frequency(email_text, "technology")
253
- features["word_freq_1999"] = calculate_word_frequency(email_text, "1999")
254
- features["word_freq_parts"] = calculate_word_frequency(email_text, "parts")
255
- features["word_freq_pm"] = calculate_word_frequency(email_text, "pm")
256
- features["word_freq_direct"] = calculate_word_frequency(email_text, "direct")
257
- features["word_freq_cs"] = calculate_word_frequency(email_text, "cs")
258
- features["word_freq_meeting"] = calculate_word_frequency(email_text, "meeting")
259
- features["word_freq_original"] = calculate_word_frequency(email_text, "original")
260
- features["word_freq_project"] = calculate_word_frequency(email_text, "project")
261
- features["word_freq_re"] = calculate_word_frequency(email_text, "re")
262
- features["word_freq_edu"] = calculate_word_frequency(email_text, "edu")
263
- features["word_freq_table"] = calculate_word_frequency(email_text, "table")
264
- features["word_freq_conference"] = calculate_word_frequency(email_text, "conference")
265
-
266
- # ========================================================================
267
- # 6 Character Frequency Features (in exact order)
268
- # ========================================================================
269
- features["char_freq__semicolon"] = calculate_char_frequency(email_text, ";")
270
- features["char_freq__openparen"] = calculate_char_frequency(email_text, "(")
271
- features["char_freq__openbracket"] = calculate_char_frequency(email_text, "[")
272
- features["char_freq__exclaim"] = calculate_char_frequency(email_text, "!")
273
- features["char_freq__dollar"] = calculate_char_frequency(email_text, "$")
274
- features["char_freq__hash"] = calculate_char_frequency(email_text, "#")
275
-
276
- # ========================================================================
277
- # 3 Capital Letter Run Length Features (in exact order)
278
- # ========================================================================
279
- features["capital_run_length_average"] = calculate_capital_run_length_average(email_text)
280
- features["capital_run_length_longest"] = calculate_capital_run_length_longest(email_text)
281
- features["capital_run_length_total"] = calculate_capital_run_length_total(email_text)
282
-
283
- logger.info(f"✓ Successfully extracted all 57 features from email")
284
- logger.debug(f"Features: {features}")
285
 
286
  return features
287
 
 
1
  """
2
+ Email Feature Extraction System for Phishing Detection
3
+ Extracts 21 specific features from email content using professional NLP libraries.
4
 
5
+ Enhanced with:
6
+ - NLTK for tokenization and stopwords
7
+ - spaCy for advanced linguistic analysis
8
+ - TextBlob for sentiment analysis
 
 
9
  """
10
 
11
  import re
12
  import logging
13
+ from typing import Dict, Any, List, Set
14
+ import numpy as np
15
+ import unicodedata
16
+
17
+ # NLP Libraries
18
+ import nltk
19
+ from nltk.tokenize import word_tokenize
20
+ from nltk.corpus import stopwords
21
+ import spacy
22
+ from textblob import TextBlob
23
+ from langdetect import detect_langs, LangDetectException
24
+ from langdetect import DetectorFactory
25
+
26
+ # Ensure consistent language detection results
27
+ DetectorFactory.seed = 0
28
 
29
  # Configure logging
30
  logging.basicConfig(level=logging.INFO)
31
  logger = logging.getLogger(__name__)
32
 
33
  # ============================================================================
34
+ # NLP Resources Initialization
35
  # ============================================================================
36
 
37
+ def verify_nltk_resources():
38
+ """
39
+ Verify that required NLTK resources are available.
40
+ Raises an error if any required resource is missing.
41
+ """
42
+ # Verify punkt tokenizer
43
+ nltk.data.find('tokenizers/punkt')
44
+ nltk.data.find('tokenizers/punkt_tab')
 
45
 
46
+ # Verify stopwords corpus
47
+ nltk.data.find('corpora/stopwords')
48
 
49
+ # Verify POS tagger
50
+ nltk.data.find('taggers/averaged_perceptron_tagger')
 
51
 
52
+ logger.info("✓ NLTK resources verified")
 
 
 
 
53
 
 
 
54
 
55
+ def load_spacy_model():
 
56
  """
57
+ Load spaCy language model.
58
+ Raises an error if the model is not installed.
 
59
 
60
+ Returns:
61
+ spacy.Language: Loaded spaCy model
62
  """
63
+ nlp = spacy.load("en_core_web_sm")
64
+ logger.info("✓ spaCy model 'en_core_web_sm' loaded successfully")
65
+ return nlp
66
 
 
 
67
 
68
+ # Initialize NLP resources on module load - will fail fast if not available
69
+ verify_nltk_resources()
70
+ _spacy_nlp = load_spacy_model()
 
 
71
 
72
+ # ============================================================================
73
+ # Text Preprocessing and Normalization
74
+ # ============================================================================
75
 
76
+ def preprocess_email_text(text: str) -> str:
77
  """
78
+ Preprocess and normalize raw email text to handle multi-line input,
79
+ special characters, and formatting issues.
80
 
81
+ This function:
82
+ 1. Handles None/empty input gracefully
83
+ 2. Normalizes Unicode characters (e.g., smart quotes, special dashes)
84
+ 3. Preserves URLs and email addresses (important phishing indicators)
85
+ 4. Normalizes line breaks and whitespace
86
+ 5. Removes excessive whitespace while preserving single spaces
87
+ 6. Preserves semantic content and phishing indicators
88
 
89
  Args:
90
+ text: Raw email text (may contain line breaks, tabs, special formatting)
 
91
 
92
  Returns:
93
+ str: Cleaned and normalized text ready for feature extraction
 
 
 
94
 
95
+ Examples:
96
+ >>> preprocess_email_text("Hello\\n\\nWorld \\t Test")
97
+ 'Hello World Test'
98
 
99
+ >>> preprocess_email_text("Your account\\r\\nhas been\\tsuspended")
100
+ 'Your account has been suspended'
101
+ """
102
+ # Handle None or empty input
103
+ if not text:
104
+ logger.debug("Empty text provided to preprocessor")
105
+ return ""
106
+
107
+ # Ensure text is a string
108
+ if not isinstance(text, str):
109
+ logger.warning(f"Non-string input to preprocessor: {type(text)}")
110
+ text = str(text)
111
+
112
+ # Step 1: Normalize Unicode characters
113
+ # This handles smart quotes, special dashes, accented characters, etc.
114
+ # NFKC normalization: compatibility decomposition followed by canonical composition
115
+ text = unicodedata.normalize('NFKC', text)
116
+
117
+ # Step 2: Normalize line breaks
118
+ # Convert all line break variations to single space
119
+ # This handles: \r\n (Windows), \n (Unix), \r (old Mac)
120
+ text = re.sub(r'\r\n|\r|\n', ' ', text)
121
+
122
+ # Step 3: Normalize tabs to spaces
123
+ text = text.replace('\t', ' ')
124
+
125
+ # Step 4: Remove zero-width characters and other invisible Unicode
126
+ # These can be used in obfuscation attempts
127
+ text = re.sub(r'[\u200b-\u200f\u202a-\u202e\ufeff]', '', text)
128
+
129
+ # Step 5: Normalize multiple spaces to single space
130
+ # This handles excessive whitespace while preserving word boundaries
131
+ text = re.sub(r'\s+', ' ', text)
132
+
133
+ # Step 6: Remove leading/trailing whitespace
134
+ text = text.strip()
135
+
136
+ # Step 7: Normalize common HTML entities if present
137
+ # Some emails may contain HTML entities
138
+ html_entities = {
139
+ ' ': ' ',
140
+ '&': '&',
141
+ '&lt;': '<',
142
+ '&gt;': '>',
143
+ '&quot;': '"',
144
+ '&#39;': "'",
145
+ '&apos;': "'",
146
+ }
147
+ for entity, replacement in html_entities.items():
148
+ text = text.replace(entity, replacement)
149
+
150
+ # Step 8: Remove excessive punctuation repetition (e.g., "!!!!!!" -> "!")
151
+ # But preserve single instances as they may be phishing indicators
152
+ text = re.sub(r'([!?.]){3,}', r'\1\1', text)
153
+
154
+ logger.debug(f"Preprocessed text: {len(text)} chars (original: {len(text)} chars)")
155
+
156
+ return text
157
 
 
158
 
159
+ # ============================================================================
160
+ # Function Words and Keywords
161
+ # ============================================================================
162
 
163
+ # Mapping from langdetect ISO 639-1 codes to NLTK stopwords language names
164
+ # langdetect supports 55 languages, NLTK stopwords supports 32 languages
165
+ LANGDETECT_TO_NLTK_MAP = {
166
+ 'ar': 'arabic',
167
+ 'az': 'azerbaijani',
168
+ 'eu': 'basque', # Basque
169
+ 'be': 'belarusian', # Belarusian (added in newer NLTK)
170
+ 'bn': 'bengali',
171
+ 'ca': 'catalan',
172
+ 'zh-cn': 'chinese',
173
+ 'zh-tw': 'chinese', # Map Traditional Chinese to same stopwords
174
+ 'da': 'danish',
175
+ 'nl': 'dutch',
176
+ 'en': 'english',
177
+ 'fi': 'finnish',
178
+ 'fr': 'french',
179
+ 'de': 'german',
180
+ 'el': 'greek',
181
+ 'he': 'hebrew',
182
+ 'hi': 'hinglish', # Hindi (mapped to hinglish which is Hindi-English mix)
183
+ 'hu': 'hungarian',
184
+ 'id': 'indonesian',
185
+ 'it': 'italian',
186
+ 'kk': 'kazakh',
187
+ 'ne': 'nepali',
188
+ 'no': 'norwegian',
189
+ 'pt': 'portuguese',
190
+ 'ro': 'romanian',
191
+ 'ru': 'russian',
192
+ 'sl': 'slovene',
193
+ 'es': 'spanish',
194
+ 'sv': 'swedish',
195
+ 'tg': 'tajik',
196
+ 'ta': 'tamil',
197
+ 'tl': 'tagalog', # Filipino
198
+ 'tr': 'turkish',
199
+ 'sq': 'albanian', # Albanian
200
+ }
201
+
202
+ # Get set of all NLTK stopwords languages for validation
203
+ NLTK_STOPWORDS_LANGUAGES = set(stopwords.fileids())
204
+
205
+ # Minimum confidence threshold for language detection (0.0 to 1.0)
206
+ LANGUAGE_DETECTION_THRESHOLD = 0.1
207
+
208
+
209
+ class LanguageDetectionError(Exception):
210
+ """Raised when language detection fails."""
211
+ pass
212
+
213
+
214
+ class UnsupportedLanguageError(Exception):
215
+ """Raised when a detected language is not supported by NLTK stopwords."""
216
+ pass
217
+
218
+
219
+ def detect_languages(text: str) -> List[str]:
220
  """
221
+ Detect language(s) present in the text.
222
 
223
+ Uses langdetect library to identify one or more languages in the text.
224
+ Returns all languages that meet the confidence threshold.
225
 
226
  Args:
227
+ text: The text to analyze for language detection
 
228
 
229
  Returns:
230
+ List[str]: List of detected NLTK language names (e.g., ['english', 'spanish'])
231
+
232
+ Raises:
233
+ LanguageDetectionError: If language detection fails
234
+ UnsupportedLanguageError: If a detected language is not supported by NLTK stopwords
235
+ """
236
+ if not text or not text.strip():
237
+ raise LanguageDetectionError("Cannot detect language from empty text")
238
+
239
+ # Detect languages with probabilities
240
+ detected = detect_langs(text)
241
+
242
+ if not detected:
243
+ raise LanguageDetectionError("Language detection returned no results")
244
+
245
+ # Filter by confidence threshold and map to NLTK language names
246
+ nltk_languages = []
247
+ unsupported_languages = []
248
+
249
+ for lang_prob in detected:
250
+ lang_code = str(lang_prob.lang)
251
+ probability = lang_prob.prob
252
+
253
+ # Skip low-confidence detections
254
+ if probability < LANGUAGE_DETECTION_THRESHOLD:
255
+ continue
256
+
257
+ # Map langdetect code to NLTK language name
258
+ if lang_code in LANGDETECT_TO_NLTK_MAP:
259
+ nltk_lang = LANGDETECT_TO_NLTK_MAP[lang_code]
260
+
261
+ # Verify the NLTK language is actually available
262
+ if nltk_lang in NLTK_STOPWORDS_LANGUAGES:
263
+ if nltk_lang not in nltk_languages:
264
+ nltk_languages.append(nltk_lang)
265
+ logger.debug(f"Detected language: {lang_code} -> {nltk_lang} (confidence: {probability:.2f})")
266
+ else:
267
+ # Language is in our map but not in NLTK
268
+ unsupported_languages.append((lang_code, nltk_lang, probability))
269
+ else:
270
+ # Language is not in our map at all
271
+ unsupported_languages.append((lang_code, None, probability))
272
+
273
+ # If we have unsupported languages with high confidence and no supported alternatives
274
+ if unsupported_languages and not nltk_languages:
275
+ unsupported_msgs = []
276
+ for lang_code, nltk_lang, prob in unsupported_languages:
277
+ if nltk_lang:
278
+ unsupported_msgs.append(f"{lang_code} (mapped to '{nltk_lang}' but not available in NLTK, confidence: {prob:.2f})")
279
+ else:
280
+ unsupported_msgs.append(f"{lang_code} (no NLTK mapping available, confidence: {prob:.2f})")
281
+ raise UnsupportedLanguageError(
282
+ f"Detected language(s) not supported by NLTK stopwords: {', '.join(unsupported_msgs)}"
283
+ )
284
+
285
+ if not nltk_languages:
286
+ raise LanguageDetectionError(
287
+ f"No languages detected with sufficient confidence (threshold: {LANGUAGE_DETECTION_THRESHOLD})"
288
+ )
289
+
290
+ return nltk_languages
291
+
292
+
293
+ def get_function_words(text: str) -> Set[str]:
294
  """
295
+ Get comprehensive set of function words (stopwords) based on detected language(s).
296
 
297
+ This function:
298
+ 1. Analyzes the email text to detect the language(s) present
299
+ 2. Returns stopwords for the detected language(s)
300
+ 3. For mixed-language emails, returns combined stopwords from all detected languages
301
 
302
+ Args:
303
+ text: The email text to analyze for language detection
304
 
305
+ Returns:
306
+ Set[str]: Set of function words (stopwords) for the detected language(s)
307
 
308
+ Raises:
309
+ LanguageDetectionError: If language detection fails
310
+ UnsupportedLanguageError: If a detected language is not supported by NLTK stopwords
311
+ LangDetectException: If langdetect encounters an internal error
312
+ """
313
+ # Detect language(s) in the text
314
+ detected_languages = detect_languages(text)
315
+
316
+ # Collect stopwords from all detected languages
317
+ function_words = set()
318
+ for language in detected_languages:
319
+ lang_stopwords = set(stopwords.words(language))
320
+ function_words.update(lang_stopwords)
321
+ logger.debug(f"Loaded {len(lang_stopwords)} stopwords for '{language}'")
322
+
323
+ # Add additional common function words for English if English is detected
324
+ if 'english' in detected_languages:
325
+ additional_words = {
326
+ 'shall', 'might', 'must', 'ought', 'need', 'dare',
327
+ 'used', 'having', 'being', 'does', 'did', 'done',
328
+ 'may', 'should', 'would', 'could', 'can', 'will',
329
+ }
330
+ function_words.update(additional_words)
331
+
332
+ logger.info(f"Loaded {len(function_words)} function words for languages: {detected_languages}")
333
+
334
+ return function_words
335
+
336
+ # Phishing-related keywords (case-insensitive)
337
+ PHISHING_KEYWORDS = {
338
+ 'account': r'\baccount\b',
339
+ 'access': r'\baccess\b',
340
+ 'bank': r'\bbank\b',
341
+ 'credit': r'\bcredit\b',
342
+ 'click': r'\bclick\b',
343
+ 'identity': r'\bidentity\b',
344
+ 'inconvenience': r'\binconvenience\b',
345
+ 'information': r'\binformation\b',
346
+ 'limited': r'\blimited\b',
347
+ 'minutes': r'\bminutes?\b',
348
+ 'password': r'\bpassword\b',
349
+ 'recently': r'\brecently\b',
350
+ 'risk': r'\brisk\b',
351
+ 'social': r'\bsocial\b',
352
+ 'security': r'\bsecurity\b',
353
+ 'service': r'\bservice\b',
354
+ 'suspended': r'\bsuspended\b',
355
+ }
356
 
357
+
358
+ def extract_words(text: str) -> List[str]:
359
  """
360
+ Extract words from text using NLTK tokenization.
361
 
362
  Args:
363
  text: Email content
364
 
365
  Returns:
366
+ list: List of words (lowercase, alphabetic only)
367
  """
368
+ # Use NLTK's word tokenizer for better accuracy
369
+ tokens = word_tokenize(text.lower())
370
+ # Filter to keep only alphabetic words
371
+ words = [word for word in tokens if word.isalpha()]
372
+ return words
373
+
374
 
375
+ def count_keyword_occurrences(text: str, keyword: str, pattern: str) -> int:
376
+ """
377
+ Count occurrences of a specific keyword in text.
378
+
379
+ Args:
380
+ text: Email content
381
+ keyword: Keyword name (for logging)
382
+ pattern: Regex pattern to match
383
+
384
+ Returns:
385
+ int: Count of keyword occurrences
386
+ """
387
+ matches = re.findall(pattern, text.lower())
388
+ return len(matches)
389
 
 
 
390
 
391
+ def calculate_vocabulary_richness(words: list, total_chars: int) -> float:
392
+ """
393
+ Calculate vocabulary richness as W/C (number of words / total characters).
394
+
395
+ Args:
396
+ words: List of words
397
+ total_chars: Total number of characters
398
+
399
+ Returns:
400
+ float: Vocabulary richness ratio
401
+ """
402
+ if total_chars == 0:
403
+ return 0.0
404
+
405
+ num_words = len(words)
406
+ return num_words / total_chars
407
 
408
 
409
+ def calculate_function_word_ratio(words: list, text: str) -> float:
410
  """
411
+ Calculate the ratio of function words to total words (Function words/W).
412
+
413
+ Uses language detection to determine which stopwords to use for calculating
414
+ the function word ratio. Supports multi-language emails.
415
 
416
  Args:
417
+ words: List of words (lowercase, alphabetic only)
418
+ text: Original email text (used for language detection)
419
 
420
  Returns:
421
+ float: Function word ratio
 
 
 
422
 
423
+ Raises:
424
+ LanguageDetectionError: If language detection fails
425
+ UnsupportedLanguageError: If a detected language is not supported
426
+ LangDetectException: If langdetect encounters an internal error
427
+ """
428
+ if len(words) == 0:
429
+ return 0.0
430
 
431
+ # Get function words based on detected language(s)
432
+ function_words = get_function_words(text)
433
 
434
+ function_word_count = sum(1 for word in words if word in function_words)
435
+ return function_word_count / len(words)
436
 
437
 
438
+ def count_unique_words(words: List[str]) -> int:
439
  """
440
+ Count the number of unique words in the text.
 
441
 
442
  Args:
443
+ words: List of words
444
 
445
  Returns:
446
+ int: Number of unique words
447
  """
448
+ return len(set(words))
 
 
 
449
 
450
 
451
  # ============================================================================
452
+ # Advanced NLP Features (Optional Enhancement)
453
  # ============================================================================
454
 
455
+ def extract_advanced_nlp_features(text: str) -> Dict[str, Any]:
456
  """
457
+ Extract advanced NLP features using spaCy and TextBlob.
458
+ These features provide additional insights but are not part of the core 21 features.
459
 
460
+ Args:
461
+ text: Email content
 
 
462
 
463
+ Returns:
464
+ dict: Dictionary of advanced features
465
+ """
466
+ # Sentiment analysis using TextBlob
467
+ blob = TextBlob(text)
468
+ sentiment_polarity = blob.sentiment.polarity
469
+ sentiment_subjectivity = blob.sentiment.subjectivity
470
+
471
+ # spaCy analysis
472
+ doc = _spacy_nlp(text[:1000000]) # Limit text length for performance
473
+
474
+ # Named Entity Recognition
475
+ entities = list(doc.ents)
476
+ named_entities_count = len(entities)
477
+
478
+ # Count specific entity types
479
+ financial_entities = 0
480
+ person_entities = 0
481
+ org_entities = 0
482
+ for ent in entities:
483
+ if ent.label_ in ['MONEY', 'PERCENT', 'CARDINAL']:
484
+ financial_entities += 1
485
+ elif ent.label_ == 'PERSON':
486
+ person_entities += 1
487
+ elif ent.label_ == 'ORG':
488
+ org_entities += 1
489
+
490
+ # Part-of-speech analysis
491
+ pos_noun_ratio = 0.0
492
+ pos_verb_ratio = 0.0
493
+ pos_adj_ratio = 0.0
494
+ if len(doc) > 0:
495
+ pos_counts = {'NOUN': 0, 'VERB': 0, 'ADJ': 0}
496
+ for token in doc:
497
+ if token.pos_ in pos_counts:
498
+ pos_counts[token.pos_] += 1
499
+
500
+ total_tokens = len(doc)
501
+ pos_noun_ratio = pos_counts['NOUN'] / total_tokens
502
+ pos_verb_ratio = pos_counts['VERB'] / total_tokens
503
+ pos_adj_ratio = pos_counts['ADJ'] / total_tokens
504
+
505
+ advanced_features = {
506
+ 'sentiment_polarity': sentiment_polarity,
507
+ 'sentiment_subjectivity': sentiment_subjectivity,
508
+ 'named_entities_count': named_entities_count,
509
+ 'financial_entities': financial_entities,
510
+ 'person_entities': person_entities,
511
+ 'org_entities': org_entities,
512
+ 'pos_noun_ratio': pos_noun_ratio,
513
+ 'pos_verb_ratio': pos_verb_ratio,
514
+ 'pos_adj_ratio': pos_adj_ratio,
515
+ }
516
+
517
+ logger.debug(f"Advanced NLP features extracted: {advanced_features}")
518
+
519
+ return advanced_features
520
+
521
+
522
+ def extract_features(email_text: str, include_advanced: bool = False) -> Dict[str, Any]:
523
+ """
524
+ Extract all 21 features from email content using enhanced NLP libraries.
525
+
526
+ Features extracted (in exact order):
527
+ 1. Total Number of Characters C
528
+ 2. Vocabulary richness W/C
529
+ 3-19. Keyword counts (Account, Access, Bank, Credit, Click, Identity,
530
+ Inconvenience, Information, Limited, Minutes, Password, Recently,
531
+ Risk, Social, Security, Service, Suspended)
532
+ 20. Total number of Function words/W
533
+ 21. Unique Words
534
+
535
+ Enhanced with:
536
+ - Automatic text preprocessing and normalization (handles multi-line input)
537
+ - NLTK word tokenization (more accurate than regex)
538
+ - NLTK stopwords for function word detection (more comprehensive)
539
+ - Optional spaCy analysis for advanced features
540
 
541
  Args:
542
+ email_text: Raw email content as string (can be multi-line with formatting)
543
+ include_advanced: If True, include advanced NLP features (not used by model)
544
 
545
  Returns:
546
+ dict: Dictionary containing all 21 features with exact column names
547
+ (plus optional advanced features if include_advanced=True)
 
 
548
  """
549
  # Handle empty or None input
550
  if not email_text or not isinstance(email_text, str):
551
  raise ValueError("Email text must be a non-empty string")
552
 
553
+ # PREPROCESSING: Normalize and clean the raw email text
554
+ # This handles multi-line input, special characters, excessive whitespace, etc.
555
+ original_length = len(email_text)
556
+ email_text = preprocess_email_text(email_text)
557
+
558
+ if original_length > 0:
559
+ logger.debug(f"Text preprocessing: {original_length} -> {len(email_text)} chars")
560
+
561
+ # 1. Total Number of Characters C
562
+ total_chars = len(email_text)
563
+
564
+ # Extract words for further analysis (using NLTK tokenization)
565
+ words = extract_words(email_text)
566
+
567
+ # 2. Vocabulary richness W/C
568
+ vocab_richness = calculate_vocabulary_richness(words, total_chars)
569
+
570
+ # 3-19. Count keyword occurrences
571
+ keyword_counts = {}
572
+ for keyword, pattern in PHISHING_KEYWORDS.items():
573
+ count = count_keyword_occurrences(email_text, keyword, pattern)
574
+ # Capitalize first letter to match dataset column names
575
+ column_name = keyword.capitalize()
576
+ keyword_counts[column_name] = count
577
+
578
+ # 20. Total number of Function words/W (using language-aware NLTK stopwords)
579
+ function_word_ratio = calculate_function_word_ratio(words, email_text)
580
+
581
+ # 21. Unique Words
582
+ unique_words = count_unique_words(words)
583
+
584
+ # Construct features dictionary with exact column names from dataset
585
+ features = {
586
+ 'Total Number of Characters C': total_chars,
587
+ 'Vocabulary richness W/C': vocab_richness,
588
+ 'Account': keyword_counts['Account'],
589
+ 'Access': keyword_counts['Access'],
590
+ 'Bank': keyword_counts['Bank'],
591
+ 'Credit': keyword_counts['Credit'],
592
+ 'Click': keyword_counts['Click'],
593
+ 'Identity': keyword_counts['Identity'],
594
+ 'Inconvenience': keyword_counts['Inconvenience'],
595
+ 'Information': keyword_counts['Information'],
596
+ 'Limited': keyword_counts['Limited'],
597
+ 'Minutes': keyword_counts['Minutes'],
598
+ 'Password': keyword_counts['Password'],
599
+ 'Recently': keyword_counts['Recently'],
600
+ 'Risk': keyword_counts['Risk'],
601
+ 'Social': keyword_counts['Social'],
602
+ 'Security': keyword_counts['Security'],
603
+ 'Service': keyword_counts['Service'],
604
+ 'Suspended': keyword_counts['Suspended'],
605
+ 'Total number of Function words/W': function_word_ratio,
606
+ 'Unique Words': unique_words,
607
+ }
608
+
609
+ logger.info(f"✓ Successfully extracted all 21 features from email (length: {total_chars} chars, words: {len(words)})")
610
+ logger.debug(f"Core features: {features}")
611
+
612
+ # Optionally include advanced NLP features
613
+ if include_advanced:
614
+ advanced = extract_advanced_nlp_features(email_text)
615
+ features['_advanced'] = advanced
616
+ logger.debug(f"Advanced features: {advanced}")
 
 
 
 
 
 
 
 
 
 
 
 
617
 
618
  return features
619
 
model/model.py CHANGED
@@ -1,6 +1,5 @@
1
  """
2
- Model loading and prediction module for spam email detection.
3
- Uses 57 features based on the UCI Spambase dataset.
4
  """
5
 
6
  import logging
@@ -171,9 +170,9 @@ def predict_email(email_text: str) -> Dict[str, Any]:
171
  logger.info(f"Extracting features from email (length: {len(email_text)} chars)")
172
  features_dict = extract_features(email_text)
173
 
174
- # Check if feature extraction returned valid features (57 features expected)
175
- if len(features_dict) != 57:
176
- logger.warning(f"Feature extraction returned {len(features_dict)} features, expected 57")
177
 
178
  # Make prediction
179
  logger.info("Making prediction...")
 
1
  """
2
+ Model loading and prediction module for phishing email detection.
 
3
  """
4
 
5
  import logging
 
170
  logger.info(f"Extracting features from email (length: {len(email_text)} chars)")
171
  features_dict = extract_features(email_text)
172
 
173
+ # Check if feature extraction returned valid features
174
+ if features_dict.get('Total Number of Characters C', 0) == 0 and len(email_text) > 0:
175
+ logger.warning(f"Feature extraction may have failed for email")
176
 
177
  # Make prediction
178
  logger.info("Making prediction...")