Rasel Santillan commited on
Commit
28329ff
·
1 Parent(s): 6a33159
Files changed (1) hide show
  1. model/email_feature_extractor.py +32 -6
model/email_feature_extractor.py CHANGED
@@ -336,10 +336,22 @@ def get_function_words(text: str) -> Set[str]:
336
  # Phishing-related keywords (case-insensitive)
337
  PHISHING_KEYWORDS = {
338
  'account': r'\baccount\b',
 
339
  'bank': r'\bbank\b',
 
 
 
 
340
  'information': r'\binformation\b',
 
 
 
 
341
  'risk': r'\brisk\b',
 
342
  'security': r'\bsecurity\b',
 
 
343
  }
344
 
345
 
@@ -509,14 +521,16 @@ def extract_advanced_nlp_features(text: str) -> Dict[str, Any]:
509
 
510
  def extract_features(email_text: str, include_advanced: bool = False) -> Dict[str, Any]:
511
  """
512
- Extract all 9 features from email content using enhanced NLP libraries.
513
 
514
  Features extracted (in exact order):
515
  1. Total Number of Characters C
516
  2. Vocabulary richness W/C
517
- 3-7. Keyword counts (Account, Bank, Information, Risk, Security)
518
- 8. Total number of Function words/W
519
- 9. Unique Words
 
 
520
 
521
  Enhanced with:
522
  - Automatic text preprocessing and normalization (handles multi-line input)
@@ -529,7 +543,7 @@ def extract_features(email_text: str, include_advanced: bool = False) -> Dict[st
529
  include_advanced: If True, include advanced NLP features (not used by model)
530
 
531
  Returns:
532
- dict: Dictionary containing all 9 features with exact column names
533
  (plus optional advanced features if include_advanced=True)
534
  """
535
  # Handle empty or None input
@@ -572,15 +586,27 @@ def extract_features(email_text: str, include_advanced: bool = False) -> Dict[st
572
  'Total Number of Characters C': total_chars,
573
  'Vocabulary richness W/C': vocab_richness,
574
  'Account': keyword_counts['Account'],
 
575
  'Bank': keyword_counts['Bank'],
 
 
 
 
576
  'Information': keyword_counts['Information'],
 
 
 
 
577
  'Risk': keyword_counts['Risk'],
 
578
  'Security': keyword_counts['Security'],
 
 
579
  'Total number of Function words/W': function_word_ratio,
580
  'Unique Words': unique_words,
581
  }
582
 
583
- logger.info(f"✓ Successfully extracted all 9 features from email (length: {total_chars} chars, words: {len(words)})")
584
  logger.debug(f"Core features: {features}")
585
 
586
  # Optionally include advanced NLP features
 
336
  # Phishing-related keywords (case-insensitive)
337
  PHISHING_KEYWORDS = {
338
  'account': r'\baccount\b',
339
+ 'access': r'\baccess\b',
340
  'bank': r'\bbank\b',
341
+ 'credit': r'\bcredit\b',
342
+ 'click': r'\bclick\b',
343
+ 'identity': r'\bidentity\b',
344
+ 'inconvenience': r'\binconvenience\b',
345
  'information': r'\binformation\b',
346
+ 'limited': r'\blimited\b',
347
+ 'minutes': r'\bminutes?\b',
348
+ 'password': r'\bpassword\b',
349
+ 'recently': r'\brecently\b',
350
  'risk': r'\brisk\b',
351
+ 'social': r'\bsocial\b',
352
  'security': r'\bsecurity\b',
353
+ 'service': r'\bservice\b',
354
+ 'suspended': r'\bsuspended\b',
355
  }
356
 
357
 
 
521
 
522
  def extract_features(email_text: str, include_advanced: bool = False) -> Dict[str, Any]:
523
  """
524
+ Extract all 21 features from email content using enhanced NLP libraries.
525
 
526
  Features extracted (in exact order):
527
  1. Total Number of Characters C
528
  2. Vocabulary richness W/C
529
+ 3-19. Keyword counts (Account, Access, Bank, Credit, Click, Identity,
530
+ Inconvenience, Information, Limited, Minutes, Password, Recently,
531
+ Risk, Social, Security, Service, Suspended)
532
+ 20. Total number of Function words/W
533
+ 21. Unique Words
534
 
535
  Enhanced with:
536
  - Automatic text preprocessing and normalization (handles multi-line input)
 
543
  include_advanced: If True, include advanced NLP features (not used by model)
544
 
545
  Returns:
546
+ dict: Dictionary containing all 21 features with exact column names
547
  (plus optional advanced features if include_advanced=True)
548
  """
549
  # Handle empty or None input
 
586
  'Total Number of Characters C': total_chars,
587
  'Vocabulary richness W/C': vocab_richness,
588
  'Account': keyword_counts['Account'],
589
+ 'Access': keyword_counts['Access'],
590
  'Bank': keyword_counts['Bank'],
591
+ 'Credit': keyword_counts['Credit'],
592
+ 'Click': keyword_counts['Click'],
593
+ 'Identity': keyword_counts['Identity'],
594
+ 'Inconvenience': keyword_counts['Inconvenience'],
595
  'Information': keyword_counts['Information'],
596
+ 'Limited': keyword_counts['Limited'],
597
+ 'Minutes': keyword_counts['Minutes'],
598
+ 'Password': keyword_counts['Password'],
599
+ 'Recently': keyword_counts['Recently'],
600
  'Risk': keyword_counts['Risk'],
601
+ 'Social': keyword_counts['Social'],
602
  'Security': keyword_counts['Security'],
603
+ 'Service': keyword_counts['Service'],
604
+ 'Suspended': keyword_counts['Suspended'],
605
  'Total number of Function words/W': function_word_ratio,
606
  'Unique Words': unique_words,
607
  }
608
 
609
+ logger.info(f"✓ Successfully extracted all 21 features from email (length: {total_chars} chars, words: {len(words)})")
610
  logger.debug(f"Core features: {features}")
611
 
612
  # Optionally include advanced NLP features