Spaces:
Sleeping
Sleeping
Rasel Santillan commited on
Commit ·
28329ff
1
Parent(s): 6a33159
Update
Browse files
model/email_feature_extractor.py
CHANGED
|
@@ -336,10 +336,22 @@ def get_function_words(text: str) -> Set[str]:
|
|
| 336 |
# Phishing-related keywords (case-insensitive)
|
| 337 |
PHISHING_KEYWORDS = {
|
| 338 |
'account': r'\baccount\b',
|
|
|
|
| 339 |
'bank': r'\bbank\b',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
'information': r'\binformation\b',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
'risk': r'\brisk\b',
|
|
|
|
| 342 |
'security': r'\bsecurity\b',
|
|
|
|
|
|
|
| 343 |
}
|
| 344 |
|
| 345 |
|
|
@@ -509,14 +521,16 @@ def extract_advanced_nlp_features(text: str) -> Dict[str, Any]:
|
|
| 509 |
|
| 510 |
def extract_features(email_text: str, include_advanced: bool = False) -> Dict[str, Any]:
|
| 511 |
"""
|
| 512 |
-
Extract all
|
| 513 |
|
| 514 |
Features extracted (in exact order):
|
| 515 |
1. Total Number of Characters C
|
| 516 |
2. Vocabulary richness W/C
|
| 517 |
-
3-
|
| 518 |
-
|
| 519 |
-
|
|
|
|
|
|
|
| 520 |
|
| 521 |
Enhanced with:
|
| 522 |
- Automatic text preprocessing and normalization (handles multi-line input)
|
|
@@ -529,7 +543,7 @@ def extract_features(email_text: str, include_advanced: bool = False) -> Dict[st
|
|
| 529 |
include_advanced: If True, include advanced NLP features (not used by model)
|
| 530 |
|
| 531 |
Returns:
|
| 532 |
-
dict: Dictionary containing all
|
| 533 |
(plus optional advanced features if include_advanced=True)
|
| 534 |
"""
|
| 535 |
# Handle empty or None input
|
|
@@ -572,15 +586,27 @@ def extract_features(email_text: str, include_advanced: bool = False) -> Dict[st
|
|
| 572 |
'Total Number of Characters C': total_chars,
|
| 573 |
'Vocabulary richness W/C': vocab_richness,
|
| 574 |
'Account': keyword_counts['Account'],
|
|
|
|
| 575 |
'Bank': keyword_counts['Bank'],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
'Information': keyword_counts['Information'],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
'Risk': keyword_counts['Risk'],
|
|
|
|
| 578 |
'Security': keyword_counts['Security'],
|
|
|
|
|
|
|
| 579 |
'Total number of Function words/W': function_word_ratio,
|
| 580 |
'Unique Words': unique_words,
|
| 581 |
}
|
| 582 |
|
| 583 |
-
logger.info(f"✓ Successfully extracted all
|
| 584 |
logger.debug(f"Core features: {features}")
|
| 585 |
|
| 586 |
# Optionally include advanced NLP features
|
|
|
|
| 336 |
# Phishing-related keywords (case-insensitive)
|
| 337 |
PHISHING_KEYWORDS = {
|
| 338 |
'account': r'\baccount\b',
|
| 339 |
+
'access': r'\baccess\b',
|
| 340 |
'bank': r'\bbank\b',
|
| 341 |
+
'credit': r'\bcredit\b',
|
| 342 |
+
'click': r'\bclick\b',
|
| 343 |
+
'identity': r'\bidentity\b',
|
| 344 |
+
'inconvenience': r'\binconvenience\b',
|
| 345 |
'information': r'\binformation\b',
|
| 346 |
+
'limited': r'\blimited\b',
|
| 347 |
+
'minutes': r'\bminutes?\b',
|
| 348 |
+
'password': r'\bpassword\b',
|
| 349 |
+
'recently': r'\brecently\b',
|
| 350 |
'risk': r'\brisk\b',
|
| 351 |
+
'social': r'\bsocial\b',
|
| 352 |
'security': r'\bsecurity\b',
|
| 353 |
+
'service': r'\bservice\b',
|
| 354 |
+
'suspended': r'\bsuspended\b',
|
| 355 |
}
|
| 356 |
|
| 357 |
|
|
|
|
| 521 |
|
| 522 |
def extract_features(email_text: str, include_advanced: bool = False) -> Dict[str, Any]:
|
| 523 |
"""
|
| 524 |
+
Extract all 21 features from email content using enhanced NLP libraries.
|
| 525 |
|
| 526 |
Features extracted (in exact order):
|
| 527 |
1. Total Number of Characters C
|
| 528 |
2. Vocabulary richness W/C
|
| 529 |
+
3-19. Keyword counts (Account, Access, Bank, Credit, Click, Identity,
|
| 530 |
+
Inconvenience, Information, Limited, Minutes, Password, Recently,
|
| 531 |
+
Risk, Social, Security, Service, Suspended)
|
| 532 |
+
20. Total number of Function words/W
|
| 533 |
+
21. Unique Words
|
| 534 |
|
| 535 |
Enhanced with:
|
| 536 |
- Automatic text preprocessing and normalization (handles multi-line input)
|
|
|
|
| 543 |
include_advanced: If True, include advanced NLP features (not used by model)
|
| 544 |
|
| 545 |
Returns:
|
| 546 |
+
dict: Dictionary containing all 21 features with exact column names
|
| 547 |
(plus optional advanced features if include_advanced=True)
|
| 548 |
"""
|
| 549 |
# Handle empty or None input
|
|
|
|
| 586 |
'Total Number of Characters C': total_chars,
|
| 587 |
'Vocabulary richness W/C': vocab_richness,
|
| 588 |
'Account': keyword_counts['Account'],
|
| 589 |
+
'Access': keyword_counts['Access'],
|
| 590 |
'Bank': keyword_counts['Bank'],
|
| 591 |
+
'Credit': keyword_counts['Credit'],
|
| 592 |
+
'Click': keyword_counts['Click'],
|
| 593 |
+
'Identity': keyword_counts['Identity'],
|
| 594 |
+
'Inconvenience': keyword_counts['Inconvenience'],
|
| 595 |
'Information': keyword_counts['Information'],
|
| 596 |
+
'Limited': keyword_counts['Limited'],
|
| 597 |
+
'Minutes': keyword_counts['Minutes'],
|
| 598 |
+
'Password': keyword_counts['Password'],
|
| 599 |
+
'Recently': keyword_counts['Recently'],
|
| 600 |
'Risk': keyword_counts['Risk'],
|
| 601 |
+
'Social': keyword_counts['Social'],
|
| 602 |
'Security': keyword_counts['Security'],
|
| 603 |
+
'Service': keyword_counts['Service'],
|
| 604 |
+
'Suspended': keyword_counts['Suspended'],
|
| 605 |
'Total number of Function words/W': function_word_ratio,
|
| 606 |
'Unique Words': unique_words,
|
| 607 |
}
|
| 608 |
|
| 609 |
+
logger.info(f"✓ Successfully extracted all 21 features from email (length: {total_chars} chars, words: {len(words)})")
|
| 610 |
logger.debug(f"Core features: {features}")
|
| 611 |
|
| 612 |
# Optionally include advanced NLP features
|