Spaces:

xxemrzru
/

email-phish-api

Sleeping

App Files Files Community

Rasel Santillan commited on Jan 7

Commit

3ab7d75

1 Parent(s): a1daef0

Update

Browse files

Files changed (3) hide show

Dockerfile +34 -4
model/email_feature_extractor.py +522 -190
model/model.py +4 -5

Dockerfile CHANGED Viewed

@@ -34,17 +34,47 @@ ENV PATH="/home/user/.local/bin:$PATH"
 RUN pip install --user --no-cache-dir --upgrade pip && \
     pip install --user --no-cache-dir -r requirements.txt
 # Copy application code and model
 COPY --chown=user:user . .
-# Expose port 7860 (default for HuggingFace Spaces) and 8000 (standard FastAPI)
 EXPOSE 7860 8000
-# Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1
 # Run the application
-# Use app.py which defaults to port 7860 (HuggingFace Spaces standard)
 CMD ["python", "app.py"]

 RUN pip install --user --no-cache-dir --upgrade pip && \
     pip install --user --no-cache-dir -r requirements.txt
+# Download spaCy language model during build
+# This is a required dependency - the application will not start without it
+RUN python -m spacy download en_core_web_sm
 # Copy application code and model
 COPY --chown=user:user . .
+# Download NLTK data during build
+# These are required dependencies - the application will not start without them
+RUN python -c "\
+import nltk; \
+nltk.download('punkt', quiet=True); \
+nltk.download('punkt_tab', quiet=True); \
+nltk.download('stopwords', quiet=True); \
+nltk.download('averaged_perceptron_tagger', quiet=True); \
+print('NLTK data downloaded successfully')"
+# Verify all NLP resources are properly installed
+RUN python -c "\
+import nltk; \
+from nltk.tokenize import word_tokenize; \
+from nltk.corpus import stopwords; \
+import spacy; \
+from langdetect import detect_langs, DetectorFactory; \
+DetectorFactory.seed = 0; \
+nlp = spacy.load('en_core_web_sm'); \
+print('Verification: NLTK tokenization:', word_tokenize('test')); \
+print('Verification: NLTK stopwords:', len(stopwords.words('english')), 'words'); \
+print('Verification: NLTK stopwords languages:', len(stopwords.fileids())); \
+print('Verification: spaCy model loaded successfully'); \
+print('Verification: langdetect:', detect_langs('This is a test.')); \
+print('All NLP resources verified!')"
+# Expose ports (7860 is default, 8000 for compatibility)
 EXPOSE 7860 8000
+# Health check (uses port 7860 by default)
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
 # Run the application
+# Use app.py for HuggingFace Spaces compatibility, defaults to port 7860
 CMD ["python", "app.py"]

model/email_feature_extractor.py CHANGED Viewed

@@ -1,287 +1,619 @@
 """
-Spam Email Feature Extraction System
-Extracts 57 features from email content for spam detection.
-Features:
-- 48 word frequency features (word_freq_WORD)
-- 6 character frequency features (char_freq_CHAR)
-- 3 capital letter run length features
-Based on the UCI Spambase dataset feature definitions.
 """
 import re
 import logging
-from typing import Dict, Any, List
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # ============================================================================
-# Feature Definitions
 # ============================================================================
-# 48 words to track frequency for
-TRACKED_WORDS = [
-    "make", "address", "all", "3d", "our", "over", "remove", "internet",
-    "order", "mail", "receive", "will", "people", "report", "addresses",
-    "free", "business", "email", "you", "credit", "your", "font", "000",
-    "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
-    "data", "415", "85", "technology", "1999", "parts", "pm", "direct",
-    "cs", "meeting", "original", "project", "re", "edu", "table", "conference"
-]
-# 6 characters to track frequency for
-TRACKED_CHARS = [';', '(', '[', '!', '$', '#']
-# ============================================================================
-# Helper Functions
-# ============================================================================
-def extract_words(text: str) -> List[str]:
-    """
-    Extract words from text.
-    A "word" is any string of alphanumeric characters bounded by
-    non-alphanumeric characters or end-of-string.
-    Args:
-        text: Email content
-    Returns:
-        List[str]: List of words (lowercase)
     """
-    # Split by non-alphanumeric characters
-    words = re.findall(r'[a-zA-Z0-9]+', text.lower())
-    return words
-def count_total_words(text: str) -> int:
     """
-    Count total number of words in the email.
-    Args:
-        text: Email content
-    Returns:
-        int: Total word count
-    """
-    words = extract_words(text)
-    return len(words)
-def calculate_word_frequency(text: str, word: str) -> float:
     """
-    Calculate percentage of words in the email that match the given word.
-    Formula: 100 * (number of times WORD appears) / (total number of words)
     Args:
-        text: Email content
-        word: Word to search for (case-insensitive)
     Returns:
-        float: Percentage [0, 100]
-    """
-    words = extract_words(text)
-    total_words = len(words)
-    if total_words == 0:
-        return 0.0
-    word_lower = word.lower()
-    word_count = sum(1 for w in words if w == word_lower)
-    return 100.0 * word_count / total_words
-def calculate_char_frequency(text: str, char: str) -> float:
     """
-    Calculate percentage of characters in the email that match the given character.
-    Formula: 100 * (number of CHAR occurrences) / (total characters in email)
     Args:
-        text: Email content
-        char: Character to search for
     Returns:
-        float: Percentage [0, 100]
     """
-    total_chars = len(text)
-    if total_chars == 0:
-        return 0.0
-    char_count = text.count(char)
-    return 100.0 * char_count / total_chars
-def calculate_capital_run_length_average(text: str) -> float:
     """
-    Calculate average length of uninterrupted sequences of capital letters.
     Args:
         text: Email content
     Returns:
-        float: Average run length (minimum 1.0)
     """
-    # Find all sequences of capital letters
-    capital_sequences = re.findall(r'[A-Z]+', text)
-    if not capital_sequences:
-        return 1.0
-    total_length = sum(len(seq) for seq in capital_sequences)
-    avg_length = total_length / len(capital_sequences)
-    return max(1.0, avg_length)
-def calculate_capital_run_length_longest(text: str) -> int:
     """
-    Calculate length of longest uninterrupted sequence of capital letters.
     Args:
-        text: Email content
     Returns:
-        int: Longest run length (minimum 1)
-    """
-    # Find all sequences of capital letters
-    capital_sequences = re.findall(r'[A-Z]+', text)
-    if not capital_sequences:
-        return 1
-    longest = max(len(seq) for seq in capital_sequences)
-    return max(1, longest)
-def calculate_capital_run_length_total(text: str) -> int:
     """
-    Calculate total number of capital letters in the email.
-    This is the sum of length of uninterrupted sequences of capital letters.
     Args:
-        text: Email content
     Returns:
-        int: Total capital letters (minimum 1)
     """
-    # Count all capital letters
-    capital_count = sum(1 for c in text if c.isupper())
-    return max(1, capital_count)
 # ============================================================================
-# Main Feature Extraction Function
 # ============================================================================
-def extract_features(email_text: str) -> Dict[str, Any]:
     """
-    Extract all 57 spam detection features from email content.
-    Features are returned in exact order as specified:
-    1-48: word_freq_* (48 features)
-    49-54: char_freq_* (6 features)
-    55-57: capital_run_length_* (3 features)
-    Based on UCI Spambase dataset feature definitions.
     Args:
-        email_text: Raw email content as string
     Returns:
-        dict: Dictionary containing all 57 features with exact column names in order
-    Raises:
-        ValueError: If email_text is empty or not a string
     """
     # Handle empty or None input
     if not email_text or not isinstance(email_text, str):
         raise ValueError("Email text must be a non-empty string")
-    logger.info(f"Extracting features from email (length: {len(email_text)} chars)")
-    # Initialize ordered features dictionary
-    features = {}
-    # ========================================================================
-    # 48 Word Frequency Features (in exact order)
-    # ========================================================================
-    features["word_freq_make"] = calculate_word_frequency(email_text, "make")
-    features["word_freq_address"] = calculate_word_frequency(email_text, "address")
-    features["word_freq_all"] = calculate_word_frequency(email_text, "all")
-    features["word_freq_3d"] = calculate_word_frequency(email_text, "3d")
-    features["word_freq_our"] = calculate_word_frequency(email_text, "our")
-    features["word_freq_over"] = calculate_word_frequency(email_text, "over")
-    features["word_freq_remove"] = calculate_word_frequency(email_text, "remove")
-    features["word_freq_internet"] = calculate_word_frequency(email_text, "internet")
-    features["word_freq_order"] = calculate_word_frequency(email_text, "order")
-    features["word_freq_mail"] = calculate_word_frequency(email_text, "mail")
-    features["word_freq_receive"] = calculate_word_frequency(email_text, "receive")
-    features["word_freq_will"] = calculate_word_frequency(email_text, "will")
-    features["word_freq_people"] = calculate_word_frequency(email_text, "people")
-    features["word_freq_report"] = calculate_word_frequency(email_text, "report")
-    features["word_freq_addresses"] = calculate_word_frequency(email_text, "addresses")
-    features["word_freq_free"] = calculate_word_frequency(email_text, "free")
-    features["word_freq_business"] = calculate_word_frequency(email_text, "business")
-    features["word_freq_email"] = calculate_word_frequency(email_text, "email")
-    features["word_freq_you"] = calculate_word_frequency(email_text, "you")
-    features["word_freq_credit"] = calculate_word_frequency(email_text, "credit")
-    features["word_freq_your"] = calculate_word_frequency(email_text, "your")
-    features["word_freq_font"] = calculate_word_frequency(email_text, "font")
-    features["word_freq_000"] = calculate_word_frequency(email_text, "000")
-    features["word_freq_money"] = calculate_word_frequency(email_text, "money")
-    features["word_freq_hp"] = calculate_word_frequency(email_text, "hp")
-    features["word_freq_hpl"] = calculate_word_frequency(email_text, "hpl")
-    features["word_freq_george"] = calculate_word_frequency(email_text, "george")
-    features["word_freq_650"] = calculate_word_frequency(email_text, "650")
-    features["word_freq_lab"] = calculate_word_frequency(email_text, "lab")
-    features["word_freq_labs"] = calculate_word_frequency(email_text, "labs")
-    features["word_freq_telnet"] = calculate_word_frequency(email_text, "telnet")
-    features["word_freq_857"] = calculate_word_frequency(email_text, "857")
-    features["word_freq_data"] = calculate_word_frequency(email_text, "data")
-    features["word_freq_415"] = calculate_word_frequency(email_text, "415")
-    features["word_freq_85"] = calculate_word_frequency(email_text, "85")
-    features["word_freq_technology"] = calculate_word_frequency(email_text, "technology")
-    features["word_freq_1999"] = calculate_word_frequency(email_text, "1999")
-    features["word_freq_parts"] = calculate_word_frequency(email_text, "parts")
-    features["word_freq_pm"] = calculate_word_frequency(email_text, "pm")
-    features["word_freq_direct"] = calculate_word_frequency(email_text, "direct")
-    features["word_freq_cs"] = calculate_word_frequency(email_text, "cs")
-    features["word_freq_meeting"] = calculate_word_frequency(email_text, "meeting")
-    features["word_freq_original"] = calculate_word_frequency(email_text, "original")
-    features["word_freq_project"] = calculate_word_frequency(email_text, "project")
-    features["word_freq_re"] = calculate_word_frequency(email_text, "re")
-    features["word_freq_edu"] = calculate_word_frequency(email_text, "edu")
-    features["word_freq_table"] = calculate_word_frequency(email_text, "table")
-    features["word_freq_conference"] = calculate_word_frequency(email_text, "conference")
-    # ========================================================================
-    # 6 Character Frequency Features (in exact order)
-    # ========================================================================
-    features["char_freq__semicolon"] = calculate_char_frequency(email_text, ";")
-    features["char_freq__openparen"] = calculate_char_frequency(email_text, "(")
-    features["char_freq__openbracket"] = calculate_char_frequency(email_text, "[")
-    features["char_freq__exclaim"] = calculate_char_frequency(email_text, "!")
-    features["char_freq__dollar"] = calculate_char_frequency(email_text, "$")
-    features["char_freq__hash"] = calculate_char_frequency(email_text, "#")
-    # ========================================================================
-    # 3 Capital Letter Run Length Features (in exact order)
-    # ========================================================================
-    features["capital_run_length_average"] = calculate_capital_run_length_average(email_text)
-    features["capital_run_length_longest"] = calculate_capital_run_length_longest(email_text)
-    features["capital_run_length_total"] = calculate_capital_run_length_total(email_text)
-    logger.info(f"✓ Successfully extracted all 57 features from email")
-    logger.debug(f"Features: {features}")
     return features

 """
+Email Feature Extraction System for Phishing Detection
+Extracts 21 specific features from email content using professional NLP libraries.
+Enhanced with:
+- NLTK for tokenization and stopwords
+- spaCy for advanced linguistic analysis
+- TextBlob for sentiment analysis
 """
 import re
 import logging
+from typing import Dict, Any, List, Set
+import numpy as np
+import unicodedata
+# NLP Libraries
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+import spacy
+from textblob import TextBlob
+from langdetect import detect_langs, LangDetectException
+from langdetect import DetectorFactory
+# Ensure consistent language detection results
+DetectorFactory.seed = 0
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # ============================================================================
+# NLP Resources Initialization
 # ============================================================================
+def verify_nltk_resources():
+    """
+    Verify that required NLTK resources are available.
+    Raises an error if any required resource is missing.
+    """
+    # Verify punkt tokenizer
+    nltk.data.find('tokenizers/punkt')
+    nltk.data.find('tokenizers/punkt_tab')
+    # Verify stopwords corpus
+    nltk.data.find('corpora/stopwords')
+    # Verify POS tagger
+    nltk.data.find('taggers/averaged_perceptron_tagger')
+    logger.info("✓ NLTK resources verified")
+def load_spacy_model():
     """
+    Load spaCy language model.
+    Raises an error if the model is not installed.
+    Returns:
+        spacy.Language: Loaded spaCy model
     """
+    nlp = spacy.load("en_core_web_sm")
+    logger.info("✓ spaCy model 'en_core_web_sm' loaded successfully")
+    return nlp
+# Initialize NLP resources on module load - will fail fast if not available
+verify_nltk_resources()
+_spacy_nlp = load_spacy_model()
+# ============================================================================
+# Text Preprocessing and Normalization
+# ============================================================================
+def preprocess_email_text(text: str) -> str:
     """
+    Preprocess and normalize raw email text to handle multi-line input,
+    special characters, and formatting issues.
+    This function:
+    1. Handles None/empty input gracefully
+    2. Normalizes Unicode characters (e.g., smart quotes, special dashes)
+    3. Preserves URLs and email addresses (important phishing indicators)
+    4. Normalizes line breaks and whitespace
+    5. Removes excessive whitespace while preserving single spaces
+    6. Preserves semantic content and phishing indicators
     Args:
+        text: Raw email text (may contain line breaks, tabs, special formatting)
     Returns:
+        str: Cleaned and normalized text ready for feature extraction
+    Examples:
+        >>> preprocess_email_text("Hello\\n\\nWorld  \\t  Test")
+        'Hello World Test'
+        >>> preprocess_email_text("Your account\\r\\nhas been\\tsuspended")
+        'Your account has been suspended'
+    """
+    # Handle None or empty input
+    if not text:
+        logger.debug("Empty text provided to preprocessor")
+        return ""
+    # Ensure text is a string
+    if not isinstance(text, str):
+        logger.warning(f"Non-string input to preprocessor: {type(text)}")
+        text = str(text)
+    # Step 1: Normalize Unicode characters
+    # This handles smart quotes, special dashes, accented characters, etc.
+    # NFKC normalization: compatibility decomposition followed by canonical composition
+    text = unicodedata.normalize('NFKC', text)
+    # Step 2: Normalize line breaks
+    # Convert all line break variations to single space
+    # This handles: \r\n (Windows), \n (Unix), \r (old Mac)
+    text = re.sub(r'\r\n|\r|\n', ' ', text)
+    # Step 3: Normalize tabs to spaces
+    text = text.replace('\t', ' ')
+    # Step 4: Remove zero-width characters and other invisible Unicode
+    # These can be used in obfuscation attempts
+    text = re.sub(r'[\u200b-\u200f\u202a-\u202e\ufeff]', '', text)
+    # Step 5: Normalize multiple spaces to single space
+    # This handles excessive whitespace while preserving word boundaries
+    text = re.sub(r'\s+', ' ', text)
+    # Step 6: Remove leading/trailing whitespace
+    text = text.strip()
+    # Step 7: Normalize common HTML entities if present
+    # Some emails may contain HTML entities
+    html_entities = {
+        '&nbsp;': ' ',
+        '&amp;': '&',
+        '&lt;': '<',
+        '&gt;': '>',
+        '&quot;': '"',
+        '&#39;': "'",
+        '&apos;': "'",
+    }
+    for entity, replacement in html_entities.items():
+        text = text.replace(entity, replacement)
+    # Step 8: Remove excessive punctuation repetition (e.g., "!!!!!!" -> "!")
+    # But preserve single instances as they may be phishing indicators
+    text = re.sub(r'([!?.]){3,}', r'\1\1', text)
+    logger.debug(f"Preprocessed text: {len(text)} chars (original: {len(text)} chars)")
+    return text
+# ============================================================================
+# Function Words and Keywords
+# ============================================================================
+# Mapping from langdetect ISO 639-1 codes to NLTK stopwords language names
+# langdetect supports 55 languages, NLTK stopwords supports 32 languages
+LANGDETECT_TO_NLTK_MAP = {
+    'ar': 'arabic',
+    'az': 'azerbaijani',
+    'eu': 'basque',       # Basque
+    'be': 'belarusian',   # Belarusian (added in newer NLTK)
+    'bn': 'bengali',
+    'ca': 'catalan',
+    'zh-cn': 'chinese',
+    'zh-tw': 'chinese',   # Map Traditional Chinese to same stopwords
+    'da': 'danish',
+    'nl': 'dutch',
+    'en': 'english',
+    'fi': 'finnish',
+    'fr': 'french',
+    'de': 'german',
+    'el': 'greek',
+    'he': 'hebrew',
+    'hi': 'hinglish',     # Hindi (mapped to hinglish which is Hindi-English mix)
+    'hu': 'hungarian',
+    'id': 'indonesian',
+    'it': 'italian',
+    'kk': 'kazakh',
+    'ne': 'nepali',
+    'no': 'norwegian',
+    'pt': 'portuguese',
+    'ro': 'romanian',
+    'ru': 'russian',
+    'sl': 'slovene',
+    'es': 'spanish',
+    'sv': 'swedish',
+    'tg': 'tajik',
+    'ta': 'tamil',
+    'tl': 'tagalog',    # Filipino
+    'tr': 'turkish',
+    'sq': 'albanian',     # Albanian
+}
+# Get set of all NLTK stopwords languages for validation
+NLTK_STOPWORDS_LANGUAGES = set(stopwords.fileids())
+# Minimum confidence threshold for language detection (0.0 to 1.0)
+LANGUAGE_DETECTION_THRESHOLD = 0.1
+class LanguageDetectionError(Exception):
+    """Raised when language detection fails."""
+    pass
+class UnsupportedLanguageError(Exception):
+    """Raised when a detected language is not supported by NLTK stopwords."""
+    pass
+def detect_languages(text: str) -> List[str]:
     """
+    Detect language(s) present in the text.
+    Uses langdetect library to identify one or more languages in the text.
+    Returns all languages that meet the confidence threshold.
     Args:
+        text: The text to analyze for language detection
     Returns:
+        List[str]: List of detected NLTK language names (e.g., ['english', 'spanish'])
+    Raises:
+        LanguageDetectionError: If language detection fails
+        UnsupportedLanguageError: If a detected language is not supported by NLTK stopwords
+    """
+    if not text or not text.strip():
+        raise LanguageDetectionError("Cannot detect language from empty text")
+    # Detect languages with probabilities
+    detected = detect_langs(text)
+    if not detected:
+        raise LanguageDetectionError("Language detection returned no results")
+    # Filter by confidence threshold and map to NLTK language names
+    nltk_languages = []
+    unsupported_languages = []
+    for lang_prob in detected:
+        lang_code = str(lang_prob.lang)
+        probability = lang_prob.prob
+        # Skip low-confidence detections
+        if probability < LANGUAGE_DETECTION_THRESHOLD:
+            continue
+        # Map langdetect code to NLTK language name
+        if lang_code in LANGDETECT_TO_NLTK_MAP:
+            nltk_lang = LANGDETECT_TO_NLTK_MAP[lang_code]
+            # Verify the NLTK language is actually available
+            if nltk_lang in NLTK_STOPWORDS_LANGUAGES:
+                if nltk_lang not in nltk_languages:
+                    nltk_languages.append(nltk_lang)
+                    logger.debug(f"Detected language: {lang_code} -> {nltk_lang} (confidence: {probability:.2f})")
+            else:
+                # Language is in our map but not in NLTK
+                unsupported_languages.append((lang_code, nltk_lang, probability))
+        else:
+            # Language is not in our map at all
+            unsupported_languages.append((lang_code, None, probability))
+    # If we have unsupported languages with high confidence and no supported alternatives
+    if unsupported_languages and not nltk_languages:
+        unsupported_msgs = []
+        for lang_code, nltk_lang, prob in unsupported_languages:
+            if nltk_lang:
+                unsupported_msgs.append(f"{lang_code} (mapped to '{nltk_lang}' but not available in NLTK, confidence: {prob:.2f})")
+            else:
+                unsupported_msgs.append(f"{lang_code} (no NLTK mapping available, confidence: {prob:.2f})")
+        raise UnsupportedLanguageError(
+            f"Detected language(s) not supported by NLTK stopwords: {', '.join(unsupported_msgs)}"
+        )
+    if not nltk_languages:
+        raise LanguageDetectionError(
+            f"No languages detected with sufficient confidence (threshold: {LANGUAGE_DETECTION_THRESHOLD})"
+        )
+    return nltk_languages
+def get_function_words(text: str) -> Set[str]:
     """
+    Get comprehensive set of function words (stopwords) based on detected language(s).
+    This function:
+    1. Analyzes the email text to detect the language(s) present
+    2. Returns stopwords for the detected language(s)
+    3. For mixed-language emails, returns combined stopwords from all detected languages
+    Args:
+        text: The email text to analyze for language detection
+    Returns:
+        Set[str]: Set of function words (stopwords) for the detected language(s)
+    Raises:
+        LanguageDetectionError: If language detection fails
+        UnsupportedLanguageError: If a detected language is not supported by NLTK stopwords
+        LangDetectException: If langdetect encounters an internal error
+    """
+    # Detect language(s) in the text
+    detected_languages = detect_languages(text)
+    # Collect stopwords from all detected languages
+    function_words = set()
+    for language in detected_languages:
+        lang_stopwords = set(stopwords.words(language))
+        function_words.update(lang_stopwords)
+        logger.debug(f"Loaded {len(lang_stopwords)} stopwords for '{language}'")
+    # Add additional common function words for English if English is detected
+    if 'english' in detected_languages:
+        additional_words = {
+            'shall', 'might', 'must', 'ought', 'need', 'dare',
+            'used', 'having', 'being', 'does', 'did', 'done',
+            'may', 'should', 'would', 'could', 'can', 'will',
+        }
+        function_words.update(additional_words)
+    logger.info(f"Loaded {len(function_words)} function words for languages: {detected_languages}")
+    return function_words
+# Phishing-related keywords (case-insensitive)
+PHISHING_KEYWORDS = {
+    'account': r'\baccount\b',
+    'access': r'\baccess\b',
+    'bank': r'\bbank\b',
+    'credit': r'\bcredit\b',
+    'click': r'\bclick\b',
+    'identity': r'\bidentity\b',
+    'inconvenience': r'\binconvenience\b',
+    'information': r'\binformation\b',
+    'limited': r'\blimited\b',
+    'minutes': r'\bminutes?\b',
+    'password': r'\bpassword\b',
+    'recently': r'\brecently\b',
+    'risk': r'\brisk\b',
+    'social': r'\bsocial\b',
+    'security': r'\bsecurity\b',
+    'service': r'\bservice\b',
+    'suspended': r'\bsuspended\b',
+}
+def extract_words(text: str) -> List[str]:
     """
+    Extract words from text using NLTK tokenization.
     Args:
         text: Email content
     Returns:
+        list: List of words (lowercase, alphabetic only)
     """
+    # Use NLTK's word tokenizer for better accuracy
+    tokens = word_tokenize(text.lower())
+    # Filter to keep only alphabetic words
+    words = [word for word in tokens if word.isalpha()]
+    return words
+def count_keyword_occurrences(text: str, keyword: str, pattern: str) -> int:
+    """
+    Count occurrences of a specific keyword in text.
+    Args:
+        text: Email content
+        keyword: Keyword name (for logging)
+        pattern: Regex pattern to match
+    Returns:
+        int: Count of keyword occurrences
+    """
+    matches = re.findall(pattern, text.lower())
+    return len(matches)
+def calculate_vocabulary_richness(words: list, total_chars: int) -> float:
+    """
+    Calculate vocabulary richness as W/C (number of words / total characters).
+    Args:
+        words: List of words
+        total_chars: Total number of characters
+    Returns:
+        float: Vocabulary richness ratio
+    """
+    if total_chars == 0:
+        return 0.0
+    num_words = len(words)
+    return num_words / total_chars
+def calculate_function_word_ratio(words: list, text: str) -> float:
     """
+    Calculate the ratio of function words to total words (Function words/W).
+    Uses language detection to determine which stopwords to use for calculating
+    the function word ratio. Supports multi-language emails.
     Args:
+        words: List of words (lowercase, alphabetic only)
+        text: Original email text (used for language detection)
     Returns:
+        float: Function word ratio
+    Raises:
+        LanguageDetectionError: If language detection fails
+        UnsupportedLanguageError: If a detected language is not supported
+        LangDetectException: If langdetect encounters an internal error
+    """
+    if len(words) == 0:
+        return 0.0
+    # Get function words based on detected language(s)
+    function_words = get_function_words(text)
+    function_word_count = sum(1 for word in words if word in function_words)
+    return function_word_count / len(words)
+def count_unique_words(words: List[str]) -> int:
     """
+    Count the number of unique words in the text.
     Args:
+        words: List of words
     Returns:
+        int: Number of unique words
     """
+    return len(set(words))
 # ============================================================================
+# Advanced NLP Features (Optional Enhancement)
 # ============================================================================
+def extract_advanced_nlp_features(text: str) -> Dict[str, Any]:
     """
+    Extract advanced NLP features using spaCy and TextBlob.
+    These features provide additional insights but are not part of the core 21 features.
+    Args:
+        text: Email content
+    Returns:
+        dict: Dictionary of advanced features
+    """
+    # Sentiment analysis using TextBlob
+    blob = TextBlob(text)
+    sentiment_polarity = blob.sentiment.polarity
+    sentiment_subjectivity = blob.sentiment.subjectivity
+    # spaCy analysis
+    doc = _spacy_nlp(text[:1000000])  # Limit text length for performance
+    # Named Entity Recognition
+    entities = list(doc.ents)
+    named_entities_count = len(entities)
+    # Count specific entity types
+    financial_entities = 0
+    person_entities = 0
+    org_entities = 0
+    for ent in entities:
+        if ent.label_ in ['MONEY', 'PERCENT', 'CARDINAL']:
+            financial_entities += 1
+        elif ent.label_ == 'PERSON':
+            person_entities += 1
+        elif ent.label_ == 'ORG':
+            org_entities += 1
+    # Part-of-speech analysis
+    pos_noun_ratio = 0.0
+    pos_verb_ratio = 0.0
+    pos_adj_ratio = 0.0
+    if len(doc) > 0:
+        pos_counts = {'NOUN': 0, 'VERB': 0, 'ADJ': 0}
+        for token in doc:
+            if token.pos_ in pos_counts:
+                pos_counts[token.pos_] += 1
+        total_tokens = len(doc)
+        pos_noun_ratio = pos_counts['NOUN'] / total_tokens
+        pos_verb_ratio = pos_counts['VERB'] / total_tokens
+        pos_adj_ratio = pos_counts['ADJ'] / total_tokens
+    advanced_features = {
+        'sentiment_polarity': sentiment_polarity,
+        'sentiment_subjectivity': sentiment_subjectivity,
+        'named_entities_count': named_entities_count,
+        'financial_entities': financial_entities,
+        'person_entities': person_entities,
+        'org_entities': org_entities,
+        'pos_noun_ratio': pos_noun_ratio,
+        'pos_verb_ratio': pos_verb_ratio,
+        'pos_adj_ratio': pos_adj_ratio,
+    }
+    logger.debug(f"Advanced NLP features extracted: {advanced_features}")
+    return advanced_features
+def extract_features(email_text: str, include_advanced: bool = False) -> Dict[str, Any]:
+    """
+    Extract all 21 features from email content using enhanced NLP libraries.
+    Features extracted (in exact order):
+    1. Total Number of Characters C
+    2. Vocabulary richness W/C
+    3-19. Keyword counts (Account, Access, Bank, Credit, Click, Identity,
+          Inconvenience, Information, Limited, Minutes, Password, Recently,
+          Risk, Social, Security, Service, Suspended)
+    20. Total number of Function words/W
+    21. Unique Words
+    Enhanced with:
+    - Automatic text preprocessing and normalization (handles multi-line input)
+    - NLTK word tokenization (more accurate than regex)
+    - NLTK stopwords for function word detection (more comprehensive)
+    - Optional spaCy analysis for advanced features
     Args:
+        email_text: Raw email content as string (can be multi-line with formatting)
+        include_advanced: If True, include advanced NLP features (not used by model)
     Returns:
+        dict: Dictionary containing all 21 features with exact column names
+              (plus optional advanced features if include_advanced=True)
     """
     # Handle empty or None input
     if not email_text or not isinstance(email_text, str):
         raise ValueError("Email text must be a non-empty string")
+    # PREPROCESSING: Normalize and clean the raw email text
+    # This handles multi-line input, special characters, excessive whitespace, etc.
+    original_length = len(email_text)
+    email_text = preprocess_email_text(email_text)
+    if original_length > 0:
+        logger.debug(f"Text preprocessing: {original_length} -> {len(email_text)} chars")
+    # 1. Total Number of Characters C
+    total_chars = len(email_text)
+    # Extract words for further analysis (using NLTK tokenization)
+    words = extract_words(email_text)
+    # 2. Vocabulary richness W/C
+    vocab_richness = calculate_vocabulary_richness(words, total_chars)
+    # 3-19. Count keyword occurrences
+    keyword_counts = {}
+    for keyword, pattern in PHISHING_KEYWORDS.items():
+        count = count_keyword_occurrences(email_text, keyword, pattern)
+        # Capitalize first letter to match dataset column names
+        column_name = keyword.capitalize()
+        keyword_counts[column_name] = count
+    # 20. Total number of Function words/W (using language-aware NLTK stopwords)
+    function_word_ratio = calculate_function_word_ratio(words, email_text)
+    # 21. Unique Words
+    unique_words = count_unique_words(words)
+    # Construct features dictionary with exact column names from dataset
+    features = {
+        'Total Number of Characters C': total_chars,
+        'Vocabulary richness W/C': vocab_richness,
+        'Account': keyword_counts['Account'],
+        'Access': keyword_counts['Access'],
+        'Bank': keyword_counts['Bank'],
+        'Credit': keyword_counts['Credit'],
+        'Click': keyword_counts['Click'],
+        'Identity': keyword_counts['Identity'],
+        'Inconvenience': keyword_counts['Inconvenience'],
+        'Information': keyword_counts['Information'],
+        'Limited': keyword_counts['Limited'],
+        'Minutes': keyword_counts['Minutes'],
+        'Password': keyword_counts['Password'],
+        'Recently': keyword_counts['Recently'],
+        'Risk': keyword_counts['Risk'],
+        'Social': keyword_counts['Social'],
+        'Security': keyword_counts['Security'],
+        'Service': keyword_counts['Service'],
+        'Suspended': keyword_counts['Suspended'],
+        'Total number of Function words/W': function_word_ratio,
+        'Unique Words': unique_words,
+    }
+    logger.info(f"✓ Successfully extracted all 21 features from email (length: {total_chars} chars, words: {len(words)})")
+    logger.debug(f"Core features: {features}")
+    # Optionally include advanced NLP features
+    if include_advanced:
+        advanced = extract_advanced_nlp_features(email_text)
+        features['_advanced'] = advanced
+        logger.debug(f"Advanced features: {advanced}")
     return features

model/model.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """
-Model loading and prediction module for spam email detection.
-Uses 57 features based on the UCI Spambase dataset.
 """
 import logging
@@ -171,9 +170,9 @@ def predict_email(email_text: str) -> Dict[str, Any]:
         logger.info(f"Extracting features from email (length: {len(email_text)} chars)")
         features_dict = extract_features(email_text)
-        # Check if feature extraction returned valid features (57 features expected)
-        if len(features_dict) != 57:
-            logger.warning(f"Feature extraction returned {len(features_dict)} features, expected 57")
         # Make prediction
         logger.info("Making prediction...")

 """
+Model loading and prediction module for phishing email detection.
 """
 import logging
         logger.info(f"Extracting features from email (length: {len(email_text)} chars)")
         features_dict = extract_features(email_text)
+        # Check if feature extraction returned valid features
+        if features_dict.get('Total Number of Characters C', 0) == 0 and len(email_text) > 0:
+            logger.warning(f"Feature extraction may have failed for email")
         # Make prediction
         logger.info("Making prediction...")