email-pii-classifier / feature_extractor.py
nitinprajwal's picture
Update feature_extractor.py
1a7032d verified
# feature_extractor.py - Advanced Text Feature Extractor for Email Classification
"""
Advanced text feature extractor for email classification.
This module contains the AdvancedTextFeatureExtractor class that extracts
linguistic and semantic features from email text.
"""
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class AdvancedTextFeatureExtractor(BaseEstimator, TransformerMixin):
"""
Advanced feature extractor for email text analysis.
Extracts multiple linguistic and semantic features.
"""
def __init__(self):
self.urgency_keywords = ['urgent', 'asap', 'immediately', 'emergency', 'critical', 'priority']
self.sentiment_positive = ['good', 'great', 'excellent', 'please', 'thank', 'appreciate']
self.sentiment_negative = ['bad', 'terrible', 'awful', 'problem', 'issue', 'error', 'fail']
self.request_keywords = ['request', 'need', 'want', 'could', 'would', 'please', 'help']
self.technical_keywords = ['server', 'database', 'api', 'system', 'code', 'bug', 'fix']
def fit(self, X, y=None):
return self
def transform(self, X):
features = []
for text in X:
text_lower = text.lower()
# Basic text statistics
word_count = len(text.split())
char_count = len(text)
sentence_count = len([s for s in text.split('.') if s.strip()])
# Urgency indicators
urgency_score = sum([text_lower.count(word) for word in self.urgency_keywords])
# Sentiment indicators
positive_score = sum([text_lower.count(word) for word in self.sentiment_positive])
negative_score = sum([text_lower.count(word) for word in self.sentiment_negative])
# Email type indicators
request_score = sum([text_lower.count(word) for word in self.request_keywords])
technical_score = sum([text_lower.count(word) for word in self.technical_keywords])
# Subject line analysis
has_subject = 1 if 'subject:' in text_lower else 0
# Question indicators
question_count = text.count('?')
# Exclamation indicators
exclamation_count = text.count('!')
# Capitalization patterns
caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
# Time-related keywords
time_keywords = ['today', 'tomorrow', 'yesterday', 'now', 'soon', 'later']
time_score = sum([text_lower.count(word) for word in time_keywords])
# Feature vector
feature_vector = [
word_count, char_count, sentence_count,
urgency_score, positive_score, negative_score,
request_score, technical_score, has_subject,
question_count, exclamation_count, caps_ratio, time_score
]
features.append(feature_vector)
return np.array(features)