Spaces:

nitinprajwal
/

email-pii-classifier

Sleeping

App Files Files Community

email-pii-classifier / feature_extractor.py

nitinprajwal

Update feature_extractor.py

1a7032d verified 11 months ago

raw

history blame contribute delete

3.15 kB

	# feature_extractor.py - Advanced Text Feature Extractor for Email Classification
	"""
	Advanced text feature extractor for email classification.
	This module contains the AdvancedTextFeatureExtractor class that extracts
	linguistic and semantic features from email text.
	"""

	import numpy as np
	from sklearn.base import BaseEstimator, TransformerMixin

	class AdvancedTextFeatureExtractor(BaseEstimator, TransformerMixin):
	"""
	Advanced feature extractor for email text analysis.
	Extracts multiple linguistic and semantic features.
	"""

	def __init__(self):
	self.urgency_keywords = ['urgent', 'asap', 'immediately', 'emergency', 'critical', 'priority']
	self.sentiment_positive = ['good', 'great', 'excellent', 'please', 'thank', 'appreciate']
	self.sentiment_negative = ['bad', 'terrible', 'awful', 'problem', 'issue', 'error', 'fail']
	self.request_keywords = ['request', 'need', 'want', 'could', 'would', 'please', 'help']
	self.technical_keywords = ['server', 'database', 'api', 'system', 'code', 'bug', 'fix']

	def fit(self, X, y=None):
	return self

	def transform(self, X):
	features = []
	for text in X:
	text_lower = text.lower()

	# Basic text statistics
	word_count = len(text.split())
	char_count = len(text)
	sentence_count = len([s for s in text.split('.') if s.strip()])

	# Urgency indicators
	urgency_score = sum([text_lower.count(word) for word in self.urgency_keywords])

	# Sentiment indicators
	positive_score = sum([text_lower.count(word) for word in self.sentiment_positive])
	negative_score = sum([text_lower.count(word) for word in self.sentiment_negative])

	# Email type indicators
	request_score = sum([text_lower.count(word) for word in self.request_keywords])
	technical_score = sum([text_lower.count(word) for word in self.technical_keywords])

	# Subject line analysis
	has_subject = 1 if 'subject:' in text_lower else 0

	# Question indicators
	question_count = text.count('?')

	# Exclamation indicators
	exclamation_count = text.count('!')

	# Capitalization patterns
	caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)

	# Time-related keywords
	time_keywords = ['today', 'tomorrow', 'yesterday', 'now', 'soon', 'later']
	time_score = sum([text_lower.count(word) for word in time_keywords])

	# Feature vector
	feature_vector = [
	word_count, char_count, sentence_count,
	urgency_score, positive_score, negative_score,
	request_score, technical_score, has_subject,
	question_count, exclamation_count, caps_ratio, time_score
	]

	features.append(feature_vector)

	return np.array(features)