Upload 8 files

Browse files

Files changed (8) hide show

README.md +0 -3
app.py +81 -0
email_classifier.joblib +3 -0
models.py +133 -0
requirements.txt +12 -0
test.py +4 -0
train_model.py +21 -0
utils.py +133 -0

README.md CHANGED Viewed

@@ -1,3 +0,0 @@
----
-license: mit
----

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import List, Dict
+from models import EmailClassifier
+from utils import PIIDetector
+import joblib
+import os
+app = FastAPI(
+    title="Email Classification and PII Masking API",
+    description="API for classifying support emails and masking PII information",
+    version="1.0.0"
+)
+# Initialize components
+pii_detector = PIIDetector()
+email_classifier = EmailClassifier()
+try:
+    email_classifier.load_model("email_classifier.joblib")
+except Exception as e:
+    print("Model loading failed:", e)
+    raise RuntimeError("Pre-trained model not found. Please train it using train_model.py")
+class EmailRequest(BaseModel):
+    email_body: str
+class MaskedEntity(BaseModel):
+        position: List[int]
+        classification: str
+        entity: str
+class EmailResponse(BaseModel):
+    input_email_body: str
+    list_of_masked_entities: List[MaskedEntity]
+    masked_email: str
+    category_of_the_email: str
+@app.post("/classify_email", response_model=EmailResponse)
+async def classify_email(request: EmailRequest):
+    """
+    Endpoint for classifying emails and masking PII.
+    Args:
+        request: EmailRequest containing the email body
+    Returns:
+        EmailResponse with classification and PII masking information
+    """
+    try:
+        # Step 1: Detect PII in the email
+        email_text = request.email_body
+        detected_entities = pii_detector.detect_pii(email_text)
+        # Step 2: Mask the PII
+        masked_email, masked_entities = pii_detector.mask_pii(email_text, detected_entities)
+        # Step 3: Classify the email
+        category = email_classifier.predict(masked_email)
+        # Prepare response
+        response = {
+            "input_email_body": email_text,
+            "list_of_masked_entities": masked_entities,
+            "masked_email": masked_email,
+            "category_of_the_email": category
+        }
+        return response
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=5000)

email_classifier.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9ed8331348cd41157590442a442dacfad9221129defd3c40a907755fcc4d149
+size 116355553

models.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import pandas as pd
+import numpy as np
+import os
+import re
+import joblib
+import nltk
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import TfidfVectorizer
+#from sklearn.linear_model import SGDClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.metrics import classification_report
+#from sklearn.model_selection import StratifiedKFold
+from sklearn.ensemble import RandomForestClassifier
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.pipeline import Pipeline as ImbPipeline
+nltk.download('stopwords')
+nltk.download('wordnet')
+class EmailClassifier:
+    def __init__(self):
+        self.model = None
+        self.vectorizer = None
+        self.classes = None
+        self.lemmatizer = WordNetLemmatizer()
+        self.stop_words = set(stopwords.words('english'))
+    def preprocess(self, text: str) -> str:
+        text = text.lower()
+        # Remove email addresses
+        text = re.sub(r'\S+@\S+', ' ', text)
+        # Keep alphanumerics, dots, underscores, hyphens (useful in tech terms)
+        text = re.sub(r'[^a-zA-Z0-9\s._-]', ' ', text)
+        tokens = text.split()
+        # Custom stopwords: remove common words but retain useful ones
+        custom_stop_words = self.stop_words - {'no', 'not', 'nor', 'against', 'aren', "aren't", 'isn', "isn't"}
+        # Lemmatize and filter
+        tokens = [
+            self.lemmatizer.lemmatize(word)
+            for word in tokens
+            if word not in custom_stop_words and len(word) > 1
+        ]
+        return ' '.join(tokens)
+    def train(self, X, y, use_grid_search=False):
+        print("Preprocessing data...")
+        X_processed = [self.preprocess(text) for text in X]
+        print("Oversampling minority classes...")
+        ros = RandomOverSampler(random_state=42)
+        X_resampled, y_resampled = ros.fit_resample(np.array(X_processed).reshape(-1, 1), y)
+        X_resampled = X_resampled.ravel()  # Flatten the array back
+        print("Initializing pipeline...")
+        pipeline = ImbPipeline([
+            ('tfidf', TfidfVectorizer(
+                stop_words='english',
+                max_features=15000,
+                ngram_range=(1, 3),
+                sublinear_tf=True
+            )),
+            ('clf', RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample', random_state=42))
+        ])
+        if use_grid_search:
+            print("Running Grid Search...")
+            params = {
+                'clf__alpha': [1e-4, 1e-3, 1e-2],
+                'clf__penalty': ['l2', 'l1', 'elasticnet']
+            }
+            grid = GridSearchCV(pipeline, param_grid=params, scoring='f1_weighted', cv=5, verbose=2)
+            grid.fit(X_resampled, y_resampled)
+            self.model = grid.best_estimator_
+            print("Best Params:", grid.best_params_)
+        else:
+            print("Fitting model...")
+            pipeline.fit(X_resampled, y_resampled)
+            self.model = pipeline
+        print("Model trained.")
+        self.classes = self.model.named_steps['clf'].classes_
+    def predict(self, text: str) -> str:
+        if not self.model:
+            raise ValueError("Model not trained or loaded")
+        processed_text = self.preprocess(text)
+        return self.model.predict([processed_text])[0]
+    def save_model(self, model_path: str):
+        if not self.model:
+            raise ValueError("Model not trained")
+        joblib.dump(self.model, model_path)
+    def load_model(self, model_path: str):
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"Model file not found at {model_path}")
+        self.model = joblib.load(model_path)
+        self.classes = self.model.named_steps['clf'].classes_
+    @staticmethod
+    def load_data_from_csv(csv_path: str, text_col: str = "email", label_col: str = "type"):
+        df = pd.read_csv(csv_path)
+        return df[[text_col, label_col]].dropna()
+    def train_from_csv(self, csv_path: str, text_col: str = "email", label_col: str = "type", use_grid_search=False):
+        df = self.load_data_from_csv(csv_path, text_col, label_col)
+        #X_train, X_test, y_train, y_test = train_test_split(df[text_col], df[label_col], test_size=0.2, random_state=42)
+        X_train, X_test, y_train, y_test = train_test_split(
+            df[text_col], df[label_col], test_size=0.2, random_state=42, stratify=df[label_col]
+        )
+        self.train(X_train, y_train, use_grid_search=use_grid_search)
+        X_test_processed = [self.preprocess(text) for text in X_test]
+        y_pred = self.model.predict(X_test_processed)
+        print(classification_report(y_test, y_pred))
+        self.save_model("email_classifier.joblib")
+        print("Model trained and saved to email_classifier.joblib")

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi>=0.68.0
+uvicorn>=0.15.0
+pydantic>=1.8.0
+scikit-learn>=0.24.0
+pandas>=1.2.0
+numpy>=1.20.0
+joblib>=1.0.0
+python-dateutil>=2.8.0
+nltk
+imblearn
+huggingface_hub
+scikit-learn

test.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from models import EmailClassifier
+clf = EmailClassifier()
+clf.train_from_csv("data/combined_emails_with_natural_pii.csv", use_grid_search=True)

train_model.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from models import EmailClassifier
+import argparse
+def main():
+    parser = argparse.ArgumentParser(description="Train email classification model")
+    parser.add_argument("--csv_path", type=str, required=True, help="Path to CSV dataset")
+    parser.add_argument("--text_col", type=str, default="email", help="Name of text column")
+    parser.add_argument("--label_col", type=str, default="type", help="Name of label column")
+    args = parser.parse_args()
+    # Initialize and train classifier
+    classifier = EmailClassifier()
+    classifier.train_from_csv(
+        csv_path=args.csv_path,
+        text_col=args.text_col,
+        label_col=args.label_col
+    )
+if __name__ == "__main__":
+    main()

utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import re
+from typing import List, Dict, Tuple
+from datetime import datetime
+class PIIDetector:
+    """
+    Class for detecting and masking Personally Identifiable Information (PII) in text.
+    Uses regular expressions and pattern matching to identify PII entities.
+    """
+    def __init__(self):
+        # Compile regex patterns for different PII types
+        self.patterns = {
+            "full_name": re.compile(r'\b([A-Z][a-z]+(\s[A-Z][a-z]+)+)\b'),
+            "email": re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
+            "phone_number": re.compile(r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'),
+            "dob": re.compile(r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}, \d{4})\b'),
+            "aadhar_num": re.compile(r'\b\d{4}[ -]?\d{4}[ -]?\d{4}\b'),
+            "credit_debit_no": re.compile(r'\b(?:\d[ -]*?){13,16}\b'),
+            "cvv_no": re.compile(r'\b\d{3,4}\b'),
+            "expiry_no": re.compile(r'\b(0[1-9]|1[0-2])[-/]\d{2}\b')
+        }
+    def detect_pii(self, text: str) -> List[Dict]:
+        """
+        Detect all PII entities in the given text.
+        Args:
+            text: Input text to scan for PII
+        Returns:
+            List of dictionaries containing PII entities with their positions and types
+        """
+        entities = []
+        for entity_type, pattern in self.patterns.items():
+            for match in pattern.finditer(text):
+                start, end = match.span()
+                entity_value = match.group()
+                # Additional validation for specific entity types
+                if entity_type == "credit_debit_no" and not self._validate_luhn(entity_value):
+                    continue
+                if entity_type == "dob" and not self._validate_date(entity_value):
+                    continue
+                entities.append({
+                    "position": [start, end],
+                    "classification": entity_type,
+                    "entity": entity_value
+                })
+        # Sort entities by start position to handle masking in order
+        entities.sort(key=lambda x: x["position"][0])
+        return entities
+    def mask_pii(self, text: str, entities: List[Dict]) -> Tuple[str, List[Dict]]:
+        """
+        Mask detected PII entities in the text.
+        Args:
+            text: Original text containing PII
+            entities: List of detected PII entities
+        Returns:
+            Tuple of (masked_text, list_of_masked_entities)
+        """
+        masked_text = text
+        offset = 0
+        masked_entities = []
+        for entity in entities:
+            start, end = entity["position"]
+            entity_type = entity["classification"]
+            original_value = entity["entity"]
+            # Adjust positions based on previous replacements
+            adj_start = start + offset
+            adj_end = end + offset
+            # Create masked token
+            masked_token = f"[{entity_type}]"
+            # Replace the entity with masked token
+            masked_text = masked_text[:adj_start] + masked_token + masked_text[adj_end:]
+            # Update offset for next replacement
+            offset += len(masked_token) - (end - start)
+            # Store masked entity info
+            masked_entities.append({
+                "position": [start, end],
+                "classification": entity_type,
+                "entity": original_value
+            })
+        return masked_text, masked_entities
+    def _validate_luhn(self, card_number: str) -> bool:
+        """Validate credit card number using Luhn algorithm."""
+        # Remove non-digit characters
+        card_number = re.sub(r'[^0-9]', '', card_number)
+        if not card_number.isdigit() or len(card_number) < 13 or len(card_number) > 19:
+            return False
+        digits = list(map(int, card_number))
+        checksum = digits[-1]
+        total = 0
+        for i, digit in enumerate(digits[:-1]):
+            if i % 2 == 0:
+                digit *= 2
+                if digit > 9:
+                    digit -= 9
+            total += digit
+        return (total * 9) % 10 == checksum
+    def _validate_date(self, date_str: str) -> bool:
+        """Validate date of birth."""
+        try:
+            # Try to parse different date formats
+            for fmt in ('%m/%d/%Y', '%m-%d-%Y', '%d/%m/%Y', '%d-%m-%Y',
+                        '%b %d, %Y', '%B %d, %Y'):
+                try:
+                    datetime.strptime(date_str, fmt)
+                    return True
+                except ValueError:
+                    continue
+            return False
+        except:
+            return False