File size: 2,795 Bytes
805b6d7 5150e44 7688f8c 9049a44 0b9e0a4 9049a44 0b9e0a4 7688f8c 0b9e0a4 bf35fed 0b9e0a4 27cd901 bf35fed 27cd901 bf35fed 2184afd bf35fed e38a109 cac30f3 e38a109 7688f8c bf35fed 9049a44 bf35fed 9049a44 2184afd bf35fed 2184afd 5150e44 0b9e0a4 bf35fed 0b9e0a4 27cd901 0b9e0a4 27cd901 0b9e0a4 bf35fed 0b9e0a4 bf35fed 27cd901 0b9e0a4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | # -*- coding: utf-8 -*-
"""api.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1_vvePPVuXHIon25W71PD4kt1Z5J2sKgh
"""
# Install required packages
# Install required packages
# -*- coding: utf-8 -*-
"""api.py"""
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Dict, Any
import joblib
import cloudpickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import logging
# Your custom utils
from utils import mask_pii # Keep this if you have utils.py uploaded
# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# NLTK setup
nltk_data_dir = "/tmp/nltk_data"
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.data.path.append(nltk_data_dir)
# Load Models
def load_pickle_file(file_path):
try:
return joblib.load(file_path)
except ModuleNotFoundError:
with open(file_path, 'rb') as f:
return cloudpickle.load(f)
try:
model = load_pickle_file('rf_classifier_v3.pkl')
vectorizer = load_pickle_file('vectorizer.pkl')
label_encoder = load_pickle_file('label_encoder.pkl')
except Exception as e:
logger.critical(f"Model loading failed: {e}")
exit(1)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
app = FastAPI(title="Email Classification API")
class EmailRequest(BaseModel):
email_body: str
class EmailResponse(BaseModel):
masked_email: str
predicted_category: str
pii_entities: List[Dict[str, Any]]
def clean_text(text):
text = re.sub(r'\n|\r', ' ', text)
text = re.sub(r'Subject:', '', text)
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
text = re.sub(r'\s+', ' ', text)
tokens = text.split()
tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
return ' '.join(tokens)
def classify_email(text: str):
cleaned = clean_text(text)
vec = vectorizer.transform([cleaned])
pred = model.predict(vec)
return label_encoder.inverse_transform(pred)[0]
@app.get("/")
async def root():
return {"message": "API is running."}
@app.post("/predict", response_model=EmailResponse)
async def predict(request: EmailRequest):
if not request.email_body.strip():
raise HTTPException(status_code=400, detail="Email body is required.")
masked_text, pii_entities = mask_pii(request.email_body)
predicted_category = classify_email(masked_text)
return EmailResponse(
masked_email=masked_text,
predicted_category=predicted_category,
pii_entities=pii_entities
)
|