| | |
| | """api.ipynb |
| | Automatically generated by Colab. |
| | Original file is located at |
| | https://colab.research.google.com/drive/1_vvePPVuXHIon25W71PD4kt1Z5J2sKgh |
| | """ |
| | |
| | |
| | |
| | """api.py""" |
| | from fastapi import FastAPI, HTTPException |
| | from pydantic import BaseModel |
| | from typing import List, Dict, Any |
| | import joblib |
| | import cloudpickle |
| | import re |
| | import nltk |
| | from nltk.corpus import stopwords |
| | from nltk.stem import WordNetLemmatizer |
| | import os |
| | import logging |
| |
|
| | |
| | from utils import mask_pii |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | nltk_data_dir = "/tmp/nltk_data" |
| | os.makedirs(nltk_data_dir, exist_ok=True) |
| | nltk.download('stopwords', download_dir=nltk_data_dir) |
| | nltk.download('wordnet', download_dir=nltk_data_dir) |
| | nltk.data.path.append(nltk_data_dir) |
| |
|
| | |
| | def load_pickle_file(file_path): |
| | try: |
| | return joblib.load(file_path) |
| | except ModuleNotFoundError: |
| | with open(file_path, 'rb') as f: |
| | return cloudpickle.load(f) |
| |
|
| | try: |
| | model = load_pickle_file('rf_classifier_v3.pkl') |
| | vectorizer = load_pickle_file('vectorizer.pkl') |
| | label_encoder = load_pickle_file('label_encoder.pkl') |
| | except Exception as e: |
| | logger.critical(f"Model loading failed: {e}") |
| | exit(1) |
| |
|
| | stop_words = set(stopwords.words('english')) |
| | lemmatizer = WordNetLemmatizer() |
| |
|
| | app = FastAPI(title="Email Classification API") |
| |
|
| | class EmailRequest(BaseModel): |
| | email_body: str |
| |
|
| | class EmailResponse(BaseModel): |
| | masked_email: str |
| | predicted_category: str |
| | pii_entities: List[Dict[str, Any]] |
| |
|
| | def clean_text(text): |
| | text = re.sub(r'\n|\r', ' ', text) |
| | text = re.sub(r'Subject:', '', text) |
| | text = text.lower() |
| | text = re.sub(r'[^a-z\s]', '', text) |
| | text = re.sub(r'\s+', ' ', text) |
| | tokens = text.split() |
| | tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] |
| | return ' '.join(tokens) |
| |
|
| | def classify_email(text: str): |
| | cleaned = clean_text(text) |
| | vec = vectorizer.transform([cleaned]) |
| | pred = model.predict(vec) |
| | return label_encoder.inverse_transform(pred)[0] |
| |
|
| | @app.get("/") |
| | async def root(): |
| | return {"message": "API is running."} |
| |
|
| | @app.post("/predict", response_model=EmailResponse) |
| | async def predict(request: EmailRequest): |
| | if not request.email_body.strip(): |
| | raise HTTPException(status_code=400, detail="Email body is required.") |
| | masked_text, pii_entities = mask_pii(request.email_body) |
| | predicted_category = classify_email(masked_text) |
| | return EmailResponse( |
| | masked_email=masked_text, |
| | predicted_category=predicted_category, |
| | pii_entities=pii_entities |
| | ) |
| |
|