nitinprajwal's picture
Update utils.py
4701ad2 verified
"""
Utility functions for the Email Classification and PII Masking application.
This module provides common helper functions that can be used across different
parts of the project, such as data loading, preprocessing, or other shared tasks.
It aims to promote code reusability and organization.
"""
import pandas as pd
from typing import Union
def load_data(file_path: str) -> Union[pd.DataFrame, None]:
"""
Loads data from a specified CSV file into a pandas DataFrame.
Args:
file_path (str): The absolute or relative path to the CSV file.
Returns:
Union[pd.DataFrame, None]: A pandas DataFrame containing the loaded data if successful,
with 'email' and 'type' columns validated.
Returns None if any error occurs during loading or validation
(e.g., file not found, empty file, missing required columns).
Raises:
Prints an error message to the console if loading fails or if the
required columns ('email', 'type') are not found in the CSV.
"""
try:
df = pd.read_csv(file_path)
# Basic validation: Check for expected columns 'email' and 'type'
# Basic validation: Ensure essential columns are present.
# These columns are critical for training the email classifier and processing emails.
if 'email' not in df.columns or 'type' not in df.columns:
print(f"Error: CSV file at {file_path} must contain 'email' and 'type' columns.")
return None
print(f"Successfully loaded data from {file_path}. DataFrame shape: {df.shape}")
return df
except FileNotFoundError:
print(f"Error: The data file was not found at the specified path: {file_path}")
return None
except pd.errors.EmptyDataError:
print(f"Error: The data file at {file_path} is empty and cannot be processed.")
return None
except Exception as e: # Catching other potential pandas or general exceptions during file loading.
print(f"An unexpected error occurred while loading data from {file_path}: {e}")
return None
if __name__ == "__main__":
# This block serves as an example of how to use the functions in this module.
# It will only execute when this script is run directly (e.g., `python utils.py`)
# and not when `utils.py` is imported by another module.
# --- Example: Loading email data --- #
# Ensure the CSV file 'combined_emails_with_natural_pii.csv' exists in the project's
# root directory or update DATASET_PATH to the correct location for this example to run.
# This dataset is assumed to be for demonstration or initial model training preparation.
DATASET_PATH = 'combined_emails_with_natural_pii.csv'
email_data = load_data(DATASET_PATH)
if email_data is not None:
print(f"Successfully loaded {len(email_data)} emails for example usage.")
print("First 5 rows:")
print(email_data.head())
print("\nEmail categories distribution:")
print(email_data['type'].value_counts())