Spaces:
Sleeping
Sleeping
| """ | |
| Utility functions for the Email Classification and PII Masking application. | |
| This module provides common helper functions that can be used across different | |
| parts of the project, such as data loading, preprocessing, or other shared tasks. | |
| It aims to promote code reusability and organization. | |
| """ | |
| import pandas as pd | |
| from typing import Union | |
| def load_data(file_path: str) -> Union[pd.DataFrame, None]: | |
| """ | |
| Loads data from a specified CSV file into a pandas DataFrame. | |
| Args: | |
| file_path (str): The absolute or relative path to the CSV file. | |
| Returns: | |
| Union[pd.DataFrame, None]: A pandas DataFrame containing the loaded data if successful, | |
| with 'email' and 'type' columns validated. | |
| Returns None if any error occurs during loading or validation | |
| (e.g., file not found, empty file, missing required columns). | |
| Raises: | |
| Prints an error message to the console if loading fails or if the | |
| required columns ('email', 'type') are not found in the CSV. | |
| """ | |
| try: | |
| df = pd.read_csv(file_path) | |
| # Basic validation: Check for expected columns 'email' and 'type' | |
| # Basic validation: Ensure essential columns are present. | |
| # These columns are critical for training the email classifier and processing emails. | |
| if 'email' not in df.columns or 'type' not in df.columns: | |
| print(f"Error: CSV file at {file_path} must contain 'email' and 'type' columns.") | |
| return None | |
| print(f"Successfully loaded data from {file_path}. DataFrame shape: {df.shape}") | |
| return df | |
| except FileNotFoundError: | |
| print(f"Error: The data file was not found at the specified path: {file_path}") | |
| return None | |
| except pd.errors.EmptyDataError: | |
| print(f"Error: The data file at {file_path} is empty and cannot be processed.") | |
| return None | |
| except Exception as e: # Catching other potential pandas or general exceptions during file loading. | |
| print(f"An unexpected error occurred while loading data from {file_path}: {e}") | |
| return None | |
| if __name__ == "__main__": | |
| # This block serves as an example of how to use the functions in this module. | |
| # It will only execute when this script is run directly (e.g., `python utils.py`) | |
| # and not when `utils.py` is imported by another module. | |
| # --- Example: Loading email data --- # | |
| # Ensure the CSV file 'combined_emails_with_natural_pii.csv' exists in the project's | |
| # root directory or update DATASET_PATH to the correct location for this example to run. | |
| # This dataset is assumed to be for demonstration or initial model training preparation. | |
| DATASET_PATH = 'combined_emails_with_natural_pii.csv' | |
| email_data = load_data(DATASET_PATH) | |
| if email_data is not None: | |
| print(f"Successfully loaded {len(email_data)} emails for example usage.") | |
| print("First 5 rows:") | |
| print(email_data.head()) | |
| print("\nEmail categories distribution:") | |
| print(email_data['type'].value_counts()) | |