Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from pathlib import Path | |
| from config.settings import DATASET_1_PATH, DATASET_2_PATH | |
| import functools | |
| import sys | |
| # Ensure UTF-8 encoding for print statements | |
| if sys.stdout.encoding != 'utf-8': | |
| sys.stdout.reconfigure(encoding='utf-8') | |
| class DataLoader: | |
| """Handles loading and validation of health datasets.""" | |
| def load_datasets(): | |
| """ | |
| Loads both health datasets with caching. | |
| Returns: tuple(df1, df2) | |
| """ | |
| try: | |
| print(f"Loading datasets from {DATASET_1_PATH} and {DATASET_2_PATH}...") | |
| if not Path(DATASET_1_PATH).exists() or not Path(DATASET_2_PATH).exists(): | |
| raise FileNotFoundError("One or both dataset files are missing.") | |
| df1 = pd.read_csv(DATASET_1_PATH) | |
| df2 = pd.read_csv(DATASET_2_PATH) | |
| # Basic validation | |
| DataLoader._validate_structure(df1, df2) | |
| # Feature Engineering (Mandatory 2a) | |
| df1 = DataLoader._feature_engineering(df1) | |
| print(f"[OK] Datasets loaded successfully: DF1({len(df1)}), DF2({len(df2)})") | |
| return df1, df2 | |
| except Exception as e: | |
| print(f"[ERROR] Error loading datasets: {e}") | |
| raise e | |
| def _feature_engineering(df): | |
| """Adds derived features to the dataset.""" | |
| if 'BMI' in df.columns: | |
| # Categorize BMI according to standard ranges | |
| bins = [0, 18.5, 25, 30, 100] | |
| labels = ['Underweight', 'Normal', 'Overweight', 'Obese'] | |
| df['BMI_Category'] = pd.cut(df['BMI'], bins=bins, labels=labels) | |
| return df | |
| def _validate_structure(df1, df2): | |
| """Validates that necessary columns exist.""" | |
| required_key = "Patient_Number" | |
| if required_key not in df1.columns: | |
| raise ValueError(f"Dataset 1 missing required key: {required_key}") | |
| if required_key not in df2.columns: | |
| raise ValueError(f"Dataset 2 missing required key: {required_key}") | |