File size: 2,186 Bytes
96638b2
 
 
 
ee9f8cb
 
 
 
 
96638b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee9f8cb
96638b2
 
 
ee9f8cb
96638b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
from pathlib import Path
from config.settings import DATASET_1_PATH, DATASET_2_PATH
import functools
import sys

# Ensure UTF-8 encoding for print statements
if sys.stdout.encoding != 'utf-8':
    sys.stdout.reconfigure(encoding='utf-8')

class DataLoader:
    """Handles loading and validation of health datasets."""
    
    @staticmethod
    @functools.lru_cache(maxsize=1)
    def load_datasets():
        """
        Loads both health datasets with caching.
        Returns: tuple(df1, df2)
        """
        try:
            print(f"Loading datasets from {DATASET_1_PATH} and {DATASET_2_PATH}...")
            
            if not Path(DATASET_1_PATH).exists() or not Path(DATASET_2_PATH).exists():
                raise FileNotFoundError("One or both dataset files are missing.")
                
            df1 = pd.read_csv(DATASET_1_PATH)
            df2 = pd.read_csv(DATASET_2_PATH)
            
            # Basic validation
            DataLoader._validate_structure(df1, df2)
            
            # Feature Engineering (Mandatory 2a)
            df1 = DataLoader._feature_engineering(df1)
            
            print(f"[OK] Datasets loaded successfully: DF1({len(df1)}), DF2({len(df2)})")
            return df1, df2
            
        except Exception as e:
            print(f"[ERROR] Error loading datasets: {e}")
            raise e

    @staticmethod
    def _feature_engineering(df):
        """Adds derived features to the dataset."""
        if 'BMI' in df.columns:
            # Categorize BMI according to standard ranges
            bins = [0, 18.5, 25, 30, 100]
            labels = ['Underweight', 'Normal', 'Overweight', 'Obese']
            df['BMI_Category'] = pd.cut(df['BMI'], bins=bins, labels=labels)
        return df

    @staticmethod
    def _validate_structure(df1, df2):
        """Validates that necessary columns exist."""
        required_key = "Patient_Number"
        if required_key not in df1.columns:
            raise ValueError(f"Dataset 1 missing required key: {required_key}")
        if required_key not in df2.columns:
            raise ValueError(f"Dataset 2 missing required key: {required_key}")