File size: 4,769 Bytes
75b9644
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""Project-wide constants and feature configuration for credit risk."""

from dataclasses import dataclass
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[2]
DATA_RAW_PATH = PROJECT_ROOT / "data" / "raw" / "german_credit.csv"
DATA_PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
MODEL_DIR = PROJECT_ROOT / "model"
REPORTS_DIR = PROJECT_ROOT / "reports"

TARGET_COLUMN = "Creditability"
NOT_SELECTED_LABEL = "Not selected"


@dataclass(frozen=True)
class FeatureOption:
    """Single UI option and its one-hot encoded destination column."""

    label: str
    column: str


@dataclass(frozen=True)
class FeatureGroup:
    """Feature group used in the app and preprocessing logic."""

    name: str
    source_column: str
    options: tuple[FeatureOption, ...]

    @property
    def labels(self) -> list[str]:
        return [option.label for option in self.options]

    def column_from_label(self, label: str | None) -> str | None:
        if label is None:
            return None
        for option in self.options:
            if option.label == label:
                return option.column
        return None


# This list defines both the app controls and the final model input schema.
FEATURE_GROUPS: tuple[FeatureGroup, ...] = (
    FeatureGroup(
        name="Account Balance",
        source_column="Account Balance",
        options=(
            FeatureOption("No account", "Account Balance_1"),
            FeatureOption("No balance", "Account Balance_2"),
            FeatureOption("Some balance", "Account Balance_3"),
        ),
    ),
    FeatureGroup(
        name="Payment Status of Previous Credit",
        source_column="Payment Status of Previous Credit",
        options=(
            FeatureOption("Some problems", "Payment Status of Previous Credit_1"),
            FeatureOption("No problems in this bank", "Payment Status of Previous Credit_3"),
        ),
    ),
    FeatureGroup(
        name="Purpose",
        source_column="Purpose",
        options=(
            FeatureOption("New car", "Purpose_1"),
            FeatureOption("Other", "Purpose_4"),
        ),
    ),
    FeatureGroup(
        name="Value Savings/Stocks",
        source_column="Value Savings/Stocks",
        options=(
            FeatureOption("No savings", "Value Savings/Stocks_1"),
            FeatureOption("DM between [100, 1000]", "Value Savings/Stocks_3"),
            FeatureOption("DM >= 1000", "Value Savings/Stocks_5"),
        ),
    ),
    FeatureGroup(
        name="Length of Current Employment",
        source_column="Length of current employment",
        options=(
            FeatureOption("Below 1 year (or unemployed)", "Length of current employment_1"),
            FeatureOption("Between 4 and 7 years", "Length of current employment_4"),
        ),
    ),
    FeatureGroup(
        name="Instalment Per Cent",
        source_column="Instalment per cent",
        options=(FeatureOption("Smaller than 20%", "Instalment per cent_4"),),
    ),
    FeatureGroup(
        name="Guarantors",
        source_column="Guarantors",
        options=(FeatureOption("No guarantors", "Guarantors_1"),),
    ),
    FeatureGroup(
        name="Duration in Current Address",
        source_column="Duration in Current address",
        options=(
            FeatureOption("Less than a year", "Duration in Current address_1"),
            FeatureOption("Between 1 and 4 years", "Duration in Current address_2"),
        ),
    ),
    FeatureGroup(
        name="Most Valuable Available Asset",
        source_column="Most valuable available asset",
        options=(
            FeatureOption("Not available / no assets", "Most valuable available asset_1"),
            FeatureOption("Ownership of house or land", "Most valuable available asset_4"),
        ),
    ),
    FeatureGroup(
        name="Concurrent Credits",
        source_column="Concurrent Credits",
        options=(FeatureOption("No further running credits", "Concurrent Credits_3"),),
    ),
    FeatureGroup(
        name="Type of Apartment",
        source_column="Type of apartment",
        options=(FeatureOption("Free apartment", "Type of apartment_1"),),
    ),
    FeatureGroup(
        name="Number of Credits at this Bank",
        source_column="No of Credits at this Bank",
        options=(FeatureOption("One credit", "No of Credits at this Bank_1"),),
    ),
    FeatureGroup(
        name="Occupation",
        source_column="Occupation",
        options=(FeatureOption("Unemployed or unskilled with no permanent", "Occupation_1"),),
    ),
)


# Keep this explicit list to guarantee deterministic input order for training/inference.
SELECTED_FEATURES: list[str] = [
    option.column for group in FEATURE_GROUPS for option in group.options
]