File size: 14,931 Bytes
070061f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import logging
import json
import random
import re
import os
import pandas as pd
import shap
import sklearn
import pickle
from constraints import *
from nlu import NLU
import json
from answer import Answers

# Import natural conversation enhancer
try:
    from natural_conversation import enhance_response
    NATURAL_CONVERSATION_AVAILABLE = True
except ImportError:
    NATURAL_CONVERSATION_AVAILABLE = False
    def enhance_response(response, context=None, response_type="explanation"):
        return response

class Agent:
    def __init__(self, nlu_model=None):
        # Core state
        self.dataset = "adult"
        self.current_instance = None
        self.clf = None
        self.predicted_class = None
        self.mode = None
        self.data = {"X": None, "y": None, "features": None, "classes": None}

        # NLU setup: prefer provided model, else use config, else default
        config_path = os.path.join(os.path.dirname(__file__), 'nlu_config.json')
        if nlu_model is not None:
            self.nlu_model = nlu_model
        elif os.path.exists(config_path):
            with open(config_path, 'r') as f:
                nlu_config = json.load(f)
            self.nlu_model = NLU(model_type=nlu_config.get('model_type', 'sentence_transformers'), model_path=nlu_config.get('model_path'))
        else:
            self.nlu_model = NLU()

        # UI/state helpers
        self.list_node = []
        self.clf_display = None
        self.l_exist_classes = None
        self.l_exist_features = None
        self.l_instances = None
        self.df_display_instance = None
        self.current_feature = None
        self.preprocessor = None

        # Feature requirements for user input flows
        self.required_features = [
            'age', 'workclass', 'education', 'education_num', 'marital_status',
            'occupation', 'relationship', 'race', 'sex', 'capital_gain',
            'capital_loss', 'hours_per_week', 'native_country'
        ]
        self.user_features = {}

        # Load data and train model (sets self.clf and self.clf_display)
        self.load_adult_dataset()
        self.train_model()

    def load_adult_dataset(self):
        data_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'adult.data')
        info_path = os.path.join(os.path.dirname(__file__), '..', 'dataset_info', 'adult.json')
        columns = [
            'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
            'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
            'hours_per_week', 'native_country', 'income'
        ]
        self.data['X_display'] = pd.read_csv(data_path, names=columns, skipinitialspace=True)
        self.data['y_display'] = self.data['X_display']['income']
        self.data['X_display'].drop(['income'], axis=1, inplace=True)
        with open(info_path, 'r') as f:
            self.data['info'] = json.load(f)
        self.data['classes'] = ['<=50K', '>50K']
        self.data['features'] = self.data['X_display'].columns.tolist()
        self.data['feature_names'] = self.data['features']
        self.data['map'] = {}

    def train_model(self):
        # Ensure model directory exists
        model_dir = os.path.join(os.path.dirname(__file__), '..', 'models')
        os.makedirs(model_dir, exist_ok=True)
        model_path = os.path.join(model_dir, 'RandomForest.pkl')
        if os.path.exists(model_path):
            try:
                self.clf = pickle.load(open(model_path, 'rb'))
                self.clf_display = self.clf
            except Exception as e:
                print(f"⚠️ Failed to load existing model ({e}). Retraining...")
                from preprocessing import preprocess_adult
                df = pd.concat([self.data['X_display'], self.data['y_display']], axis=1)
                df_clean = preprocess_adult(df)
                X = df_clean.drop('income', axis=1)
                y = df_clean['income']
                from sklearn.ensemble import RandomForestClassifier
                clf = RandomForestClassifier(n_estimators=200, random_state=42)
                clf.fit(X, y)
                self.clf = clf
                self.clf_display = clf
                pickle.dump(clf, open(model_path, 'wb'))
        else:
            from preprocessing import preprocess_adult
            df = pd.concat([self.data['X_display'], self.data['y_display']], axis=1)
            df_clean = preprocess_adult(df)
            X = df_clean.drop('income', axis=1)
            y = df_clean['income']
            from sklearn.ensemble import RandomForestClassifier
            self.clf = RandomForestClassifier(n_estimators=100, random_state=42)
            self.clf.fit(X, y)
            # Persist the trained model for faster subsequent runs
            with open(model_path, 'wb') as f:
                pickle.dump(self.clf, f)
            self.clf_display = self.clf

    # (Removed duplicate __init__; initialization handled above)

    def handle_user_input(self, user_input):
        """Handle user input for XAI explanations (used by loan assistant for explanations)"""
        # Step 1: Intent classification and XAI routing using enhanced NLU
        try:
            intent_result, confidence, suggestions = self.nlu_model.classify_intent(user_input)
            from constraints import SUGGEST_SIMILAR_QUESTIONS_MSG, REPHRASE_QUESTION_MSG
            
            # Route to appropriate XAI method based on intent
            if isinstance(intent_result, dict) and 'intent' in intent_result:
                # Ensure we have a current instance for explanation
                if self.current_instance is None:
                    self.select_random_instance()
                
                # Import the routing function
                try:
                    from xai_methods import route_to_xai_method
                    explanation_result = route_to_xai_method(self, intent_result)
                    base_explanation = explanation_result.get('explanation', 'Sorry, I could not generate an explanation.')
                    
                    # Enhance with natural conversation if available
                    if NATURAL_CONVERSATION_AVAILABLE:
                        context = {
                            'explanation_type': intent_result.get('intent', 'general'),
                            'user_question': user_input,
                            'confidence': intent_result.get('confidence', 0)
                        }
                        return enhance_response(base_explanation, context, "explanation")
                    
                    return base_explanation
                except ImportError:
                    # Fallback if routing function not available
                    base_explanation = self._generate_basic_explanation(intent_result)
                    
                    # Enhance fallback explanation too
                    if NATURAL_CONVERSATION_AVAILABLE:
                        context = {
                            'explanation_type': 'basic',
                            'user_question': user_input,
                            'confidence': 0.5
                        }
                        return enhance_response(base_explanation, context, "explanation")
                    
                    return base_explanation
                
            elif intent_result == 'unknown' and suggestions:
                suggestions_str = "\n".join([f"{idx}. {q}" for idx, q in enumerate(suggestions, 1)])
                return SUGGEST_SIMILAR_QUESTIONS_MSG.format(suggestions=suggestions_str)
            else:
                return REPHRASE_QUESTION_MSG
                
        except Exception as e:
            return f"I'm having trouble processing that question. Could you try asking it differently? Error: {str(e)}"
    
    def _generate_basic_explanation(self, intent_result):
        """Generate basic explanation when XAI methods are not available"""
        if self.current_instance is None or self.predicted_class is None:
            return "I need a specific instance to explain. Please make sure a prediction has been made."
        
        # Basic explanation based on the current instance
        explanation = f"Based on your profile, the decision was: {self.predicted_class}\n\n"
        explanation += "Key factors in this decision include:\n"
        
        # Highlight some key features
        key_features = ['age', 'education', 'hours_per_week', 'occupation', 'marital_status']
        for feature in key_features:
            if feature in self.current_instance:
                value = self.current_instance[feature]
                explanation += f"• {feature.replace('_', ' ').title()}: {value}\n"
        
        explanation += "\nThis is a simplified explanation. For more detailed analysis, specific XAI methods would provide deeper insights."
        return explanation
    
    def select_random_instance(self):
        """Select a random instance from the dataset for explanation"""
        if self.data.get('X_display') is not None and len(self.data['X_display']) > 0:
            random_idx = random.randint(0, len(self.data['X_display']) - 1)
            self.df_display_instance = self.data['X_display'].iloc[[random_idx]]
            self.current_instance = self.df_display_instance.iloc[0].to_dict()
            
            # Make prediction for this instance
            if self.clf_display is not None:
                self.predicted_class = self.clf_display.predict(self.df_display_instance)[0]

    def get_visualization(self, viz_type, instance_df=None):
        """

        Route advanced visualization requests to Answers class.

        viz_type: 'shap_advanced' or 'dtreeviz'

        instance_df: DataFrame for the instance to visualize

        """
        answers = Answers(
            list_node=self.list_node,
            clf=self.clf,
            clf_display=self.clf_display,
            current_instance=self.current_instance,
            question=None,
            l_exist_classes=self.l_exist_classes,
            l_exist_features=self.l_exist_features,
            l_instances=self.l_instances,
            data=self.data,
            df_display_instance=self.df_display_instance,
            predicted_class=self.predicted_class,
            preprocessor=self.preprocessor
        )
        return answers.answer(viz_type, instance_df=instance_df)

    def handle_user_input(self, user_input, instance_df=None):
        # Step 1: Refined feature extraction using regex and synonyms
        feature_synonyms = {
            'age': ['age', 'years old'],
            'workclass': ['workclass', 'work type', 'job type'],
            'education': ['education', 'degree'],
            'education_num': ['education num', 'education number', 'years of education'],
            'marital_status': ['marital status', 'married', 'single', 'relationship status'],
            'occupation': ['occupation', 'job', 'profession'],
            'relationship': ['relationship'],
            'race': ['race', 'ethnicity'],
            'sex': ['sex', 'gender'],
            'capital_gain': ['capital gain', 'gain'],
            'capital_loss': ['capital loss', 'loss'],
            'hours_per_week': ['hours per week', 'weekly hours', 'work hours'],
            'native_country': ['native country', 'country', 'nationality']
        }
        # Try to extract feature-value pairs from user input
        for feature, synonyms in feature_synonyms.items():
            for syn in synonyms:
                pattern = rf"{syn}[:=]?\s*([\w\-\+]+)"
                match = re.search(pattern, user_input, re.IGNORECASE)
                if match:
                    self.user_features[feature] = match.group(1)
        # Check for missing features
        from constraints import CLARIFY_FEATURE_MSG
        missing = [f for f in self.required_features if f not in self.user_features]
        if missing:
            next_feat = missing[0]
            return CLARIFY_FEATURE_MSG.format(feature=next_feat.replace('_', ' '))
        # Step 2: Robust validation using adult dataset metadata
        from constraints import REPEAT_NUM_FEATURES, REPEAT_CAT_FEATURES
        info = self.data.get('info', {})
        for feature in self.required_features:
            value = self.user_features.get(feature)
            if value is None:
                continue
            # Numeric validation
            if feature in info.get('num_features', []):
                try:
                    val = float(value)
                    minv, maxv = info.get('feature_ranges', {}).get(feature, (None, None))
                    if minv is not None and (val < minv or val > maxv):
                        del self.user_features[feature]
                        return REPEAT_NUM_FEATURES.format(f"{minv}-{maxv}")
                except Exception:
                    del self.user_features[feature]
                    return REPEAT_NUM_FEATURES.format("valid number")
            # Categorical validation
            if feature in info.get('cat_features', []):
                valid = info.get('feature_values', {}).get(feature, [])
                if valid and value not in valid:
                    del self.user_features[feature]
                    return REPEAT_CAT_FEATURES.format(", ".join(valid))
        # Step 3: Intent classification and XAI routing using enhanced NLU
        intent_result, confidence, suggestions = self.nlu_model.classify_intent(user_input)
        from constraints import SUGGEST_SIMILAR_QUESTIONS_MSG, REPHRASE_QUESTION_MSG
        from xai_methods import route_to_xai_method
        # Route to appropriate XAI method based on intent
        if isinstance(intent_result, dict) and 'intent' in intent_result:
            if self.current_instance is None:
                self.select_random_instance()
            # Advanced visualization intents
            if intent_result['intent'] in ['shap_advanced', 'dtreeviz']:
                return self.get_visualization(intent_result['intent'], instance_df)
            # Standard explanation routing
            explanation_result = route_to_xai_method(self, intent_result)
            return explanation_result.get('explanation', 'Sorry, I could not generate an explanation.')
        elif intent_result == 'unknown' and suggestions:
            suggestions_str = "\n".join([f"{idx}. {q}" for idx, q in enumerate(suggestions, 1)])
            return SUGGEST_SIMILAR_QUESTIONS_MSG.format(suggestions=suggestions_str)
        else:
            return REPHRASE_QUESTION_MSG