Spaces:

Suvh
/

hicxai-condition-2

Sleeping

File size: 14,931 Bytes

070061f

import logging
import json
import random
import re
import os
import pandas as pd
import shap
import sklearn
import pickle
from constraints import *
from nlu import NLU
import json
from answer import Answers

# Import natural conversation enhancer
try:
    from natural_conversation import enhance_response
    NATURAL_CONVERSATION_AVAILABLE = True
except ImportError:
    NATURAL_CONVERSATION_AVAILABLE = False
    def enhance_response(response, context=None, response_type="explanation"):
        return response

class Agent:
    def __init__(self, nlu_model=None):
        # Core state
        self.dataset = "adult"
        self.current_instance = None
        self.clf = None
        self.predicted_class = None
        self.mode = None
        self.data = {"X": None, "y": None, "features": None, "classes": None}

        # NLU setup: prefer provided model, else use config, else default
        config_path = os.path.join(os.path.dirname(__file__), 'nlu_config.json')
        if nlu_model is not None:
            self.nlu_model = nlu_model
        elif os.path.exists(config_path):
            with open(config_path, 'r') as f:
                nlu_config = json.load(f)
            self.nlu_model = NLU(model_type=nlu_config.get('model_type', 'sentence_transformers'), model_path=nlu_config.get('model_path'))
        else:
            self.nlu_model = NLU()

        # UI/state helpers
        self.list_node = []
        self.clf_display = None
        self.l_exist_classes = None
        self.l_exist_features = None
        self.l_instances = None
        self.df_display_instance = None
        self.current_feature = None
        self.preprocessor = None

        # Feature requirements for user input flows
        self.required_features = [
            'age', 'workclass', 'education', 'education_num', 'marital_status',
            'occupation', 'relationship', 'race', 'sex', 'capital_gain',
            'capital_loss', 'hours_per_week', 'native_country'
        ]
        self.user_features = {}

        # Load data and train model (sets self.clf and self.clf_display)
        self.load_adult_dataset()
        self.train_model()

    def load_adult_dataset(self):
        data_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'adult.data')
        info_path = os.path.join(os.path.dirname(__file__), '..', 'dataset_info', 'adult.json')
        columns = [
            'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
            'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
            'hours_per_week', 'native_country', 'income'
        ]
        self.data['X_display'] = pd.read_csv(data_path, names=columns, skipinitialspace=True)
        self.data['y_display'] = self.data['X_display']['income']
        self.data['X_display'].drop(['income'], axis=1, inplace=True)
        with open(info_path, 'r') as f:
            self.data['info'] = json.load(f)
        self.data['classes'] = ['<=50K', '>50K']
        self.data['features'] = self.data['X_display'].columns.tolist()
        self.data['feature_names'] = self.data['features']
        self.data['map'] = {}

    def train_model(self):
        # Ensure model directory exists
        model_dir = os.path.join(os.path.dirname(__file__), '..', 'models')
        os.makedirs(model_dir, exist_ok=True)
        model_path = os.path.join(model_dir, 'RandomForest.pkl')
        if os.path.exists(model_path):
            try:
                self.clf = pickle.load(open(model_path, 'rb'))
                self.clf_display = self.clf
            except Exception as e:
                print(f"⚠️ Failed to load existing model ({e}). Retraining...")
                from preprocessing import preprocess_adult
                df = pd.concat([self.data['X_display'], self.data['y_display']], axis=1)
                df_clean = preprocess_adult(df)
                X = df_clean.drop('income', axis=1)
                y = df_clean['income']
                from sklearn.ensemble import RandomForestClassifier
                clf = RandomForestClassifier(n_estimators=200, random_state=42)
                clf.fit(X, y)
                self.clf = clf
                self.clf_display = clf
                pickle.dump(clf, open(model_path, 'wb'))
        else:
            from preprocessing import preprocess_adult
            df = pd.concat([self.data['X_display'], self.data['y_display']], axis=1)
            df_clean = preprocess_adult(df)
            X = df_clean.drop('income', axis=1)
            y = df_clean['income']
            from sklearn.ensemble import RandomForestClassifier
            self.clf = RandomForestClassifier(n_estimators=100, random_state=42)
            self.clf.fit(X, y)
            # Persist the trained model for faster subsequent runs
            with open(model_path, 'wb') as f:
                pickle.dump(self.clf, f)
            self.clf_display = self.clf

    # (Removed duplicate __init__; initialization handled above)

    def handle_user_input(self, user_input):
        """Handle user input for XAI explanations (used by loan assistant for explanations)"""
        # Step 1: Intent classification and XAI routing using enhanced NLU
        try:
            intent_result, confidence, suggestions = self.nlu_model.classify_intent(user_input)
            from constraints import SUGGEST_SIMILAR_QUESTIONS_MSG, REPHRASE_QUESTION_MSG
            
            # Route to appropriate XAI method based on intent
            if isinstance(intent_result, dict) and 'intent' in intent_result:
                # Ensure we have a current instance for explanation
                if self.current_instance is None:
                    self.select_random_instance()
                
                # Import the routing function
                try:
                    from xai_methods import route_to_xai_method
                    explanation_result = route_to_xai_method(self, intent_result)
                    base_explanation = explanation_result.get('explanation', 'Sorry, I could not generate an explanation.')
                    
                    # Enhance with natural conversation if available
                    if NATURAL_CONVERSATION_AVAILABLE:
                        context = {
                            'explanation_type': intent_result.get('intent', 'general'),
                            'user_question': user_input,
                            'confidence': intent_result.get('confidence', 0)
                        }
                        return enhance_response(base_explanation, context, "explanation")
                    
                    return base_explanation
                except ImportError:
                    # Fallback if routing function not available
                    base_explanation = self._generate_basic_explanation(intent_result)
                    
                    # Enhance fallback explanation too
                    if NATURAL_CONVERSATION_AVAILABLE:
                        context = {
                            'explanation_type': 'basic',
                            'user_question': user_input,
                            'confidence': 0.5
                        }
                        return enhance_response(base_explanation, context, "explanation")
                    
                    return base_explanation
                
            elif intent_result == 'unknown' and suggestions:
                suggestions_str = "\n".join([f"{idx}. {q}" for idx, q in enumerate(suggestions, 1)])
                return SUGGEST_SIMILAR_QUESTIONS_MSG.format(suggestions=suggestions_str)
            else:
                return REPHRASE_QUESTION_MSG
                
        except Exception as e:
            return f"I'm having trouble processing that question. Could you try asking it differently? Error: {str(e)}"
    
    def _generate_basic_explanation(self, intent_result):
        """Generate basic explanation when XAI methods are not available"""
        if self.current_instance is None or self.predicted_class is None:
            return "I need a specific instance to explain. Please make sure a prediction has been made."
        
        # Basic explanation based on the current instance
        explanation = f"Based on your profile, the decision was: {self.predicted_class}\n\n"
        explanation += "Key factors in this decision include:\n"
        
        # Highlight some key features
        key_features = ['age', 'education', 'hours_per_week', 'occupation', 'marital_status']
        for feature in key_features:
            if feature in self.current_instance:
                value = self.current_instance[feature]
                explanation += f"• {feature.replace('_', ' ').title()}: {value}\n"
        
        explanation += "\nThis is a simplified explanation. For more detailed analysis, specific XAI methods would provide deeper insights."
        return explanation
    
    def select_random_instance(self):
        """Select a random instance from the dataset for explanation"""
        if self.data.get('X_display') is not None and len(self.data['X_display']) > 0:
            random_idx = random.randint(0, len(self.data['X_display']) - 1)
            self.df_display_instance = self.data['X_display'].iloc[[random_idx]]
            self.current_instance = self.df_display_instance.iloc[0].to_dict()
            
            # Make prediction for this instance
            if self.clf_display is not None:
                self.predicted_class = self.clf_display.predict(self.df_display_instance)[0]

    def get_visualization(self, viz_type, instance_df=None):
        """

        Route advanced visualization requests to Answers class.

        viz_type: 'shap_advanced' or 'dtreeviz'

        instance_df: DataFrame for the instance to visualize

        """
        answers = Answers(
            list_node=self.list_node,
            clf=self.clf,
            clf_display=self.clf_display,
            current_instance=self.current_instance,
            question=None,
            l_exist_classes=self.l_exist_classes,
            l_exist_features=self.l_exist_features,
            l_instances=self.l_instances,
            data=self.data,
            df_display_instance=self.df_display_instance,
            predicted_class=self.predicted_class,
            preprocessor=self.preprocessor
        )
        return answers.answer(viz_type, instance_df=instance_df)

    def handle_user_input(self, user_input, instance_df=None):
        # Step 1: Refined feature extraction using regex and synonyms
        feature_synonyms = {
            'age': ['age', 'years old'],
            'workclass': ['workclass', 'work type', 'job type'],
            'education': ['education', 'degree'],
            'education_num': ['education num', 'education number', 'years of education'],
            'marital_status': ['marital status', 'married', 'single', 'relationship status'],
            'occupation': ['occupation', 'job', 'profession'],
            'relationship': ['relationship'],
            'race': ['race', 'ethnicity'],
            'sex': ['sex', 'gender'],
            'capital_gain': ['capital gain', 'gain'],
            'capital_loss': ['capital loss', 'loss'],
            'hours_per_week': ['hours per week', 'weekly hours', 'work hours'],
            'native_country': ['native country', 'country', 'nationality']
        }
        # Try to extract feature-value pairs from user input
        for feature, synonyms in feature_synonyms.items():
            for syn in synonyms:
                pattern = rf"{syn}[:=]?\s*([\w\-\+]+)"
                match = re.search(pattern, user_input, re.IGNORECASE)
                if match:
                    self.user_features[feature] = match.group(1)
        # Check for missing features
        from constraints import CLARIFY_FEATURE_MSG
        missing = [f for f in self.required_features if f not in self.user_features]
        if missing:
            next_feat = missing[0]
            return CLARIFY_FEATURE_MSG.format(feature=next_feat.replace('_', ' '))
        # Step 2: Robust validation using adult dataset metadata
        from constraints import REPEAT_NUM_FEATURES, REPEAT_CAT_FEATURES
        info = self.data.get('info', {})
        for feature in self.required_features:
            value = self.user_features.get(feature)
            if value is None:
                continue
            # Numeric validation
            if feature in info.get('num_features', []):
                try:
                    val = float(value)
                    minv, maxv = info.get('feature_ranges', {}).get(feature, (None, None))
                    if minv is not None and (val < minv or val > maxv):
                        del self.user_features[feature]
                        return REPEAT_NUM_FEATURES.format(f"{minv}-{maxv}")
                except Exception:
                    del self.user_features[feature]
                    return REPEAT_NUM_FEATURES.format("valid number")
            # Categorical validation
            if feature in info.get('cat_features', []):
                valid = info.get('feature_values', {}).get(feature, [])
                if valid and value not in valid:
                    del self.user_features[feature]
                    return REPEAT_CAT_FEATURES.format(", ".join(valid))
        # Step 3: Intent classification and XAI routing using enhanced NLU
        intent_result, confidence, suggestions = self.nlu_model.classify_intent(user_input)
        from constraints import SUGGEST_SIMILAR_QUESTIONS_MSG, REPHRASE_QUESTION_MSG
        from xai_methods import route_to_xai_method
        # Route to appropriate XAI method based on intent
        if isinstance(intent_result, dict) and 'intent' in intent_result:
            if self.current_instance is None:
                self.select_random_instance()
            # Advanced visualization intents
            if intent_result['intent'] in ['shap_advanced', 'dtreeviz']:
                return self.get_visualization(intent_result['intent'], instance_df)
            # Standard explanation routing
            explanation_result = route_to_xai_method(self, intent_result)
            return explanation_result.get('explanation', 'Sorry, I could not generate an explanation.')
        elif intent_result == 'unknown' and suggestions:
            suggestions_str = "\n".join([f"{idx}. {q}" for idx, q in enumerate(suggestions, 1)])
            return SUGGEST_SIMILAR_QUESTIONS_MSG.format(suggestions=suggestions_str)
        else:
            return REPHRASE_QUESTION_MSG