File size: 12,505 Bytes
c2ea5ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env python3
"""
Component Influence Analysis

This script analyzes the influence of knowledge graph components on perturbation scores
using the DataFrame created by the create_component_influence_dataframe function.
"""

import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import logging
from typing import Optional, Dict, List, Tuple, Any
import sys
from sklearn.linear_model import LinearRegression

# Import from the same directory
from .utils.dataframe_builder import create_component_influence_dataframe

# Configure logging for this module
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

def analyze_component_influence(df: pd.DataFrame, n_estimators: int = 100, 
                               random_state: int = 42) -> Tuple[Optional[RandomForestRegressor], Dict[str, float], List[str]]:
    """
    Analyzes the influence of components on perturbation scores.
    Uses a linear model to directly estimate the effect size and direction.
    Random Forest is still trained as a secondary model for comparison.
    
    Args:
        df: DataFrame with binary component features and perturbation score
        n_estimators: Number of trees in the Random Forest
        random_state: Random seed for reproducibility
        
    Returns:
        A tuple containing:
        - The trained RandomForestRegressor model (or None if training fails)
        - Dictionary of feature importances with sign (direction)
        - List of feature columns used for training
    """
    # Extract feature columns (all columns starting with "entity_" or "relation_")
    # Ensure we only select columns that actually exist in the DataFrame
    potential_feature_cols = [col for col in df.columns if col.startswith(("entity_", "relation_"))]
    feature_cols = [col for col in potential_feature_cols if col in df.columns]
    
    if not feature_cols:
        logger.error("No component features found in DataFrame. Column names should start with 'entity_' or 'relation_'.")
        return None, {}, []
    
    logger.info(f"Found {len(feature_cols)} feature columns for analysis")
    
    # Check if we have enough data for meaningful analysis
    if len(df) < 2:
        logger.error("Not enough data points for analysis (need at least 2 rows).")
        return None, {}, []
    
    # Prepare X and y
    X = df[feature_cols]
    y = df['perturbation']
    
    # Check if target variable has any variance
    if y.std() == 0:
        logger.warning("Target variable 'perturbation' has no variance. Feature importance will be 0 for all features.")
        # Return a dictionary of zeros for all features and the feature list
        return None, {feature: 0.0 for feature in feature_cols}, feature_cols
    
    try:
        # 1. Create and train the Random Forest model (still used for metrics and as a backup)
        rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
        rf_model.fit(X, y)
        
        # 2. Fit a linear model for effect estimation with direction
        linear_model = LinearRegression()
        linear_model.fit(X, y)
        
        # Get coefficients (these include both magnitude and direction)
        coefficients = linear_model.coef_
        
        # 3. Use linear coefficients directly as our importance scores
        feature_importance = {}
        for i, feature in enumerate(feature_cols):
            feature_importance[feature] = coefficients[i]
            
        # Sort by absolute importance (magnitude)
        feature_importance = dict(sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True))
        return rf_model, feature_importance, feature_cols
    
    except Exception as e:
        logger.error(f"Error during model training: {e}")
        return None, {feature: 0.0 for feature in feature_cols}, feature_cols

def print_feature_importance(feature_importance: Dict[str, float], top_n: int = 10) -> None:
    """
    Prints the feature importance values with signs (positive/negative influence).
    
    Args:
        feature_importance: Dictionary mapping feature names to importance values
        top_n: Number of top features to show
    """
    print(f"\nTop {min(top_n, len(feature_importance))} Components by Influence:")
    print("=" * 50)
    print(f"{'Rank':<5}{'Component':<30}{'Importance':<15}{'Direction':<10}")
    print("-" * 50)
    
    # Sort by absolute importance
    sorted_features = sorted(feature_importance.items(), key=lambda x: abs(x[1]), reverse=True)
    
    for i, (feature, importance) in enumerate(sorted_features[:min(top_n, len(feature_importance))], 1):
        direction = "Positive" if importance >= 0 else "Negative"
        print(f"{i:<5}{feature:<30}{abs(importance):.6f}   {direction}")
    
    # Save to CSV for further analysis
    output_path = os.path.join(os.path.dirname(__file__), 'component_influence_rankings.csv')
    pd.DataFrame({
        'Component': [item[0] for item in sorted_features],
        'Importance': [abs(item[1]) for item in sorted_features],
        'Direction': ["Positive" if item[1] >= 0 else "Negative" for item in sorted_features]
    }).to_csv(output_path, index=False)
    logger.info(f"Component rankings saved to {output_path}")

def evaluate_model(model: Optional[RandomForestRegressor], X: pd.DataFrame, y: pd.Series) -> Dict[str, float]:
    """
    Evaluates the model performance.
    
    Args:
        model: Trained RandomForestRegressor model (or None)
        X: Feature DataFrame
        y: Target series
        
    Returns:
        Dictionary of evaluation metrics
    """
    if model is None:
        return {
            'mse': 0.0,
            'rmse': 0.0,
            'r2': 1.0 if y.std() == 0 else 0.0
        }
    
    try:
        y_pred = model.predict(X)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        
        return {
            'mse': mse,
            'rmse': np.sqrt(mse),
            'r2': r2
        }
    except Exception as e:
        logger.error(f"Error during model evaluation: {e}")
        return {
            'mse': 0.0,
            'rmse': 0.0,
            'r2': 0.0
        }

def identify_key_components(feature_importance: Dict[str, float], 
                           threshold: float = 0.01) -> List[str]:
    """
    Identifies key components that have absolute importance above the threshold.
    
    Args:
        feature_importance: Dictionary mapping feature names to importance values
        threshold: Minimum absolute importance value to be considered a key component
        
    Returns:
        List of key component names
    """
    return [feature for feature, importance in feature_importance.items() 
            if abs(importance) >= threshold]

def print_component_groups(df: pd.DataFrame, feature_importance: Dict[str, float]) -> None:
    """
    Prints component influence by type, handling both positive and negative values.
    
    Args:
        df: Original DataFrame
        feature_importance: Feature importance dictionary with signed values
    """
    if not feature_importance:
        print("\nNo feature importance values available for group analysis.")
        return
    
    # Extract entity and relation features
    entity_features = [f for f in feature_importance.keys() if f.startswith('entity_')]
    relation_features = [f for f in feature_importance.keys() if f.startswith('relation_')]
    
    # Calculate group importances (using absolute values)
    entity_importance = sum(abs(feature_importance[f]) for f in entity_features)
    relation_importance = sum(abs(feature_importance[f]) for f in relation_features)
    total_importance = sum(abs(value) for value in feature_importance.values())
    
    # Count positive and negative components
    pos_entities = sum(1 for f in entity_features if feature_importance[f] > 0)
    neg_entities = sum(1 for f in entity_features if feature_importance[f] < 0)
    pos_relations = sum(1 for f in relation_features if feature_importance[f] > 0)
    neg_relations = sum(1 for f in relation_features if feature_importance[f] < 0)
    
    print("\nComponent Group Influence:")
    print("=" * 70)
    print(f"{'Group':<20}{'Abs Importance':<15}{'Percentage':<10}{'Positive':<10}{'Negative':<10}")
    print("-" * 70)
    
    if total_importance > 0:
        entity_percentage = (entity_importance/total_importance*100) if total_importance > 0 else 0
        relation_percentage = (relation_importance/total_importance*100) if total_importance > 0 else 0
        
        print(f"{'Entities':<20}{entity_importance:.6f}{'%.2f%%' % entity_percentage:<10}{pos_entities:<10}{neg_entities:<10}")
        print(f"{'Relations':<20}{relation_importance:.6f}{'%.2f%%' % relation_percentage:<10}{pos_relations:<10}{neg_relations:<10}")
    else:
        print("No importance values available for analysis.")

def main():
    """Main function to run the component influence analysis."""
    import argparse
    
    parser = argparse.ArgumentParser(description='Analyze component influence on perturbation scores')
    parser.add_argument('--input', '-i', required=True, help='Path to the knowledge graph JSON file')
    parser.add_argument('--output', '-o', help='Path to save the output DataFrame (CSV format)')
    args = parser.parse_args()
    
    print("\n=== Component Influence Analysis ===")
    print(f"Input file: {args.input}")
    print(f"Output file: {args.output or 'Not specified'}")
    
    # Create DataFrame using the function from create_component_influence_dataframe.py
    print("\nCreating DataFrame from knowledge graph...")
    df = create_component_influence_dataframe(args.input)
    
    if df is None or df.empty:
        logger.error("Failed to create or empty DataFrame. Cannot proceed with analysis.")
        return
    
    # Print basic DataFrame info
    print(f"\nDataFrame info:")
    print(f"Rows: {len(df)}")
    entity_features = [col for col in df.columns if col.startswith("entity_")]
    relation_features = [col for col in df.columns if col.startswith("relation_")]
    print(f"Entity features: {len(entity_features)}")
    print(f"Relation features: {len(relation_features)}")
    print(f"Other columns: {', '.join([col for col in df.columns if not (col.startswith('entity_') or col.startswith('relation_'))])}")
    
    # Check if we have any variance in perturbation scores
    if df['perturbation'].std() == 0:
        logger.warning("All perturbation scores are identical. This might lead to uninformative results.")
        print("\nWARNING: All perturbation scores are identical (value: %.2f). Results may not be meaningful." % df['perturbation'].iloc[0])
    else:
        print(f"\nPerturbation score distribution:")
        print(f"Min: {df['perturbation'].min():.2f}, Max: {df['perturbation'].max():.2f}")
        print(f"Mean: {df['perturbation'].mean():.2f}, Std: {df['perturbation'].std():.2f}")
        
    # Run analysis
    print("\nRunning component influence analysis...")
    model, feature_importance, feature_cols = analyze_component_influence(df)
    
    # Print feature importance
    print_feature_importance(feature_importance)
    
    # Identify key components
    print("\nIdentifying key components...")
    key_components = identify_key_components(feature_importance)
    print(f"Identified {len(key_components)} key components (importance >= 0.01)")
    
    # Print component groups
    print("\nAnalyzing component groups...")
    print_component_groups(df, feature_importance)
    
    # Evaluate model
    print("\nEvaluating model performance...")
    metrics = evaluate_model(model, df[feature_cols], df['perturbation'])
    
    print("\nModel Evaluation Metrics:")
    print("=" * 50)
    for metric, value in metrics.items():
        print(f"{metric.upper()}: {value:.6f}")
    
    # Save full DataFrame with importance values for reference
    if args.output:
        result_df = df.copy()
        for feature, importance in feature_importance.items():
            result_df[f'importance_{feature}'] = importance
        result_df.to_csv(args.output)
        logger.info(f"Full analysis results saved to {args.output}")
    
    print("\nAnalysis complete. CSV files with detailed results have been saved.")

if __name__ == "__main__":
    main()