File size: 4,350 Bytes
2b7062a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
Feature Importance - Pure Python Implementation
"""

import math
from typing import List, Dict
from collections import defaultdict
import logging

logger = logging.getLogger(__name__)


class FeatureImportance:
    """Calculate feature importance using pure Python"""
    
    def __init__(self):
        self.importance_scores = {}
        self.variance_scores = {}
    
    def calculate_all(self, items: List, questions: List[Dict]) -> Dict[str, float]:
        """Calculate importance for all attributes"""
        attributes = set(q['attribute'] for q in questions)
        
        for attr in attributes:
            variance = self._calculate_variance(items, attr)
            uniqueness = self._calculate_uniqueness(items, attr)
            coverage = self._calculate_coverage(items, attr)
            discrimination = self._calculate_discrimination_power(items, attr)
            
            importance = (
                0.30 * variance +
                0.25 * uniqueness +
                0.25 * coverage +
                0.20 * discrimination
            )
            
            self.importance_scores[attr] = importance
            self.variance_scores[attr] = variance
        
        logger.info(f"Feature importance calculated for {len(attributes)} attributes")
        return self.importance_scores
    
    def _calculate_variance(self, items: List, attribute: str) -> float:
        """Calculate variance"""
        values = [
            item.attributes.get(attribute) for item in items 
            if item.attributes.get(attribute) is not None 
            and not isinstance(item.attributes.get(attribute), list)
        ]
        
        if not values:
            return 0.0
        
        unique_values = len(set(values))
        total_items = len(items)
        
        return min(1.0, unique_values / total_items) if total_items > 0 else 0.0
    
    def _calculate_uniqueness(self, items: List, attribute: str) -> float:
        """Calculate uniqueness"""
        value_counts = defaultdict(int)
        total = 0
        
        for item in items:
            value = item.attributes.get(attribute)
            
            if value is not None:
                if isinstance(value, list):
                    for v in value:
                        value_counts[str(v)] += 1
                        total += 1
                else:
                    value_counts[str(value)] += 1
                    total += 1
        
        if total == 0:
            return 0.0
        
        gini = 1.0 - sum((count / total) ** 2 for count in value_counts.values())
        return gini
    
    def _calculate_coverage(self, items: List, attribute: str) -> float:
        """Calculate coverage"""
        defined_count = sum(
            1 for item in items 
            if item.attributes.get(attribute) is not None
        )
        
        return defined_count / len(items) if items else 0.0
    
    def _calculate_discrimination_power(self, items: List, attribute: str) -> float:
        """Calculate discrimination power"""
        value_groups = defaultdict(list)
        
        for item in items:
            value = item.attributes.get(attribute)
            
            if value is not None:
                if isinstance(value, list):
                    for v in value:
                        value_groups[str(v)].append(item)
                else:
                    value_groups[str(value)].append(item)
        
        if not value_groups:
            return 0.0
        
        total_items = len(items)
        entropy = 0.0
        
        for group in value_groups.values():
            p = len(group) / total_items
            if p > 0:
                entropy -= p * math.log2(p)
        
        max_entropy = math.log2(len(value_groups)) if len(value_groups) > 1 else 1.0
        
        return entropy / max_entropy if max_entropy > 0 else 0.0
    
    def get_importance(self, attribute: str) -> float:
        """Get importance score"""
        return self.importance_scores.get(attribute, 0.5)
    
    def get_top_features(self, n: int = 10) -> List[tuple]:
        """Get top N features"""
        sorted_features = sorted(
            self.importance_scores.items(),
            key=lambda x: x[1],
            reverse=True
        )
        return sorted_features[:n]