GeoAI-Backend / algorithms /feature_importance.py
Rafs-an09002's picture
sync: backend from GitHub Actions
2b7062a verified
"""
Feature Importance - Pure Python Implementation
"""
import math
from typing import List, Dict
from collections import defaultdict
import logging
logger = logging.getLogger(__name__)
class FeatureImportance:
"""Calculate feature importance using pure Python"""
def __init__(self):
self.importance_scores = {}
self.variance_scores = {}
def calculate_all(self, items: List, questions: List[Dict]) -> Dict[str, float]:
"""Calculate importance for all attributes"""
attributes = set(q['attribute'] for q in questions)
for attr in attributes:
variance = self._calculate_variance(items, attr)
uniqueness = self._calculate_uniqueness(items, attr)
coverage = self._calculate_coverage(items, attr)
discrimination = self._calculate_discrimination_power(items, attr)
importance = (
0.30 * variance +
0.25 * uniqueness +
0.25 * coverage +
0.20 * discrimination
)
self.importance_scores[attr] = importance
self.variance_scores[attr] = variance
logger.info(f"Feature importance calculated for {len(attributes)} attributes")
return self.importance_scores
def _calculate_variance(self, items: List, attribute: str) -> float:
"""Calculate variance"""
values = [
item.attributes.get(attribute) for item in items
if item.attributes.get(attribute) is not None
and not isinstance(item.attributes.get(attribute), list)
]
if not values:
return 0.0
unique_values = len(set(values))
total_items = len(items)
return min(1.0, unique_values / total_items) if total_items > 0 else 0.0
def _calculate_uniqueness(self, items: List, attribute: str) -> float:
"""Calculate uniqueness"""
value_counts = defaultdict(int)
total = 0
for item in items:
value = item.attributes.get(attribute)
if value is not None:
if isinstance(value, list):
for v in value:
value_counts[str(v)] += 1
total += 1
else:
value_counts[str(value)] += 1
total += 1
if total == 0:
return 0.0
gini = 1.0 - sum((count / total) ** 2 for count in value_counts.values())
return gini
def _calculate_coverage(self, items: List, attribute: str) -> float:
"""Calculate coverage"""
defined_count = sum(
1 for item in items
if item.attributes.get(attribute) is not None
)
return defined_count / len(items) if items else 0.0
def _calculate_discrimination_power(self, items: List, attribute: str) -> float:
"""Calculate discrimination power"""
value_groups = defaultdict(list)
for item in items:
value = item.attributes.get(attribute)
if value is not None:
if isinstance(value, list):
for v in value:
value_groups[str(v)].append(item)
else:
value_groups[str(value)].append(item)
if not value_groups:
return 0.0
total_items = len(items)
entropy = 0.0
for group in value_groups.values():
p = len(group) / total_items
if p > 0:
entropy -= p * math.log2(p)
max_entropy = math.log2(len(value_groups)) if len(value_groups) > 1 else 1.0
return entropy / max_entropy if max_entropy > 0 else 0.0
def get_importance(self, attribute: str) -> float:
"""Get importance score"""
return self.importance_scores.get(attribute, 0.5)
def get_top_features(self, n: int = 10) -> List[tuple]:
"""Get top N features"""
sorted_features = sorted(
self.importance_scores.items(),
key=lambda x: x[1],
reverse=True
)
return sorted_features[:n]