File size: 1,686 Bytes
874b2d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from typing import Tuple, List
def load_data(filepath: str) -> pd.DataFrame:
"""
Load dataset from a CSV file.
Args:
filepath: Path to the CSV file.
Returns:
Pandas DataFrame.
"""
return pd.read_csv(filepath)
def extract_features(df: pd.DataFrame, feature_cols: List[str]) -> np.ndarray:
"""
Extract numeric feature matrix from DataFrame.
Args:
df: Input DataFrame.
feature_cols: List of column names to use as features.
Returns:
2D NumPy array of features.
"""
return df[feature_cols].to_numpy()
def fit_kmeans(
X: np.ndarray,
n_clusters: int,
random_state: int = 42
) -> Tuple[np.ndarray, np.ndarray]:
"""
Fit KMeans and return labels and centroids.
Args:
X: Feature matrix.
n_clusters: Number of clusters.
random_state: Random seed for reproducibility.
Returns:
Tuple of (labels array, centers array).
"""
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
labels = kmeans.fit_predict(X)
return labels, kmeans.cluster_centers_
def calculate_wcss(
X: np.ndarray,
max_clusters: int = 10
) -> List[float]:
"""
Compute within-cluster sum of squares for 1..max_clusters.
Args:
X: Feature matrix.
max_clusters: Maximum number of clusters to evaluate.
Returns:
List of inertia values.
"""
wcss = []
for k in range(1, max_clusters + 1):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
return wcss |