Spaces:
Running
Running
| """ | |
| SmartCertify ML β Math Utilities | |
| Linear algebra, statistics, and probability utilities. | |
| """ | |
| import numpy as np | |
| from scipy import stats | |
| from typing import List, Tuple, Optional | |
| # βββ Linear Algebra Utilities βββββββββββββββββββββββββββββββββ | |
| def cosine_similarity_vectors(a: np.ndarray, b: np.ndarray) -> float: | |
| """Compute cosine similarity between two vectors.""" | |
| norm_a = np.linalg.norm(a) | |
| norm_b = np.linalg.norm(b) | |
| if norm_a == 0 or norm_b == 0: | |
| return 0.0 | |
| return float(np.dot(a, b) / (norm_a * norm_b)) | |
| def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float: | |
| """Compute Euclidean distance between two vectors.""" | |
| return float(np.linalg.norm(a - b)) | |
| def matrix_rank(matrix: np.ndarray) -> int: | |
| """Compute rank of a matrix.""" | |
| return int(np.linalg.matrix_rank(matrix)) | |
| def compute_svd(matrix: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: | |
| """Compute Singular Value Decomposition.""" | |
| U, S, Vt = np.linalg.svd(matrix, full_matrices=False) | |
| return U, S, Vt | |
| def normalize_vector(v: np.ndarray) -> np.ndarray: | |
| """L2-normalize a vector.""" | |
| norm = np.linalg.norm(v) | |
| if norm == 0: | |
| return v | |
| return v / norm | |
| # βββ Statistics Utilities βββββββββββββββββββββββββββββββββββββ | |
| def compute_confidence_interval( | |
| data: np.ndarray, confidence: float = 0.95 | |
| ) -> Tuple[float, float]: | |
| """Compute confidence interval for the mean of data.""" | |
| n = len(data) | |
| mean = np.mean(data) | |
| se = stats.sem(data) | |
| h = se * stats.t.ppf((1 + confidence) / 2, n - 1) | |
| return (float(mean - h), float(mean + h)) | |
| def compute_z_score(value: float, mean: float, std: float) -> float: | |
| """Compute z-score for a value given mean and standard deviation.""" | |
| if std == 0: | |
| return 0.0 | |
| return (value - mean) / std | |
| def compute_p_value(z_score: float, two_tailed: bool = True) -> float: | |
| """Compute p-value from z-score.""" | |
| p = 2 * (1 - stats.norm.cdf(abs(z_score))) if two_tailed else (1 - stats.norm.cdf(z_score)) | |
| return float(p) | |
| def ks_test(data: np.ndarray, distribution: str = "norm") -> Tuple[float, float]: | |
| """Kolmogorov-Smirnov test for distribution fit.""" | |
| statistic, p_value = stats.kstest(data, distribution) | |
| return float(statistic), float(p_value) | |
| def compute_entropy(probabilities: np.ndarray) -> float: | |
| """Compute Shannon entropy of a probability distribution.""" | |
| probabilities = probabilities[probabilities > 0] | |
| return float(-np.sum(probabilities * np.log2(probabilities))) | |
| def compute_kl_divergence(p: np.ndarray, q: np.ndarray) -> float: | |
| """Compute KL divergence D(P || Q).""" | |
| p = np.asarray(p, dtype=np.float64) | |
| q = np.asarray(q, dtype=np.float64) | |
| # Avoid division by zero | |
| mask = (p > 0) & (q > 0) | |
| return float(np.sum(p[mask] * np.log(p[mask] / q[mask]))) | |
| # βββ Probability Utilities ββββββββββββββββββββββββββββββββββββ | |
| def gaussian_probability(x: float, mean: float, std: float) -> float: | |
| """Compute probability density of x under Gaussian distribution.""" | |
| return float(stats.norm.pdf(x, loc=mean, scale=std)) | |
| def bayesian_update( | |
| prior: float, likelihood: float, evidence: float | |
| ) -> float: | |
| """Apply Bayes' theorem: P(H|E) = P(E|H) * P(H) / P(E).""" | |
| if evidence == 0: | |
| return 0.0 | |
| return (likelihood * prior) / evidence | |
| def softmax(x: np.ndarray) -> np.ndarray: | |
| """Compute softmax probabilities.""" | |
| e_x = np.exp(x - np.max(x)) | |
| return e_x / e_x.sum() | |
| # βββ Feature Analysis βββββββββββββββββββββββββββββββββββββββββ | |
| def compute_correlation_matrix(data: np.ndarray) -> np.ndarray: | |
| """Compute Pearson correlation matrix.""" | |
| return np.corrcoef(data, rowvar=False) | |
| def compute_mutual_information(x: np.ndarray, y: np.ndarray, bins: int = 20) -> float: | |
| """Compute mutual information between two variables.""" | |
| hist_2d, _, _ = np.histogram2d(x, y, bins=bins) | |
| pxy = hist_2d / hist_2d.sum() | |
| px = pxy.sum(axis=1) | |
| py = pxy.sum(axis=0) | |
| mi = 0.0 | |
| for i in range(bins): | |
| for j in range(bins): | |
| if pxy[i, j] > 0 and px[i] > 0 and py[j] > 0: | |
| mi += pxy[i, j] * np.log2(pxy[i, j] / (px[i] * py[j])) | |
| return mi | |