|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use crate::core::Point; |
|
|
|
|
|
|
|
|
#[derive(Debug, Clone)] |
|
|
pub struct SubspaceConfig { |
|
|
|
|
|
pub rank: usize, |
|
|
|
|
|
|
|
|
pub min_points_for_subspace: usize, |
|
|
|
|
|
|
|
|
pub subspace_weight: f32, |
|
|
|
|
|
|
|
|
|
|
|
pub incremental_covariance: bool, |
|
|
} |
|
|
|
|
|
impl Default for SubspaceConfig { |
|
|
fn default() -> Self { |
|
|
Self { |
|
|
rank: 3, |
|
|
min_points_for_subspace: 5, |
|
|
subspace_weight: 0.3, |
|
|
incremental_covariance: false, |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
impl SubspaceConfig { |
|
|
pub fn new() -> Self { |
|
|
Self::default() |
|
|
} |
|
|
|
|
|
pub fn with_rank(mut self, rank: usize) -> Self { |
|
|
self.rank = rank; |
|
|
self |
|
|
} |
|
|
|
|
|
pub fn with_subspace_weight(mut self, weight: f32) -> Self { |
|
|
self.subspace_weight = weight.clamp(0.0, 1.0); |
|
|
self |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Clone)] |
|
|
pub struct Subspace { |
|
|
|
|
|
pub centroid: Point, |
|
|
|
|
|
|
|
|
|
|
|
pub principal_directions: Vec<Point>, |
|
|
|
|
|
|
|
|
|
|
|
pub eigenvalues: Vec<f32>, |
|
|
|
|
|
|
|
|
pub point_count: usize, |
|
|
|
|
|
|
|
|
accumulated_sum: Vec<f32>, |
|
|
|
|
|
|
|
|
|
|
|
accumulated_outer_product: Vec<f32>, |
|
|
} |
|
|
|
|
|
impl Subspace { |
|
|
|
|
|
pub fn new(dimensionality: usize) -> Self { |
|
|
Self { |
|
|
centroid: Point::origin(dimensionality), |
|
|
principal_directions: Vec::new(), |
|
|
eigenvalues: Vec::new(), |
|
|
point_count: 0, |
|
|
accumulated_sum: vec![0.0; dimensionality], |
|
|
|
|
|
accumulated_outer_product: vec![0.0; dimensionality * (dimensionality + 1) / 2], |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
pub fn from_point(point: &Point) -> Self { |
|
|
Self { |
|
|
centroid: point.clone(), |
|
|
principal_directions: Vec::new(), |
|
|
eigenvalues: Vec::new(), |
|
|
point_count: 1, |
|
|
accumulated_sum: point.dims().to_vec(), |
|
|
accumulated_outer_product: Self::outer_product_upper(point.dims()), |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
pub fn dimensionality(&self) -> usize { |
|
|
self.centroid.dimensionality() |
|
|
} |
|
|
|
|
|
|
|
|
pub fn has_subspace(&self) -> bool { |
|
|
!self.principal_directions.is_empty() |
|
|
} |
|
|
|
|
|
|
|
|
pub fn rank(&self) -> usize { |
|
|
self.principal_directions.len() |
|
|
} |
|
|
|
|
|
|
|
|
fn outer_product_upper(v: &[f32]) -> Vec<f32> { |
|
|
let n = v.len(); |
|
|
let mut result = vec![0.0; n * (n + 1) / 2]; |
|
|
let mut idx = 0; |
|
|
for i in 0..n { |
|
|
for j in i..n { |
|
|
result[idx] = v[i] * v[j]; |
|
|
idx += 1; |
|
|
} |
|
|
} |
|
|
result |
|
|
} |
|
|
|
|
|
|
|
|
fn get_upper(&self, i: usize, j: usize) -> f32 { |
|
|
let (row, col) = if i <= j { (i, j) } else { (j, i) }; |
|
|
let n = self.dimensionality(); |
|
|
|
|
|
let idx = row * (2 * n - row - 1) / 2 + col; |
|
|
self.accumulated_outer_product[idx] |
|
|
} |
|
|
|
|
|
|
|
|
fn add_to_upper(&mut self, i: usize, j: usize, value: f32) { |
|
|
let (row, col) = if i <= j { (i, j) } else { (j, i) }; |
|
|
let n = self.dimensionality(); |
|
|
let idx = row * (2 * n - row - 1) / 2 + col; |
|
|
self.accumulated_outer_product[idx] += value; |
|
|
} |
|
|
|
|
|
|
|
|
pub fn add_point(&mut self, point: &Point) { |
|
|
let dims = point.dims(); |
|
|
|
|
|
|
|
|
for (i, &v) in dims.iter().enumerate() { |
|
|
self.accumulated_sum[i] += v; |
|
|
} |
|
|
|
|
|
|
|
|
for i in 0..dims.len() { |
|
|
for j in i..dims.len() { |
|
|
self.add_to_upper(i, j, dims[i] * dims[j]); |
|
|
} |
|
|
} |
|
|
|
|
|
self.point_count += 1; |
|
|
|
|
|
|
|
|
let n = self.point_count as f32; |
|
|
let centroid_dims: Vec<f32> = self.accumulated_sum.iter() |
|
|
.map(|&s| s / n) |
|
|
.collect(); |
|
|
self.centroid = Point::new(centroid_dims).normalize(); |
|
|
} |
|
|
|
|
|
|
|
|
fn compute_covariance(&self) -> Vec<Vec<f32>> { |
|
|
let n = self.dimensionality(); |
|
|
let count = self.point_count as f32; |
|
|
|
|
|
if count < 2.0 { |
|
|
return vec![vec![0.0; n]; n]; |
|
|
} |
|
|
|
|
|
|
|
|
let mean: Vec<f32> = self.accumulated_sum.iter() |
|
|
.map(|&s| s / count) |
|
|
.collect(); |
|
|
|
|
|
|
|
|
let mut cov = vec![vec![0.0; n]; n]; |
|
|
for i in 0..n { |
|
|
for j in i..n { |
|
|
let exx = self.get_upper(i, j) / count; |
|
|
let exex = mean[i] * mean[j]; |
|
|
let c = exx - exex; |
|
|
cov[i][j] = c; |
|
|
cov[j][i] = c; |
|
|
} |
|
|
} |
|
|
|
|
|
cov |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
pub fn recompute_subspace(&mut self, rank: usize) { |
|
|
if self.point_count < 3 { |
|
|
|
|
|
self.principal_directions.clear(); |
|
|
self.eigenvalues.clear(); |
|
|
return; |
|
|
} |
|
|
|
|
|
let cov = self.compute_covariance(); |
|
|
let n = self.dimensionality(); |
|
|
|
|
|
|
|
|
let mut directions = Vec::new(); |
|
|
let mut values = Vec::new(); |
|
|
let mut working_cov = cov.clone(); |
|
|
|
|
|
for _ in 0..rank.min(n) { |
|
|
|
|
|
let (eigval, eigvec) = self.power_iteration(&working_cov, 50); |
|
|
|
|
|
if eigval < 1e-8 { |
|
|
break; |
|
|
} |
|
|
|
|
|
values.push(eigval); |
|
|
directions.push(Point::new(eigvec.clone()).normalize()); |
|
|
|
|
|
|
|
|
for i in 0..n { |
|
|
for j in 0..n { |
|
|
working_cov[i][j] -= eigval * eigvec[i] * eigvec[j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
self.principal_directions = directions; |
|
|
self.eigenvalues = values; |
|
|
} |
|
|
|
|
|
|
|
|
fn power_iteration(&self, matrix: &[Vec<f32>], max_iters: usize) -> (f32, Vec<f32>) { |
|
|
let n = matrix.len(); |
|
|
|
|
|
|
|
|
let mut v: Vec<f32> = (0..n).map(|i| 1.0 + (i as f32) * 0.1).collect(); |
|
|
let mut norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt(); |
|
|
for x in &mut v { |
|
|
*x /= norm; |
|
|
} |
|
|
|
|
|
let mut eigenvalue = 0.0f32; |
|
|
|
|
|
for _ in 0..max_iters { |
|
|
|
|
|
let mut v_new = vec![0.0; n]; |
|
|
for i in 0..n { |
|
|
for j in 0..n { |
|
|
v_new[i] += matrix[i][j] * v[j]; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
eigenvalue = v_new.iter().zip(v.iter()).map(|(a, b)| a * b).sum(); |
|
|
|
|
|
|
|
|
norm = v_new.iter().map(|x| x * x).sum::<f32>().sqrt(); |
|
|
if norm < 1e-10 { |
|
|
return (0.0, vec![0.0; n]); |
|
|
} |
|
|
|
|
|
let converged = v.iter().zip(v_new.iter()) |
|
|
.map(|(a, b)| (a - b / norm).abs()) |
|
|
.sum::<f32>() < 1e-8; |
|
|
|
|
|
for i in 0..n { |
|
|
v[i] = v_new[i] / norm; |
|
|
} |
|
|
|
|
|
if converged { |
|
|
break; |
|
|
} |
|
|
} |
|
|
|
|
|
(eigenvalue.abs(), v) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pub fn subspace_similarity(a: &Subspace, b: &Subspace) -> f32 { |
|
|
|
|
|
if !a.has_subspace() || !b.has_subspace() { |
|
|
return centroid_similarity(&a.centroid, &b.centroid); |
|
|
} |
|
|
|
|
|
|
|
|
let rank_a = a.rank(); |
|
|
let rank_b = b.rank(); |
|
|
let k = rank_a.min(rank_b); |
|
|
|
|
|
if k == 0 { |
|
|
return centroid_similarity(&a.centroid, &b.centroid); |
|
|
} |
|
|
|
|
|
|
|
|
let mut m = vec![vec![0.0f32; rank_b]; rank_a]; |
|
|
for i in 0..rank_a { |
|
|
for j in 0..rank_b { |
|
|
let dot: f32 = a.principal_directions[i].dims().iter() |
|
|
.zip(b.principal_directions[j].dims().iter()) |
|
|
.map(|(x, y)| x * y) |
|
|
.sum(); |
|
|
m[i][j] = dot; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
let cos_angles = greedy_max_matching(&m, k); |
|
|
|
|
|
|
|
|
let similarity: f32 = cos_angles.iter() |
|
|
.map(|&c| c * c) |
|
|
.sum::<f32>() / k as f32; |
|
|
|
|
|
similarity |
|
|
} |
|
|
|
|
|
|
|
|
fn greedy_max_matching(m: &[Vec<f32>], k: usize) -> Vec<f32> { |
|
|
let rows = m.len(); |
|
|
let cols = if rows > 0 { m[0].len() } else { 0 }; |
|
|
|
|
|
let mut used_rows = vec![false; rows]; |
|
|
let mut used_cols = vec![false; cols]; |
|
|
let mut result = Vec::new(); |
|
|
|
|
|
for _ in 0..k { |
|
|
let mut best = (0, 0, 0.0f32); |
|
|
|
|
|
for i in 0..rows { |
|
|
if used_rows[i] { continue; } |
|
|
for j in 0..cols { |
|
|
if used_cols[j] { continue; } |
|
|
let val = m[i][j].abs(); |
|
|
if val > best.2 { |
|
|
best = (i, j, val); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
if best.2 > 0.0 { |
|
|
used_rows[best.0] = true; |
|
|
used_cols[best.1] = true; |
|
|
result.push(best.2); |
|
|
} else { |
|
|
break; |
|
|
} |
|
|
} |
|
|
|
|
|
result |
|
|
} |
|
|
|
|
|
|
|
|
fn centroid_similarity(a: &Point, b: &Point) -> f32 { |
|
|
let dot: f32 = a.dims().iter() |
|
|
.zip(b.dims().iter()) |
|
|
.map(|(x, y)| x * y) |
|
|
.sum(); |
|
|
dot.clamp(-1.0, 1.0) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pub fn combined_subspace_similarity( |
|
|
query: &Point, |
|
|
container: &Subspace, |
|
|
config: &SubspaceConfig, |
|
|
) -> f32 { |
|
|
let centroid_sim = centroid_similarity(query, &container.centroid); |
|
|
|
|
|
if !container.has_subspace() || config.subspace_weight < 1e-6 { |
|
|
return centroid_sim; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
let subspace_sim = query_subspace_alignment(query, container); |
|
|
|
|
|
|
|
|
let w = config.subspace_weight; |
|
|
(1.0 - w) * centroid_sim + w * subspace_sim |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pub fn query_subspace_alignment(query: &Point, subspace: &Subspace) -> f32 { |
|
|
if !subspace.has_subspace() { |
|
|
return centroid_similarity(query, &subspace.centroid); |
|
|
} |
|
|
|
|
|
|
|
|
let centered: Vec<f32> = query.dims().iter() |
|
|
.zip(subspace.centroid.dims().iter()) |
|
|
.map(|(q, c)| q - c) |
|
|
.collect(); |
|
|
|
|
|
let centered_norm: f32 = centered.iter().map(|x| x * x).sum::<f32>().sqrt(); |
|
|
if centered_norm < 1e-10 { |
|
|
|
|
|
return 1.0; |
|
|
} |
|
|
|
|
|
|
|
|
let mut total_proj_sq = 0.0f32; |
|
|
for (dir, &eigenval) in subspace.principal_directions.iter().zip(subspace.eigenvalues.iter()) { |
|
|
let proj: f32 = centered.iter() |
|
|
.zip(dir.dims().iter()) |
|
|
.map(|(c, d)| c * d) |
|
|
.sum(); |
|
|
|
|
|
|
|
|
|
|
|
let weight = (eigenval / subspace.eigenvalues[0]).sqrt(); |
|
|
total_proj_sq += proj * proj * weight; |
|
|
} |
|
|
|
|
|
|
|
|
let alignment = (total_proj_sq / (centered_norm * centered_norm)).min(1.0); |
|
|
|
|
|
|
|
|
let centroid_sim = centroid_similarity(query, &subspace.centroid); |
|
|
|
|
|
|
|
|
(centroid_sim + alignment) / 2.0 |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pub fn subspace_spread(subspace: &Subspace) -> f32 { |
|
|
if subspace.eigenvalues.is_empty() { |
|
|
return 0.0; |
|
|
} |
|
|
|
|
|
|
|
|
subspace.eigenvalues.iter().sum() |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pub fn subspace_isotropy(subspace: &Subspace) -> f32 { |
|
|
if subspace.eigenvalues.len() < 2 { |
|
|
return 1.0; |
|
|
} |
|
|
|
|
|
|
|
|
let max = subspace.eigenvalues[0]; |
|
|
let min = *subspace.eigenvalues.last().unwrap(); |
|
|
|
|
|
if max < 1e-10 { |
|
|
return 1.0; |
|
|
} |
|
|
|
|
|
min / max |
|
|
} |
|
|
|
|
|
#[cfg(test)] |
|
|
mod tests { |
|
|
use super::*; |
|
|
|
|
|
fn make_point(v: Vec<f32>) -> Point { |
|
|
Point::new(v).normalize() |
|
|
} |
|
|
|
|
|
#[test] |
|
|
fn test_subspace_creation() { |
|
|
let mut subspace = Subspace::new(3); |
|
|
|
|
|
|
|
|
subspace.add_point(&make_point(vec![1.0, 0.0, 0.0])); |
|
|
subspace.add_point(&make_point(vec![0.9, 0.1, 0.0])); |
|
|
subspace.add_point(&make_point(vec![0.8, 0.2, 0.0])); |
|
|
subspace.add_point(&make_point(vec![0.7, 0.3, 0.1])); |
|
|
subspace.add_point(&make_point(vec![0.6, 0.4, 0.1])); |
|
|
|
|
|
assert_eq!(subspace.point_count, 5); |
|
|
|
|
|
|
|
|
subspace.recompute_subspace(2); |
|
|
|
|
|
assert!(subspace.has_subspace()); |
|
|
assert!(subspace.rank() > 0); |
|
|
assert!(!subspace.eigenvalues.is_empty()); |
|
|
|
|
|
println!("Centroid: {:?}", subspace.centroid.dims()); |
|
|
println!("Principal directions: {}", subspace.rank()); |
|
|
println!("Eigenvalues: {:?}", subspace.eigenvalues); |
|
|
} |
|
|
|
|
|
#[test] |
|
|
fn test_subspace_similarity() { |
|
|
let mut a = Subspace::new(3); |
|
|
let mut b = Subspace::new(3); |
|
|
|
|
|
|
|
|
for i in 0..10 { |
|
|
let x = 1.0 - i as f32 * 0.05; |
|
|
let y = i as f32 * 0.05; |
|
|
a.add_point(&make_point(vec![x, y, 0.0])); |
|
|
} |
|
|
|
|
|
|
|
|
for i in 0..10 { |
|
|
let x = 0.95 - i as f32 * 0.04; |
|
|
let y = i as f32 * 0.04 + 0.05; |
|
|
b.add_point(&make_point(vec![x, y, 0.1])); |
|
|
} |
|
|
|
|
|
a.recompute_subspace(2); |
|
|
b.recompute_subspace(2); |
|
|
|
|
|
let sim = subspace_similarity(&a, &b); |
|
|
println!("Similarity between similar subspaces: {:.3}", sim); |
|
|
assert!(sim > 0.5, "Similar subspaces should have high similarity"); |
|
|
|
|
|
|
|
|
let mut c = Subspace::new(3); |
|
|
for i in 0..10 { |
|
|
let z = 1.0 - i as f32 * 0.05; |
|
|
c.add_point(&make_point(vec![0.0, 0.1, z])); |
|
|
} |
|
|
c.recompute_subspace(2); |
|
|
|
|
|
let sim_ac = subspace_similarity(&a, &c); |
|
|
println!("Similarity between orthogonal subspaces: {:.3}", sim_ac); |
|
|
assert!(sim_ac < sim, "Orthogonal subspaces should have lower similarity"); |
|
|
} |
|
|
|
|
|
#[test] |
|
|
fn test_query_alignment() { |
|
|
let mut subspace = Subspace::new(3); |
|
|
|
|
|
|
|
|
for i in 0..20 { |
|
|
let x = 0.8 + (i % 3) as f32 * 0.1; |
|
|
let y = (i as f32 * 0.05) % 0.3; |
|
|
subspace.add_point(&make_point(vec![x, y, 0.05])); |
|
|
} |
|
|
subspace.recompute_subspace(2); |
|
|
|
|
|
|
|
|
let aligned_query = make_point(vec![0.9, 0.1, 0.0]); |
|
|
let aligned_score = query_subspace_alignment(&aligned_query, &subspace); |
|
|
|
|
|
|
|
|
let orthogonal_query = make_point(vec![0.0, 0.0, 1.0]); |
|
|
let orthogonal_score = query_subspace_alignment(&orthogonal_query, &subspace); |
|
|
|
|
|
println!("Aligned query score: {:.3}", aligned_score); |
|
|
println!("Orthogonal query score: {:.3}", orthogonal_score); |
|
|
|
|
|
assert!(aligned_score > orthogonal_score, |
|
|
"Aligned query should score higher than orthogonal query"); |
|
|
} |
|
|
|
|
|
#[test] |
|
|
fn test_spread_and_isotropy() { |
|
|
let mut tight = Subspace::new(3); |
|
|
let mut spread_out = Subspace::new(3); |
|
|
|
|
|
|
|
|
for _ in 0..20 { |
|
|
tight.add_point(&make_point(vec![0.9, 0.1, 0.05])); |
|
|
} |
|
|
|
|
|
|
|
|
for i in 0..20 { |
|
|
let angle = i as f32 * 0.3; |
|
|
spread_out.add_point(&make_point(vec![ |
|
|
angle.cos(), |
|
|
angle.sin(), |
|
|
0.1 |
|
|
])); |
|
|
} |
|
|
|
|
|
tight.recompute_subspace(3); |
|
|
spread_out.recompute_subspace(3); |
|
|
|
|
|
let tight_spread = subspace_spread(&tight); |
|
|
let wide_spread = subspace_spread(&spread_out); |
|
|
|
|
|
println!("Tight cluster spread: {:.6}", tight_spread); |
|
|
println!("Wide cluster spread: {:.6}", wide_spread); |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
|