# ============================================================================== # VSA ENCODING LAYER # Thermometer, Categorical, and Ordinal Encoders # Mirrors: atom_factory.rs (ThermometerEncoder, CategoricalEncoder, OrdinalEncoder) # ============================================================================== # --- Abstract Encoder --- abstract type VSAEncoder end # --- Thermometer Encoder --- # Numeric values → cumulative atom superposition # Close values share many levels → high similarity # Distant values share few levels → low similarity struct ThermometerEncoder <: VSAEncoder reg::VSARegistry sector::String # Registry sector for level atoms field_name::String # Field identifier min_val::Float64 max_val::Float64 levels::Int # Number of discretization levels end function ThermometerEncoder(reg::VSARegistry, field_name::String, min_val, max_val; levels=100) return ThermometerEncoder(reg, "thermo_$(field_name)", field_name, Float64(min_val), Float64(max_val), levels) end function encode(enc::ThermometerEncoder, value::Real, d::Int) # Clamp to range v = clamp(Float64(value), enc.min_val, enc.max_val) # Normalize to [0, 1] range_size = enc.max_val - enc.min_val normalized = range_size > 0 ? (v - enc.min_val) / range_size : 0.5 # How many levels to activate (thermometer style) num_active = max(1, ceil(Int, normalized * enc.levels)) # Optimize: Single allocation for result res_vec = zeros(Float32, d) base = get_element(enc.reg, enc.sector, "base", d) # Base vector (SingleData) b_vec = base.data.vec # In-place bundling of shifted levels temp_vec = Vector{Float32}(undef, d) for i in 1:num_active # Efficient circular shift and accumulate # Use a simplified shift logic to avoid heavy allocations s = mod(i, d) if s == 0 bundle!(res_vec, b_vec) else @inbounds for j in 1:d target_idx = j + s if target_idx > d target_idx -= d end res_vec[target_idx] += b_vec[j] end end end return Atom(SingleData(res_vec)) end function expected_similarity(enc::ThermometerEncoder, v1::Real, v2::Real) range_size = enc.max_val - enc.min_val n1 = range_size > 0 ? clamp((Float64(v1) - enc.min_val) / range_size, 0, 1) : 0.5 n2 = range_size > 0 ? clamp((Float64(v2) - enc.min_val) / range_size, 0, 1) : 0.5 levels1 = max(1, ceil(Int, n1 * enc.levels)) levels2 = max(1, ceil(Int, n2 * enc.levels)) overlap = min(levels1, levels2) union = max(levels1, levels2) return union > 0 ? Float32(overlap / union) : 1.0f0 end # --- Categorical Encoder --- # Discrete labels → orthogonal atoms from Registry # Each category gets its own stable random atom struct CategoricalEncoder <: VSAEncoder reg::VSARegistry sector::String field_name::String categories::Vector{String} end function CategoricalEncoder(reg::VSARegistry, field_name::String, categories::Vector{String}) return CategoricalEncoder(reg, "cat_$(field_name)", field_name, categories) end function encode(enc::CategoricalEncoder, value::String, d::Int) # Each category → unique stable atom from Registry return get_element(enc.reg, enc.sector, value, d) end # --- Ordinal Encoder --- # Ordered discrete values → indexed atoms with progressive similarity struct OrdinalEncoder <: VSAEncoder reg::VSARegistry sector::String field_name::String values::Vector{String} end function OrdinalEncoder(reg::VSARegistry, field_name::String, values::Vector{String}) return OrdinalEncoder(reg, "ord_$(field_name)", field_name, values) end function encode(enc::OrdinalEncoder, value::String, d::Int) return get_element(enc.reg, enc.sector, value, d) end # --- Permutation Helper --- # Circular shift of atom vector (used by Thermometer levels) function permute_atom(atom::Atom, shift::Int) if atom.data isa SingleData vec = atom.data.vec d = length(vec) s = mod(shift, d) s == 0 && return atom # Optimized circular shift new_vec = Vector{Float32}(undef, d) @inbounds for i in 1:d src_idx = i - s if src_idx < 1 src_idx += d end new_vec[i] = vec[src_idx] end return Atom(SingleData(new_vec)) elseif atom.data isa BinaryData # For binary: circular bit shift chunks = atom.data.chunks dim = atom.data.dim s = mod(shift, dim) s == 0 && return atom # Simplified bit shifting logic # For max performance we would use bit-level shifting on chunks, # but for now we optimize the bit extraction/packing loop n_chunks = length(chunks) new_chunks = zeros(UInt64, n_chunks) @inbounds for i in 1:dim # Get bit from original src_idx = i - s if src_idx < 1 src_idx += dim end sc_idx = ((src_idx - 1) ÷ 64) + 1 sb_idx = (src_idx - 1) % 64 bit = (chunks[sc_idx] >> sb_idx) & 1 if bit == 1 dc_idx = ((i - 1) ÷ 64) + 1 db_idx = (i - 1) % 64 new_chunks[dc_idx] |= UInt64(1) << db_idx end end return Atom(BinaryData(new_chunks, dim)) end return atom end # --- Schema Definition --- struct FieldSchema name::String encoder::VSAEncoder end