Spaces:
Sleeping
Sleeping
| # ============================================================================== | |
| # VSA ENCODING LAYER | |
| # Thermometer, Categorical, and Ordinal Encoders | |
| # Mirrors: atom_factory.rs (ThermometerEncoder, CategoricalEncoder, OrdinalEncoder) | |
| # ============================================================================== | |
| # --- Abstract Encoder --- | |
| abstract type VSAEncoder end | |
| # --- Thermometer Encoder --- | |
| # Numeric values → cumulative atom superposition | |
| # Close values share many levels → high similarity | |
| # Distant values share few levels → low similarity | |
| struct ThermometerEncoder <: VSAEncoder | |
| reg::VSARegistry | |
| sector::String # Registry sector for level atoms | |
| field_name::String # Field identifier | |
| min_val::Float64 | |
| max_val::Float64 | |
| levels::Int # Number of discretization levels | |
| end | |
| function ThermometerEncoder(reg::VSARegistry, field_name::String, min_val, max_val; levels=100) | |
| return ThermometerEncoder(reg, "thermo_$(field_name)", field_name, Float64(min_val), Float64(max_val), levels) | |
| end | |
| function encode(enc::ThermometerEncoder, value::Real, d::Int) | |
| # Clamp to range | |
| v = clamp(Float64(value), enc.min_val, enc.max_val) | |
| # Normalize to [0, 1] | |
| range_size = enc.max_val - enc.min_val | |
| normalized = range_size > 0 ? (v - enc.min_val) / range_size : 0.5 | |
| # How many levels to activate (thermometer style) | |
| num_active = max(1, ceil(Int, normalized * enc.levels)) | |
| # Optimize: Single allocation for result | |
| res_vec = zeros(Float32, d) | |
| base = get_element(enc.reg, enc.sector, "base", d) | |
| # Base vector (SingleData) | |
| b_vec = base.data.vec | |
| # In-place bundling of shifted levels | |
| temp_vec = Vector{Float32}(undef, d) | |
| for i in 1:num_active | |
| # Efficient circular shift and accumulate | |
| # Use a simplified shift logic to avoid heavy allocations | |
| s = mod(i, d) | |
| if s == 0 | |
| bundle!(res_vec, b_vec) | |
| else | |
| for j in 1:d | |
| target_idx = j + s | |
| if target_idx > d target_idx -= d end | |
| res_vec[target_idx] += b_vec[j] | |
| end | |
| end | |
| end | |
| return Atom(SingleData(res_vec)) | |
| end | |
| function expected_similarity(enc::ThermometerEncoder, v1::Real, v2::Real) | |
| range_size = enc.max_val - enc.min_val | |
| n1 = range_size > 0 ? clamp((Float64(v1) - enc.min_val) / range_size, 0, 1) : 0.5 | |
| n2 = range_size > 0 ? clamp((Float64(v2) - enc.min_val) / range_size, 0, 1) : 0.5 | |
| levels1 = max(1, ceil(Int, n1 * enc.levels)) | |
| levels2 = max(1, ceil(Int, n2 * enc.levels)) | |
| overlap = min(levels1, levels2) | |
| union = max(levels1, levels2) | |
| return union > 0 ? Float32(overlap / union) : 1.0f0 | |
| end | |
| # --- Categorical Encoder --- | |
| # Discrete labels → orthogonal atoms from Registry | |
| # Each category gets its own stable random atom | |
| struct CategoricalEncoder <: VSAEncoder | |
| reg::VSARegistry | |
| sector::String | |
| field_name::String | |
| categories::Vector{String} | |
| end | |
| function CategoricalEncoder(reg::VSARegistry, field_name::String, categories::Vector{String}) | |
| return CategoricalEncoder(reg, "cat_$(field_name)", field_name, categories) | |
| end | |
| function encode(enc::CategoricalEncoder, value::String, d::Int) | |
| # Each category → unique stable atom from Registry | |
| return get_element(enc.reg, enc.sector, value, d) | |
| end | |
| # --- Ordinal Encoder --- | |
| # Ordered discrete values → indexed atoms with progressive similarity | |
| struct OrdinalEncoder <: VSAEncoder | |
| reg::VSARegistry | |
| sector::String | |
| field_name::String | |
| values::Vector{String} | |
| end | |
| function OrdinalEncoder(reg::VSARegistry, field_name::String, values::Vector{String}) | |
| return OrdinalEncoder(reg, "ord_$(field_name)", field_name, values) | |
| end | |
| function encode(enc::OrdinalEncoder, value::String, d::Int) | |
| return get_element(enc.reg, enc.sector, value, d) | |
| end | |
| # --- Permutation Helper --- | |
| # Circular shift of atom vector (used by Thermometer levels) | |
| function permute_atom(atom::Atom, shift::Int) | |
| if atom.data isa SingleData | |
| vec = atom.data.vec | |
| d = length(vec) | |
| s = mod(shift, d) | |
| s == 0 && return atom | |
| # Optimized circular shift | |
| new_vec = Vector{Float32}(undef, d) | |
| for i in 1:d | |
| src_idx = i - s | |
| if src_idx < 1 src_idx += d end | |
| new_vec[i] = vec[src_idx] | |
| end | |
| return Atom(SingleData(new_vec)) | |
| elseif atom.data isa BinaryData | |
| # For binary: circular bit shift | |
| chunks = atom.data.chunks | |
| dim = atom.data.dim | |
| s = mod(shift, dim) | |
| s == 0 && return atom | |
| # Simplified bit shifting logic | |
| # For max performance we would use bit-level shifting on chunks, | |
| # but for now we optimize the bit extraction/packing loop | |
| n_chunks = length(chunks) | |
| new_chunks = zeros(UInt64, n_chunks) | |
| for i in 1:dim | |
| # Get bit from original | |
| src_idx = i - s | |
| if src_idx < 1 src_idx += dim end | |
| sc_idx = ((src_idx - 1) ÷ 64) + 1 | |
| sb_idx = (src_idx - 1) % 64 | |
| bit = (chunks[sc_idx] >> sb_idx) & 1 | |
| if bit == 1 | |
| dc_idx = ((i - 1) ÷ 64) + 1 | |
| db_idx = (i - 1) % 64 | |
| new_chunks[dc_idx] |= UInt64(1) << db_idx | |
| end | |
| end | |
| return Atom(BinaryData(new_chunks, dim)) | |
| end | |
| return atom | |
| end | |
| # --- Schema Definition --- | |
| struct FieldSchema | |
| name::String | |
| encoder::VSAEncoder | |
| end | |