Atomic-VSA / src /vsa_encoding.jl
marshad180's picture
Update Atomic VSA deployment
fa6bd30 verified
# ==============================================================================
# VSA ENCODING LAYER
# Thermometer, Categorical, and Ordinal Encoders
# Mirrors: atom_factory.rs (ThermometerEncoder, CategoricalEncoder, OrdinalEncoder)
# ==============================================================================
# --- Abstract Encoder ---
abstract type VSAEncoder end
# --- Thermometer Encoder ---
# Numeric values → cumulative atom superposition
# Close values share many levels → high similarity
# Distant values share few levels → low similarity
struct ThermometerEncoder <: VSAEncoder
reg::VSARegistry
sector::String # Registry sector for level atoms
field_name::String # Field identifier
min_val::Float64
max_val::Float64
levels::Int # Number of discretization levels
end
function ThermometerEncoder(reg::VSARegistry, field_name::String, min_val, max_val; levels=100)
return ThermometerEncoder(reg, "thermo_$(field_name)", field_name, Float64(min_val), Float64(max_val), levels)
end
function encode(enc::ThermometerEncoder, value::Real, d::Int)
# Clamp to range
v = clamp(Float64(value), enc.min_val, enc.max_val)
# Normalize to [0, 1]
range_size = enc.max_val - enc.min_val
normalized = range_size > 0 ? (v - enc.min_val) / range_size : 0.5
# How many levels to activate (thermometer style)
num_active = max(1, ceil(Int, normalized * enc.levels))
# Optimize: Single allocation for result
res_vec = zeros(Float32, d)
base = get_element(enc.reg, enc.sector, "base", d)
# Base vector (SingleData)
b_vec = base.data.vec
# In-place bundling of shifted levels
temp_vec = Vector{Float32}(undef, d)
for i in 1:num_active
# Efficient circular shift and accumulate
# Use a simplified shift logic to avoid heavy allocations
s = mod(i, d)
if s == 0
bundle!(res_vec, b_vec)
else
@inbounds for j in 1:d
target_idx = j + s
if target_idx > d target_idx -= d end
res_vec[target_idx] += b_vec[j]
end
end
end
return Atom(SingleData(res_vec))
end
function expected_similarity(enc::ThermometerEncoder, v1::Real, v2::Real)
range_size = enc.max_val - enc.min_val
n1 = range_size > 0 ? clamp((Float64(v1) - enc.min_val) / range_size, 0, 1) : 0.5
n2 = range_size > 0 ? clamp((Float64(v2) - enc.min_val) / range_size, 0, 1) : 0.5
levels1 = max(1, ceil(Int, n1 * enc.levels))
levels2 = max(1, ceil(Int, n2 * enc.levels))
overlap = min(levels1, levels2)
union = max(levels1, levels2)
return union > 0 ? Float32(overlap / union) : 1.0f0
end
# --- Categorical Encoder ---
# Discrete labels → orthogonal atoms from Registry
# Each category gets its own stable random atom
struct CategoricalEncoder <: VSAEncoder
reg::VSARegistry
sector::String
field_name::String
categories::Vector{String}
end
function CategoricalEncoder(reg::VSARegistry, field_name::String, categories::Vector{String})
return CategoricalEncoder(reg, "cat_$(field_name)", field_name, categories)
end
function encode(enc::CategoricalEncoder, value::String, d::Int)
# Each category → unique stable atom from Registry
return get_element(enc.reg, enc.sector, value, d)
end
# --- Ordinal Encoder ---
# Ordered discrete values → indexed atoms with progressive similarity
struct OrdinalEncoder <: VSAEncoder
reg::VSARegistry
sector::String
field_name::String
values::Vector{String}
end
function OrdinalEncoder(reg::VSARegistry, field_name::String, values::Vector{String})
return OrdinalEncoder(reg, "ord_$(field_name)", field_name, values)
end
function encode(enc::OrdinalEncoder, value::String, d::Int)
return get_element(enc.reg, enc.sector, value, d)
end
# --- Permutation Helper ---
# Circular shift of atom vector (used by Thermometer levels)
function permute_atom(atom::Atom, shift::Int)
if atom.data isa SingleData
vec = atom.data.vec
d = length(vec)
s = mod(shift, d)
s == 0 && return atom
# Optimized circular shift
new_vec = Vector{Float32}(undef, d)
@inbounds for i in 1:d
src_idx = i - s
if src_idx < 1 src_idx += d end
new_vec[i] = vec[src_idx]
end
return Atom(SingleData(new_vec))
elseif atom.data isa BinaryData
# For binary: circular bit shift
chunks = atom.data.chunks
dim = atom.data.dim
s = mod(shift, dim)
s == 0 && return atom
# Simplified bit shifting logic
# For max performance we would use bit-level shifting on chunks,
# but for now we optimize the bit extraction/packing loop
n_chunks = length(chunks)
new_chunks = zeros(UInt64, n_chunks)
@inbounds for i in 1:dim
# Get bit from original
src_idx = i - s
if src_idx < 1 src_idx += dim end
sc_idx = ((src_idx - 1) ÷ 64) + 1
sb_idx = (src_idx - 1) % 64
bit = (chunks[sc_idx] >> sb_idx) & 1
if bit == 1
dc_idx = ((i - 1) ÷ 64) + 1
db_idx = (i - 1) % 64
new_chunks[dc_idx] |= UInt64(1) << db_idx
end
end
return Atom(BinaryData(new_chunks, dim))
end
return atom
end
# --- Schema Definition ---
struct FieldSchema
name::String
encoder::VSAEncoder
end