Spaces:
Sleeping
Sleeping
File size: 5,780 Bytes
fa6bd30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | # ==============================================================================
# VSA ENCODING LAYER
# Thermometer, Categorical, and Ordinal Encoders
# Mirrors: atom_factory.rs (ThermometerEncoder, CategoricalEncoder, OrdinalEncoder)
# ==============================================================================
# --- Abstract Encoder ---
abstract type VSAEncoder end
# --- Thermometer Encoder ---
# Numeric values → cumulative atom superposition
# Close values share many levels → high similarity
# Distant values share few levels → low similarity
struct ThermometerEncoder <: VSAEncoder
reg::VSARegistry
sector::String # Registry sector for level atoms
field_name::String # Field identifier
min_val::Float64
max_val::Float64
levels::Int # Number of discretization levels
end
function ThermometerEncoder(reg::VSARegistry, field_name::String, min_val, max_val; levels=100)
return ThermometerEncoder(reg, "thermo_$(field_name)", field_name, Float64(min_val), Float64(max_val), levels)
end
function encode(enc::ThermometerEncoder, value::Real, d::Int)
# Clamp to range
v = clamp(Float64(value), enc.min_val, enc.max_val)
# Normalize to [0, 1]
range_size = enc.max_val - enc.min_val
normalized = range_size > 0 ? (v - enc.min_val) / range_size : 0.5
# How many levels to activate (thermometer style)
num_active = max(1, ceil(Int, normalized * enc.levels))
# Optimize: Single allocation for result
res_vec = zeros(Float32, d)
base = get_element(enc.reg, enc.sector, "base", d)
# Base vector (SingleData)
b_vec = base.data.vec
# In-place bundling of shifted levels
temp_vec = Vector{Float32}(undef, d)
for i in 1:num_active
# Efficient circular shift and accumulate
# Use a simplified shift logic to avoid heavy allocations
s = mod(i, d)
if s == 0
bundle!(res_vec, b_vec)
else
@inbounds for j in 1:d
target_idx = j + s
if target_idx > d target_idx -= d end
res_vec[target_idx] += b_vec[j]
end
end
end
return Atom(SingleData(res_vec))
end
function expected_similarity(enc::ThermometerEncoder, v1::Real, v2::Real)
range_size = enc.max_val - enc.min_val
n1 = range_size > 0 ? clamp((Float64(v1) - enc.min_val) / range_size, 0, 1) : 0.5
n2 = range_size > 0 ? clamp((Float64(v2) - enc.min_val) / range_size, 0, 1) : 0.5
levels1 = max(1, ceil(Int, n1 * enc.levels))
levels2 = max(1, ceil(Int, n2 * enc.levels))
overlap = min(levels1, levels2)
union = max(levels1, levels2)
return union > 0 ? Float32(overlap / union) : 1.0f0
end
# --- Categorical Encoder ---
# Discrete labels → orthogonal atoms from Registry
# Each category gets its own stable random atom
struct CategoricalEncoder <: VSAEncoder
reg::VSARegistry
sector::String
field_name::String
categories::Vector{String}
end
function CategoricalEncoder(reg::VSARegistry, field_name::String, categories::Vector{String})
return CategoricalEncoder(reg, "cat_$(field_name)", field_name, categories)
end
function encode(enc::CategoricalEncoder, value::String, d::Int)
# Each category → unique stable atom from Registry
return get_element(enc.reg, enc.sector, value, d)
end
# --- Ordinal Encoder ---
# Ordered discrete values → indexed atoms with progressive similarity
struct OrdinalEncoder <: VSAEncoder
reg::VSARegistry
sector::String
field_name::String
values::Vector{String}
end
function OrdinalEncoder(reg::VSARegistry, field_name::String, values::Vector{String})
return OrdinalEncoder(reg, "ord_$(field_name)", field_name, values)
end
function encode(enc::OrdinalEncoder, value::String, d::Int)
return get_element(enc.reg, enc.sector, value, d)
end
# --- Permutation Helper ---
# Circular shift of atom vector (used by Thermometer levels)
function permute_atom(atom::Atom, shift::Int)
if atom.data isa SingleData
vec = atom.data.vec
d = length(vec)
s = mod(shift, d)
s == 0 && return atom
# Optimized circular shift
new_vec = Vector{Float32}(undef, d)
@inbounds for i in 1:d
src_idx = i - s
if src_idx < 1 src_idx += d end
new_vec[i] = vec[src_idx]
end
return Atom(SingleData(new_vec))
elseif atom.data isa BinaryData
# For binary: circular bit shift
chunks = atom.data.chunks
dim = atom.data.dim
s = mod(shift, dim)
s == 0 && return atom
# Simplified bit shifting logic
# For max performance we would use bit-level shifting on chunks,
# but for now we optimize the bit extraction/packing loop
n_chunks = length(chunks)
new_chunks = zeros(UInt64, n_chunks)
@inbounds for i in 1:dim
# Get bit from original
src_idx = i - s
if src_idx < 1 src_idx += dim end
sc_idx = ((src_idx - 1) ÷ 64) + 1
sb_idx = (src_idx - 1) % 64
bit = (chunks[sc_idx] >> sb_idx) & 1
if bit == 1
dc_idx = ((i - 1) ÷ 64) + 1
db_idx = (i - 1) % 64
new_chunks[dc_idx] |= UInt64(1) << db_idx
end
end
return Atom(BinaryData(new_chunks, dim))
end
return atom
end
# --- Schema Definition ---
struct FieldSchema
name::String
encoder::VSAEncoder
end
|