File size: 5,780 Bytes
fa6bd30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# ==============================================================================
# VSA ENCODING LAYER
# Thermometer, Categorical, and Ordinal Encoders
# Mirrors: atom_factory.rs (ThermometerEncoder, CategoricalEncoder, OrdinalEncoder)
# ==============================================================================

# --- Abstract Encoder ---

abstract type VSAEncoder end

# --- Thermometer Encoder ---
# Numeric values → cumulative atom superposition
# Close values share many levels → high similarity
# Distant values share few levels → low similarity

struct ThermometerEncoder <: VSAEncoder
    reg::VSARegistry
    sector::String       # Registry sector for level atoms
    field_name::String   # Field identifier
    min_val::Float64
    max_val::Float64
    levels::Int          # Number of discretization levels
end

function ThermometerEncoder(reg::VSARegistry, field_name::String, min_val, max_val; levels=100)
    return ThermometerEncoder(reg, "thermo_$(field_name)", field_name, Float64(min_val), Float64(max_val), levels)
end

function encode(enc::ThermometerEncoder, value::Real, d::Int)
    # Clamp to range
    v = clamp(Float64(value), enc.min_val, enc.max_val)
    
    # Normalize to [0, 1]
    range_size = enc.max_val - enc.min_val
    normalized = range_size > 0 ? (v - enc.min_val) / range_size : 0.5
    
    # How many levels to activate (thermometer style)
    num_active = max(1, ceil(Int, normalized * enc.levels))
    
    # Optimize: Single allocation for result
    res_vec = zeros(Float32, d)
    base = get_element(enc.reg, enc.sector, "base", d)
    
    # Base vector (SingleData)
    b_vec = base.data.vec
    
    # In-place bundling of shifted levels
    temp_vec = Vector{Float32}(undef, d)
    for i in 1:num_active
        # Efficient circular shift and accumulate
        # Use a simplified shift logic to avoid heavy allocations
        s = mod(i, d)
        if s == 0
            bundle!(res_vec, b_vec)
        else
            @inbounds for j in 1:d
                target_idx = j + s
                if target_idx > d target_idx -= d end
                res_vec[target_idx] += b_vec[j]
            end
        end
    end
    
    return Atom(SingleData(res_vec))
end

function expected_similarity(enc::ThermometerEncoder, v1::Real, v2::Real)
    range_size = enc.max_val - enc.min_val
    n1 = range_size > 0 ? clamp((Float64(v1) - enc.min_val) / range_size, 0, 1) : 0.5
    n2 = range_size > 0 ? clamp((Float64(v2) - enc.min_val) / range_size, 0, 1) : 0.5
    
    levels1 = max(1, ceil(Int, n1 * enc.levels))
    levels2 = max(1, ceil(Int, n2 * enc.levels))
    
    overlap = min(levels1, levels2)
    union = max(levels1, levels2)
    return union > 0 ? Float32(overlap / union) : 1.0f0
end

# --- Categorical Encoder ---
# Discrete labels → orthogonal atoms from Registry
# Each category gets its own stable random atom

struct CategoricalEncoder <: VSAEncoder
    reg::VSARegistry
    sector::String
    field_name::String
    categories::Vector{String}
end

function CategoricalEncoder(reg::VSARegistry, field_name::String, categories::Vector{String})
    return CategoricalEncoder(reg, "cat_$(field_name)", field_name, categories)
end

function encode(enc::CategoricalEncoder, value::String, d::Int)
    # Each category → unique stable atom from Registry
    return get_element(enc.reg, enc.sector, value, d)
end

# --- Ordinal Encoder ---
# Ordered discrete values → indexed atoms with progressive similarity

struct OrdinalEncoder <: VSAEncoder
    reg::VSARegistry
    sector::String
    field_name::String
    values::Vector{String}
end

function OrdinalEncoder(reg::VSARegistry, field_name::String, values::Vector{String})
    return OrdinalEncoder(reg, "ord_$(field_name)", field_name, values)
end

function encode(enc::OrdinalEncoder, value::String, d::Int)
    return get_element(enc.reg, enc.sector, value, d)
end

# --- Permutation Helper ---
# Circular shift of atom vector (used by Thermometer levels)

function permute_atom(atom::Atom, shift::Int)
    if atom.data isa SingleData
        vec = atom.data.vec
        d = length(vec)
        s = mod(shift, d)
        s == 0 && return atom
        
        # Optimized circular shift
        new_vec = Vector{Float32}(undef, d)
        @inbounds for i in 1:d
            src_idx = i - s
            if src_idx < 1 src_idx += d end
            new_vec[i] = vec[src_idx]
        end
        return Atom(SingleData(new_vec))
        
    elseif atom.data isa BinaryData
        # For binary: circular bit shift
        chunks = atom.data.chunks
        dim = atom.data.dim
        s = mod(shift, dim)
        s == 0 && return atom
        
        # Simplified bit shifting logic
        # For max performance we would use bit-level shifting on chunks,
        # but for now we optimize the bit extraction/packing loop
        n_chunks = length(chunks)
        new_chunks = zeros(UInt64, n_chunks)
        
        @inbounds for i in 1:dim
            # Get bit from original
            src_idx = i - s
            if src_idx < 1 src_idx += dim end
            
            sc_idx = ((src_idx - 1) ÷ 64) + 1
            sb_idx = (src_idx - 1) % 64
            bit = (chunks[sc_idx] >> sb_idx) & 1
            
            if bit == 1
                dc_idx = ((i - 1) ÷ 64) + 1
                db_idx = (i - 1) % 64
                new_chunks[dc_idx] |= UInt64(1) << db_idx
            end
        end
        return Atom(BinaryData(new_chunks, dim))
    end
    return atom
end

# --- Schema Definition ---

struct FieldSchema
    name::String
    encoder::VSAEncoder
end