Atomic-VSA / src /vsa_discovery.jl
marshad180's picture
Update Atomic VSA deployment
fa6bd30 verified
# ==============================================================================
# VSA PATTERN MINING
# Discovers patterns you did NOT ask for — algebraically, without training.
#
# Mining operations:
# 1. Association Rules — "X AND Y → Z" discovered via resonance
# 2. Field Correlation — Which fields co-vary? (without statistics)
# 3. Co-occurrence — Which values appear together?
# 4. Population Drift — Is subset A different from subset B?
# 5. Anomaly Detection — Records that don't fit the population
# 6. Cluster Discovery — Find natural groups without K-means
# ==============================================================================
# --- 1. ASSOCIATION RULE MINING ---
# "IF Diagnosis=Hypertension THEN SBP>130?"
# We encode the antecedent, extract it from population, measure resonance
# with the consequent. HIGH resonance = strong rule.
function mine_association_rules(db::VSADatabase; min_confidence::Float64=0.3)
rules = Tuple{String, String, Float64}[] # (antecedent, consequent, confidence)
# Collect all encodable field-value pairs
pairs = Tuple{String, String, Atom}[] # (field, value, encoded_atom)
for (field_name, enc) in db.encoders
if enc isa CategoricalEncoder
role = db.field_roles[field_name]
for cat in enc.categories
cat_atom = encode(enc, cat, db.dim)
bound = bind(role, cat_atom)
push!(pairs, (field_name, cat, bound))
end
end
end
length(pairs) < 2 && return rules
# For each pair of field-values, measure co-occurrence via resonance
# Build sub-populations per antecedent
for (i, (f_a, v_a, atom_a)) in enumerate(pairs)
# Find records matching antecedent
role_a = db.field_roles[f_a]
matching_indices = Int[]
for (idx, record) in enumerate(db.records)
extracted = bind(record, role_a)
enc_a = db.encoders[f_a]
target = encode(enc_a, v_a, db.dim)
if similarity(extracted, target) > 0.05
push!(matching_indices, idx)
end
end
isempty(matching_indices) && continue
# Build sub-population superposition
sub_pop = bundle([db.records[idx] for idx in matching_indices])
# Test all consequents from DIFFERENT fields
for (j, (f_c, v_c, atom_c)) in enumerate(pairs)
f_c == f_a && continue # Same field → skip
# Extract consequent field from sub-population
role_c = db.field_roles[f_c]
extracted = bind(sub_pop, role_c)
enc_c = db.encoders[f_c]
target_c = encode(enc_c, v_c, db.dim)
confidence = Float64(similarity(extracted, target_c))
if confidence > min_confidence
push!(rules, ("$(f_a)=$(v_a)", "$(f_c)=$(v_c)", confidence))
end
end
end
sort!(rules, by=x -> -x[3])
return rules
end
# --- 2. FIELD CORRELATION ---
# Do two fields move together? Measure by bundling all (Bind(RoleA, ValA), Bind(RoleB, ValB))
# pairs from actual records, then checking resonance strength.
function mine_field_correlations(db::VSADatabase)
field_names = collect(keys(db.field_roles))
correlations = Tuple{String, String, Float64}[]
length(field_names) < 2 && return correlations
for i in 1:length(field_names)
for j in (i+1):length(field_names)
f1, f2 = field_names[i], field_names[j]
role1, role2 = db.field_roles[f1], db.field_roles[f2]
# Extract both fields from every record, measure joint coherence
joint_atoms = Atom[]
for record in db.records
ext1 = bind(record, role1)
ext2 = bind(record, role2)
joint = bind(ext1, ext2)
push!(joint_atoms, joint)
end
if length(joint_atoms) >= 2
# High coherence among joint extractions = correlated fields
joint_super = bundle(joint_atoms)
coherence_sum = 0.0
for atom in joint_atoms
coherence_sum += Float64(similarity(atom, joint_super))
end
avg_coherence = coherence_sum / length(joint_atoms)
push!(correlations, (f1, f2, avg_coherence))
end
end
end
sort!(correlations, by=x -> -x[3])
return correlations
end
# --- 3. CO-OCCURRENCE DISCOVERY ---
# Find which categorical values tend to appear together in records.
# "Male + Hypertension" vs "Female + Hypertension" — which is more common?
function mine_cooccurrence(db::VSADatabase, field_a::String, field_b::String)
haskey(db.encoders, field_a) && haskey(db.encoders, field_b) || return []
enc_a, enc_b = db.encoders[field_a], db.encoders[field_b]
(enc_a isa CategoricalEncoder && enc_b isa CategoricalEncoder) || return []
role_a, role_b = db.field_roles[field_a], db.field_roles[field_b]
results = Tuple{String, String, Int}[]
for cat_a in enc_a.categories
target_a = encode(enc_a, cat_a, db.dim)
for cat_b in enc_b.categories
target_b = encode(enc_b, cat_b, db.dim)
count = 0
for record in db.records
ext_a = bind(record, role_a)
ext_b = bind(record, role_b)
sim_a = similarity(ext_a, target_a)
sim_b = similarity(ext_b, target_b)
if sim_a > 0.05 && sim_b > 0.05
count += 1
end
end
if count > 0
push!(results, (cat_a, cat_b, count))
end
end
end
sort!(results, by=x -> -x[3])
return results
end
# --- 4. POPULATION DRIFT ---
# Is one subset of records fundamentally different from another?
# Split population → measure cross-similarity.
function detect_drift(db::VSADatabase; split_at::Int=0)
n = length(db.records)
n < 4 && return 0.0
mid = split_at > 0 ? split_at : n ÷ 2
mid = clamp(mid, 1, n-1)
pop_a = bundle(db.records[1:mid])
pop_b = bundle(db.records[mid+1:end])
cross_sim = Float64(similarity(pop_a, pop_b))
return cross_sim # Low = drift detected, High = stable
end
# --- 5. ANOMALY DETECTION ---
# Records with LOW similarity to population superposition are anomalies.
function detect_anomalies(db::VSADatabase; threshold::Float64=0.15)
if db.superposition[] === nothing
build_superposition!(db)
end
pop = db.superposition[]
pop === nothing && return [], []
anomalies = Tuple{String, Float64}[]
normals = Tuple{String, Float64}[]
for (i, record) in enumerate(db.records)
sim = Float64(similarity(record, pop))
if sim < threshold
push!(anomalies, (db.record_ids[i], sim))
else
push!(normals, (db.record_ids[i], sim))
end
end
sort!(anomalies, by=x -> x[2])
return anomalies, normals
end
# --- 6. CLUSTER DISCOVERY (Unsupervised) ---
# Find natural clusters without knowing categories.
# Greedy resonance: pick seed, pull in similar records, repeat.
function discover_clusters(db::VSADatabase; min_sim::Float64=0.6, min_size::Int=2)
n = length(db.records)
n < min_size && return []
assigned = falses(n)
clusters = Vector{Vector{Tuple{String, Float64}}}()
# Sort by similarity to population (most central first as seeds)
if db.superposition[] === nothing
build_superposition!(db)
end
pop = db.superposition[]
pop_sims = [(i, Float64(similarity(db.records[i], pop !== nothing ? pop : db.records[1]))) for i in 1:n]
sort!(pop_sims, by=x -> -x[2])
for (seed_idx, _) in pop_sims
assigned[seed_idx] && continue
# Start new cluster from this seed
cluster = Tuple{String, Float64}[]
seed = db.records[seed_idx]
for j in 1:n
assigned[j] && continue
sim = Float64(similarity(seed, db.records[j]))
if sim >= min_sim
push!(cluster, (db.record_ids[j], sim))
assigned[j] = true
end
end
if length(cluster) >= min_size
sort!(cluster, by=x -> -x[2])
push!(clusters, cluster)
else
# Release back to unassigned
for (id, _) in cluster
idx = findfirst(==(id), db.record_ids)
if idx !== nothing
assigned[idx] = false
end
end
end
end
return clusters
end
# --- 7. FIELD CLUSTERING (Known field) ---
# Group records by a known categorical field via resonance extraction.
function cluster_by_field(db::VSADatabase, field_name::String)
if !haskey(db.field_roles, field_name) || !haskey(db.encoders, field_name)
return Dict{String, Vector{String}}()
end
role = db.field_roles[field_name]
enc = db.encoders[field_name]
if !(enc isa CategoricalEncoder)
return Dict{String, Vector{String}}()
end
clusters = Dict{String, Vector{String}}()
for cat in enc.categories
cat_atom = encode(enc, cat, db.dim)
cluster_members = String[]
for (i, record) in enumerate(db.records)
extracted = bind(record, role)
sim = similarity(extracted, cat_atom)
if sim > 0.05
push!(cluster_members, db.record_ids[i])
end
end
if !isempty(cluster_members)
clusters[cat] = cluster_members
end
end
return clusters
end
# --- 8. POPULATION COHERENCE ---
function measure_coherence(db::VSADatabase, record_ids::Vector{String})
indices = [findfirst(==(id), db.record_ids) for id in record_ids]
filter!(x -> x !== nothing, indices)
length(indices) < 2 && return 1.0
atoms = [db.records[i] for i in indices]
total_sim = 0.0
count = 0
for i in 1:length(atoms)
for j in (i+1):length(atoms)
total_sim += Float64(similarity(atoms[i], atoms[j]))
count += 1
end
end
return count > 0 ? total_sim / count : 0.0
end
# --- DETERMINISM PROOF ---
function prove_determinism(db::VSADatabase, field_name::String, value::Any)
run1 = query_exact(db, field_name, value; top_k=5)
run2 = query_exact(db, field_name, value; top_k=5)
identical = true
if length(run1) != length(run2)
identical = false
else
for i in 1:length(run1)
if run1[i][1] != run2[i][1] || abs(run1[i][2] - run2[i][2]) > 1e-10
identical = false
break
end
end
end
return identical, run1, run2
end