Spaces:

marshad180
/

Atomic-VSA

Sleeping

App Files Files Community

Atomic-VSA / src /vsa_discovery.jl

marshad180

Update Atomic VSA deployment

fa6bd30 verified 3 months ago

raw

history blame contribute delete

11.5 kB

	# ==============================================================================
	# VSA PATTERN MINING
	# Discovers patterns you did NOT ask for — algebraically, without training.
	#
	# Mining operations:
	# 1. Association Rules — "X AND Y → Z" discovered via resonance
	# 2. Field Correlation — Which fields co-vary? (without statistics)
	# 3. Co-occurrence — Which values appear together?
	# 4. Population Drift — Is subset A different from subset B?
	# 5. Anomaly Detection — Records that don't fit the population
	# 6. Cluster Discovery — Find natural groups without K-means
	# ==============================================================================

	# --- 1. ASSOCIATION RULE MINING ---
	# "IF Diagnosis=Hypertension THEN SBP>130?"
	# We encode the antecedent, extract it from population, measure resonance
	# with the consequent. HIGH resonance = strong rule.

	function mine_association_rules(db::VSADatabase; min_confidence::Float64=0.3)
	rules = Tuple{String, String, Float64}[] # (antecedent, consequent, confidence)

	# Collect all encodable field-value pairs
	pairs = Tuple{String, String, Atom}[] # (field, value, encoded_atom)
	for (field_name, enc) in db.encoders
	if enc isa CategoricalEncoder
	role = db.field_roles[field_name]
	for cat in enc.categories
	cat_atom = encode(enc, cat, db.dim)
	bound = bind(role, cat_atom)
	push!(pairs, (field_name, cat, bound))
	end
	end
	end

	length(pairs) < 2 && return rules

	# For each pair of field-values, measure co-occurrence via resonance
	# Build sub-populations per antecedent
	for (i, (f_a, v_a, atom_a)) in enumerate(pairs)
	# Find records matching antecedent
	role_a = db.field_roles[f_a]
	matching_indices = Int[]
	for (idx, record) in enumerate(db.records)
	extracted = bind(record, role_a)
	enc_a = db.encoders[f_a]
	target = encode(enc_a, v_a, db.dim)
	if similarity(extracted, target) > 0.05
	push!(matching_indices, idx)
	end
	end

	isempty(matching_indices) && continue

	# Build sub-population superposition
	sub_pop = bundle([db.records[idx] for idx in matching_indices])

	# Test all consequents from DIFFERENT fields
	for (j, (f_c, v_c, atom_c)) in enumerate(pairs)
	f_c == f_a && continue # Same field → skip

	# Extract consequent field from sub-population
	role_c = db.field_roles[f_c]
	extracted = bind(sub_pop, role_c)
	enc_c = db.encoders[f_c]
	target_c = encode(enc_c, v_c, db.dim)

	confidence = Float64(similarity(extracted, target_c))

	if confidence > min_confidence
	push!(rules, ("$(f_a)=$(v_a)", "$(f_c)=$(v_c)", confidence))
	end
	end
	end

	sort!(rules, by=x -> -x[3])
	return rules
	end

	# --- 2. FIELD CORRELATION ---
	# Do two fields move together? Measure by bundling all (Bind(RoleA, ValA), Bind(RoleB, ValB))
	# pairs from actual records, then checking resonance strength.

	function mine_field_correlations(db::VSADatabase)
	field_names = collect(keys(db.field_roles))
	correlations = Tuple{String, String, Float64}[]

	length(field_names) < 2 && return correlations

	for i in 1:length(field_names)
	for j in (i+1):length(field_names)
	f1, f2 = field_names[i], field_names[j]
	role1, role2 = db.field_roles[f1], db.field_roles[f2]

	# Extract both fields from every record, measure joint coherence
	joint_atoms = Atom[]
	for record in db.records
	ext1 = bind(record, role1)
	ext2 = bind(record, role2)
	joint = bind(ext1, ext2)
	push!(joint_atoms, joint)
	end

	if length(joint_atoms) >= 2
	# High coherence among joint extractions = correlated fields
	joint_super = bundle(joint_atoms)

	coherence_sum = 0.0
	for atom in joint_atoms
	coherence_sum += Float64(similarity(atom, joint_super))
	end
	avg_coherence = coherence_sum / length(joint_atoms)

	push!(correlations, (f1, f2, avg_coherence))
	end
	end
	end

	sort!(correlations, by=x -> -x[3])
	return correlations
	end

	# --- 3. CO-OCCURRENCE DISCOVERY ---
	# Find which categorical values tend to appear together in records.
	# "Male + Hypertension" vs "Female + Hypertension" — which is more common?

	function mine_cooccurrence(db::VSADatabase, field_a::String, field_b::String)
	haskey(db.encoders, field_a) && haskey(db.encoders, field_b) \|\| return []
	enc_a, enc_b = db.encoders[field_a], db.encoders[field_b]

	(enc_a isa CategoricalEncoder && enc_b isa CategoricalEncoder) \|\| return []

	role_a, role_b = db.field_roles[field_a], db.field_roles[field_b]

	results = Tuple{String, String, Int}[]

	for cat_a in enc_a.categories
	target_a = encode(enc_a, cat_a, db.dim)
	for cat_b in enc_b.categories
	target_b = encode(enc_b, cat_b, db.dim)

	count = 0
	for record in db.records
	ext_a = bind(record, role_a)
	ext_b = bind(record, role_b)
	sim_a = similarity(ext_a, target_a)
	sim_b = similarity(ext_b, target_b)

	if sim_a > 0.05 && sim_b > 0.05
	count += 1
	end
	end

	if count > 0
	push!(results, (cat_a, cat_b, count))
	end
	end
	end

	sort!(results, by=x -> -x[3])
	return results
	end

	# --- 4. POPULATION DRIFT ---
	# Is one subset of records fundamentally different from another?
	# Split population → measure cross-similarity.

	function detect_drift(db::VSADatabase; split_at::Int=0)
	n = length(db.records)
	n < 4 && return 0.0

	mid = split_at > 0 ? split_at : n ÷ 2
	mid = clamp(mid, 1, n-1)

	pop_a = bundle(db.records[1:mid])
	pop_b = bundle(db.records[mid+1:end])

	cross_sim = Float64(similarity(pop_a, pop_b))
	return cross_sim # Low = drift detected, High = stable
	end

	# --- 5. ANOMALY DETECTION ---
	# Records with LOW similarity to population superposition are anomalies.

	function detect_anomalies(db::VSADatabase; threshold::Float64=0.15)
	if db.superposition[] === nothing
	build_superposition!(db)
	end

	pop = db.superposition[]
	pop === nothing && return [], []

	anomalies = Tuple{String, Float64}[]
	normals = Tuple{String, Float64}[]

	for (i, record) in enumerate(db.records)
	sim = Float64(similarity(record, pop))
	if sim < threshold
	push!(anomalies, (db.record_ids[i], sim))
	else
	push!(normals, (db.record_ids[i], sim))
	end
	end

	sort!(anomalies, by=x -> x[2])
	return anomalies, normals
	end

	# --- 6. CLUSTER DISCOVERY (Unsupervised) ---
	# Find natural clusters without knowing categories.
	# Greedy resonance: pick seed, pull in similar records, repeat.

	function discover_clusters(db::VSADatabase; min_sim::Float64=0.6, min_size::Int=2)
	n = length(db.records)
	n < min_size && return []

	assigned = falses(n)
	clusters = Vector{Vector{Tuple{String, Float64}}}()

	# Sort by similarity to population (most central first as seeds)
	if db.superposition[] === nothing
	build_superposition!(db)
	end
	pop = db.superposition[]

	pop_sims = [(i, Float64(similarity(db.records[i], pop !== nothing ? pop : db.records[1]))) for i in 1:n]
	sort!(pop_sims, by=x -> -x[2])

	for (seed_idx, _) in pop_sims
	assigned[seed_idx] && continue

	# Start new cluster from this seed
	cluster = Tuple{String, Float64}[]
	seed = db.records[seed_idx]

	for j in 1:n
	assigned[j] && continue
	sim = Float64(similarity(seed, db.records[j]))
	if sim >= min_sim
	push!(cluster, (db.record_ids[j], sim))
	assigned[j] = true
	end
	end

	if length(cluster) >= min_size
	sort!(cluster, by=x -> -x[2])
	push!(clusters, cluster)
	else
	# Release back to unassigned
	for (id, _) in cluster
	idx = findfirst(==(id), db.record_ids)
	if idx !== nothing
	assigned[idx] = false
	end
	end
	end
	end

	return clusters
	end

	# --- 7. FIELD CLUSTERING (Known field) ---
	# Group records by a known categorical field via resonance extraction.

	function cluster_by_field(db::VSADatabase, field_name::String)
	if !haskey(db.field_roles, field_name) \|\| !haskey(db.encoders, field_name)
	return Dict{String, Vector{String}}()
	end

	role = db.field_roles[field_name]
	enc = db.encoders[field_name]

	if !(enc isa CategoricalEncoder)
	return Dict{String, Vector{String}}()
	end

	clusters = Dict{String, Vector{String}}()

	for cat in enc.categories
	cat_atom = encode(enc, cat, db.dim)
	cluster_members = String[]

	for (i, record) in enumerate(db.records)
	extracted = bind(record, role)
	sim = similarity(extracted, cat_atom)
	if sim > 0.05
	push!(cluster_members, db.record_ids[i])
	end
	end

	if !isempty(cluster_members)
	clusters[cat] = cluster_members
	end
	end

	return clusters
	end

	# --- 8. POPULATION COHERENCE ---

	function measure_coherence(db::VSADatabase, record_ids::Vector{String})
	indices = [findfirst(==(id), db.record_ids) for id in record_ids]
	filter!(x -> x !== nothing, indices)

	length(indices) < 2 && return 1.0

	atoms = [db.records[i] for i in indices]
	total_sim = 0.0
	count = 0
	for i in 1:length(atoms)
	for j in (i+1):length(atoms)
	total_sim += Float64(similarity(atoms[i], atoms[j]))
	count += 1
	end
	end

	return count > 0 ? total_sim / count : 0.0
	end

	# --- DETERMINISM PROOF ---

	function prove_determinism(db::VSADatabase, field_name::String, value::Any)
	run1 = query_exact(db, field_name, value; top_k=5)
	run2 = query_exact(db, field_name, value; top_k=5)

	identical = true
	if length(run1) != length(run2)
	identical = false
	else
	for i in 1:length(run1)
	if run1[i][1] != run2[i][1] \|\| abs(run1[i][2] - run2[i][2]) > 1e-10
	identical = false
	break
	end
	end
	end

	return identical, run1, run2
	end