Spaces:
Sleeping
Sleeping
| # ============================================================================== | |
| # VSA PATTERN MINING | |
| # Discovers patterns you did NOT ask for — algebraically, without training. | |
| # | |
| # Mining operations: | |
| # 1. Association Rules — "X AND Y → Z" discovered via resonance | |
| # 2. Field Correlation — Which fields co-vary? (without statistics) | |
| # 3. Co-occurrence — Which values appear together? | |
| # 4. Population Drift — Is subset A different from subset B? | |
| # 5. Anomaly Detection — Records that don't fit the population | |
| # 6. Cluster Discovery — Find natural groups without K-means | |
| # ============================================================================== | |
| # --- 1. ASSOCIATION RULE MINING --- | |
| # "IF Diagnosis=Hypertension THEN SBP>130?" | |
| # We encode the antecedent, extract it from population, measure resonance | |
| # with the consequent. HIGH resonance = strong rule. | |
| function mine_association_rules(db::VSADatabase; min_confidence::Float64=0.3) | |
| rules = Tuple{String, String, Float64}[] # (antecedent, consequent, confidence) | |
| # Collect all encodable field-value pairs | |
| pairs = Tuple{String, String, Atom}[] # (field, value, encoded_atom) | |
| for (field_name, enc) in db.encoders | |
| if enc isa CategoricalEncoder | |
| role = db.field_roles[field_name] | |
| for cat in enc.categories | |
| cat_atom = encode(enc, cat, db.dim) | |
| bound = bind(role, cat_atom) | |
| push!(pairs, (field_name, cat, bound)) | |
| end | |
| end | |
| end | |
| length(pairs) < 2 && return rules | |
| # For each pair of field-values, measure co-occurrence via resonance | |
| # Build sub-populations per antecedent | |
| for (i, (f_a, v_a, atom_a)) in enumerate(pairs) | |
| # Find records matching antecedent | |
| role_a = db.field_roles[f_a] | |
| matching_indices = Int[] | |
| for (idx, record) in enumerate(db.records) | |
| extracted = bind(record, role_a) | |
| enc_a = db.encoders[f_a] | |
| target = encode(enc_a, v_a, db.dim) | |
| if similarity(extracted, target) > 0.05 | |
| push!(matching_indices, idx) | |
| end | |
| end | |
| isempty(matching_indices) && continue | |
| # Build sub-population superposition | |
| sub_pop = bundle([db.records[idx] for idx in matching_indices]) | |
| # Test all consequents from DIFFERENT fields | |
| for (j, (f_c, v_c, atom_c)) in enumerate(pairs) | |
| f_c == f_a && continue # Same field → skip | |
| # Extract consequent field from sub-population | |
| role_c = db.field_roles[f_c] | |
| extracted = bind(sub_pop, role_c) | |
| enc_c = db.encoders[f_c] | |
| target_c = encode(enc_c, v_c, db.dim) | |
| confidence = Float64(similarity(extracted, target_c)) | |
| if confidence > min_confidence | |
| push!(rules, ("$(f_a)=$(v_a)", "$(f_c)=$(v_c)", confidence)) | |
| end | |
| end | |
| end | |
| sort!(rules, by=x -> -x[3]) | |
| return rules | |
| end | |
| # --- 2. FIELD CORRELATION --- | |
| # Do two fields move together? Measure by bundling all (Bind(RoleA, ValA), Bind(RoleB, ValB)) | |
| # pairs from actual records, then checking resonance strength. | |
| function mine_field_correlations(db::VSADatabase) | |
| field_names = collect(keys(db.field_roles)) | |
| correlations = Tuple{String, String, Float64}[] | |
| length(field_names) < 2 && return correlations | |
| for i in 1:length(field_names) | |
| for j in (i+1):length(field_names) | |
| f1, f2 = field_names[i], field_names[j] | |
| role1, role2 = db.field_roles[f1], db.field_roles[f2] | |
| # Extract both fields from every record, measure joint coherence | |
| joint_atoms = Atom[] | |
| for record in db.records | |
| ext1 = bind(record, role1) | |
| ext2 = bind(record, role2) | |
| joint = bind(ext1, ext2) | |
| push!(joint_atoms, joint) | |
| end | |
| if length(joint_atoms) >= 2 | |
| # High coherence among joint extractions = correlated fields | |
| joint_super = bundle(joint_atoms) | |
| coherence_sum = 0.0 | |
| for atom in joint_atoms | |
| coherence_sum += Float64(similarity(atom, joint_super)) | |
| end | |
| avg_coherence = coherence_sum / length(joint_atoms) | |
| push!(correlations, (f1, f2, avg_coherence)) | |
| end | |
| end | |
| end | |
| sort!(correlations, by=x -> -x[3]) | |
| return correlations | |
| end | |
| # --- 3. CO-OCCURRENCE DISCOVERY --- | |
| # Find which categorical values tend to appear together in records. | |
| # "Male + Hypertension" vs "Female + Hypertension" — which is more common? | |
| function mine_cooccurrence(db::VSADatabase, field_a::String, field_b::String) | |
| haskey(db.encoders, field_a) && haskey(db.encoders, field_b) || return [] | |
| enc_a, enc_b = db.encoders[field_a], db.encoders[field_b] | |
| (enc_a isa CategoricalEncoder && enc_b isa CategoricalEncoder) || return [] | |
| role_a, role_b = db.field_roles[field_a], db.field_roles[field_b] | |
| results = Tuple{String, String, Int}[] | |
| for cat_a in enc_a.categories | |
| target_a = encode(enc_a, cat_a, db.dim) | |
| for cat_b in enc_b.categories | |
| target_b = encode(enc_b, cat_b, db.dim) | |
| count = 0 | |
| for record in db.records | |
| ext_a = bind(record, role_a) | |
| ext_b = bind(record, role_b) | |
| sim_a = similarity(ext_a, target_a) | |
| sim_b = similarity(ext_b, target_b) | |
| if sim_a > 0.05 && sim_b > 0.05 | |
| count += 1 | |
| end | |
| end | |
| if count > 0 | |
| push!(results, (cat_a, cat_b, count)) | |
| end | |
| end | |
| end | |
| sort!(results, by=x -> -x[3]) | |
| return results | |
| end | |
| # --- 4. POPULATION DRIFT --- | |
| # Is one subset of records fundamentally different from another? | |
| # Split population → measure cross-similarity. | |
| function detect_drift(db::VSADatabase; split_at::Int=0) | |
| n = length(db.records) | |
| n < 4 && return 0.0 | |
| mid = split_at > 0 ? split_at : n ÷ 2 | |
| mid = clamp(mid, 1, n-1) | |
| pop_a = bundle(db.records[1:mid]) | |
| pop_b = bundle(db.records[mid+1:end]) | |
| cross_sim = Float64(similarity(pop_a, pop_b)) | |
| return cross_sim # Low = drift detected, High = stable | |
| end | |
| # --- 5. ANOMALY DETECTION --- | |
| # Records with LOW similarity to population superposition are anomalies. | |
| function detect_anomalies(db::VSADatabase; threshold::Float64=0.15) | |
| if db.superposition[] === nothing | |
| build_superposition!(db) | |
| end | |
| pop = db.superposition[] | |
| pop === nothing && return [], [] | |
| anomalies = Tuple{String, Float64}[] | |
| normals = Tuple{String, Float64}[] | |
| for (i, record) in enumerate(db.records) | |
| sim = Float64(similarity(record, pop)) | |
| if sim < threshold | |
| push!(anomalies, (db.record_ids[i], sim)) | |
| else | |
| push!(normals, (db.record_ids[i], sim)) | |
| end | |
| end | |
| sort!(anomalies, by=x -> x[2]) | |
| return anomalies, normals | |
| end | |
| # --- 6. CLUSTER DISCOVERY (Unsupervised) --- | |
| # Find natural clusters without knowing categories. | |
| # Greedy resonance: pick seed, pull in similar records, repeat. | |
| function discover_clusters(db::VSADatabase; min_sim::Float64=0.6, min_size::Int=2) | |
| n = length(db.records) | |
| n < min_size && return [] | |
| assigned = falses(n) | |
| clusters = Vector{Vector{Tuple{String, Float64}}}() | |
| # Sort by similarity to population (most central first as seeds) | |
| if db.superposition[] === nothing | |
| build_superposition!(db) | |
| end | |
| pop = db.superposition[] | |
| pop_sims = [(i, Float64(similarity(db.records[i], pop !== nothing ? pop : db.records[1]))) for i in 1:n] | |
| sort!(pop_sims, by=x -> -x[2]) | |
| for (seed_idx, _) in pop_sims | |
| assigned[seed_idx] && continue | |
| # Start new cluster from this seed | |
| cluster = Tuple{String, Float64}[] | |
| seed = db.records[seed_idx] | |
| for j in 1:n | |
| assigned[j] && continue | |
| sim = Float64(similarity(seed, db.records[j])) | |
| if sim >= min_sim | |
| push!(cluster, (db.record_ids[j], sim)) | |
| assigned[j] = true | |
| end | |
| end | |
| if length(cluster) >= min_size | |
| sort!(cluster, by=x -> -x[2]) | |
| push!(clusters, cluster) | |
| else | |
| # Release back to unassigned | |
| for (id, _) in cluster | |
| idx = findfirst(==(id), db.record_ids) | |
| if idx !== nothing | |
| assigned[idx] = false | |
| end | |
| end | |
| end | |
| end | |
| return clusters | |
| end | |
| # --- 7. FIELD CLUSTERING (Known field) --- | |
| # Group records by a known categorical field via resonance extraction. | |
| function cluster_by_field(db::VSADatabase, field_name::String) | |
| if !haskey(db.field_roles, field_name) || !haskey(db.encoders, field_name) | |
| return Dict{String, Vector{String}}() | |
| end | |
| role = db.field_roles[field_name] | |
| enc = db.encoders[field_name] | |
| if !(enc isa CategoricalEncoder) | |
| return Dict{String, Vector{String}}() | |
| end | |
| clusters = Dict{String, Vector{String}}() | |
| for cat in enc.categories | |
| cat_atom = encode(enc, cat, db.dim) | |
| cluster_members = String[] | |
| for (i, record) in enumerate(db.records) | |
| extracted = bind(record, role) | |
| sim = similarity(extracted, cat_atom) | |
| if sim > 0.05 | |
| push!(cluster_members, db.record_ids[i]) | |
| end | |
| end | |
| if !isempty(cluster_members) | |
| clusters[cat] = cluster_members | |
| end | |
| end | |
| return clusters | |
| end | |
| # --- 8. POPULATION COHERENCE --- | |
| function measure_coherence(db::VSADatabase, record_ids::Vector{String}) | |
| indices = [findfirst(==(id), db.record_ids) for id in record_ids] | |
| filter!(x -> x !== nothing, indices) | |
| length(indices) < 2 && return 1.0 | |
| atoms = [db.records[i] for i in indices] | |
| total_sim = 0.0 | |
| count = 0 | |
| for i in 1:length(atoms) | |
| for j in (i+1):length(atoms) | |
| total_sim += Float64(similarity(atoms[i], atoms[j])) | |
| count += 1 | |
| end | |
| end | |
| return count > 0 ? total_sim / count : 0.0 | |
| end | |
| # --- DETERMINISM PROOF --- | |
| function prove_determinism(db::VSADatabase, field_name::String, value::Any) | |
| run1 = query_exact(db, field_name, value; top_k=5) | |
| run2 = query_exact(db, field_name, value; top_k=5) | |
| identical = true | |
| if length(run1) != length(run2) | |
| identical = false | |
| else | |
| for i in 1:length(run1) | |
| if run1[i][1] != run2[i][1] || abs(run1[i][2] - run2[i][2]) > 1e-10 | |
| identical = false | |
| break | |
| end | |
| end | |
| end | |
| return identical, run1, run2 | |
| end | |