Spaces:

marshad180
/

Atomic-VSA

Sleeping

File size: 7,958 Bytes

fa6bd30

using Printf

# ==============================================================================
# VSA CSV LOADER — Universal CSV → VDBTable Pipeline
# Handles: quoted fields, multi-value cells, auto type detection
# Produces: VDBTable ready for VSA queries & SQL
# ==============================================================================

# --- CSV Parsing (handles quoted commas) ---

function csv_parse_line(line::AbstractString)
    fields = String[]
    current = IOBuffer()
    in_quotes = false
    for c in line
        if c == '"'
            in_quotes = !in_quotes
        elseif c == ',' && !in_quotes
            push!(fields, strip(String(take!(current))))
            current = IOBuffer()
        else
            write(current, c)
        end
    end
    push!(fields, strip(String(take!(current))))
    return fields
end

function csv_read(path::String; max_rows::Int=0)
    lines = readlines(path)
    isempty(lines) && return (String[], Vector{Vector{String}}())
    
    headers = csv_parse_line(lines[1])
    rows = Vector{Vector{String}}()
    
    limit = max_rows > 0 ? min(max_rows + 1, length(lines)) : length(lines)
    
    for i in 2:limit
        line = strip(lines[i])
        isempty(line) && continue
        fields = csv_parse_line(line)
        # Pad or trim to match header count
        while length(fields) < length(headers)
            push!(fields, "")
        end
        if length(fields) > length(headers)
            fields = fields[1:length(headers)]
        end
        push!(rows, fields)
    end
    
    return (headers, rows)
end

# --- Auto Type Detection ---
# Decides if a column is numeric (THERMO) or categorical (CAT)

struct ColumnProfile
    name::String
    is_numeric::Bool
    min_val::Float64
    max_val::Float64
    unique_values::Set{String}
    sample_count::Int
end

function profile_columns(headers::Vector{String}, rows::Vector{Vector{String}})
    profiles = ColumnProfile[]
    
    for (j, name) in enumerate(headers)
        values = [row[j] for row in rows if j <= length(row)]
        
        # Try parsing as numeric
        nums = Float64[]
        for v in values
            if !isempty(v)
                n = tryparse(Float64, v)
                n !== nothing && push!(nums, n)
            end
        end
        
        non_empty = filter(!isempty, values)
        numeric_ratio = length(non_empty) > 0 ? length(nums) / length(non_empty) : 0.0
        uniques = Set(non_empty)
        
        # Column is numeric if >80% parse as numbers AND unique count > 10
        is_numeric = numeric_ratio > 0.8 && length(uniques) > 10
        
        min_v = isempty(nums) ? 0.0 : minimum(nums)
        max_v = isempty(nums) ? 100.0 : maximum(nums)
        
        push!(profiles, ColumnProfile(name, is_numeric, min_v, max_v, uniques, length(non_empty)))
    end
    
    return profiles
end

# --- Build VDBTable from CSV ---

"""
    csv_to_table(reg, path; dim, id_col, max_rows, max_categories)

Load a CSV file into a VDBTable.

- `reg`: VSARegistry for atom allocation
- `path`: Path to CSV file  
- `dim`: Vector dimension (default 2048)
- `id_col`: Column index to use as record ID (default 1)
- `max_rows`: Maximum rows to load (0 = all)
- `max_categories`: Maximum unique values for a CAT encoder (default 500)
"""
function csv_to_table(reg::VSARegistry, path::String;
                      dim::Int=2048,
                      id_col::Int=1,
                      max_rows::Int=0,
                      max_categories::Int=500,
                      table_name::String="")
    # Read CSV
    headers, rows = csv_read(path; max_rows=max_rows)
    isempty(rows) && error("Empty CSV: $path")
    
    # Auto-detect table name from filename
    if isempty(table_name)
        table_name = replace(basename(path), ".csv" => "")
        table_name = replace(table_name, r"[^a-zA-Z0-9_]" => "_")
    end
    
    # Profile columns
    profiles = profile_columns(headers, rows)
    
    # Build schema (skip the ID column from encoding)
    schema = Tuple{String, VSAEncoder}[]
    col_indices = Int[]  # Which CSV column index maps to which schema column
    
    for (j, prof) in enumerate(profiles)
        j == id_col && continue  # Skip ID column
        
        enc = if prof.is_numeric
            # Thermometer encoding for numeric data
            margin = (prof.max_val - prof.min_val) * 0.1
            min_v = prof.min_val - margin
            max_v = prof.max_val + margin
            ThermometerEncoder(reg, prof.name, min_v, max_v; levels=100)
        else
            # Categorical encoding — collect top N categories
            cats = collect(prof.unique_values)
            if length(cats) > max_categories
                # Take top by frequency
                freq = Dict{String,Int}()
                for row in rows
                    j <= length(row) && !isempty(row[j]) && (freq[row[j]] = get(freq, row[j], 0) + 1)
                end
                sorted = sort(collect(freq), by=x -> -x.second)
                cats = [x.first for x in sorted[1:min(max_categories, length(sorted))]]
            end
            CategoricalEncoder(reg, prof.name, cats)
        end
        
        push!(schema, (prof.name, enc))
        push!(col_indices, j)
    end
    
    # Create table
    table = create_table(reg, table_name, dim, schema)
    
    # Insert rows
    inserted = 0
    for row in rows
        id = id_col <= length(row) ? row[id_col] : "row_$(inserted+1)"
        isempty(id) && (id = "row_$(inserted+1)")
        
        fields = Dict{String, Any}()
        for (si, ci) in enumerate(col_indices)
            ci <= length(row) || continue
            val_str = row[ci]
            isempty(val_str) && continue
            
            col_name = schema[si][1]
            enc = schema[si][2]
            
            if enc isa ThermometerEncoder
                v = tryparse(Float64, val_str)
                v !== nothing && (fields[col_name] = v)
            else
                fields[col_name] = val_str
            end
        end
        
        vdb_insert!(table, id, fields)
        inserted += 1
    end
    
    return table, inserted
end

# --- Summary ---

function csv_summary(path::String; max_rows::Int=5)
    headers, rows = csv_read(path; max_rows=max_rows)
    profiles = profile_columns(headers, rows)
    
    println("  File: $(basename(path))")
    println("  Rows: $(length(rows)) (sampled for profiling)")
    println("  Columns: $(length(headers))")
    println("  ─────────────────────────────────────────────")
    for prof in profiles
        type_str = prof.is_numeric ? 
            @sprintf("NUMERIC [%.1f, %.1f]", prof.min_val, prof.max_val) :
            "CATEGORICAL ($(length(prof.unique_values)) unique)"
        @printf("  %-25s %s\n", prof.name, type_str)
    end
end

# --- Bulk Load Helper ---
# Load multiple CSVs into a single VSAEngine

function csv_load_all!(engine::VSAEngine, paths::Vector{String};
                       max_rows::Int=0, max_categories::Int=500)
    results = Dict{String, NamedTuple{(:table, :rows), Tuple{VDBTable, Int}}}()
    
    for path in paths
        t = @elapsed begin
            table, n = csv_to_table(engine.reg, path;
                                    dim=engine.dim,
                                    max_rows=max_rows,
                                    max_categories=max_categories)
            engine.tables[table.name] = table
        end
        
        @printf("  ✓ %-25s  %5d records  (%.3f s)\n", table.name, n, t)
        results[table.name] = (table=table, rows=n)
    end
    
    return results
end