Spaces:
Sleeping
Sleeping
File size: 7,958 Bytes
fa6bd30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 | using Printf
# ==============================================================================
# VSA CSV LOADER — Universal CSV → VDBTable Pipeline
# Handles: quoted fields, multi-value cells, auto type detection
# Produces: VDBTable ready for VSA queries & SQL
# ==============================================================================
# --- CSV Parsing (handles quoted commas) ---
function csv_parse_line(line::AbstractString)
fields = String[]
current = IOBuffer()
in_quotes = false
for c in line
if c == '"'
in_quotes = !in_quotes
elseif c == ',' && !in_quotes
push!(fields, strip(String(take!(current))))
current = IOBuffer()
else
write(current, c)
end
end
push!(fields, strip(String(take!(current))))
return fields
end
function csv_read(path::String; max_rows::Int=0)
lines = readlines(path)
isempty(lines) && return (String[], Vector{Vector{String}}())
headers = csv_parse_line(lines[1])
rows = Vector{Vector{String}}()
limit = max_rows > 0 ? min(max_rows + 1, length(lines)) : length(lines)
for i in 2:limit
line = strip(lines[i])
isempty(line) && continue
fields = csv_parse_line(line)
# Pad or trim to match header count
while length(fields) < length(headers)
push!(fields, "")
end
if length(fields) > length(headers)
fields = fields[1:length(headers)]
end
push!(rows, fields)
end
return (headers, rows)
end
# --- Auto Type Detection ---
# Decides if a column is numeric (THERMO) or categorical (CAT)
struct ColumnProfile
name::String
is_numeric::Bool
min_val::Float64
max_val::Float64
unique_values::Set{String}
sample_count::Int
end
function profile_columns(headers::Vector{String}, rows::Vector{Vector{String}})
profiles = ColumnProfile[]
for (j, name) in enumerate(headers)
values = [row[j] for row in rows if j <= length(row)]
# Try parsing as numeric
nums = Float64[]
for v in values
if !isempty(v)
n = tryparse(Float64, v)
n !== nothing && push!(nums, n)
end
end
non_empty = filter(!isempty, values)
numeric_ratio = length(non_empty) > 0 ? length(nums) / length(non_empty) : 0.0
uniques = Set(non_empty)
# Column is numeric if >80% parse as numbers AND unique count > 10
is_numeric = numeric_ratio > 0.8 && length(uniques) > 10
min_v = isempty(nums) ? 0.0 : minimum(nums)
max_v = isempty(nums) ? 100.0 : maximum(nums)
push!(profiles, ColumnProfile(name, is_numeric, min_v, max_v, uniques, length(non_empty)))
end
return profiles
end
# --- Build VDBTable from CSV ---
"""
csv_to_table(reg, path; dim, id_col, max_rows, max_categories)
Load a CSV file into a VDBTable.
- `reg`: VSARegistry for atom allocation
- `path`: Path to CSV file
- `dim`: Vector dimension (default 2048)
- `id_col`: Column index to use as record ID (default 1)
- `max_rows`: Maximum rows to load (0 = all)
- `max_categories`: Maximum unique values for a CAT encoder (default 500)
"""
function csv_to_table(reg::VSARegistry, path::String;
dim::Int=2048,
id_col::Int=1,
max_rows::Int=0,
max_categories::Int=500,
table_name::String="")
# Read CSV
headers, rows = csv_read(path; max_rows=max_rows)
isempty(rows) && error("Empty CSV: $path")
# Auto-detect table name from filename
if isempty(table_name)
table_name = replace(basename(path), ".csv" => "")
table_name = replace(table_name, r"[^a-zA-Z0-9_]" => "_")
end
# Profile columns
profiles = profile_columns(headers, rows)
# Build schema (skip the ID column from encoding)
schema = Tuple{String, VSAEncoder}[]
col_indices = Int[] # Which CSV column index maps to which schema column
for (j, prof) in enumerate(profiles)
j == id_col && continue # Skip ID column
enc = if prof.is_numeric
# Thermometer encoding for numeric data
margin = (prof.max_val - prof.min_val) * 0.1
min_v = prof.min_val - margin
max_v = prof.max_val + margin
ThermometerEncoder(reg, prof.name, min_v, max_v; levels=100)
else
# Categorical encoding — collect top N categories
cats = collect(prof.unique_values)
if length(cats) > max_categories
# Take top by frequency
freq = Dict{String,Int}()
for row in rows
j <= length(row) && !isempty(row[j]) && (freq[row[j]] = get(freq, row[j], 0) + 1)
end
sorted = sort(collect(freq), by=x -> -x.second)
cats = [x.first for x in sorted[1:min(max_categories, length(sorted))]]
end
CategoricalEncoder(reg, prof.name, cats)
end
push!(schema, (prof.name, enc))
push!(col_indices, j)
end
# Create table
table = create_table(reg, table_name, dim, schema)
# Insert rows
inserted = 0
for row in rows
id = id_col <= length(row) ? row[id_col] : "row_$(inserted+1)"
isempty(id) && (id = "row_$(inserted+1)")
fields = Dict{String, Any}()
for (si, ci) in enumerate(col_indices)
ci <= length(row) || continue
val_str = row[ci]
isempty(val_str) && continue
col_name = schema[si][1]
enc = schema[si][2]
if enc isa ThermometerEncoder
v = tryparse(Float64, val_str)
v !== nothing && (fields[col_name] = v)
else
fields[col_name] = val_str
end
end
vdb_insert!(table, id, fields)
inserted += 1
end
return table, inserted
end
# --- Summary ---
function csv_summary(path::String; max_rows::Int=5)
headers, rows = csv_read(path; max_rows=max_rows)
profiles = profile_columns(headers, rows)
println(" File: $(basename(path))")
println(" Rows: $(length(rows)) (sampled for profiling)")
println(" Columns: $(length(headers))")
println(" ─────────────────────────────────────────────")
for prof in profiles
type_str = prof.is_numeric ?
@sprintf("NUMERIC [%.1f, %.1f]", prof.min_val, prof.max_val) :
"CATEGORICAL ($(length(prof.unique_values)) unique)"
@printf(" %-25s %s\n", prof.name, type_str)
end
end
# --- Bulk Load Helper ---
# Load multiple CSVs into a single VSAEngine
function csv_load_all!(engine::VSAEngine, paths::Vector{String};
max_rows::Int=0, max_categories::Int=500)
results = Dict{String, NamedTuple{(:table, :rows), Tuple{VDBTable, Int}}}()
for path in paths
t = @elapsed begin
table, n = csv_to_table(engine.reg, path;
dim=engine.dim,
max_rows=max_rows,
max_categories=max_categories)
engine.tables[table.name] = table
end
@printf(" ✓ %-25s %5d records (%.3f s)\n", table.name, n, t)
results[table.name] = (table=table, rows=n)
end
return results
end
|