File size: 7,958 Bytes
fa6bd30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
using Printf

# ==============================================================================
# VSA CSV LOADER — Universal CSV → VDBTable Pipeline
# Handles: quoted fields, multi-value cells, auto type detection
# Produces: VDBTable ready for VSA queries & SQL
# ==============================================================================

# --- CSV Parsing (handles quoted commas) ---

function csv_parse_line(line::AbstractString)
    fields = String[]
    current = IOBuffer()
    in_quotes = false
    for c in line
        if c == '"'
            in_quotes = !in_quotes
        elseif c == ',' && !in_quotes
            push!(fields, strip(String(take!(current))))
            current = IOBuffer()
        else
            write(current, c)
        end
    end
    push!(fields, strip(String(take!(current))))
    return fields
end

function csv_read(path::String; max_rows::Int=0)
    lines = readlines(path)
    isempty(lines) && return (String[], Vector{Vector{String}}())
    
    headers = csv_parse_line(lines[1])
    rows = Vector{Vector{String}}()
    
    limit = max_rows > 0 ? min(max_rows + 1, length(lines)) : length(lines)
    
    for i in 2:limit
        line = strip(lines[i])
        isempty(line) && continue
        fields = csv_parse_line(line)
        # Pad or trim to match header count
        while length(fields) < length(headers)
            push!(fields, "")
        end
        if length(fields) > length(headers)
            fields = fields[1:length(headers)]
        end
        push!(rows, fields)
    end
    
    return (headers, rows)
end

# --- Auto Type Detection ---
# Decides if a column is numeric (THERMO) or categorical (CAT)

struct ColumnProfile
    name::String
    is_numeric::Bool
    min_val::Float64
    max_val::Float64
    unique_values::Set{String}
    sample_count::Int
end

function profile_columns(headers::Vector{String}, rows::Vector{Vector{String}})
    profiles = ColumnProfile[]
    
    for (j, name) in enumerate(headers)
        values = [row[j] for row in rows if j <= length(row)]
        
        # Try parsing as numeric
        nums = Float64[]
        for v in values
            if !isempty(v)
                n = tryparse(Float64, v)
                n !== nothing && push!(nums, n)
            end
        end
        
        non_empty = filter(!isempty, values)
        numeric_ratio = length(non_empty) > 0 ? length(nums) / length(non_empty) : 0.0
        uniques = Set(non_empty)
        
        # Column is numeric if >80% parse as numbers AND unique count > 10
        is_numeric = numeric_ratio > 0.8 && length(uniques) > 10
        
        min_v = isempty(nums) ? 0.0 : minimum(nums)
        max_v = isempty(nums) ? 100.0 : maximum(nums)
        
        push!(profiles, ColumnProfile(name, is_numeric, min_v, max_v, uniques, length(non_empty)))
    end
    
    return profiles
end

# --- Build VDBTable from CSV ---

"""
    csv_to_table(reg, path; dim, id_col, max_rows, max_categories)

Load a CSV file into a VDBTable.

- `reg`: VSARegistry for atom allocation
- `path`: Path to CSV file  
- `dim`: Vector dimension (default 2048)
- `id_col`: Column index to use as record ID (default 1)
- `max_rows`: Maximum rows to load (0 = all)
- `max_categories`: Maximum unique values for a CAT encoder (default 500)
"""
function csv_to_table(reg::VSARegistry, path::String;
                      dim::Int=2048,
                      id_col::Int=1,
                      max_rows::Int=0,
                      max_categories::Int=500,
                      table_name::String="")
    # Read CSV
    headers, rows = csv_read(path; max_rows=max_rows)
    isempty(rows) && error("Empty CSV: $path")
    
    # Auto-detect table name from filename
    if isempty(table_name)
        table_name = replace(basename(path), ".csv" => "")
        table_name = replace(table_name, r"[^a-zA-Z0-9_]" => "_")
    end
    
    # Profile columns
    profiles = profile_columns(headers, rows)
    
    # Build schema (skip the ID column from encoding)
    schema = Tuple{String, VSAEncoder}[]
    col_indices = Int[]  # Which CSV column index maps to which schema column
    
    for (j, prof) in enumerate(profiles)
        j == id_col && continue  # Skip ID column
        
        enc = if prof.is_numeric
            # Thermometer encoding for numeric data
            margin = (prof.max_val - prof.min_val) * 0.1
            min_v = prof.min_val - margin
            max_v = prof.max_val + margin
            ThermometerEncoder(reg, prof.name, min_v, max_v; levels=100)
        else
            # Categorical encoding — collect top N categories
            cats = collect(prof.unique_values)
            if length(cats) > max_categories
                # Take top by frequency
                freq = Dict{String,Int}()
                for row in rows
                    j <= length(row) && !isempty(row[j]) && (freq[row[j]] = get(freq, row[j], 0) + 1)
                end
                sorted = sort(collect(freq), by=x -> -x.second)
                cats = [x.first for x in sorted[1:min(max_categories, length(sorted))]]
            end
            CategoricalEncoder(reg, prof.name, cats)
        end
        
        push!(schema, (prof.name, enc))
        push!(col_indices, j)
    end
    
    # Create table
    table = create_table(reg, table_name, dim, schema)
    
    # Insert rows
    inserted = 0
    for row in rows
        id = id_col <= length(row) ? row[id_col] : "row_$(inserted+1)"
        isempty(id) && (id = "row_$(inserted+1)")
        
        fields = Dict{String, Any}()
        for (si, ci) in enumerate(col_indices)
            ci <= length(row) || continue
            val_str = row[ci]
            isempty(val_str) && continue
            
            col_name = schema[si][1]
            enc = schema[si][2]
            
            if enc isa ThermometerEncoder
                v = tryparse(Float64, val_str)
                v !== nothing && (fields[col_name] = v)
            else
                fields[col_name] = val_str
            end
        end
        
        vdb_insert!(table, id, fields)
        inserted += 1
    end
    
    return table, inserted
end

# --- Summary ---

function csv_summary(path::String; max_rows::Int=5)
    headers, rows = csv_read(path; max_rows=max_rows)
    profiles = profile_columns(headers, rows)
    
    println("  File: $(basename(path))")
    println("  Rows: $(length(rows)) (sampled for profiling)")
    println("  Columns: $(length(headers))")
    println("  ─────────────────────────────────────────────")
    for prof in profiles
        type_str = prof.is_numeric ? 
            @sprintf("NUMERIC [%.1f, %.1f]", prof.min_val, prof.max_val) :
            "CATEGORICAL ($(length(prof.unique_values)) unique)"
        @printf("  %-25s %s\n", prof.name, type_str)
    end
end

# --- Bulk Load Helper ---
# Load multiple CSVs into a single VSAEngine

function csv_load_all!(engine::VSAEngine, paths::Vector{String};
                       max_rows::Int=0, max_categories::Int=500)
    results = Dict{String, NamedTuple{(:table, :rows), Tuple{VDBTable, Int}}}()
    
    for path in paths
        t = @elapsed begin
            table, n = csv_to_table(engine.reg, path;
                                    dim=engine.dim,
                                    max_rows=max_rows,
                                    max_categories=max_categories)
            engine.tables[table.name] = table
        end
        
        @printf("  ✓ %-25s  %5d records  (%.3f s)\n", table.name, n, t)
        results[table.name] = (table=table, rows=n)
    end
    
    return results
end