arthikrangan commited on
Commit
aa96015
·
verified ·
1 Parent(s): f50d923

Upload 2 files

Browse files
Files changed (2) hide show
  1. source_to_duckdb.py +781 -0
  2. streamlit_app.py +74 -18
source_to_duckdb.py ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Excel/CSV → DuckDB ingestion (generic, robust, multi-table, unified lineage)
3
+
4
+ - Supports Excel (.xlsx/.xlsm/.xls) and CSV (first row = headers)
5
+ - Hierarchical headers with merged-cell parent context (titles removed)
6
+ - Merged rows/cols resolved to master (top-left) value for consistent replication
7
+ - Multiple tables detected ONLY when separated by at least one completely empty row
8
+ - Footer detection (ignore trailing notes/summaries)
9
+ - Pivot detection (skip pivot-looking rows; optional sheet-level pivot/charthood skip)
10
+ - Optional LLM inference for unnamed columns and table titles (EXCEL_LLM_INFER=1)
11
+ - One DuckDB table per detected table block (Excel) or per file (CSV)
12
+ - Unified lineage tables for BOTH Excel and CSV:
13
+ __file_schema (file_name, sheet_name, table_name, column_ordinal, original_name, sql_column)
14
+ __file_tables (file_name, sheet_name, table_name, block_index, start_row, end_row,
15
+ header_rows_json, inferred_title, original_title_text)
16
+
17
+ Usage:
18
+ python source_to_duckdb.py --file /path/file.xlsx --duckdb /path/out.duckdb
19
+ python source_to_duckdb.py --file /path/file.csv --duckdb /path/out.duckdb
20
+ """
21
+
22
+ import os
23
+ import re
24
+ import sys
25
+ import json
26
+ import hashlib
27
+ from pathlib import Path
28
+ from typing import List, Tuple, Dict
29
+
30
+ from openpyxl import load_workbook
31
+ from openpyxl.worksheet.worksheet import Worksheet
32
+
33
+ # ------------------------- Small utilities -------------------------
34
+
35
+ def _nonempty(vals):
36
+ return [v for v in vals if v not in (None, "")]
37
+
38
+ def _is_numlike(x):
39
+ if isinstance(x, (int, float)):
40
+ return True
41
+ s = str(x).strip().replace(",", "")
42
+ if s.endswith("%"):
43
+ s = s[:-1]
44
+ if not s:
45
+ return False
46
+ if any(c.isalpha() for c in s):
47
+ return False
48
+ try:
49
+ float(s); return True
50
+ except: return False
51
+
52
+ def _is_year_token(x):
53
+ if isinstance(x, int) and 1800 <= x <= 2100: return True
54
+ s = str(x).strip()
55
+ return s.isdigit() and 1800 <= int(s) <= 2100
56
+
57
+ def sanitize_table_name(name: str) -> str:
58
+ t = re.sub(r"[^\w]", "_", str(name))
59
+ t = re.sub(r"_+", "_", t).strip("_")
60
+ if t and not t[0].isalpha(): t = "table_" + t
61
+ return t or "sheet_data"
62
+
63
+ def clean_col_name(s: str) -> str:
64
+ s = re.sub(r"[^\w\s%#‰]", "", str(s).strip())
65
+ s = s.replace("%"," pct").replace("‰"," permille").replace("#"," count ")
66
+ s = re.sub(r"\s+"," ", s)
67
+ s = re.sub(r"\s+","_", s)
68
+ s = re.sub(r"_+","_", s).strip("_")
69
+ if s and s[0].isdigit(): s = "col_" + s
70
+ return s or "unnamed_column"
71
+
72
+ def ensure_unique(names):
73
+ seen = {}; out = []
74
+ for n in names:
75
+ base = (n or "unnamed_column").lower()
76
+ if base not in seen:
77
+ seen[base] = 0; out.append(n)
78
+ else:
79
+ i = seen[base] + 1
80
+ while f"{n}_{i}".lower() in seen: i += 1
81
+ seen[base] = i; out.append(f"{n}_{i}")
82
+ seen[(f"{n}_{i}").lower()] = 0
83
+ return out
84
+
85
+ def compose_col(parts):
86
+ cleaned = []; prev = None
87
+ for p in parts:
88
+ if not p: continue
89
+ p_norm = str(p).strip()
90
+ if prev is not None and p_norm.lower() == prev.lower(): continue
91
+ cleaned.append(p_norm); prev = p_norm
92
+ if not cleaned: return ""
93
+ return clean_col_name("_".join(cleaned))
94
+
95
+ # ------------------------- Heuristics & detection -------------------------
96
+
97
+ def is_probably_footer(cells):
98
+ nonempty = [(i, v) for i, v in enumerate(cells) if v not in (None, "")]
99
+ if not nonempty: return False
100
+ if len(nonempty) <= 2:
101
+ text = " ".join(str(v) for _, v in nonempty).strip().lower()
102
+ if any(text.startswith(k) for k in ["note","notes","source","summary","disclaimer"]): return True
103
+ if len(text) > 50: return True
104
+ return False
105
+
106
+ def is_probably_data(cells, num_cols):
107
+ vals = [v for v in cells if v not in (None, "")]
108
+ if not vals: return False
109
+ nums_list = [v for v in vals if _is_numlike(v)]
110
+ num_num = len(nums_list); num_text = len(vals) - num_num
111
+ density = len(vals) / max(1, num_cols)
112
+ if num_num >= 2 and all(_is_year_token(v) for v in nums_list) and num_text >= 2:
113
+ return False
114
+ if num_num >= max(2, num_text): return True
115
+ if density >= 0.6 and num_num >= 2: return True
116
+ first = str(vals[0]).strip().lower() if vals else ""
117
+ if first in ("total","totals","grand total"): return True
118
+ return False
119
+
120
+ PIVOT_MARKERS = {"row labels","column labels","values","grand total","report filter","filters","∑ values","σ values","Σ values"}
121
+ def is_pivot_marker_string(s: str) -> bool:
122
+ if not s: return False
123
+ t = str(s).strip().lower()
124
+ if t in PIVOT_MARKERS: return True
125
+ if t.startswith(("sum of ","count of ","avg of ","average of ")): return True
126
+ if t.endswith(" total") or t.startswith("total "): return True
127
+ return False
128
+
129
+ def is_pivot_row(cells) -> bool:
130
+ text_cells = [str(v).strip() for v in cells if v not in (None, "")]
131
+ if not text_cells: return False
132
+ if any(is_pivot_marker_string(x) for x in text_cells): return True
133
+ agg_hits = sum(1 for x in text_cells if x.lower().startswith(("sum of","count of","avg of","average of","min of","max of")))
134
+ return agg_hits >= 2
135
+
136
+ def is_pivot_or_chart_sheet(ws: Worksheet) -> bool:
137
+ try:
138
+ if getattr(ws, "_charts", None): return True
139
+ except Exception: pass
140
+ if hasattr(ws, "_pivots") and getattr(ws, "_pivots"): return True
141
+ scan_rows = min(ws.max_row, 40); scan_cols = min(ws.max_column, 20)
142
+ pivotish = 0
143
+ for r in range(1, scan_rows+1):
144
+ row = [ws.cell(r,c).value for c in range(1, scan_cols+1)]
145
+ if is_pivot_row(row):
146
+ pivotish += 1
147
+ if pivotish >= 2: return True
148
+ name = (ws.title or "").lower()
149
+ if any(k in name for k in ("pivot","dashboard","chart","charts")): return True
150
+ return False
151
+
152
+ def _samples_for_column(rows, col_idx, max_items=20):
153
+ vals = []
154
+ for row in rows:
155
+ if col_idx < len(row):
156
+ v = row[col_idx]
157
+ if v not in (None, ""): vals.append(v)
158
+ if len(vals) >= max_items: break
159
+ return vals
160
+
161
+ def _heuristic_infer_col_name(samples):
162
+ if not samples: return None
163
+ if sum(1 for v in samples if _is_year_token(v)) >= max(2, int(0.8*len(samples))): return "year"
164
+ pct_hits = 0
165
+ for v in samples:
166
+ s = str(v).strip()
167
+ if s.endswith("%"): pct_hits += 1
168
+ else:
169
+ try:
170
+ f = float(s.replace(",",""))
171
+ if 0 <= f <= 1.0 or 0 <= f <= 100: pct_hits += 0.5
172
+ except: pass
173
+ if pct_hits >= max(2, int(0.7*len(samples))): return "percentage"
174
+ if sum(1 for v in samples if _is_numlike(v)) >= max(3, int(0.7*len(samples))):
175
+ intish = 0
176
+ for v in samples:
177
+ try:
178
+ if float(str(v).replace(",","")) == int(float(str(v).replace(",",""))): intish += 1
179
+ except: pass
180
+ if intish >= max(2, int(0.6*len(samples))): return "count"
181
+ return "value"
182
+ uniq = {str(v).strip().lower() for v in samples}
183
+ if len(uniq) <= 3 and max(len(str(v)) for v in samples) >= 30: return "question"
184
+ if sum(1 for v in samples if re.search(r"\d", str(v)) and ("-" in str(v) or "–" in str(v))) >= max(2, int(0.6*len(samples))): return "range"
185
+ if len(uniq) < max(5, int(0.5*len(samples))): return "category"
186
+ return None
187
+
188
+ def used_bounds(ws: Worksheet) -> Tuple[int,int,int,int]:
189
+ min_row, max_row, min_col, max_col = None, 0, None, 0
190
+ for r in ws.iter_rows():
191
+ for c in r:
192
+ v = c.value
193
+ if v is not None and str(v).strip() != "":
194
+ if min_row is None or c.row < min_row: min_row = c.row
195
+ if c.row > max_row: max_row = c.row
196
+ if min_col is None or c.column < min_col: min_col = c.column
197
+ if c.column > max_col: max_col = c.column
198
+ if min_row is None: return 1,0,1,0
199
+ return min_row, max_row, min_col, max_col
200
+
201
+ def build_merged_master_map(ws: Worksheet):
202
+ mapping = {}
203
+ for mr in ws.merged_cells.ranges:
204
+ min_col, min_row, max_col, max_row = mr.min_col, mr.min_row, mr.max_col, mr.max_row
205
+ master = (min_row, min_col)
206
+ for r in range(min_row, max_row+1):
207
+ for c in range(min_col, max_col+1):
208
+ mapping[(r,c)] = master
209
+ return mapping
210
+
211
+ def build_value_grid(ws: Worksheet, min_row: int, max_row: int, min_col: int, max_col: int):
212
+ merged_map = build_merged_master_map(ws)
213
+ nrows = max_row - min_row + 1; ncols = max_col - min_col + 1
214
+ grid = [[None]*ncols for _ in range(nrows)]
215
+ for r in range(min_row, max_row+1):
216
+ rr = r - min_row
217
+ for c in range(min_col, max_col+1):
218
+ cc = c - min_col
219
+ master = merged_map.get((r,c))
220
+ if master:
221
+ mr, mc = master; grid[rr][cc] = ws.cell(mr, mc).value
222
+ else:
223
+ grid[rr][cc] = ws.cell(r, c).value
224
+ return grid
225
+
226
+ def row_vals_from_grid(grid, r, min_row):
227
+ return grid[r - min_row]
228
+
229
+ def is_empty_row_vals(vals):
230
+ return not any(v not in (None, "") for v in vals)
231
+
232
+ def is_title_like_row_vals(vals, total_cols=20):
233
+ vals_ne = _nonempty(vals)
234
+ if not vals_ne: return False
235
+ if len(vals_ne) == 1: return True
236
+ coverage = len(vals_ne) / max(1, total_cols)
237
+ if coverage <= 0.2 and all(isinstance(v,str) and len(str(v))>20 for v in vals_ne): return True
238
+ uniq = {str(v).strip().lower() for v in vals_ne}
239
+ if len(uniq) == 1: return True
240
+ block = {"local currency unit per us dollar","exchange rate","average annual exchange rate"}
241
+ if any(str(v).strip().lower() in block for v in vals_ne): return True
242
+ return False
243
+
244
+ def is_header_candidate_row_vals(vals, total_cols=20):
245
+ vals_ne = _nonempty(vals)
246
+ if not vals_ne: return False
247
+ if is_title_like_row_vals(vals, total_cols): return False
248
+ nums = sum(1 for v in vals_ne if _is_numlike(v))
249
+ years = sum(1 for v in vals_ne if _is_year_token(v))
250
+ has_text = any(not _is_numlike(v) for v in vals_ne)
251
+ if years >= 2 and has_text: return True
252
+ if nums >= max(2, len(vals_ne)-nums): return years >= max(2, int(0.6*len(vals_ne)))
253
+ uniq_labels = {str(v).strip().lower() for v in vals_ne if not _is_numlike(v)}
254
+ return (len(vals_ne) >= 2) or (len(uniq_labels) >= 2)
255
+
256
+ def detect_tables_fast(ws: Worksheet, grid, min_row, max_row, min_col, max_col):
257
+ blocks = []
258
+ if is_pivot_or_chart_sheet(ws): return blocks
259
+ total_cols = max_col - min_col + 1
260
+ r = min_row
261
+ while r <= max_row:
262
+ vals = row_vals_from_grid(grid, r, min_row)
263
+ if is_empty_row_vals(vals) or is_title_like_row_vals(vals, total_cols) or is_pivot_row(vals):
264
+ r += 1; continue
265
+ if not is_probably_data(vals, total_cols):
266
+ r += 1; continue
267
+ data_start = r
268
+ header_rows = []
269
+ up = data_start - 1
270
+ while up >= min_row:
271
+ vup = row_vals_from_grid(grid, up, min_row)
272
+ if is_empty_row_vals(vup): break
273
+ if is_title_like_row_vals(vup, total_cols) or is_pivot_row(vup):
274
+ up -= 1; continue
275
+ if is_header_candidate_row_vals(vup, total_cols):
276
+ header_rows = []
277
+ hdr_row = up
278
+ while hdr_row >= min_row:
279
+ hdr_vals = row_vals_from_grid(grid, hdr_row, min_row)
280
+ if is_empty_row_vals(hdr_vals): break
281
+ if is_header_candidate_row_vals(hdr_vals, total_cols):
282
+ header_rows.insert(0, hdr_row); hdr_row -= 1
283
+ else: break
284
+ break
285
+ data_end = data_start
286
+ rr = data_start + 1
287
+ while rr <= max_row:
288
+ v = row_vals_from_grid(grid, rr, min_row)
289
+ if is_probably_footer(v) or is_pivot_row(v): break
290
+ if is_empty_row_vals(v): break
291
+ if is_probably_data(v, total_cols) or is_header_candidate_row_vals(v, total_cols):
292
+ data_end = rr
293
+ rr += 1
294
+ title_text = None
295
+ if header_rows:
296
+ top = header_rows[0]
297
+ for tr in range(max(min_row, top-3), top):
298
+ tv = row_vals_from_grid(grid, tr, min_row)
299
+ if is_title_like_row_vals(tv, total_cols):
300
+ first = next((str(x).strip() for x in tv if x not in (None,"")), None)
301
+ if first: title_text = first
302
+ break
303
+ if (header_rows or data_end - data_start >= 1) and data_start <= data_end:
304
+ blocks.append({"header_rows": header_rows, "data_start": data_start, "data_end": data_end, "title_text": title_text})
305
+ r = data_end + 1
306
+ while r <= max_row and is_empty_row_vals(row_vals_from_grid(grid, r, min_row)):
307
+ r += 1
308
+ return blocks
309
+
310
+ def expand_headers_from_grid(grid, header_rows, min_row, min_col, eff_max_col):
311
+ if not header_rows: return []
312
+ mat = []
313
+ for r in header_rows:
314
+ row_vals = row_vals_from_grid(grid, r, min_row)
315
+ row = [("" if (row_vals[c] is None) else str(row_vals[c]).strip()) for c in range(0, eff_max_col)]
316
+ last = ""
317
+ for i in range(len(row)):
318
+ if row[i] == "" and i > 0: row[i] = last
319
+ else: last = row[i]
320
+ mat.append(row)
321
+ return mat
322
+
323
+ def sheet_block_to_df_fast(ws, grid, min_row, max_row, min_col, max_col, header_rows, data_start, data_end):
324
+ import pandas as pd
325
+ total_cols = max_col - min_col + 1
326
+ if (not header_rows) and data_start and data_start > min_row:
327
+ prev = row_vals_from_grid(grid, data_start - 1, min_row)
328
+ if is_header_candidate_row_vals(prev, total_cols):
329
+ header_rows = [data_start - 1]
330
+ if (not header_rows) and data_start:
331
+ cur = row_vals_from_grid(grid, data_start, min_row)
332
+ nxt = row_vals_from_grid(grid, data_start + 1, min_row) if data_start + 1 <= max_row else []
333
+ if is_header_candidate_row_vals(cur, total_cols) and is_probably_data(nxt, total_cols):
334
+ header_rows = [data_start]; data_start += 1
335
+ if not header_rows or data_start is None or data_end is None or data_end < data_start:
336
+ import pandas as _pd
337
+ return _pd.DataFrame(), [], []
338
+ def used_upto_col():
339
+ maxc = 0
340
+ for r in list(header_rows) + list(range(data_start, data_end+1)):
341
+ vals = row_vals_from_grid(grid, r, min_row)
342
+ for c_off in range(total_cols):
343
+ v = vals[c_off]
344
+ if v not in (None, ""): maxc = max(maxc, c_off+1)
345
+ return maxc or total_cols
346
+ eff_max_col = used_upto_col()
347
+ header_mat = expand_headers_from_grid(grid, header_rows, min_row, min_col, eff_max_col)
348
+ def is_title_level(values):
349
+ total = len(values)
350
+ filled = [str(v).strip() for v in values if v not in (None, "")]
351
+ if total == 0: return False
352
+ coverage = len(filled) / total
353
+ if coverage <= 0.2 and len(filled) <= 2: return True
354
+ if filled:
355
+ uniq = {v.lower() for v in filled}
356
+ if len(uniq) == 1:
357
+ label = next(iter(uniq))
358
+ dom = sum(1 for v in values if isinstance(v,str) and v.strip().lower() == label)
359
+ if dom / total >= 0.6: return True
360
+ return False
361
+ usable_levels = [i for i in range(len(header_mat)) if not is_title_level(header_mat[i])]
362
+ if not usable_levels and header_mat: usable_levels = [len(header_mat) - 1]
363
+ cols = []
364
+ for c_off in range(eff_max_col):
365
+ parts = [header_mat[l][c_off] for l in range(usable_levels[0], usable_levels[-1]+1)] if usable_levels else []
366
+ cols.append(compose_col(parts))
367
+ cols = ensure_unique([clean_col_name(x) for x in cols])
368
+ data_rows = []
369
+ for r in range(data_start, data_end+1):
370
+ vals = row_vals_from_grid(grid, r, min_row)
371
+ row = [vals[c_off] for c_off in range(eff_max_col)]
372
+ if is_probably_footer(row): break
373
+ data_rows.append(row[:len(cols)])
374
+ if not data_rows:
375
+ import pandas as _pd
376
+ return _pd.DataFrame(columns=cols), header_mat, cols
377
+ keep_mask = [any(row[i] not in (None, "") for row in data_rows) for i in range(len(cols))]
378
+ kept_cols = [c for c,k in zip(cols, keep_mask) if k]
379
+ trimmed_rows = [[v for v,k in zip(row, keep_mask) if k] for row in data_rows]
380
+ import pandas as pd
381
+ df = pd.DataFrame(trimmed_rows, columns=kept_cols)
382
+ if any(str(c).startswith("unnamed_column") for c in df.columns):
383
+ new_names = list(df.columns)
384
+ for idx, name in enumerate(list(df.columns)):
385
+ if not str(name).startswith("unnamed_column"): continue
386
+ samples = _samples_for_column(trimmed_rows, idx, max_items=20)
387
+ guess = _heuristic_infer_col_name(samples)
388
+ if guess: new_names[idx] = clean_col_name(guess)
389
+ df.columns = ensure_unique([clean_col_name(x) for x in new_names])
390
+ return df, header_mat, kept_cols
391
+
392
+ # ------------------------- Optional LLM title inference -------------------------
393
+
394
+ def _llm_infer_table_title(header_mat, sample_rows, sheet_name):
395
+ if os.environ.get("EXCEL_LLM_INFER","0") != "1": return None
396
+ api_key = os.environ.get("OPENAI_API_KEY")
397
+ if not api_key: return None
398
+ headers = []
399
+ if header_mat:
400
+ for c in range(len(header_mat[0])):
401
+ parts = [header_mat[l][c] for l in range(len(header_mat))]
402
+ parts = [p for p in parts if p]
403
+ if parts: headers.append("_".join(parts))
404
+ headers = headers[:10]
405
+ samples = [[str(x) for x in r[:6]] for r in sample_rows[:5]]
406
+ prompt = (
407
+ "Propose a short, human-readable title for a data table.\n"
408
+ "Keep it 3-6 words, Title Case, no punctuation at the end.\n"
409
+ f"Sheet: {sheet_name}\nHeaders: {headers}\nRow samples: {samples}\n"
410
+ "Answer with JSON: {\"title\": \"...\"}"
411
+ )
412
+ try:
413
+ from openai import OpenAI
414
+ client = OpenAI(api_key=api_key)
415
+ resp = client.chat.completions.create(
416
+ model=os.environ.get("OPENAI_MODEL","gpt-4o-mini"),
417
+ messages=[{"role":"user","content":prompt}], temperature=0.2,
418
+ )
419
+ text = resp.choices[0].message.content.strip()
420
+ except Exception:
421
+ return None
422
+ import re as _re, json as _json
423
+ m = _re.search(r"\{.*\}", text, re.S)
424
+ if not m: return None
425
+ try:
426
+ obj = _json.loads(m.group(0)); title = obj.get("title","").strip()
427
+ return title or None
428
+ except Exception: return None
429
+
430
+ def _heuristic_table_title(header_mat, sheet_name, idx):
431
+ if header_mat:
432
+ parts = []
433
+ levels = len(header_mat)
434
+ cols = len(header_mat[0]) if header_mat else 0
435
+ for c in range(min(6, cols)):
436
+ colparts = [header_mat[l][c] for l in range(min(levels, 2)) if header_mat[l][c]]
437
+ if colparts: parts.extend(colparts)
438
+ if parts:
439
+ base = " ".join(dict.fromkeys(parts))
440
+ return base[:60]
441
+ return f"{sheet_name} Table {idx}"
442
+
443
+ def infer_table_title(header_mat, sample_rows, sheet_name, idx):
444
+ title = _heuristic_table_title(header_mat, sheet_name, idx)
445
+ llm = _llm_infer_table_title(header_mat, sample_rows, sheet_name)
446
+ return llm or title
447
+
448
+ # ------------------------- Unified lineage helpers -------------------------
449
+
450
+ FILE_SCHEMA_TABLE = "__file_schema"
451
+ FILE_TABLES_TABLE = "__file_tables"
452
+
453
+ def ensure_lineage_tables(con):
454
+ con.execute(f"""
455
+ CREATE TABLE IF NOT EXISTS {FILE_SCHEMA_TABLE} (
456
+ file_name TEXT,
457
+ sheet_name TEXT,
458
+ table_name TEXT,
459
+ column_ordinal INTEGER,
460
+ original_name TEXT,
461
+ sql_column TEXT
462
+ )
463
+ """)
464
+ con.execute(f"""
465
+ CREATE TABLE IF NOT EXISTS {FILE_TABLES_TABLE} (
466
+ file_name TEXT,
467
+ sheet_name TEXT,
468
+ table_name TEXT,
469
+ block_index INTEGER,
470
+ start_row INTEGER,
471
+ end_row INTEGER,
472
+ header_rows_json TEXT,
473
+ inferred_title TEXT,
474
+ original_title_text TEXT
475
+ )
476
+ """)
477
+
478
+ def record_table_schema(con, file_name, sheet_name, table_name, columns):
479
+ """
480
+ columns: list of tuples (column_ordinal, original_name, sql_column)
481
+ """
482
+ ensure_lineage_tables(con)
483
+ # DuckDB doesn't support `IS ?` with NULL; branch the delete
484
+ if sheet_name is None:
485
+ con.execute(
486
+ f"DELETE FROM {FILE_SCHEMA_TABLE} WHERE file_name = ? AND sheet_name IS NULL AND table_name = ?",
487
+ [file_name, table_name],
488
+ )
489
+ else:
490
+ con.execute(
491
+ f"DELETE FROM {FILE_SCHEMA_TABLE} WHERE file_name = ? AND sheet_name = ? AND table_name = ?",
492
+ [file_name, sheet_name, table_name],
493
+ )
494
+ con.executemany(
495
+ f"INSERT INTO {FILE_SCHEMA_TABLE} (file_name, sheet_name, table_name, column_ordinal, original_name, sql_column) VALUES (?, ?, ?, ?, ?, ?)",
496
+ [(file_name, sheet_name, table_name, i, orig, sql) for (i, orig, sql) in columns],
497
+ )
498
+
499
+ def record_table_block(con, file_name, sheet_name, table_name, block_index, start_row, end_row, header_rows_json, inferred_title, original_title_text):
500
+ ensure_lineage_tables(con)
501
+ # DuckDB doesn't support `IS ?` with NULL; branch the delete
502
+ if sheet_name is None:
503
+ con.execute(
504
+ f"DELETE FROM {FILE_TABLES_TABLE} WHERE file_name = ? AND sheet_name IS NULL AND table_name = ? AND block_index = ?",
505
+ [file_name, table_name, int(block_index) if block_index is not None else 0],
506
+ )
507
+ else:
508
+ con.execute(
509
+ f"DELETE FROM {FILE_TABLES_TABLE} WHERE file_name = ? AND sheet_name = ? AND table_name = ? AND block_index = ?",
510
+ [file_name, sheet_name, table_name, int(block_index) if block_index is not None else 0],
511
+ )
512
+ con.execute(
513
+ f"""INSERT INTO {FILE_TABLES_TABLE}
514
+ (file_name, sheet_name, table_name, block_index, start_row, end_row, header_rows_json, inferred_title, original_title_text)
515
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
516
+ [
517
+ file_name, sheet_name, table_name,
518
+ int(block_index) if block_index is not None else 0,
519
+ int(start_row) if start_row is not None else None,
520
+ int(end_row) if end_row is not None else None,
521
+ header_rows_json, inferred_title, original_title_text
522
+ ]
523
+ )
524
+
525
+ # --- block coalescing to avoid nested/overlapping duplicates ---
526
+ def coalesce_blocks(blocks: List[Dict]) -> List[Dict]:
527
+ """Keep only maximal non-overlapping blocks by data row range."""
528
+ if not blocks: return blocks
529
+ blocks_sorted = sorted(blocks, key=lambda b: (b["data_start"], b["data_end"]))
530
+ result = []
531
+ for b in blocks_sorted:
532
+ if any(b["data_start"] >= x["data_start"] and b["data_end"] <= x["data_end"] for x in result):
533
+ continue # fully contained -> drop
534
+ result.append(b)
535
+ return result
536
+
537
+ # ------------------------- Persistence: Excel -------------------------
538
+
539
+ def persist(excel_path, duckdb_path):
540
+ try:
541
+ from duckdb import connect
542
+ except ImportError:
543
+ print("Error: DuckDB library not installed. Install with: pip install duckdb"); sys.exit(1)
544
+ try:
545
+ wb = load_workbook(excel_path, data_only=True)
546
+ except FileNotFoundError:
547
+ print(f"Error: Excel file not found: {excel_path}"); sys.exit(1)
548
+ except Exception as e:
549
+ print(f"Error loading Excel file: {e}"); sys.exit(1)
550
+
551
+ file_name = Path(excel_path).name
552
+ db_path = Path(duckdb_path)
553
+ db_path.parent.mkdir(parents=True, exist_ok=True)
554
+ new_db = not db_path.exists()
555
+ con = connect(str(db_path))
556
+ if new_db: print(f"Created new DuckDB at: {db_path}")
557
+
558
+ # Ensure unified lineage tables exist
559
+ ensure_lineage_tables(con)
560
+
561
+ used_names = set(); total_tables = 0; total_rows = 0
562
+
563
+ for sheet in wb.sheetnames:
564
+ ws = wb[sheet]
565
+ try:
566
+ if not isinstance(ws, Worksheet):
567
+ print(f"Skipping chartsheet: {sheet}"); continue
568
+ except Exception: pass
569
+ if is_pivot_or_chart_sheet(ws):
570
+ print(f"Skipping pivot/chart-like sheet: {sheet}"); continue
571
+
572
+ min_row, max_row, min_col, max_col = used_bounds(ws)
573
+ if max_row < min_row: continue
574
+ grid = build_value_grid(ws, min_row, max_row, min_col, max_col)
575
+
576
+ blocks = detect_tables_fast(ws, grid, min_row, max_row, min_col, max_col)
577
+ blocks = coalesce_blocks(blocks)
578
+ if not blocks: continue
579
+
580
+ # per-sheet content hash set to avoid identical duplicate content
581
+ seen_content = set()
582
+
583
+ for idx, blk in enumerate(blocks, start=1):
584
+ df, header_mat, kept_cols = sheet_block_to_df_fast(
585
+ ws, grid, min_row, max_row, min_col, max_col,
586
+ blk["header_rows"], blk["data_start"], blk["data_end"]
587
+ )
588
+ if df.empty: continue
589
+
590
+ # Content hash (stable CSV representation)
591
+ csv_bytes = df.to_csv(index=False).encode("utf-8")
592
+ h = hashlib.sha256(csv_bytes).hexdigest()
593
+ if h in seen_content:
594
+ print(f"Skipping duplicate content on sheet {sheet} (block {idx})")
595
+ continue
596
+ seen_content.add(h)
597
+
598
+ # Build original composite header names for lineage mapping
599
+ original_cols = []
600
+ if header_mat:
601
+ levels = len(header_mat)
602
+ cols = len(header_mat[0]) if header_mat else 0
603
+ for c in range(cols):
604
+ parts = [header_mat[l][c] for l in range(levels)]
605
+ original_cols.append("_".join([p for p in parts if p]))
606
+ else:
607
+ original_cols = list(df.columns)
608
+ while len(original_cols) < len(df.columns): original_cols.append("unnamed")
609
+
610
+ title_orig = blk.get("title_text")
611
+ title = title_orig or infer_table_title(header_mat, df.values.tolist(), sheet, idx)
612
+ candidate = title if title else f"{sheet} Table {idx}"
613
+ table = ensure_unique_table_name(used_names, candidate)
614
+
615
+ # Create/replace table
616
+ con.execute(f'DROP TABLE IF EXISTS "{table}"')
617
+ con.register(f"{table}_temp", df)
618
+ con.execute(f'CREATE TABLE "{table}" AS SELECT * FROM {table}_temp')
619
+ con.unregister(f"{table}_temp")
620
+
621
+ # Record lineage (schema + block)
622
+ schema_rows = []
623
+ for cidx, (orig, sqlc) in enumerate(zip(original_cols[:len(df.columns)], df.columns), start=1):
624
+ schema_rows.append((cidx, str(orig), str(sqlc)))
625
+ record_table_schema(
626
+ con,
627
+ file_name=file_name,
628
+ sheet_name=sheet,
629
+ table_name=table,
630
+ columns=schema_rows,
631
+ )
632
+ record_table_block(
633
+ con,
634
+ file_name=file_name,
635
+ sheet_name=sheet,
636
+ table_name=table,
637
+ block_index=idx,
638
+ start_row=int(blk["data_start"]),
639
+ end_row=int(blk["data_end"]),
640
+ header_rows_json=json.dumps(blk["header_rows"]),
641
+ inferred_title=title if title else None,
642
+ original_title_text=title_orig if title_orig else None,
643
+ )
644
+
645
+ print(f"Created table {table} from sheet {sheet} with {len(df)} rows and {len(df.columns)} columns.")
646
+ total_tables += 1; total_rows += len(df)
647
+
648
+ con.close()
649
+ print(f"""\n✅ Completed.
650
+ - Created {total_tables} tables with {total_rows} total rows
651
+ - Column lineage: {FILE_SCHEMA_TABLE}
652
+ - Block metadata: {FILE_TABLES_TABLE}""")
653
+
654
+ # ------------------------- Persistence: CSV -------------------------
655
+
656
+ def persist_csv(csv_path, duckdb_path):
657
+ """
658
+ Ingest a single CSV file into DuckDB AND write lineage, aligned with Excel.
659
+ - First row is headers.
660
+ - One table named from the CSV file name.
661
+ - Cleans headers and ensures uniqueness.
662
+ - Records lineage in __file_schema and __file_tables using the unified schema (with file_name).
663
+ """
664
+ import pandas as pd
665
+ from duckdb import connect
666
+
667
+ csv_path = Path(csv_path)
668
+ if not csv_path.exists():
669
+ print(f"Error: CSV file not found: {csv_path}")
670
+ sys.exit(1)
671
+
672
+ # Keep original header names for lineage before cleaning
673
+ try:
674
+ df_raw = pd.read_csv(csv_path, header=0, encoding="utf-8-sig")
675
+ except UnicodeDecodeError:
676
+ df_raw = pd.read_csv(csv_path, header=0)
677
+
678
+ original_headers = list(df_raw.columns)
679
+
680
+ # Clean/normalize column names
681
+ def _clean_hdr(s):
682
+ s = str(s) if s is not None else ""
683
+ s = s.strip()
684
+ s = re.sub(r"\s+", " ", s)
685
+ return clean_col_name(s)
686
+
687
+ cleaned_cols = ensure_unique([_clean_hdr(c) for c in original_headers])
688
+ df = df_raw.copy()
689
+ df.columns = cleaned_cols
690
+
691
+ # Compute table name from file name
692
+ table = sanitize_table_name(csv_path.stem)
693
+
694
+ # Open / create DuckDB
695
+ db_path = Path(duckdb_path)
696
+ db_path.parent.mkdir(parents=True, exist_ok=True)
697
+ new_db = not db_path.exists()
698
+
699
+ con = connect(str(db_path))
700
+ if new_db:
701
+ print(f"Created new DuckDB at: {db_path}")
702
+
703
+ # Ensure unified lineage tables (with file_name) exist
704
+ ensure_lineage_tables(con)
705
+
706
+ # Create/replace the data table
707
+ con.execute(f'DROP TABLE IF EXISTS "{table}"')
708
+ con.register(f"{table}_temp_df", df)
709
+ con.execute(f'CREATE TABLE "{table}" AS SELECT * FROM {table}_temp_df')
710
+ con.unregister(f"{table}_temp_df")
711
+
712
+ # Write lineage
713
+ file_name = csv_path.name
714
+ sheet_name = None # CSV has no sheet
715
+ block_index = 1 # single block/table for CSV
716
+ start_row = 2 # header is row 1, data starts at 2
717
+ end_row = len(df) + 1 # header + data rows
718
+ header_rows_json = "[1]" # header row index list as JSON
719
+ inferred_title = None
720
+ original_title_text = None
721
+
722
+ # Map original->sql columns
723
+ schema_rows = []
724
+ for i, (orig, sql) in enumerate(zip(original_headers, cleaned_cols), start=1):
725
+ schema_rows.append((i, str(orig), str(sql)))
726
+
727
+ record_table_schema(
728
+ con,
729
+ file_name=file_name,
730
+ sheet_name=sheet_name,
731
+ table_name=table,
732
+ columns=schema_rows
733
+ )
734
+ record_table_block(
735
+ con,
736
+ file_name=file_name,
737
+ sheet_name=sheet_name,
738
+ table_name=table,
739
+ block_index=block_index,
740
+ start_row=start_row,
741
+ end_row=end_row,
742
+ header_rows_json=header_rows_json,
743
+ inferred_title=inferred_title,
744
+ original_title_text=original_title_text
745
+ )
746
+
747
+ print(f'Created table {table} from CSV "{csv_path.name}" with {len(df)} rows and {len(df.columns)} columns.')
748
+ con.close()
749
+
750
+ # ------------------------- CLI -------------------------
751
+
752
+ def ensure_unique_table_name(existing: set, name: str) -> str:
753
+ base = sanitize_table_name(name) or "table"
754
+ if base not in existing:
755
+ existing.add(base); return base
756
+ i = 2
757
+ while f"{base}_{i}" in existing: i += 1
758
+ out = f"{base}_{i}"; existing.add(out); return out
759
+
760
+ def main():
761
+ import argparse
762
+ ap = argparse.ArgumentParser(description="Excel/CSV → DuckDB (unified --file + lineage).")
763
+ ap.add_argument("--file", required=True, help="Path to .xlsx/.xlsm/.xls or .csv")
764
+ ap.add_argument("--duckdb", required=True, help="Path to DuckDB file")
765
+ args = ap.parse_args()
766
+
767
+ if not os.path.exists(args.file):
768
+ print(f"Error: file not found: {args.file}")
769
+ sys.exit(1)
770
+
771
+ ext = Path(args.file).suffix.lower()
772
+ if ext in [".xlsx", ".xlsm", ".xls"]:
773
+ persist(args.file, args.duckdb)
774
+ elif ext == ".csv":
775
+ persist_csv(args.file, args.duckdb)
776
+ else:
777
+ print("Error: unsupported file type. Use .xlsx/.xlsm/.xls or .csv")
778
+ sys.exit(2)
779
+
780
+ if __name__ == "__main__":
781
+ main()
streamlit_app.py CHANGED
@@ -21,7 +21,7 @@ st.set_page_config(page_title="Excel → Dataset", page_icon="📊", layout="wid
21
  PRIMARY_DIR = Path(__file__).parent.resolve()
22
  UPLOAD_DIR = PRIMARY_DIR / "uploads"
23
  DB_DIR = PRIMARY_DIR / "dbs"
24
- SCRIPT_PATH = PRIMARY_DIR / "excel_to_duckdb.py" # must be colocated
25
 
26
  UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
27
  DB_DIR.mkdir(parents=True, exist_ok=True)
@@ -82,7 +82,18 @@ def list_user_tables(con: duckdb.DuckDBPyConnection) -> List[str]:
82
  if names:
83
  return names
84
  except Exception:
85
- pass
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  # 2) duckdb_tables()
88
  try:
@@ -103,11 +114,22 @@ def list_user_tables(con: duckdb.DuckDBPyConnection) -> List[str]:
103
  if names:
104
  return names
105
  except Exception:
106
- pass
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # 3) Fallback to metadata table
109
  try:
110
- meta = con.execute("SELECT DISTINCT table_name FROM __excel_tables").fetchall()
111
  names = []
112
  for (t,) in meta:
113
  try:
@@ -117,7 +139,19 @@ def list_user_tables(con: duckdb.DuckDBPyConnection) -> List[str]:
117
  continue
118
  return names
119
  except Exception:
120
- return []
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  def get_columns(con: duckdb.DuckDBPyConnection, table: str) -> List[Tuple[str,str]]:
123
  # Normalize table name for information_schema lookup
@@ -202,7 +236,7 @@ def table_mapping(con: duckdb.DuckDBPyConnection, user_tables: List[str]) -> Dic
202
  try:
203
  rows = con.execute(
204
  "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
205
- "FROM __excel_tables ORDER BY block_index, start_row"
206
  ).fetchall()
207
  for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
208
  if table_name not in want_names:
@@ -210,14 +244,25 @@ def table_mapping(con: duckdb.DuckDBPyConnection, user_tables: List[str]) -> Dic
210
  title = inferred_title or original_title_text or 'untitled'
211
  mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
212
  except Exception:
213
- pass
 
 
 
 
 
 
 
 
 
 
 
214
  return mapping
215
 
216
  def excel_schema_samples(con: duckdb.DuckDBPyConnection, mapping: Dict[str, Dict], max_cols: int = 8) -> Dict[str, List[str]]:
217
  """ Return up to max_cols original column names per table_name (normalized) for LLM hints. """
218
  samples: Dict[str, List[str]] = {}
219
  try:
220
- rows = con.execute("SELECT sheet_name, table_name, column_ordinal, original_name FROM __excel_schema ORDER BY sheet_name, table_name, column_ordinal").fetchall()
221
  for sheet_name, table_name, ordn, orig in rows:
222
  if table_name not in mapping:
223
  continue
@@ -225,7 +270,18 @@ def excel_schema_samples(con: duckdb.DuckDBPyConnection, mapping: Dict[str, Dict
225
  if orig and len(lst) < max_cols:
226
  lst.append(str(orig))
227
  except Exception:
228
- pass
 
 
 
 
 
 
 
 
 
 
 
229
  return samples
230
 
231
  # ---------- OpenAI ----------
@@ -261,7 +317,7 @@ Context (JSON):
261
  return resp.choices[0].message.content.strip()
262
 
263
  # ---------- Orchestration ----------
264
- def run_ingestion_pipeline(xlsx_path: Path, db_path: Path, log_placeholder):
265
  # Combined log function
266
  log_lines: List[str] = []
267
  def _append(line: str):
@@ -283,7 +339,7 @@ def run_ingestion_pipeline(xlsx_path: Path, db_path: Path, log_placeholder):
283
  env = os.environ.copy()
284
  env["PYTHONIOENCODING"] = "utf-8"
285
 
286
- cmd = [sys.executable, str(SCRIPT_PATH), "--excel", str(xlsx_path), "--duckdb", str(db_path)]
287
  try:
288
  proc = subprocess.Popen(
289
  cmd, cwd=str(PRIMARY_DIR),
@@ -358,10 +414,10 @@ def analyze_and_summarize(con: duckdb.DuckDBPyConnection):
358
  return overview_md, preview_items
359
 
360
  # ---------- UI flow ----------
361
- file = st.file_uploader("Upload an Excel file", type=["xlsx"])
362
 
363
  if file is None and not st.session_state.last_overview_md:
364
- st.info("Upload an .xlsx file to begin.")
365
 
366
  # Only show logs AFTER there is an upload or some result to show
367
  logs_placeholder = None
@@ -372,7 +428,7 @@ if file is not None or st.session_state.processing or st.session_state.last_over
372
  if file is not None:
373
  key = _file_key(file)
374
  stem = Path(file.name).stem
375
- saved_xlsx = UPLOAD_DIR / f"{stem}.xlsx"
376
  db_path = DB_DIR / f"{stem}.duckdb"
377
 
378
  # --- CLEAR state immediately on new upload ---
@@ -394,11 +450,11 @@ if file is not None:
394
  logs_placeholder = logs_exp.empty()
395
 
396
  # Save uploaded file
397
- with open(saved_xlsx, "wb") as f:
398
  f.write(file.getbuffer())
399
 
400
  try:
401
- con, app_log = run_ingestion_pipeline(saved_xlsx, db_path, logs_placeholder)
402
  # Analyze + overview
403
  app_log("[app] Analyzing data…")
404
  overview_md, preview_items = analyze_and_summarize(con)
@@ -505,7 +561,7 @@ if st.session_state.get("last_overview_md"):
505
  _db_path = _candidates[0] if _candidates else None
506
 
507
  if not _db_path or not Path(_db_path).exists():
508
- stream_placeholder.error("No dataset found. Please re-upload the Excel file in this session.")
509
  else:
510
  # Call agent lazily
511
  get_schema_summary, make_llm, answer_question = _lazy_imports()
@@ -571,4 +627,4 @@ if st.session_state.get("last_overview_md"):
571
  try:
572
  con2.close()
573
  except Exception:
574
- pass
 
21
  PRIMARY_DIR = Path(__file__).parent.resolve()
22
  UPLOAD_DIR = PRIMARY_DIR / "uploads"
23
  DB_DIR = PRIMARY_DIR / "dbs"
24
+ SCRIPT_PATH = PRIMARY_DIR / "source_to_duckdb.py" # must be colocated
25
 
26
  UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
27
  DB_DIR.mkdir(parents=True, exist_ok=True)
 
82
  if names:
83
  return names
84
  except Exception:
85
+ try:
86
+ rows = con.execute(
87
+ "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
88
+ "FROM __excel_tables ORDER BY block_index, start_row"
89
+ ).fetchall()
90
+ for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
91
+ if table_name not in want_names:
92
+ continue
93
+ title = inferred_title or original_title_text or 'untitled'
94
+ mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
95
+ except Exception:
96
+ pass
97
 
98
  # 2) duckdb_tables()
99
  try:
 
114
  if names:
115
  return names
116
  except Exception:
117
+ try:
118
+ rows = con.execute(
119
+ "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
120
+ "FROM __excel_tables ORDER BY block_index, start_row"
121
+ ).fetchall()
122
+ for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
123
+ if table_name not in want_names:
124
+ continue
125
+ title = inferred_title or original_title_text or 'untitled'
126
+ mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
127
+ except Exception:
128
+ pass
129
 
130
  # 3) Fallback to metadata table
131
  try:
132
+ meta = con.execute("SELECT DISTINCT table_name FROM __file_tables").fetchall()
133
  names = []
134
  for (t,) in meta:
135
  try:
 
139
  continue
140
  return names
141
  except Exception:
142
+ # Fallback to legacy excel metadata table if unified not present
143
+ try:
144
+ meta = con.execute("SELECT DISTINCT table_name FROM __excel_tables").fetchall()
145
+ names = []
146
+ for (t,) in meta:
147
+ try:
148
+ con.execute(f'SELECT 1 FROM "{t}" LIMIT 1').fetchone()
149
+ names.append(t)
150
+ except Exception:
151
+ continue
152
+ return names
153
+ except Exception:
154
+ return []
155
 
156
  def get_columns(con: duckdb.DuckDBPyConnection, table: str) -> List[Tuple[str,str]]:
157
  # Normalize table name for information_schema lookup
 
236
  try:
237
  rows = con.execute(
238
  "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
239
+ "FROM __file_tables ORDER BY block_index, start_row"
240
  ).fetchall()
241
  for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
242
  if table_name not in want_names:
 
244
  title = inferred_title or original_title_text or 'untitled'
245
  mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
246
  except Exception:
247
+ try:
248
+ rows = con.execute(
249
+ "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
250
+ "FROM __excel_tables ORDER BY block_index, start_row"
251
+ ).fetchall()
252
+ for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
253
+ if table_name not in want_names:
254
+ continue
255
+ title = inferred_title or original_title_text or 'untitled'
256
+ mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
257
+ except Exception:
258
+ pass
259
  return mapping
260
 
261
  def excel_schema_samples(con: duckdb.DuckDBPyConnection, mapping: Dict[str, Dict], max_cols: int = 8) -> Dict[str, List[str]]:
262
  """ Return up to max_cols original column names per table_name (normalized) for LLM hints. """
263
  samples: Dict[str, List[str]] = {}
264
  try:
265
+ rows = con.execute("SELECT sheet_name, table_name, column_ordinal, original_name FROM __file_schema ORDER BY sheet_name, table_name, column_ordinal").fetchall()
266
  for sheet_name, table_name, ordn, orig in rows:
267
  if table_name not in mapping:
268
  continue
 
270
  if orig and len(lst) < max_cols:
271
  lst.append(str(orig))
272
  except Exception:
273
+ try:
274
+ rows = con.execute(
275
+ "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
276
+ "FROM __excel_tables ORDER BY block_index, start_row"
277
+ ).fetchall()
278
+ for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
279
+ if table_name not in want_names:
280
+ continue
281
+ title = inferred_title or original_title_text or 'untitled'
282
+ mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
283
+ except Exception:
284
+ pass
285
  return samples
286
 
287
  # ---------- OpenAI ----------
 
317
  return resp.choices[0].message.content.strip()
318
 
319
  # ---------- Orchestration ----------
320
+ def run_ingestion_pipeline(file_path: Path, db_path: Path, log_placeholder):
321
  # Combined log function
322
  log_lines: List[str] = []
323
  def _append(line: str):
 
339
  env = os.environ.copy()
340
  env["PYTHONIOENCODING"] = "utf-8"
341
 
342
+ cmd = [sys.executable, str(SCRIPT_PATH), "--file", str(file_path), "--duckdb", str(db_path)]
343
  try:
344
  proc = subprocess.Popen(
345
  cmd, cwd=str(PRIMARY_DIR),
 
414
  return overview_md, preview_items
415
 
416
  # ---------- UI flow ----------
417
+ file = st.file_uploader("Upload an Excel or CSV file", type=["xlsx", "csv"])
418
 
419
  if file is None and not st.session_state.last_overview_md:
420
+ st.info("Upload a .xlsx or .csv file to begin.")
421
 
422
  # Only show logs AFTER there is an upload or some result to show
423
  logs_placeholder = None
 
428
  if file is not None:
429
  key = _file_key(file)
430
  stem = Path(file.name).stem
431
+ saved_file = UPLOAD_DIR / file.name
432
  db_path = DB_DIR / f"{stem}.duckdb"
433
 
434
  # --- CLEAR state immediately on new upload ---
 
450
  logs_placeholder = logs_exp.empty()
451
 
452
  # Save uploaded file
453
+ with open(saved_file, "wb") as f:
454
  f.write(file.getbuffer())
455
 
456
  try:
457
+ con, app_log = run_ingestion_pipeline(saved_file, db_path, logs_placeholder)
458
  # Analyze + overview
459
  app_log("[app] Analyzing data…")
460
  overview_md, preview_items = analyze_and_summarize(con)
 
561
  _db_path = _candidates[0] if _candidates else None
562
 
563
  if not _db_path or not Path(_db_path).exists():
564
+ stream_placeholder.error("No dataset found. Please re-upload the file in this session.")
565
  else:
566
  # Call agent lazily
567
  get_schema_summary, make_llm, answer_question = _lazy_imports()
 
627
  try:
628
  con2.close()
629
  except Exception:
630
+ pass