Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- source_to_duckdb.py +781 -0
- streamlit_app.py +74 -18
source_to_duckdb.py
ADDED
|
@@ -0,0 +1,781 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Excel/CSV → DuckDB ingestion (generic, robust, multi-table, unified lineage)
|
| 3 |
+
|
| 4 |
+
- Supports Excel (.xlsx/.xlsm/.xls) and CSV (first row = headers)
|
| 5 |
+
- Hierarchical headers with merged-cell parent context (titles removed)
|
| 6 |
+
- Merged rows/cols resolved to master (top-left) value for consistent replication
|
| 7 |
+
- Multiple tables detected ONLY when separated by at least one completely empty row
|
| 8 |
+
- Footer detection (ignore trailing notes/summaries)
|
| 9 |
+
- Pivot detection (skip pivot-looking rows; optional sheet-level pivot/charthood skip)
|
| 10 |
+
- Optional LLM inference for unnamed columns and table titles (EXCEL_LLM_INFER=1)
|
| 11 |
+
- One DuckDB table per detected table block (Excel) or per file (CSV)
|
| 12 |
+
- Unified lineage tables for BOTH Excel and CSV:
|
| 13 |
+
__file_schema (file_name, sheet_name, table_name, column_ordinal, original_name, sql_column)
|
| 14 |
+
__file_tables (file_name, sheet_name, table_name, block_index, start_row, end_row,
|
| 15 |
+
header_rows_json, inferred_title, original_title_text)
|
| 16 |
+
|
| 17 |
+
Usage:
|
| 18 |
+
python source_to_duckdb.py --file /path/file.xlsx --duckdb /path/out.duckdb
|
| 19 |
+
python source_to_duckdb.py --file /path/file.csv --duckdb /path/out.duckdb
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import os
|
| 23 |
+
import re
|
| 24 |
+
import sys
|
| 25 |
+
import json
|
| 26 |
+
import hashlib
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
from typing import List, Tuple, Dict
|
| 29 |
+
|
| 30 |
+
from openpyxl import load_workbook
|
| 31 |
+
from openpyxl.worksheet.worksheet import Worksheet
|
| 32 |
+
|
| 33 |
+
# ------------------------- Small utilities -------------------------
|
| 34 |
+
|
| 35 |
+
def _nonempty(vals):
|
| 36 |
+
return [v for v in vals if v not in (None, "")]
|
| 37 |
+
|
| 38 |
+
def _is_numlike(x):
|
| 39 |
+
if isinstance(x, (int, float)):
|
| 40 |
+
return True
|
| 41 |
+
s = str(x).strip().replace(",", "")
|
| 42 |
+
if s.endswith("%"):
|
| 43 |
+
s = s[:-1]
|
| 44 |
+
if not s:
|
| 45 |
+
return False
|
| 46 |
+
if any(c.isalpha() for c in s):
|
| 47 |
+
return False
|
| 48 |
+
try:
|
| 49 |
+
float(s); return True
|
| 50 |
+
except: return False
|
| 51 |
+
|
| 52 |
+
def _is_year_token(x):
|
| 53 |
+
if isinstance(x, int) and 1800 <= x <= 2100: return True
|
| 54 |
+
s = str(x).strip()
|
| 55 |
+
return s.isdigit() and 1800 <= int(s) <= 2100
|
| 56 |
+
|
| 57 |
+
def sanitize_table_name(name: str) -> str:
|
| 58 |
+
t = re.sub(r"[^\w]", "_", str(name))
|
| 59 |
+
t = re.sub(r"_+", "_", t).strip("_")
|
| 60 |
+
if t and not t[0].isalpha(): t = "table_" + t
|
| 61 |
+
return t or "sheet_data"
|
| 62 |
+
|
| 63 |
+
def clean_col_name(s: str) -> str:
|
| 64 |
+
s = re.sub(r"[^\w\s%#‰]", "", str(s).strip())
|
| 65 |
+
s = s.replace("%"," pct").replace("‰"," permille").replace("#"," count ")
|
| 66 |
+
s = re.sub(r"\s+"," ", s)
|
| 67 |
+
s = re.sub(r"\s+","_", s)
|
| 68 |
+
s = re.sub(r"_+","_", s).strip("_")
|
| 69 |
+
if s and s[0].isdigit(): s = "col_" + s
|
| 70 |
+
return s or "unnamed_column"
|
| 71 |
+
|
| 72 |
+
def ensure_unique(names):
|
| 73 |
+
seen = {}; out = []
|
| 74 |
+
for n in names:
|
| 75 |
+
base = (n or "unnamed_column").lower()
|
| 76 |
+
if base not in seen:
|
| 77 |
+
seen[base] = 0; out.append(n)
|
| 78 |
+
else:
|
| 79 |
+
i = seen[base] + 1
|
| 80 |
+
while f"{n}_{i}".lower() in seen: i += 1
|
| 81 |
+
seen[base] = i; out.append(f"{n}_{i}")
|
| 82 |
+
seen[(f"{n}_{i}").lower()] = 0
|
| 83 |
+
return out
|
| 84 |
+
|
| 85 |
+
def compose_col(parts):
|
| 86 |
+
cleaned = []; prev = None
|
| 87 |
+
for p in parts:
|
| 88 |
+
if not p: continue
|
| 89 |
+
p_norm = str(p).strip()
|
| 90 |
+
if prev is not None and p_norm.lower() == prev.lower(): continue
|
| 91 |
+
cleaned.append(p_norm); prev = p_norm
|
| 92 |
+
if not cleaned: return ""
|
| 93 |
+
return clean_col_name("_".join(cleaned))
|
| 94 |
+
|
| 95 |
+
# ------------------------- Heuristics & detection -------------------------
|
| 96 |
+
|
| 97 |
+
def is_probably_footer(cells):
|
| 98 |
+
nonempty = [(i, v) for i, v in enumerate(cells) if v not in (None, "")]
|
| 99 |
+
if not nonempty: return False
|
| 100 |
+
if len(nonempty) <= 2:
|
| 101 |
+
text = " ".join(str(v) for _, v in nonempty).strip().lower()
|
| 102 |
+
if any(text.startswith(k) for k in ["note","notes","source","summary","disclaimer"]): return True
|
| 103 |
+
if len(text) > 50: return True
|
| 104 |
+
return False
|
| 105 |
+
|
| 106 |
+
def is_probably_data(cells, num_cols):
|
| 107 |
+
vals = [v for v in cells if v not in (None, "")]
|
| 108 |
+
if not vals: return False
|
| 109 |
+
nums_list = [v for v in vals if _is_numlike(v)]
|
| 110 |
+
num_num = len(nums_list); num_text = len(vals) - num_num
|
| 111 |
+
density = len(vals) / max(1, num_cols)
|
| 112 |
+
if num_num >= 2 and all(_is_year_token(v) for v in nums_list) and num_text >= 2:
|
| 113 |
+
return False
|
| 114 |
+
if num_num >= max(2, num_text): return True
|
| 115 |
+
if density >= 0.6 and num_num >= 2: return True
|
| 116 |
+
first = str(vals[0]).strip().lower() if vals else ""
|
| 117 |
+
if first in ("total","totals","grand total"): return True
|
| 118 |
+
return False
|
| 119 |
+
|
| 120 |
+
PIVOT_MARKERS = {"row labels","column labels","values","grand total","report filter","filters","∑ values","σ values","Σ values"}
|
| 121 |
+
def is_pivot_marker_string(s: str) -> bool:
|
| 122 |
+
if not s: return False
|
| 123 |
+
t = str(s).strip().lower()
|
| 124 |
+
if t in PIVOT_MARKERS: return True
|
| 125 |
+
if t.startswith(("sum of ","count of ","avg of ","average of ")): return True
|
| 126 |
+
if t.endswith(" total") or t.startswith("total "): return True
|
| 127 |
+
return False
|
| 128 |
+
|
| 129 |
+
def is_pivot_row(cells) -> bool:
|
| 130 |
+
text_cells = [str(v).strip() for v in cells if v not in (None, "")]
|
| 131 |
+
if not text_cells: return False
|
| 132 |
+
if any(is_pivot_marker_string(x) for x in text_cells): return True
|
| 133 |
+
agg_hits = sum(1 for x in text_cells if x.lower().startswith(("sum of","count of","avg of","average of","min of","max of")))
|
| 134 |
+
return agg_hits >= 2
|
| 135 |
+
|
| 136 |
+
def is_pivot_or_chart_sheet(ws: Worksheet) -> bool:
|
| 137 |
+
try:
|
| 138 |
+
if getattr(ws, "_charts", None): return True
|
| 139 |
+
except Exception: pass
|
| 140 |
+
if hasattr(ws, "_pivots") and getattr(ws, "_pivots"): return True
|
| 141 |
+
scan_rows = min(ws.max_row, 40); scan_cols = min(ws.max_column, 20)
|
| 142 |
+
pivotish = 0
|
| 143 |
+
for r in range(1, scan_rows+1):
|
| 144 |
+
row = [ws.cell(r,c).value for c in range(1, scan_cols+1)]
|
| 145 |
+
if is_pivot_row(row):
|
| 146 |
+
pivotish += 1
|
| 147 |
+
if pivotish >= 2: return True
|
| 148 |
+
name = (ws.title or "").lower()
|
| 149 |
+
if any(k in name for k in ("pivot","dashboard","chart","charts")): return True
|
| 150 |
+
return False
|
| 151 |
+
|
| 152 |
+
def _samples_for_column(rows, col_idx, max_items=20):
|
| 153 |
+
vals = []
|
| 154 |
+
for row in rows:
|
| 155 |
+
if col_idx < len(row):
|
| 156 |
+
v = row[col_idx]
|
| 157 |
+
if v not in (None, ""): vals.append(v)
|
| 158 |
+
if len(vals) >= max_items: break
|
| 159 |
+
return vals
|
| 160 |
+
|
| 161 |
+
def _heuristic_infer_col_name(samples):
|
| 162 |
+
if not samples: return None
|
| 163 |
+
if sum(1 for v in samples if _is_year_token(v)) >= max(2, int(0.8*len(samples))): return "year"
|
| 164 |
+
pct_hits = 0
|
| 165 |
+
for v in samples:
|
| 166 |
+
s = str(v).strip()
|
| 167 |
+
if s.endswith("%"): pct_hits += 1
|
| 168 |
+
else:
|
| 169 |
+
try:
|
| 170 |
+
f = float(s.replace(",",""))
|
| 171 |
+
if 0 <= f <= 1.0 or 0 <= f <= 100: pct_hits += 0.5
|
| 172 |
+
except: pass
|
| 173 |
+
if pct_hits >= max(2, int(0.7*len(samples))): return "percentage"
|
| 174 |
+
if sum(1 for v in samples if _is_numlike(v)) >= max(3, int(0.7*len(samples))):
|
| 175 |
+
intish = 0
|
| 176 |
+
for v in samples:
|
| 177 |
+
try:
|
| 178 |
+
if float(str(v).replace(",","")) == int(float(str(v).replace(",",""))): intish += 1
|
| 179 |
+
except: pass
|
| 180 |
+
if intish >= max(2, int(0.6*len(samples))): return "count"
|
| 181 |
+
return "value"
|
| 182 |
+
uniq = {str(v).strip().lower() for v in samples}
|
| 183 |
+
if len(uniq) <= 3 and max(len(str(v)) for v in samples) >= 30: return "question"
|
| 184 |
+
if sum(1 for v in samples if re.search(r"\d", str(v)) and ("-" in str(v) or "–" in str(v))) >= max(2, int(0.6*len(samples))): return "range"
|
| 185 |
+
if len(uniq) < max(5, int(0.5*len(samples))): return "category"
|
| 186 |
+
return None
|
| 187 |
+
|
| 188 |
+
def used_bounds(ws: Worksheet) -> Tuple[int,int,int,int]:
|
| 189 |
+
min_row, max_row, min_col, max_col = None, 0, None, 0
|
| 190 |
+
for r in ws.iter_rows():
|
| 191 |
+
for c in r:
|
| 192 |
+
v = c.value
|
| 193 |
+
if v is not None and str(v).strip() != "":
|
| 194 |
+
if min_row is None or c.row < min_row: min_row = c.row
|
| 195 |
+
if c.row > max_row: max_row = c.row
|
| 196 |
+
if min_col is None or c.column < min_col: min_col = c.column
|
| 197 |
+
if c.column > max_col: max_col = c.column
|
| 198 |
+
if min_row is None: return 1,0,1,0
|
| 199 |
+
return min_row, max_row, min_col, max_col
|
| 200 |
+
|
| 201 |
+
def build_merged_master_map(ws: Worksheet):
|
| 202 |
+
mapping = {}
|
| 203 |
+
for mr in ws.merged_cells.ranges:
|
| 204 |
+
min_col, min_row, max_col, max_row = mr.min_col, mr.min_row, mr.max_col, mr.max_row
|
| 205 |
+
master = (min_row, min_col)
|
| 206 |
+
for r in range(min_row, max_row+1):
|
| 207 |
+
for c in range(min_col, max_col+1):
|
| 208 |
+
mapping[(r,c)] = master
|
| 209 |
+
return mapping
|
| 210 |
+
|
| 211 |
+
def build_value_grid(ws: Worksheet, min_row: int, max_row: int, min_col: int, max_col: int):
|
| 212 |
+
merged_map = build_merged_master_map(ws)
|
| 213 |
+
nrows = max_row - min_row + 1; ncols = max_col - min_col + 1
|
| 214 |
+
grid = [[None]*ncols for _ in range(nrows)]
|
| 215 |
+
for r in range(min_row, max_row+1):
|
| 216 |
+
rr = r - min_row
|
| 217 |
+
for c in range(min_col, max_col+1):
|
| 218 |
+
cc = c - min_col
|
| 219 |
+
master = merged_map.get((r,c))
|
| 220 |
+
if master:
|
| 221 |
+
mr, mc = master; grid[rr][cc] = ws.cell(mr, mc).value
|
| 222 |
+
else:
|
| 223 |
+
grid[rr][cc] = ws.cell(r, c).value
|
| 224 |
+
return grid
|
| 225 |
+
|
| 226 |
+
def row_vals_from_grid(grid, r, min_row):
|
| 227 |
+
return grid[r - min_row]
|
| 228 |
+
|
| 229 |
+
def is_empty_row_vals(vals):
|
| 230 |
+
return not any(v not in (None, "") for v in vals)
|
| 231 |
+
|
| 232 |
+
def is_title_like_row_vals(vals, total_cols=20):
|
| 233 |
+
vals_ne = _nonempty(vals)
|
| 234 |
+
if not vals_ne: return False
|
| 235 |
+
if len(vals_ne) == 1: return True
|
| 236 |
+
coverage = len(vals_ne) / max(1, total_cols)
|
| 237 |
+
if coverage <= 0.2 and all(isinstance(v,str) and len(str(v))>20 for v in vals_ne): return True
|
| 238 |
+
uniq = {str(v).strip().lower() for v in vals_ne}
|
| 239 |
+
if len(uniq) == 1: return True
|
| 240 |
+
block = {"local currency unit per us dollar","exchange rate","average annual exchange rate"}
|
| 241 |
+
if any(str(v).strip().lower() in block for v in vals_ne): return True
|
| 242 |
+
return False
|
| 243 |
+
|
| 244 |
+
def is_header_candidate_row_vals(vals, total_cols=20):
|
| 245 |
+
vals_ne = _nonempty(vals)
|
| 246 |
+
if not vals_ne: return False
|
| 247 |
+
if is_title_like_row_vals(vals, total_cols): return False
|
| 248 |
+
nums = sum(1 for v in vals_ne if _is_numlike(v))
|
| 249 |
+
years = sum(1 for v in vals_ne if _is_year_token(v))
|
| 250 |
+
has_text = any(not _is_numlike(v) for v in vals_ne)
|
| 251 |
+
if years >= 2 and has_text: return True
|
| 252 |
+
if nums >= max(2, len(vals_ne)-nums): return years >= max(2, int(0.6*len(vals_ne)))
|
| 253 |
+
uniq_labels = {str(v).strip().lower() for v in vals_ne if not _is_numlike(v)}
|
| 254 |
+
return (len(vals_ne) >= 2) or (len(uniq_labels) >= 2)
|
| 255 |
+
|
| 256 |
+
def detect_tables_fast(ws: Worksheet, grid, min_row, max_row, min_col, max_col):
|
| 257 |
+
blocks = []
|
| 258 |
+
if is_pivot_or_chart_sheet(ws): return blocks
|
| 259 |
+
total_cols = max_col - min_col + 1
|
| 260 |
+
r = min_row
|
| 261 |
+
while r <= max_row:
|
| 262 |
+
vals = row_vals_from_grid(grid, r, min_row)
|
| 263 |
+
if is_empty_row_vals(vals) or is_title_like_row_vals(vals, total_cols) or is_pivot_row(vals):
|
| 264 |
+
r += 1; continue
|
| 265 |
+
if not is_probably_data(vals, total_cols):
|
| 266 |
+
r += 1; continue
|
| 267 |
+
data_start = r
|
| 268 |
+
header_rows = []
|
| 269 |
+
up = data_start - 1
|
| 270 |
+
while up >= min_row:
|
| 271 |
+
vup = row_vals_from_grid(grid, up, min_row)
|
| 272 |
+
if is_empty_row_vals(vup): break
|
| 273 |
+
if is_title_like_row_vals(vup, total_cols) or is_pivot_row(vup):
|
| 274 |
+
up -= 1; continue
|
| 275 |
+
if is_header_candidate_row_vals(vup, total_cols):
|
| 276 |
+
header_rows = []
|
| 277 |
+
hdr_row = up
|
| 278 |
+
while hdr_row >= min_row:
|
| 279 |
+
hdr_vals = row_vals_from_grid(grid, hdr_row, min_row)
|
| 280 |
+
if is_empty_row_vals(hdr_vals): break
|
| 281 |
+
if is_header_candidate_row_vals(hdr_vals, total_cols):
|
| 282 |
+
header_rows.insert(0, hdr_row); hdr_row -= 1
|
| 283 |
+
else: break
|
| 284 |
+
break
|
| 285 |
+
data_end = data_start
|
| 286 |
+
rr = data_start + 1
|
| 287 |
+
while rr <= max_row:
|
| 288 |
+
v = row_vals_from_grid(grid, rr, min_row)
|
| 289 |
+
if is_probably_footer(v) or is_pivot_row(v): break
|
| 290 |
+
if is_empty_row_vals(v): break
|
| 291 |
+
if is_probably_data(v, total_cols) or is_header_candidate_row_vals(v, total_cols):
|
| 292 |
+
data_end = rr
|
| 293 |
+
rr += 1
|
| 294 |
+
title_text = None
|
| 295 |
+
if header_rows:
|
| 296 |
+
top = header_rows[0]
|
| 297 |
+
for tr in range(max(min_row, top-3), top):
|
| 298 |
+
tv = row_vals_from_grid(grid, tr, min_row)
|
| 299 |
+
if is_title_like_row_vals(tv, total_cols):
|
| 300 |
+
first = next((str(x).strip() for x in tv if x not in (None,"")), None)
|
| 301 |
+
if first: title_text = first
|
| 302 |
+
break
|
| 303 |
+
if (header_rows or data_end - data_start >= 1) and data_start <= data_end:
|
| 304 |
+
blocks.append({"header_rows": header_rows, "data_start": data_start, "data_end": data_end, "title_text": title_text})
|
| 305 |
+
r = data_end + 1
|
| 306 |
+
while r <= max_row and is_empty_row_vals(row_vals_from_grid(grid, r, min_row)):
|
| 307 |
+
r += 1
|
| 308 |
+
return blocks
|
| 309 |
+
|
| 310 |
+
def expand_headers_from_grid(grid, header_rows, min_row, min_col, eff_max_col):
|
| 311 |
+
if not header_rows: return []
|
| 312 |
+
mat = []
|
| 313 |
+
for r in header_rows:
|
| 314 |
+
row_vals = row_vals_from_grid(grid, r, min_row)
|
| 315 |
+
row = [("" if (row_vals[c] is None) else str(row_vals[c]).strip()) for c in range(0, eff_max_col)]
|
| 316 |
+
last = ""
|
| 317 |
+
for i in range(len(row)):
|
| 318 |
+
if row[i] == "" and i > 0: row[i] = last
|
| 319 |
+
else: last = row[i]
|
| 320 |
+
mat.append(row)
|
| 321 |
+
return mat
|
| 322 |
+
|
| 323 |
+
def sheet_block_to_df_fast(ws, grid, min_row, max_row, min_col, max_col, header_rows, data_start, data_end):
|
| 324 |
+
import pandas as pd
|
| 325 |
+
total_cols = max_col - min_col + 1
|
| 326 |
+
if (not header_rows) and data_start and data_start > min_row:
|
| 327 |
+
prev = row_vals_from_grid(grid, data_start - 1, min_row)
|
| 328 |
+
if is_header_candidate_row_vals(prev, total_cols):
|
| 329 |
+
header_rows = [data_start - 1]
|
| 330 |
+
if (not header_rows) and data_start:
|
| 331 |
+
cur = row_vals_from_grid(grid, data_start, min_row)
|
| 332 |
+
nxt = row_vals_from_grid(grid, data_start + 1, min_row) if data_start + 1 <= max_row else []
|
| 333 |
+
if is_header_candidate_row_vals(cur, total_cols) and is_probably_data(nxt, total_cols):
|
| 334 |
+
header_rows = [data_start]; data_start += 1
|
| 335 |
+
if not header_rows or data_start is None or data_end is None or data_end < data_start:
|
| 336 |
+
import pandas as _pd
|
| 337 |
+
return _pd.DataFrame(), [], []
|
| 338 |
+
def used_upto_col():
|
| 339 |
+
maxc = 0
|
| 340 |
+
for r in list(header_rows) + list(range(data_start, data_end+1)):
|
| 341 |
+
vals = row_vals_from_grid(grid, r, min_row)
|
| 342 |
+
for c_off in range(total_cols):
|
| 343 |
+
v = vals[c_off]
|
| 344 |
+
if v not in (None, ""): maxc = max(maxc, c_off+1)
|
| 345 |
+
return maxc or total_cols
|
| 346 |
+
eff_max_col = used_upto_col()
|
| 347 |
+
header_mat = expand_headers_from_grid(grid, header_rows, min_row, min_col, eff_max_col)
|
| 348 |
+
def is_title_level(values):
|
| 349 |
+
total = len(values)
|
| 350 |
+
filled = [str(v).strip() for v in values if v not in (None, "")]
|
| 351 |
+
if total == 0: return False
|
| 352 |
+
coverage = len(filled) / total
|
| 353 |
+
if coverage <= 0.2 and len(filled) <= 2: return True
|
| 354 |
+
if filled:
|
| 355 |
+
uniq = {v.lower() for v in filled}
|
| 356 |
+
if len(uniq) == 1:
|
| 357 |
+
label = next(iter(uniq))
|
| 358 |
+
dom = sum(1 for v in values if isinstance(v,str) and v.strip().lower() == label)
|
| 359 |
+
if dom / total >= 0.6: return True
|
| 360 |
+
return False
|
| 361 |
+
usable_levels = [i for i in range(len(header_mat)) if not is_title_level(header_mat[i])]
|
| 362 |
+
if not usable_levels and header_mat: usable_levels = [len(header_mat) - 1]
|
| 363 |
+
cols = []
|
| 364 |
+
for c_off in range(eff_max_col):
|
| 365 |
+
parts = [header_mat[l][c_off] for l in range(usable_levels[0], usable_levels[-1]+1)] if usable_levels else []
|
| 366 |
+
cols.append(compose_col(parts))
|
| 367 |
+
cols = ensure_unique([clean_col_name(x) for x in cols])
|
| 368 |
+
data_rows = []
|
| 369 |
+
for r in range(data_start, data_end+1):
|
| 370 |
+
vals = row_vals_from_grid(grid, r, min_row)
|
| 371 |
+
row = [vals[c_off] for c_off in range(eff_max_col)]
|
| 372 |
+
if is_probably_footer(row): break
|
| 373 |
+
data_rows.append(row[:len(cols)])
|
| 374 |
+
if not data_rows:
|
| 375 |
+
import pandas as _pd
|
| 376 |
+
return _pd.DataFrame(columns=cols), header_mat, cols
|
| 377 |
+
keep_mask = [any(row[i] not in (None, "") for row in data_rows) for i in range(len(cols))]
|
| 378 |
+
kept_cols = [c for c,k in zip(cols, keep_mask) if k]
|
| 379 |
+
trimmed_rows = [[v for v,k in zip(row, keep_mask) if k] for row in data_rows]
|
| 380 |
+
import pandas as pd
|
| 381 |
+
df = pd.DataFrame(trimmed_rows, columns=kept_cols)
|
| 382 |
+
if any(str(c).startswith("unnamed_column") for c in df.columns):
|
| 383 |
+
new_names = list(df.columns)
|
| 384 |
+
for idx, name in enumerate(list(df.columns)):
|
| 385 |
+
if not str(name).startswith("unnamed_column"): continue
|
| 386 |
+
samples = _samples_for_column(trimmed_rows, idx, max_items=20)
|
| 387 |
+
guess = _heuristic_infer_col_name(samples)
|
| 388 |
+
if guess: new_names[idx] = clean_col_name(guess)
|
| 389 |
+
df.columns = ensure_unique([clean_col_name(x) for x in new_names])
|
| 390 |
+
return df, header_mat, kept_cols
|
| 391 |
+
|
| 392 |
+
# ------------------------- Optional LLM title inference -------------------------
|
| 393 |
+
|
| 394 |
+
def _llm_infer_table_title(header_mat, sample_rows, sheet_name):
|
| 395 |
+
if os.environ.get("EXCEL_LLM_INFER","0") != "1": return None
|
| 396 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
| 397 |
+
if not api_key: return None
|
| 398 |
+
headers = []
|
| 399 |
+
if header_mat:
|
| 400 |
+
for c in range(len(header_mat[0])):
|
| 401 |
+
parts = [header_mat[l][c] for l in range(len(header_mat))]
|
| 402 |
+
parts = [p for p in parts if p]
|
| 403 |
+
if parts: headers.append("_".join(parts))
|
| 404 |
+
headers = headers[:10]
|
| 405 |
+
samples = [[str(x) for x in r[:6]] for r in sample_rows[:5]]
|
| 406 |
+
prompt = (
|
| 407 |
+
"Propose a short, human-readable title for a data table.\n"
|
| 408 |
+
"Keep it 3-6 words, Title Case, no punctuation at the end.\n"
|
| 409 |
+
f"Sheet: {sheet_name}\nHeaders: {headers}\nRow samples: {samples}\n"
|
| 410 |
+
"Answer with JSON: {\"title\": \"...\"}"
|
| 411 |
+
)
|
| 412 |
+
try:
|
| 413 |
+
from openai import OpenAI
|
| 414 |
+
client = OpenAI(api_key=api_key)
|
| 415 |
+
resp = client.chat.completions.create(
|
| 416 |
+
model=os.environ.get("OPENAI_MODEL","gpt-4o-mini"),
|
| 417 |
+
messages=[{"role":"user","content":prompt}], temperature=0.2,
|
| 418 |
+
)
|
| 419 |
+
text = resp.choices[0].message.content.strip()
|
| 420 |
+
except Exception:
|
| 421 |
+
return None
|
| 422 |
+
import re as _re, json as _json
|
| 423 |
+
m = _re.search(r"\{.*\}", text, re.S)
|
| 424 |
+
if not m: return None
|
| 425 |
+
try:
|
| 426 |
+
obj = _json.loads(m.group(0)); title = obj.get("title","").strip()
|
| 427 |
+
return title or None
|
| 428 |
+
except Exception: return None
|
| 429 |
+
|
| 430 |
+
def _heuristic_table_title(header_mat, sheet_name, idx):
|
| 431 |
+
if header_mat:
|
| 432 |
+
parts = []
|
| 433 |
+
levels = len(header_mat)
|
| 434 |
+
cols = len(header_mat[0]) if header_mat else 0
|
| 435 |
+
for c in range(min(6, cols)):
|
| 436 |
+
colparts = [header_mat[l][c] for l in range(min(levels, 2)) if header_mat[l][c]]
|
| 437 |
+
if colparts: parts.extend(colparts)
|
| 438 |
+
if parts:
|
| 439 |
+
base = " ".join(dict.fromkeys(parts))
|
| 440 |
+
return base[:60]
|
| 441 |
+
return f"{sheet_name} Table {idx}"
|
| 442 |
+
|
| 443 |
+
def infer_table_title(header_mat, sample_rows, sheet_name, idx):
|
| 444 |
+
title = _heuristic_table_title(header_mat, sheet_name, idx)
|
| 445 |
+
llm = _llm_infer_table_title(header_mat, sample_rows, sheet_name)
|
| 446 |
+
return llm or title
|
| 447 |
+
|
| 448 |
+
# ------------------------- Unified lineage helpers -------------------------
|
| 449 |
+
|
| 450 |
+
FILE_SCHEMA_TABLE = "__file_schema"
|
| 451 |
+
FILE_TABLES_TABLE = "__file_tables"
|
| 452 |
+
|
| 453 |
+
def ensure_lineage_tables(con):
|
| 454 |
+
con.execute(f"""
|
| 455 |
+
CREATE TABLE IF NOT EXISTS {FILE_SCHEMA_TABLE} (
|
| 456 |
+
file_name TEXT,
|
| 457 |
+
sheet_name TEXT,
|
| 458 |
+
table_name TEXT,
|
| 459 |
+
column_ordinal INTEGER,
|
| 460 |
+
original_name TEXT,
|
| 461 |
+
sql_column TEXT
|
| 462 |
+
)
|
| 463 |
+
""")
|
| 464 |
+
con.execute(f"""
|
| 465 |
+
CREATE TABLE IF NOT EXISTS {FILE_TABLES_TABLE} (
|
| 466 |
+
file_name TEXT,
|
| 467 |
+
sheet_name TEXT,
|
| 468 |
+
table_name TEXT,
|
| 469 |
+
block_index INTEGER,
|
| 470 |
+
start_row INTEGER,
|
| 471 |
+
end_row INTEGER,
|
| 472 |
+
header_rows_json TEXT,
|
| 473 |
+
inferred_title TEXT,
|
| 474 |
+
original_title_text TEXT
|
| 475 |
+
)
|
| 476 |
+
""")
|
| 477 |
+
|
| 478 |
+
def record_table_schema(con, file_name, sheet_name, table_name, columns):
|
| 479 |
+
"""
|
| 480 |
+
columns: list of tuples (column_ordinal, original_name, sql_column)
|
| 481 |
+
"""
|
| 482 |
+
ensure_lineage_tables(con)
|
| 483 |
+
# DuckDB doesn't support `IS ?` with NULL; branch the delete
|
| 484 |
+
if sheet_name is None:
|
| 485 |
+
con.execute(
|
| 486 |
+
f"DELETE FROM {FILE_SCHEMA_TABLE} WHERE file_name = ? AND sheet_name IS NULL AND table_name = ?",
|
| 487 |
+
[file_name, table_name],
|
| 488 |
+
)
|
| 489 |
+
else:
|
| 490 |
+
con.execute(
|
| 491 |
+
f"DELETE FROM {FILE_SCHEMA_TABLE} WHERE file_name = ? AND sheet_name = ? AND table_name = ?",
|
| 492 |
+
[file_name, sheet_name, table_name],
|
| 493 |
+
)
|
| 494 |
+
con.executemany(
|
| 495 |
+
f"INSERT INTO {FILE_SCHEMA_TABLE} (file_name, sheet_name, table_name, column_ordinal, original_name, sql_column) VALUES (?, ?, ?, ?, ?, ?)",
|
| 496 |
+
[(file_name, sheet_name, table_name, i, orig, sql) for (i, orig, sql) in columns],
|
| 497 |
+
)
|
| 498 |
+
|
| 499 |
+
def record_table_block(con, file_name, sheet_name, table_name, block_index, start_row, end_row, header_rows_json, inferred_title, original_title_text):
|
| 500 |
+
ensure_lineage_tables(con)
|
| 501 |
+
# DuckDB doesn't support `IS ?` with NULL; branch the delete
|
| 502 |
+
if sheet_name is None:
|
| 503 |
+
con.execute(
|
| 504 |
+
f"DELETE FROM {FILE_TABLES_TABLE} WHERE file_name = ? AND sheet_name IS NULL AND table_name = ? AND block_index = ?",
|
| 505 |
+
[file_name, table_name, int(block_index) if block_index is not None else 0],
|
| 506 |
+
)
|
| 507 |
+
else:
|
| 508 |
+
con.execute(
|
| 509 |
+
f"DELETE FROM {FILE_TABLES_TABLE} WHERE file_name = ? AND sheet_name = ? AND table_name = ? AND block_index = ?",
|
| 510 |
+
[file_name, sheet_name, table_name, int(block_index) if block_index is not None else 0],
|
| 511 |
+
)
|
| 512 |
+
con.execute(
|
| 513 |
+
f"""INSERT INTO {FILE_TABLES_TABLE}
|
| 514 |
+
(file_name, sheet_name, table_name, block_index, start_row, end_row, header_rows_json, inferred_title, original_title_text)
|
| 515 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
| 516 |
+
[
|
| 517 |
+
file_name, sheet_name, table_name,
|
| 518 |
+
int(block_index) if block_index is not None else 0,
|
| 519 |
+
int(start_row) if start_row is not None else None,
|
| 520 |
+
int(end_row) if end_row is not None else None,
|
| 521 |
+
header_rows_json, inferred_title, original_title_text
|
| 522 |
+
]
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
# --- block coalescing to avoid nested/overlapping duplicates ---
|
| 526 |
+
def coalesce_blocks(blocks: List[Dict]) -> List[Dict]:
|
| 527 |
+
"""Keep only maximal non-overlapping blocks by data row range."""
|
| 528 |
+
if not blocks: return blocks
|
| 529 |
+
blocks_sorted = sorted(blocks, key=lambda b: (b["data_start"], b["data_end"]))
|
| 530 |
+
result = []
|
| 531 |
+
for b in blocks_sorted:
|
| 532 |
+
if any(b["data_start"] >= x["data_start"] and b["data_end"] <= x["data_end"] for x in result):
|
| 533 |
+
continue # fully contained -> drop
|
| 534 |
+
result.append(b)
|
| 535 |
+
return result
|
| 536 |
+
|
| 537 |
+
# ------------------------- Persistence: Excel -------------------------
|
| 538 |
+
|
| 539 |
+
def persist(excel_path, duckdb_path):
|
| 540 |
+
try:
|
| 541 |
+
from duckdb import connect
|
| 542 |
+
except ImportError:
|
| 543 |
+
print("Error: DuckDB library not installed. Install with: pip install duckdb"); sys.exit(1)
|
| 544 |
+
try:
|
| 545 |
+
wb = load_workbook(excel_path, data_only=True)
|
| 546 |
+
except FileNotFoundError:
|
| 547 |
+
print(f"Error: Excel file not found: {excel_path}"); sys.exit(1)
|
| 548 |
+
except Exception as e:
|
| 549 |
+
print(f"Error loading Excel file: {e}"); sys.exit(1)
|
| 550 |
+
|
| 551 |
+
file_name = Path(excel_path).name
|
| 552 |
+
db_path = Path(duckdb_path)
|
| 553 |
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
| 554 |
+
new_db = not db_path.exists()
|
| 555 |
+
con = connect(str(db_path))
|
| 556 |
+
if new_db: print(f"Created new DuckDB at: {db_path}")
|
| 557 |
+
|
| 558 |
+
# Ensure unified lineage tables exist
|
| 559 |
+
ensure_lineage_tables(con)
|
| 560 |
+
|
| 561 |
+
used_names = set(); total_tables = 0; total_rows = 0
|
| 562 |
+
|
| 563 |
+
for sheet in wb.sheetnames:
|
| 564 |
+
ws = wb[sheet]
|
| 565 |
+
try:
|
| 566 |
+
if not isinstance(ws, Worksheet):
|
| 567 |
+
print(f"Skipping chartsheet: {sheet}"); continue
|
| 568 |
+
except Exception: pass
|
| 569 |
+
if is_pivot_or_chart_sheet(ws):
|
| 570 |
+
print(f"Skipping pivot/chart-like sheet: {sheet}"); continue
|
| 571 |
+
|
| 572 |
+
min_row, max_row, min_col, max_col = used_bounds(ws)
|
| 573 |
+
if max_row < min_row: continue
|
| 574 |
+
grid = build_value_grid(ws, min_row, max_row, min_col, max_col)
|
| 575 |
+
|
| 576 |
+
blocks = detect_tables_fast(ws, grid, min_row, max_row, min_col, max_col)
|
| 577 |
+
blocks = coalesce_blocks(blocks)
|
| 578 |
+
if not blocks: continue
|
| 579 |
+
|
| 580 |
+
# per-sheet content hash set to avoid identical duplicate content
|
| 581 |
+
seen_content = set()
|
| 582 |
+
|
| 583 |
+
for idx, blk in enumerate(blocks, start=1):
|
| 584 |
+
df, header_mat, kept_cols = sheet_block_to_df_fast(
|
| 585 |
+
ws, grid, min_row, max_row, min_col, max_col,
|
| 586 |
+
blk["header_rows"], blk["data_start"], blk["data_end"]
|
| 587 |
+
)
|
| 588 |
+
if df.empty: continue
|
| 589 |
+
|
| 590 |
+
# Content hash (stable CSV representation)
|
| 591 |
+
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
| 592 |
+
h = hashlib.sha256(csv_bytes).hexdigest()
|
| 593 |
+
if h in seen_content:
|
| 594 |
+
print(f"Skipping duplicate content on sheet {sheet} (block {idx})")
|
| 595 |
+
continue
|
| 596 |
+
seen_content.add(h)
|
| 597 |
+
|
| 598 |
+
# Build original composite header names for lineage mapping
|
| 599 |
+
original_cols = []
|
| 600 |
+
if header_mat:
|
| 601 |
+
levels = len(header_mat)
|
| 602 |
+
cols = len(header_mat[0]) if header_mat else 0
|
| 603 |
+
for c in range(cols):
|
| 604 |
+
parts = [header_mat[l][c] for l in range(levels)]
|
| 605 |
+
original_cols.append("_".join([p for p in parts if p]))
|
| 606 |
+
else:
|
| 607 |
+
original_cols = list(df.columns)
|
| 608 |
+
while len(original_cols) < len(df.columns): original_cols.append("unnamed")
|
| 609 |
+
|
| 610 |
+
title_orig = blk.get("title_text")
|
| 611 |
+
title = title_orig or infer_table_title(header_mat, df.values.tolist(), sheet, idx)
|
| 612 |
+
candidate = title if title else f"{sheet} Table {idx}"
|
| 613 |
+
table = ensure_unique_table_name(used_names, candidate)
|
| 614 |
+
|
| 615 |
+
# Create/replace table
|
| 616 |
+
con.execute(f'DROP TABLE IF EXISTS "{table}"')
|
| 617 |
+
con.register(f"{table}_temp", df)
|
| 618 |
+
con.execute(f'CREATE TABLE "{table}" AS SELECT * FROM {table}_temp')
|
| 619 |
+
con.unregister(f"{table}_temp")
|
| 620 |
+
|
| 621 |
+
# Record lineage (schema + block)
|
| 622 |
+
schema_rows = []
|
| 623 |
+
for cidx, (orig, sqlc) in enumerate(zip(original_cols[:len(df.columns)], df.columns), start=1):
|
| 624 |
+
schema_rows.append((cidx, str(orig), str(sqlc)))
|
| 625 |
+
record_table_schema(
|
| 626 |
+
con,
|
| 627 |
+
file_name=file_name,
|
| 628 |
+
sheet_name=sheet,
|
| 629 |
+
table_name=table,
|
| 630 |
+
columns=schema_rows,
|
| 631 |
+
)
|
| 632 |
+
record_table_block(
|
| 633 |
+
con,
|
| 634 |
+
file_name=file_name,
|
| 635 |
+
sheet_name=sheet,
|
| 636 |
+
table_name=table,
|
| 637 |
+
block_index=idx,
|
| 638 |
+
start_row=int(blk["data_start"]),
|
| 639 |
+
end_row=int(blk["data_end"]),
|
| 640 |
+
header_rows_json=json.dumps(blk["header_rows"]),
|
| 641 |
+
inferred_title=title if title else None,
|
| 642 |
+
original_title_text=title_orig if title_orig else None,
|
| 643 |
+
)
|
| 644 |
+
|
| 645 |
+
print(f"Created table {table} from sheet {sheet} with {len(df)} rows and {len(df.columns)} columns.")
|
| 646 |
+
total_tables += 1; total_rows += len(df)
|
| 647 |
+
|
| 648 |
+
con.close()
|
| 649 |
+
print(f"""\n✅ Completed.
|
| 650 |
+
- Created {total_tables} tables with {total_rows} total rows
|
| 651 |
+
- Column lineage: {FILE_SCHEMA_TABLE}
|
| 652 |
+
- Block metadata: {FILE_TABLES_TABLE}""")
|
| 653 |
+
|
| 654 |
+
# ------------------------- Persistence: CSV -------------------------
|
| 655 |
+
|
| 656 |
+
def persist_csv(csv_path, duckdb_path):
|
| 657 |
+
"""
|
| 658 |
+
Ingest a single CSV file into DuckDB AND write lineage, aligned with Excel.
|
| 659 |
+
- First row is headers.
|
| 660 |
+
- One table named from the CSV file name.
|
| 661 |
+
- Cleans headers and ensures uniqueness.
|
| 662 |
+
- Records lineage in __file_schema and __file_tables using the unified schema (with file_name).
|
| 663 |
+
"""
|
| 664 |
+
import pandas as pd
|
| 665 |
+
from duckdb import connect
|
| 666 |
+
|
| 667 |
+
csv_path = Path(csv_path)
|
| 668 |
+
if not csv_path.exists():
|
| 669 |
+
print(f"Error: CSV file not found: {csv_path}")
|
| 670 |
+
sys.exit(1)
|
| 671 |
+
|
| 672 |
+
# Keep original header names for lineage before cleaning
|
| 673 |
+
try:
|
| 674 |
+
df_raw = pd.read_csv(csv_path, header=0, encoding="utf-8-sig")
|
| 675 |
+
except UnicodeDecodeError:
|
| 676 |
+
df_raw = pd.read_csv(csv_path, header=0)
|
| 677 |
+
|
| 678 |
+
original_headers = list(df_raw.columns)
|
| 679 |
+
|
| 680 |
+
# Clean/normalize column names
|
| 681 |
+
def _clean_hdr(s):
|
| 682 |
+
s = str(s) if s is not None else ""
|
| 683 |
+
s = s.strip()
|
| 684 |
+
s = re.sub(r"\s+", " ", s)
|
| 685 |
+
return clean_col_name(s)
|
| 686 |
+
|
| 687 |
+
cleaned_cols = ensure_unique([_clean_hdr(c) for c in original_headers])
|
| 688 |
+
df = df_raw.copy()
|
| 689 |
+
df.columns = cleaned_cols
|
| 690 |
+
|
| 691 |
+
# Compute table name from file name
|
| 692 |
+
table = sanitize_table_name(csv_path.stem)
|
| 693 |
+
|
| 694 |
+
# Open / create DuckDB
|
| 695 |
+
db_path = Path(duckdb_path)
|
| 696 |
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
| 697 |
+
new_db = not db_path.exists()
|
| 698 |
+
|
| 699 |
+
con = connect(str(db_path))
|
| 700 |
+
if new_db:
|
| 701 |
+
print(f"Created new DuckDB at: {db_path}")
|
| 702 |
+
|
| 703 |
+
# Ensure unified lineage tables (with file_name) exist
|
| 704 |
+
ensure_lineage_tables(con)
|
| 705 |
+
|
| 706 |
+
# Create/replace the data table
|
| 707 |
+
con.execute(f'DROP TABLE IF EXISTS "{table}"')
|
| 708 |
+
con.register(f"{table}_temp_df", df)
|
| 709 |
+
con.execute(f'CREATE TABLE "{table}" AS SELECT * FROM {table}_temp_df')
|
| 710 |
+
con.unregister(f"{table}_temp_df")
|
| 711 |
+
|
| 712 |
+
# Write lineage
|
| 713 |
+
file_name = csv_path.name
|
| 714 |
+
sheet_name = None # CSV has no sheet
|
| 715 |
+
block_index = 1 # single block/table for CSV
|
| 716 |
+
start_row = 2 # header is row 1, data starts at 2
|
| 717 |
+
end_row = len(df) + 1 # header + data rows
|
| 718 |
+
header_rows_json = "[1]" # header row index list as JSON
|
| 719 |
+
inferred_title = None
|
| 720 |
+
original_title_text = None
|
| 721 |
+
|
| 722 |
+
# Map original->sql columns
|
| 723 |
+
schema_rows = []
|
| 724 |
+
for i, (orig, sql) in enumerate(zip(original_headers, cleaned_cols), start=1):
|
| 725 |
+
schema_rows.append((i, str(orig), str(sql)))
|
| 726 |
+
|
| 727 |
+
record_table_schema(
|
| 728 |
+
con,
|
| 729 |
+
file_name=file_name,
|
| 730 |
+
sheet_name=sheet_name,
|
| 731 |
+
table_name=table,
|
| 732 |
+
columns=schema_rows
|
| 733 |
+
)
|
| 734 |
+
record_table_block(
|
| 735 |
+
con,
|
| 736 |
+
file_name=file_name,
|
| 737 |
+
sheet_name=sheet_name,
|
| 738 |
+
table_name=table,
|
| 739 |
+
block_index=block_index,
|
| 740 |
+
start_row=start_row,
|
| 741 |
+
end_row=end_row,
|
| 742 |
+
header_rows_json=header_rows_json,
|
| 743 |
+
inferred_title=inferred_title,
|
| 744 |
+
original_title_text=original_title_text
|
| 745 |
+
)
|
| 746 |
+
|
| 747 |
+
print(f'Created table {table} from CSV "{csv_path.name}" with {len(df)} rows and {len(df.columns)} columns.')
|
| 748 |
+
con.close()
|
| 749 |
+
|
| 750 |
+
# ------------------------- CLI -------------------------
|
| 751 |
+
|
| 752 |
+
def ensure_unique_table_name(existing: set, name: str) -> str:
|
| 753 |
+
base = sanitize_table_name(name) or "table"
|
| 754 |
+
if base not in existing:
|
| 755 |
+
existing.add(base); return base
|
| 756 |
+
i = 2
|
| 757 |
+
while f"{base}_{i}" in existing: i += 1
|
| 758 |
+
out = f"{base}_{i}"; existing.add(out); return out
|
| 759 |
+
|
| 760 |
+
def main():
|
| 761 |
+
import argparse
|
| 762 |
+
ap = argparse.ArgumentParser(description="Excel/CSV → DuckDB (unified --file + lineage).")
|
| 763 |
+
ap.add_argument("--file", required=True, help="Path to .xlsx/.xlsm/.xls or .csv")
|
| 764 |
+
ap.add_argument("--duckdb", required=True, help="Path to DuckDB file")
|
| 765 |
+
args = ap.parse_args()
|
| 766 |
+
|
| 767 |
+
if not os.path.exists(args.file):
|
| 768 |
+
print(f"Error: file not found: {args.file}")
|
| 769 |
+
sys.exit(1)
|
| 770 |
+
|
| 771 |
+
ext = Path(args.file).suffix.lower()
|
| 772 |
+
if ext in [".xlsx", ".xlsm", ".xls"]:
|
| 773 |
+
persist(args.file, args.duckdb)
|
| 774 |
+
elif ext == ".csv":
|
| 775 |
+
persist_csv(args.file, args.duckdb)
|
| 776 |
+
else:
|
| 777 |
+
print("Error: unsupported file type. Use .xlsx/.xlsm/.xls or .csv")
|
| 778 |
+
sys.exit(2)
|
| 779 |
+
|
| 780 |
+
if __name__ == "__main__":
|
| 781 |
+
main()
|
streamlit_app.py
CHANGED
|
@@ -21,7 +21,7 @@ st.set_page_config(page_title="Excel → Dataset", page_icon="📊", layout="wid
|
|
| 21 |
PRIMARY_DIR = Path(__file__).parent.resolve()
|
| 22 |
UPLOAD_DIR = PRIMARY_DIR / "uploads"
|
| 23 |
DB_DIR = PRIMARY_DIR / "dbs"
|
| 24 |
-
SCRIPT_PATH = PRIMARY_DIR / "
|
| 25 |
|
| 26 |
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
| 27 |
DB_DIR.mkdir(parents=True, exist_ok=True)
|
|
@@ -82,7 +82,18 @@ def list_user_tables(con: duckdb.DuckDBPyConnection) -> List[str]:
|
|
| 82 |
if names:
|
| 83 |
return names
|
| 84 |
except Exception:
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
# 2) duckdb_tables()
|
| 88 |
try:
|
|
@@ -103,11 +114,22 @@ def list_user_tables(con: duckdb.DuckDBPyConnection) -> List[str]:
|
|
| 103 |
if names:
|
| 104 |
return names
|
| 105 |
except Exception:
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
# 3) Fallback to metadata table
|
| 109 |
try:
|
| 110 |
-
meta = con.execute("SELECT DISTINCT table_name FROM
|
| 111 |
names = []
|
| 112 |
for (t,) in meta:
|
| 113 |
try:
|
|
@@ -117,7 +139,19 @@ def list_user_tables(con: duckdb.DuckDBPyConnection) -> List[str]:
|
|
| 117 |
continue
|
| 118 |
return names
|
| 119 |
except Exception:
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
def get_columns(con: duckdb.DuckDBPyConnection, table: str) -> List[Tuple[str,str]]:
|
| 123 |
# Normalize table name for information_schema lookup
|
|
@@ -202,7 +236,7 @@ def table_mapping(con: duckdb.DuckDBPyConnection, user_tables: List[str]) -> Dic
|
|
| 202 |
try:
|
| 203 |
rows = con.execute(
|
| 204 |
"SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
|
| 205 |
-
"FROM
|
| 206 |
).fetchall()
|
| 207 |
for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
|
| 208 |
if table_name not in want_names:
|
|
@@ -210,14 +244,25 @@ def table_mapping(con: duckdb.DuckDBPyConnection, user_tables: List[str]) -> Dic
|
|
| 210 |
title = inferred_title or original_title_text or 'untitled'
|
| 211 |
mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
|
| 212 |
except Exception:
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
return mapping
|
| 215 |
|
| 216 |
def excel_schema_samples(con: duckdb.DuckDBPyConnection, mapping: Dict[str, Dict], max_cols: int = 8) -> Dict[str, List[str]]:
|
| 217 |
""" Return up to max_cols original column names per table_name (normalized) for LLM hints. """
|
| 218 |
samples: Dict[str, List[str]] = {}
|
| 219 |
try:
|
| 220 |
-
rows = con.execute("SELECT sheet_name, table_name, column_ordinal, original_name FROM
|
| 221 |
for sheet_name, table_name, ordn, orig in rows:
|
| 222 |
if table_name not in mapping:
|
| 223 |
continue
|
|
@@ -225,7 +270,18 @@ def excel_schema_samples(con: duckdb.DuckDBPyConnection, mapping: Dict[str, Dict
|
|
| 225 |
if orig and len(lst) < max_cols:
|
| 226 |
lst.append(str(orig))
|
| 227 |
except Exception:
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
return samples
|
| 230 |
|
| 231 |
# ---------- OpenAI ----------
|
|
@@ -261,7 +317,7 @@ Context (JSON):
|
|
| 261 |
return resp.choices[0].message.content.strip()
|
| 262 |
|
| 263 |
# ---------- Orchestration ----------
|
| 264 |
-
def run_ingestion_pipeline(
|
| 265 |
# Combined log function
|
| 266 |
log_lines: List[str] = []
|
| 267 |
def _append(line: str):
|
|
@@ -283,7 +339,7 @@ def run_ingestion_pipeline(xlsx_path: Path, db_path: Path, log_placeholder):
|
|
| 283 |
env = os.environ.copy()
|
| 284 |
env["PYTHONIOENCODING"] = "utf-8"
|
| 285 |
|
| 286 |
-
cmd = [sys.executable, str(SCRIPT_PATH), "--
|
| 287 |
try:
|
| 288 |
proc = subprocess.Popen(
|
| 289 |
cmd, cwd=str(PRIMARY_DIR),
|
|
@@ -358,10 +414,10 @@ def analyze_and_summarize(con: duckdb.DuckDBPyConnection):
|
|
| 358 |
return overview_md, preview_items
|
| 359 |
|
| 360 |
# ---------- UI flow ----------
|
| 361 |
-
file = st.file_uploader("Upload an Excel file", type=["xlsx"])
|
| 362 |
|
| 363 |
if file is None and not st.session_state.last_overview_md:
|
| 364 |
-
st.info("Upload
|
| 365 |
|
| 366 |
# Only show logs AFTER there is an upload or some result to show
|
| 367 |
logs_placeholder = None
|
|
@@ -372,7 +428,7 @@ if file is not None or st.session_state.processing or st.session_state.last_over
|
|
| 372 |
if file is not None:
|
| 373 |
key = _file_key(file)
|
| 374 |
stem = Path(file.name).stem
|
| 375 |
-
|
| 376 |
db_path = DB_DIR / f"{stem}.duckdb"
|
| 377 |
|
| 378 |
# --- CLEAR state immediately on new upload ---
|
|
@@ -394,11 +450,11 @@ if file is not None:
|
|
| 394 |
logs_placeholder = logs_exp.empty()
|
| 395 |
|
| 396 |
# Save uploaded file
|
| 397 |
-
with open(
|
| 398 |
f.write(file.getbuffer())
|
| 399 |
|
| 400 |
try:
|
| 401 |
-
con, app_log = run_ingestion_pipeline(
|
| 402 |
# Analyze + overview
|
| 403 |
app_log("[app] Analyzing data…")
|
| 404 |
overview_md, preview_items = analyze_and_summarize(con)
|
|
@@ -505,7 +561,7 @@ if st.session_state.get("last_overview_md"):
|
|
| 505 |
_db_path = _candidates[0] if _candidates else None
|
| 506 |
|
| 507 |
if not _db_path or not Path(_db_path).exists():
|
| 508 |
-
stream_placeholder.error("No dataset found. Please re-upload the
|
| 509 |
else:
|
| 510 |
# Call agent lazily
|
| 511 |
get_schema_summary, make_llm, answer_question = _lazy_imports()
|
|
@@ -571,4 +627,4 @@ if st.session_state.get("last_overview_md"):
|
|
| 571 |
try:
|
| 572 |
con2.close()
|
| 573 |
except Exception:
|
| 574 |
-
pass
|
|
|
|
| 21 |
PRIMARY_DIR = Path(__file__).parent.resolve()
|
| 22 |
UPLOAD_DIR = PRIMARY_DIR / "uploads"
|
| 23 |
DB_DIR = PRIMARY_DIR / "dbs"
|
| 24 |
+
SCRIPT_PATH = PRIMARY_DIR / "source_to_duckdb.py" # must be colocated
|
| 25 |
|
| 26 |
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
| 27 |
DB_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 82 |
if names:
|
| 83 |
return names
|
| 84 |
except Exception:
|
| 85 |
+
try:
|
| 86 |
+
rows = con.execute(
|
| 87 |
+
"SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
|
| 88 |
+
"FROM __excel_tables ORDER BY block_index, start_row"
|
| 89 |
+
).fetchall()
|
| 90 |
+
for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
|
| 91 |
+
if table_name not in want_names:
|
| 92 |
+
continue
|
| 93 |
+
title = inferred_title or original_title_text or 'untitled'
|
| 94 |
+
mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
|
| 95 |
+
except Exception:
|
| 96 |
+
pass
|
| 97 |
|
| 98 |
# 2) duckdb_tables()
|
| 99 |
try:
|
|
|
|
| 114 |
if names:
|
| 115 |
return names
|
| 116 |
except Exception:
|
| 117 |
+
try:
|
| 118 |
+
rows = con.execute(
|
| 119 |
+
"SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
|
| 120 |
+
"FROM __excel_tables ORDER BY block_index, start_row"
|
| 121 |
+
).fetchall()
|
| 122 |
+
for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
|
| 123 |
+
if table_name not in want_names:
|
| 124 |
+
continue
|
| 125 |
+
title = inferred_title or original_title_text or 'untitled'
|
| 126 |
+
mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
|
| 127 |
+
except Exception:
|
| 128 |
+
pass
|
| 129 |
|
| 130 |
# 3) Fallback to metadata table
|
| 131 |
try:
|
| 132 |
+
meta = con.execute("SELECT DISTINCT table_name FROM __file_tables").fetchall()
|
| 133 |
names = []
|
| 134 |
for (t,) in meta:
|
| 135 |
try:
|
|
|
|
| 139 |
continue
|
| 140 |
return names
|
| 141 |
except Exception:
|
| 142 |
+
# Fallback to legacy excel metadata table if unified not present
|
| 143 |
+
try:
|
| 144 |
+
meta = con.execute("SELECT DISTINCT table_name FROM __excel_tables").fetchall()
|
| 145 |
+
names = []
|
| 146 |
+
for (t,) in meta:
|
| 147 |
+
try:
|
| 148 |
+
con.execute(f'SELECT 1 FROM "{t}" LIMIT 1').fetchone()
|
| 149 |
+
names.append(t)
|
| 150 |
+
except Exception:
|
| 151 |
+
continue
|
| 152 |
+
return names
|
| 153 |
+
except Exception:
|
| 154 |
+
return []
|
| 155 |
|
| 156 |
def get_columns(con: duckdb.DuckDBPyConnection, table: str) -> List[Tuple[str,str]]:
|
| 157 |
# Normalize table name for information_schema lookup
|
|
|
|
| 236 |
try:
|
| 237 |
rows = con.execute(
|
| 238 |
"SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
|
| 239 |
+
"FROM __file_tables ORDER BY block_index, start_row"
|
| 240 |
).fetchall()
|
| 241 |
for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
|
| 242 |
if table_name not in want_names:
|
|
|
|
| 244 |
title = inferred_title or original_title_text or 'untitled'
|
| 245 |
mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
|
| 246 |
except Exception:
|
| 247 |
+
try:
|
| 248 |
+
rows = con.execute(
|
| 249 |
+
"SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
|
| 250 |
+
"FROM __excel_tables ORDER BY block_index, start_row"
|
| 251 |
+
).fetchall()
|
| 252 |
+
for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
|
| 253 |
+
if table_name not in want_names:
|
| 254 |
+
continue
|
| 255 |
+
title = inferred_title or original_title_text or 'untitled'
|
| 256 |
+
mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
|
| 257 |
+
except Exception:
|
| 258 |
+
pass
|
| 259 |
return mapping
|
| 260 |
|
| 261 |
def excel_schema_samples(con: duckdb.DuckDBPyConnection, mapping: Dict[str, Dict], max_cols: int = 8) -> Dict[str, List[str]]:
|
| 262 |
""" Return up to max_cols original column names per table_name (normalized) for LLM hints. """
|
| 263 |
samples: Dict[str, List[str]] = {}
|
| 264 |
try:
|
| 265 |
+
rows = con.execute("SELECT sheet_name, table_name, column_ordinal, original_name FROM __file_schema ORDER BY sheet_name, table_name, column_ordinal").fetchall()
|
| 266 |
for sheet_name, table_name, ordn, orig in rows:
|
| 267 |
if table_name not in mapping:
|
| 268 |
continue
|
|
|
|
| 270 |
if orig and len(lst) < max_cols:
|
| 271 |
lst.append(str(orig))
|
| 272 |
except Exception:
|
| 273 |
+
try:
|
| 274 |
+
rows = con.execute(
|
| 275 |
+
"SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
|
| 276 |
+
"FROM __excel_tables ORDER BY block_index, start_row"
|
| 277 |
+
).fetchall()
|
| 278 |
+
for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
|
| 279 |
+
if table_name not in want_names:
|
| 280 |
+
continue
|
| 281 |
+
title = inferred_title or original_title_text or 'untitled'
|
| 282 |
+
mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
|
| 283 |
+
except Exception:
|
| 284 |
+
pass
|
| 285 |
return samples
|
| 286 |
|
| 287 |
# ---------- OpenAI ----------
|
|
|
|
| 317 |
return resp.choices[0].message.content.strip()
|
| 318 |
|
| 319 |
# ---------- Orchestration ----------
|
| 320 |
+
def run_ingestion_pipeline(file_path: Path, db_path: Path, log_placeholder):
|
| 321 |
# Combined log function
|
| 322 |
log_lines: List[str] = []
|
| 323 |
def _append(line: str):
|
|
|
|
| 339 |
env = os.environ.copy()
|
| 340 |
env["PYTHONIOENCODING"] = "utf-8"
|
| 341 |
|
| 342 |
+
cmd = [sys.executable, str(SCRIPT_PATH), "--file", str(file_path), "--duckdb", str(db_path)]
|
| 343 |
try:
|
| 344 |
proc = subprocess.Popen(
|
| 345 |
cmd, cwd=str(PRIMARY_DIR),
|
|
|
|
| 414 |
return overview_md, preview_items
|
| 415 |
|
| 416 |
# ---------- UI flow ----------
|
| 417 |
+
file = st.file_uploader("Upload an Excel or CSV file", type=["xlsx", "csv"])
|
| 418 |
|
| 419 |
if file is None and not st.session_state.last_overview_md:
|
| 420 |
+
st.info("Upload a .xlsx or .csv file to begin.")
|
| 421 |
|
| 422 |
# Only show logs AFTER there is an upload or some result to show
|
| 423 |
logs_placeholder = None
|
|
|
|
| 428 |
if file is not None:
|
| 429 |
key = _file_key(file)
|
| 430 |
stem = Path(file.name).stem
|
| 431 |
+
saved_file = UPLOAD_DIR / file.name
|
| 432 |
db_path = DB_DIR / f"{stem}.duckdb"
|
| 433 |
|
| 434 |
# --- CLEAR state immediately on new upload ---
|
|
|
|
| 450 |
logs_placeholder = logs_exp.empty()
|
| 451 |
|
| 452 |
# Save uploaded file
|
| 453 |
+
with open(saved_file, "wb") as f:
|
| 454 |
f.write(file.getbuffer())
|
| 455 |
|
| 456 |
try:
|
| 457 |
+
con, app_log = run_ingestion_pipeline(saved_file, db_path, logs_placeholder)
|
| 458 |
# Analyze + overview
|
| 459 |
app_log("[app] Analyzing data…")
|
| 460 |
overview_md, preview_items = analyze_and_summarize(con)
|
|
|
|
| 561 |
_db_path = _candidates[0] if _candidates else None
|
| 562 |
|
| 563 |
if not _db_path or not Path(_db_path).exists():
|
| 564 |
+
stream_placeholder.error("No dataset found. Please re-upload the file in this session.")
|
| 565 |
else:
|
| 566 |
# Call agent lazily
|
| 567 |
get_schema_summary, make_llm, answer_question = _lazy_imports()
|
|
|
|
| 627 |
try:
|
| 628 |
con2.close()
|
| 629 |
except Exception:
|
| 630 |
+
pass
|