Spaces:

arthikrangan
/

data-analysis-agent

Sleeping

App Files Files Community

arthikrangan commited on Aug 28, 2025

Commit

aa96015

verified ·

1 Parent(s): f50d923

Upload 2 files

Browse files

Files changed (2) hide show

source_to_duckdb.py +781 -0
streamlit_app.py +74 -18

source_to_duckdb.py ADDED Viewed

	@@ -0,0 +1,781 @@

+"""
+Excel/CSV → DuckDB ingestion (generic, robust, multi-table, unified lineage)
+- Supports Excel (.xlsx/.xlsm/.xls) and CSV (first row = headers)
+- Hierarchical headers with merged-cell parent context (titles removed)
+- Merged rows/cols resolved to master (top-left) value for consistent replication
+- Multiple tables detected ONLY when separated by at least one completely empty row
+- Footer detection (ignore trailing notes/summaries)
+- Pivot detection (skip pivot-looking rows; optional sheet-level pivot/charthood skip)
+- Optional LLM inference for unnamed columns and table titles (EXCEL_LLM_INFER=1)
+- One DuckDB table per detected table block (Excel) or per file (CSV)
+- Unified lineage tables for BOTH Excel and CSV:
+    __file_schema  (file_name, sheet_name, table_name, column_ordinal, original_name, sql_column)
+    __file_tables  (file_name, sheet_name, table_name, block_index, start_row, end_row,
+                    header_rows_json, inferred_title, original_title_text)
+Usage:
+  python source_to_duckdb.py --file /path/file.xlsx --duckdb /path/out.duckdb
+  python source_to_duckdb.py --file /path/file.csv  --duckdb /path/out.duckdb
+"""
+import os
+import re
+import sys
+import json
+import hashlib
+from pathlib import Path
+from typing import List, Tuple, Dict
+from openpyxl import load_workbook
+from openpyxl.worksheet.worksheet import Worksheet
+# ------------------------- Small utilities -------------------------
+def _nonempty(vals):
+    return [v for v in vals if v not in (None, "")]
+def _is_numlike(x):
+    if isinstance(x, (int, float)):
+        return True
+    s = str(x).strip().replace(",", "")
+    if s.endswith("%"):
+        s = s[:-1]
+    if not s:
+        return False
+    if any(c.isalpha() for c in s):
+        return False
+    try:
+        float(s); return True
+    except: return False
+def _is_year_token(x):
+    if isinstance(x, int) and 1800 <= x <= 2100: return True
+    s = str(x).strip()
+    return s.isdigit() and 1800 <= int(s) <= 2100
+def sanitize_table_name(name: str) -> str:
+    t = re.sub(r"[^\w]", "_", str(name))
+    t = re.sub(r"_+", "_", t).strip("_")
+    if t and not t[0].isalpha(): t = "table_" + t
+    return t or "sheet_data"
+def clean_col_name(s: str) -> str:
+    s = re.sub(r"[^\w\s%#‰]", "", str(s).strip())
+    s = s.replace("%"," pct").replace("‰"," permille").replace("#"," count ")
+    s = re.sub(r"\s+"," ", s)
+    s = re.sub(r"\s+","_", s)
+    s = re.sub(r"_+","_", s).strip("_")
+    if s and s[0].isdigit(): s = "col_" + s
+    return s or "unnamed_column"
+def ensure_unique(names):
+    seen = {}; out = []
+    for n in names:
+        base = (n or "unnamed_column").lower()
+        if base not in seen:
+            seen[base] = 0; out.append(n)
+        else:
+            i = seen[base] + 1
+            while f"{n}_{i}".lower() in seen: i += 1
+            seen[base] = i; out.append(f"{n}_{i}")
+            seen[(f"{n}_{i}").lower()] = 0
+    return out
+def compose_col(parts):
+    cleaned = []; prev = None
+    for p in parts:
+        if not p: continue
+        p_norm = str(p).strip()
+        if prev is not None and p_norm.lower() == prev.lower(): continue
+        cleaned.append(p_norm); prev = p_norm
+    if not cleaned: return ""
+    return clean_col_name("_".join(cleaned))
+# ------------------------- Heuristics & detection -------------------------
+def is_probably_footer(cells):
+    nonempty = [(i, v) for i, v in enumerate(cells) if v not in (None, "")]
+    if not nonempty: return False
+    if len(nonempty) <= 2:
+        text = " ".join(str(v) for _, v in nonempty).strip().lower()
+        if any(text.startswith(k) for k in ["note","notes","source","summary","disclaimer"]): return True
+        if len(text) > 50: return True
+    return False
+def is_probably_data(cells, num_cols):
+    vals = [v for v in cells if v not in (None, "")]
+    if not vals: return False
+    nums_list = [v for v in vals if _is_numlike(v)]
+    num_num = len(nums_list); num_text = len(vals) - num_num
+    density = len(vals) / max(1, num_cols)
+    if num_num >= 2 and all(_is_year_token(v) for v in nums_list) and num_text >= 2:
+        return False
+    if num_num >= max(2, num_text): return True
+    if density >= 0.6 and num_num >= 2: return True
+    first = str(vals[0]).strip().lower() if vals else ""
+    if first in ("total","totals","grand total"): return True
+    return False
+PIVOT_MARKERS = {"row labels","column labels","values","grand total","report filter","filters","∑ values","σ values","Σ values"}
+def is_pivot_marker_string(s: str) -> bool:
+    if not s: return False
+    t = str(s).strip().lower()
+    if t in PIVOT_MARKERS: return True
+    if t.startswith(("sum of ","count of ","avg of ","average of ")): return True
+    if t.endswith(" total") or t.startswith("total "): return True
+    return False
+def is_pivot_row(cells) -> bool:
+    text_cells = [str(v).strip() for v in cells if v not in (None, "")]
+    if not text_cells: return False
+    if any(is_pivot_marker_string(x) for x in text_cells): return True
+    agg_hits = sum(1 for x in text_cells if x.lower().startswith(("sum of","count of","avg of","average of","min of","max of")))
+    return agg_hits >= 2
+def is_pivot_or_chart_sheet(ws: Worksheet) -> bool:
+    try:
+        if getattr(ws, "_charts", None): return True
+    except Exception: pass
+    if hasattr(ws, "_pivots") and getattr(ws, "_pivots"): return True
+    scan_rows = min(ws.max_row, 40); scan_cols = min(ws.max_column, 20)
+    pivotish = 0
+    for r in range(1, scan_rows+1):
+        row = [ws.cell(r,c).value for c in range(1, scan_cols+1)]
+        if is_pivot_row(row):
+            pivotish += 1
+            if pivotish >= 2: return True
+    name = (ws.title or "").lower()
+    if any(k in name for k in ("pivot","dashboard","chart","charts")): return True
+    return False
+def _samples_for_column(rows, col_idx, max_items=20):
+    vals = []
+    for row in rows:
+        if col_idx < len(row):
+            v = row[col_idx]
+            if v not in (None, ""): vals.append(v)
+        if len(vals) >= max_items: break
+    return vals
+def _heuristic_infer_col_name(samples):
+    if not samples: return None
+    if sum(1 for v in samples if _is_year_token(v)) >= max(2, int(0.8*len(samples))): return "year"
+    pct_hits = 0
+    for v in samples:
+        s = str(v).strip()
+        if s.endswith("%"): pct_hits += 1
+        else:
+            try:
+                f = float(s.replace(",",""))
+                if 0 <= f <= 1.0 or 0 <= f <= 100: pct_hits += 0.5
+            except: pass
+    if pct_hits >= max(2, int(0.7*len(samples))): return "percentage"
+    if sum(1 for v in samples if _is_numlike(v)) >= max(3, int(0.7*len(samples))):
+        intish = 0
+        for v in samples:
+            try:
+                if float(str(v).replace(",","")) == int(float(str(v).replace(",",""))): intish += 1
+            except: pass
+        if intish >= max(2, int(0.6*len(samples))): return "count"
+        return "value"
+    uniq = {str(v).strip().lower() for v in samples}
+    if len(uniq) <= 3 and max(len(str(v)) for v in samples) >= 30: return "question"
+    if sum(1 for v in samples if re.search(r"\d", str(v)) and ("-" in str(v) or "–" in str(v))) >= max(2, int(0.6*len(samples))): return "range"
+    if len(uniq) < max(5, int(0.5*len(samples))): return "category"
+    return None
+def used_bounds(ws: Worksheet) -> Tuple[int,int,int,int]:
+    min_row, max_row, min_col, max_col = None, 0, None, 0
+    for r in ws.iter_rows():
+        for c in r:
+            v = c.value
+            if v is not None and str(v).strip() != "":
+                if min_row is None or c.row < min_row: min_row = c.row
+                if c.row > max_row: max_row = c.row
+                if min_col is None or c.column < min_col: min_col = c.column
+                if c.column > max_col: max_col = c.column
+    if min_row is None: return 1,0,1,0
+    return min_row, max_row, min_col, max_col
+def build_merged_master_map(ws: Worksheet):
+    mapping = {}
+    for mr in ws.merged_cells.ranges:
+        min_col, min_row, max_col, max_row = mr.min_col, mr.min_row, mr.max_col, mr.max_row
+        master = (min_row, min_col)
+        for r in range(min_row, max_row+1):
+            for c in range(min_col, max_col+1):
+                mapping[(r,c)] = master
+    return mapping
+def build_value_grid(ws: Worksheet, min_row: int, max_row: int, min_col: int, max_col: int):
+    merged_map = build_merged_master_map(ws)
+    nrows = max_row - min_row + 1; ncols = max_col - min_col + 1
+    grid = [[None]*ncols for _ in range(nrows)]
+    for r in range(min_row, max_row+1):
+        rr = r - min_row
+        for c in range(min_col, max_col+1):
+            cc = c - min_col
+            master = merged_map.get((r,c))
+            if master:
+                mr, mc = master; grid[rr][cc] = ws.cell(mr, mc).value
+            else:
+                grid[rr][cc] = ws.cell(r, c).value
+    return grid
+def row_vals_from_grid(grid, r, min_row):
+    return grid[r - min_row]
+def is_empty_row_vals(vals):
+    return not any(v not in (None, "") for v in vals)
+def is_title_like_row_vals(vals, total_cols=20):
+    vals_ne = _nonempty(vals)
+    if not vals_ne: return False
+    if len(vals_ne) == 1: return True
+    coverage = len(vals_ne) / max(1, total_cols)
+    if coverage <= 0.2 and all(isinstance(v,str) and len(str(v))>20 for v in vals_ne): return True
+    uniq = {str(v).strip().lower() for v in vals_ne}
+    if len(uniq) == 1: return True
+    block = {"local currency unit per us dollar","exchange rate","average annual exchange rate"}
+    if any(str(v).strip().lower() in block for v in vals_ne): return True
+    return False
+def is_header_candidate_row_vals(vals, total_cols=20):
+    vals_ne = _nonempty(vals)
+    if not vals_ne: return False
+    if is_title_like_row_vals(vals, total_cols): return False
+    nums = sum(1 for v in vals_ne if _is_numlike(v))
+    years = sum(1 for v in vals_ne if _is_year_token(v))
+    has_text = any(not _is_numlike(v) for v in vals_ne)
+    if years >= 2 and has_text: return True
+    if nums >= max(2, len(vals_ne)-nums): return years >= max(2, int(0.6*len(vals_ne)))
+    uniq_labels = {str(v).strip().lower() for v in vals_ne if not _is_numlike(v)}
+    return (len(vals_ne) >= 2) or (len(uniq_labels) >= 2)
+def detect_tables_fast(ws: Worksheet, grid, min_row, max_row, min_col, max_col):
+    blocks = []
+    if is_pivot_or_chart_sheet(ws): return blocks
+    total_cols = max_col - min_col + 1
+    r = min_row
+    while r <= max_row:
+        vals = row_vals_from_grid(grid, r, min_row)
+        if is_empty_row_vals(vals) or is_title_like_row_vals(vals, total_cols) or is_pivot_row(vals):
+            r += 1; continue
+        if not is_probably_data(vals, total_cols):
+            r += 1; continue
+        data_start = r
+        header_rows = []
+        up = data_start - 1
+        while up >= min_row:
+            vup = row_vals_from_grid(grid, up, min_row)
+            if is_empty_row_vals(vup): break
+            if is_title_like_row_vals(vup, total_cols) or is_pivot_row(vup):
+                up -= 1; continue
+            if is_header_candidate_row_vals(vup, total_cols):
+                header_rows = []
+                hdr_row = up
+                while hdr_row >= min_row:
+                    hdr_vals = row_vals_from_grid(grid, hdr_row, min_row)
+                    if is_empty_row_vals(hdr_vals): break
+                    if is_header_candidate_row_vals(hdr_vals, total_cols):
+                        header_rows.insert(0, hdr_row); hdr_row -= 1
+                    else: break
+            break
+        data_end = data_start
+        rr = data_start + 1
+        while rr <= max_row:
+            v = row_vals_from_grid(grid, rr, min_row)
+            if is_probably_footer(v) or is_pivot_row(v): break
+            if is_empty_row_vals(v): break
+            if is_probably_data(v, total_cols) or is_header_candidate_row_vals(v, total_cols):
+                data_end = rr
+            rr += 1
+        title_text = None
+        if header_rows:
+            top = header_rows[0]
+            for tr in range(max(min_row, top-3), top):
+                tv = row_vals_from_grid(grid, tr, min_row)
+                if is_title_like_row_vals(tv, total_cols):
+                    first = next((str(x).strip() for x in tv if x not in (None,"")), None)
+                    if first: title_text = first
+                    break
+        if (header_rows or data_end - data_start >= 1) and data_start <= data_end:
+            blocks.append({"header_rows": header_rows, "data_start": data_start, "data_end": data_end, "title_text": title_text})
+        r = data_end + 1
+        while r <= max_row and is_empty_row_vals(row_vals_from_grid(grid, r, min_row)):
+            r += 1
+    return blocks
+def expand_headers_from_grid(grid, header_rows, min_row, min_col, eff_max_col):
+    if not header_rows: return []
+    mat = []
+    for r in header_rows:
+        row_vals = row_vals_from_grid(grid, r, min_row)
+        row = [("" if (row_vals[c] is None) else str(row_vals[c]).strip()) for c in range(0, eff_max_col)]
+        last = ""
+        for i in range(len(row)):
+            if row[i] == "" and i > 0: row[i] = last
+            else: last = row[i]
+        mat.append(row)
+    return mat
+def sheet_block_to_df_fast(ws, grid, min_row, max_row, min_col, max_col, header_rows, data_start, data_end):
+    import pandas as pd
+    total_cols = max_col - min_col + 1
+    if (not header_rows) and data_start and data_start > min_row:
+        prev = row_vals_from_grid(grid, data_start - 1, min_row)
+        if is_header_candidate_row_vals(prev, total_cols):
+            header_rows = [data_start - 1]
+    if (not header_rows) and data_start:
+        cur = row_vals_from_grid(grid, data_start, min_row)
+        nxt = row_vals_from_grid(grid, data_start + 1, min_row) if data_start + 1 <= max_row else []
+        if is_header_candidate_row_vals(cur, total_cols) and is_probably_data(nxt, total_cols):
+            header_rows = [data_start]; data_start += 1
+    if not header_rows or data_start is None or data_end is None or data_end < data_start:
+        import pandas as _pd
+        return _pd.DataFrame(), [], []
+    def used_upto_col():
+        maxc = 0
+        for r in list(header_rows) + list(range(data_start, data_end+1)):
+            vals = row_vals_from_grid(grid, r, min_row)
+            for c_off in range(total_cols):
+                v = vals[c_off]
+                if v not in (None, ""): maxc = max(maxc, c_off+1)
+        return maxc or total_cols
+    eff_max_col = used_upto_col()
+    header_mat = expand_headers_from_grid(grid, header_rows, min_row, min_col, eff_max_col)
+    def is_title_level(values):
+        total = len(values)
+        filled = [str(v).strip() for v in values if v not in (None, "")]
+        if total == 0: return False
+        coverage = len(filled) / total
+        if coverage <= 0.2 and len(filled) <= 2: return True
+        if filled:
+            uniq = {v.lower() for v in filled}
+            if len(uniq) == 1:
+                label = next(iter(uniq))
+                dom = sum(1 for v in values if isinstance(v,str) and v.strip().lower() == label)
+                if dom / total >= 0.6: return True
+        return False
+    usable_levels = [i for i in range(len(header_mat)) if not is_title_level(header_mat[i])]
+    if not usable_levels and header_mat: usable_levels = [len(header_mat) - 1]
+    cols = []
+    for c_off in range(eff_max_col):
+        parts = [header_mat[l][c_off] for l in range(usable_levels[0], usable_levels[-1]+1)] if usable_levels else []
+        cols.append(compose_col(parts))
+    cols = ensure_unique([clean_col_name(x) for x in cols])
+    data_rows = []
+    for r in range(data_start, data_end+1):
+        vals = row_vals_from_grid(grid, r, min_row)
+        row = [vals[c_off] for c_off in range(eff_max_col)]
+        if is_probably_footer(row): break
+        data_rows.append(row[:len(cols)])
+    if not data_rows:
+        import pandas as _pd
+        return _pd.DataFrame(columns=cols), header_mat, cols
+    keep_mask = [any(row[i] not in (None, "") for row in data_rows) for i in range(len(cols))]
+    kept_cols = [c for c,k in zip(cols, keep_mask) if k]
+    trimmed_rows = [[v for v,k in zip(row, keep_mask) if k] for row in data_rows]
+    import pandas as pd
+    df = pd.DataFrame(trimmed_rows, columns=kept_cols)
+    if any(str(c).startswith("unnamed_column") for c in df.columns):
+        new_names = list(df.columns)
+        for idx, name in enumerate(list(df.columns)):
+            if not str(name).startswith("unnamed_column"): continue
+            samples = _samples_for_column(trimmed_rows, idx, max_items=20)
+            guess = _heuristic_infer_col_name(samples)
+            if guess: new_names[idx] = clean_col_name(guess)
+        df.columns = ensure_unique([clean_col_name(x) for x in new_names])
+    return df, header_mat, kept_cols
+# ------------------------- Optional LLM title inference -------------------------
+def _llm_infer_table_title(header_mat, sample_rows, sheet_name):
+    if os.environ.get("EXCEL_LLM_INFER","0") != "1": return None
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key: return None
+    headers = []
+    if header_mat:
+        for c in range(len(header_mat[0])):
+            parts = [header_mat[l][c] for l in range(len(header_mat))]
+            parts = [p for p in parts if p]
+            if parts: headers.append("_".join(parts))
+        headers = headers[:10]
+    samples = [[str(x) for x in r[:6]] for r in sample_rows[:5]]
+    prompt = (
+        "Propose a short, human-readable title for a data table.\n"
+        "Keep it 3-6 words, Title Case, no punctuation at the end.\n"
+        f"Sheet: {sheet_name}\nHeaders: {headers}\nRow samples: {samples}\n"
+        "Answer with JSON: {\"title\": \"...\"}"
+    )
+    try:
+        from openai import OpenAI
+        client = OpenAI(api_key=api_key)
+        resp = client.chat.completions.create(
+            model=os.environ.get("OPENAI_MODEL","gpt-4o-mini"),
+            messages=[{"role":"user","content":prompt}], temperature=0.2,
+        )
+        text = resp.choices[0].message.content.strip()
+    except Exception:
+        return None
+    import re as _re, json as _json
+    m = _re.search(r"\{.*\}", text, re.S)
+    if not m: return None
+    try:
+        obj = _json.loads(m.group(0)); title = obj.get("title","").strip()
+        return title or None
+    except Exception: return None
+def _heuristic_table_title(header_mat, sheet_name, idx):
+    if header_mat:
+        parts = []
+        levels = len(header_mat)
+        cols = len(header_mat[0]) if header_mat else 0
+        for c in range(min(6, cols)):
+            colparts = [header_mat[l][c] for l in range(min(levels, 2)) if header_mat[l][c]]
+            if colparts: parts.extend(colparts)
+        if parts:
+            base = " ".join(dict.fromkeys(parts))
+            return base[:60]
+    return f"{sheet_name} Table {idx}"
+def infer_table_title(header_mat, sample_rows, sheet_name, idx):
+    title = _heuristic_table_title(header_mat, sheet_name, idx)
+    llm = _llm_infer_table_title(header_mat, sample_rows, sheet_name)
+    return llm or title
+# ------------------------- Unified lineage helpers -------------------------
+FILE_SCHEMA_TABLE = "__file_schema"
+FILE_TABLES_TABLE = "__file_tables"
+def ensure_lineage_tables(con):
+    con.execute(f"""
+CREATE TABLE IF NOT EXISTS {FILE_SCHEMA_TABLE} (
+    file_name TEXT,
+    sheet_name TEXT,
+    table_name TEXT,
+    column_ordinal INTEGER,
+    original_name TEXT,
+    sql_column TEXT
+)
+""")
+    con.execute(f"""
+CREATE TABLE IF NOT EXISTS {FILE_TABLES_TABLE} (
+    file_name TEXT,
+    sheet_name TEXT,
+    table_name TEXT,
+    block_index INTEGER,
+    start_row INTEGER,
+    end_row INTEGER,
+    header_rows_json TEXT,
+    inferred_title TEXT,
+    original_title_text TEXT
+)
+""")
+def record_table_schema(con, file_name, sheet_name, table_name, columns):
+    """
+    columns: list of tuples (column_ordinal, original_name, sql_column)
+    """
+    ensure_lineage_tables(con)
+    # DuckDB doesn't support `IS ?` with NULL; branch the delete
+    if sheet_name is None:
+        con.execute(
+            f"DELETE FROM {FILE_SCHEMA_TABLE} WHERE file_name = ? AND sheet_name IS NULL AND table_name = ?",
+            [file_name, table_name],
+        )
+    else:
+        con.execute(
+            f"DELETE FROM {FILE_SCHEMA_TABLE} WHERE file_name = ? AND sheet_name = ? AND table_name = ?",
+            [file_name, sheet_name, table_name],
+        )
+    con.executemany(
+        f"INSERT INTO {FILE_SCHEMA_TABLE} (file_name, sheet_name, table_name, column_ordinal, original_name, sql_column) VALUES (?, ?, ?, ?, ?, ?)",
+        [(file_name, sheet_name, table_name, i, orig, sql) for (i, orig, sql) in columns],
+    )
+def record_table_block(con, file_name, sheet_name, table_name, block_index, start_row, end_row, header_rows_json, inferred_title, original_title_text):
+    ensure_lineage_tables(con)
+    # DuckDB doesn't support `IS ?` with NULL; branch the delete
+    if sheet_name is None:
+        con.execute(
+            f"DELETE FROM {FILE_TABLES_TABLE} WHERE file_name = ? AND sheet_name IS NULL AND table_name = ? AND block_index = ?",
+            [file_name, table_name, int(block_index) if block_index is not None else 0],
+        )
+    else:
+        con.execute(
+            f"DELETE FROM {FILE_TABLES_TABLE} WHERE file_name = ? AND sheet_name = ? AND table_name = ? AND block_index = ?",
+            [file_name, sheet_name, table_name, int(block_index) if block_index is not None else 0],
+        )
+    con.execute(
+        f"""INSERT INTO {FILE_TABLES_TABLE}
+        (file_name, sheet_name, table_name, block_index, start_row, end_row, header_rows_json, inferred_title, original_title_text)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+        [
+            file_name, sheet_name, table_name,
+            int(block_index) if block_index is not None else 0,
+            int(start_row) if start_row is not None else None,
+            int(end_row) if end_row is not None else None,
+            header_rows_json, inferred_title, original_title_text
+        ]
+    )
+# --- block coalescing to avoid nested/overlapping duplicates ---
+def coalesce_blocks(blocks: List[Dict]) -> List[Dict]:
+    """Keep only maximal non-overlapping blocks by data row range."""
+    if not blocks: return blocks
+    blocks_sorted = sorted(blocks, key=lambda b: (b["data_start"], b["data_end"]))
+    result = []
+    for b in blocks_sorted:
+        if any(b["data_start"] >= x["data_start"] and b["data_end"] <= x["data_end"] for x in result):
+            continue  # fully contained -> drop
+        result.append(b)
+    return result
+# ------------------------- Persistence: Excel -------------------------
+def persist(excel_path, duckdb_path):
+    try:
+        from duckdb import connect
+    except ImportError:
+        print("Error: DuckDB library not installed. Install with: pip install duckdb"); sys.exit(1)
+    try:
+        wb = load_workbook(excel_path, data_only=True)
+    except FileNotFoundError:
+        print(f"Error: Excel file not found: {excel_path}"); sys.exit(1)
+    except Exception as e:
+        print(f"Error loading Excel file: {e}"); sys.exit(1)
+    file_name = Path(excel_path).name
+    db_path = Path(duckdb_path)
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    new_db = not db_path.exists()
+    con = connect(str(db_path))
+    if new_db: print(f"Created new DuckDB at: {db_path}")
+    # Ensure unified lineage tables exist
+    ensure_lineage_tables(con)
+    used_names = set(); total_tables = 0; total_rows = 0
+    for sheet in wb.sheetnames:
+        ws = wb[sheet]
+        try:
+            if not isinstance(ws, Worksheet):
+                print(f"Skipping chartsheet: {sheet}"); continue
+        except Exception: pass
+        if is_pivot_or_chart_sheet(ws):
+            print(f"Skipping pivot/chart-like sheet: {sheet}"); continue
+        min_row, max_row, min_col, max_col = used_bounds(ws)
+        if max_row < min_row: continue
+        grid = build_value_grid(ws, min_row, max_row, min_col, max_col)
+        blocks = detect_tables_fast(ws, grid, min_row, max_row, min_col, max_col)
+        blocks = coalesce_blocks(blocks)
+        if not blocks: continue
+        # per-sheet content hash set to avoid identical duplicate content
+        seen_content = set()
+        for idx, blk in enumerate(blocks, start=1):
+            df, header_mat, kept_cols = sheet_block_to_df_fast(
+                ws, grid, min_row, max_row, min_col, max_col,
+                blk["header_rows"], blk["data_start"], blk["data_end"]
+            )
+            if df.empty: continue
+            # Content hash (stable CSV representation)
+            csv_bytes = df.to_csv(index=False).encode("utf-8")
+            h = hashlib.sha256(csv_bytes).hexdigest()
+            if h in seen_content:
+                print(f"Skipping duplicate content on sheet {sheet} (block {idx})")
+                continue
+            seen_content.add(h)
+            # Build original composite header names for lineage mapping
+            original_cols = []
+            if header_mat:
+                levels = len(header_mat)
+                cols = len(header_mat[0]) if header_mat else 0
+                for c in range(cols):
+                    parts = [header_mat[l][c] for l in range(levels)]
+                    original_cols.append("_".join([p for p in parts if p]))
+            else:
+                original_cols = list(df.columns)
+            while len(original_cols) < len(df.columns): original_cols.append("unnamed")
+            title_orig = blk.get("title_text")
+            title = title_orig or infer_table_title(header_mat, df.values.tolist(), sheet, idx)
+            candidate = title if title else f"{sheet} Table {idx}"
+            table = ensure_unique_table_name(used_names, candidate)
+            # Create/replace table
+            con.execute(f'DROP TABLE IF EXISTS "{table}"')
+            con.register(f"{table}_temp", df)
+            con.execute(f'CREATE TABLE "{table}" AS SELECT * FROM {table}_temp')
+            con.unregister(f"{table}_temp")
+            # Record lineage (schema + block)
+            schema_rows = []
+            for cidx, (orig, sqlc) in enumerate(zip(original_cols[:len(df.columns)], df.columns), start=1):
+                schema_rows.append((cidx, str(orig), str(sqlc)))
+            record_table_schema(
+                con,
+                file_name=file_name,
+                sheet_name=sheet,
+                table_name=table,
+                columns=schema_rows,
+            )
+            record_table_block(
+                con,
+                file_name=file_name,
+                sheet_name=sheet,
+                table_name=table,
+                block_index=idx,
+                start_row=int(blk["data_start"]),
+                end_row=int(blk["data_end"]),
+                header_rows_json=json.dumps(blk["header_rows"]),
+                inferred_title=title if title else None,
+                original_title_text=title_orig if title_orig else None,
+            )
+            print(f"Created table {table} from sheet {sheet} with {len(df)} rows and {len(df.columns)} columns.")
+            total_tables += 1; total_rows += len(df)
+    con.close()
+    print(f"""\n✅ Completed.
+   - Created {total_tables} tables with {total_rows} total rows
+   - Column lineage: {FILE_SCHEMA_TABLE}
+   - Block metadata: {FILE_TABLES_TABLE}""")
+# ------------------------- Persistence: CSV -------------------------
+def persist_csv(csv_path, duckdb_path):
+    """
+    Ingest a single CSV file into DuckDB AND write lineage, aligned with Excel.
+    - First row is headers.
+    - One table named from the CSV file name.
+    - Cleans headers and ensures uniqueness.
+    - Records lineage in __file_schema and __file_tables using the unified schema (with file_name).
+    """
+    import pandas as pd
+    from duckdb import connect
+    csv_path = Path(csv_path)
+    if not csv_path.exists():
+        print(f"Error: CSV file not found: {csv_path}")
+        sys.exit(1)
+    # Keep original header names for lineage before cleaning
+    try:
+        df_raw = pd.read_csv(csv_path, header=0, encoding="utf-8-sig")
+    except UnicodeDecodeError:
+        df_raw = pd.read_csv(csv_path, header=0)
+    original_headers = list(df_raw.columns)
+    # Clean/normalize column names
+    def _clean_hdr(s):
+        s = str(s) if s is not None else ""
+        s = s.strip()
+        s = re.sub(r"\s+", " ", s)
+        return clean_col_name(s)
+    cleaned_cols = ensure_unique([_clean_hdr(c) for c in original_headers])
+    df = df_raw.copy()
+    df.columns = cleaned_cols
+    # Compute table name from file name
+    table = sanitize_table_name(csv_path.stem)
+    # Open / create DuckDB
+    db_path = Path(duckdb_path)
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    new_db = not db_path.exists()
+    con = connect(str(db_path))
+    if new_db:
+        print(f"Created new DuckDB at: {db_path}")
+    # Ensure unified lineage tables (with file_name) exist
+    ensure_lineage_tables(con)
+    # Create/replace the data table
+    con.execute(f'DROP TABLE IF EXISTS "{table}"')
+    con.register(f"{table}_temp_df", df)
+    con.execute(f'CREATE TABLE "{table}" AS SELECT * FROM {table}_temp_df')
+    con.unregister(f"{table}_temp_df")
+    # Write lineage
+    file_name = csv_path.name
+    sheet_name = None               # CSV has no sheet
+    block_index = 1                 # single block/table for CSV
+    start_row = 2                   # header is row 1, data starts at 2
+    end_row = len(df) + 1           # header + data rows
+    header_rows_json = "[1]"        # header row index list as JSON
+    inferred_title = None
+    original_title_text = None
+    # Map original->sql columns
+    schema_rows = []
+    for i, (orig, sql) in enumerate(zip(original_headers, cleaned_cols), start=1):
+        schema_rows.append((i, str(orig), str(sql)))
+    record_table_schema(
+        con,
+        file_name=file_name,
+        sheet_name=sheet_name,
+        table_name=table,
+        columns=schema_rows
+    )
+    record_table_block(
+        con,
+        file_name=file_name,
+        sheet_name=sheet_name,
+        table_name=table,
+        block_index=block_index,
+        start_row=start_row,
+        end_row=end_row,
+        header_rows_json=header_rows_json,
+        inferred_title=inferred_title,
+        original_title_text=original_title_text
+    )
+    print(f'Created table {table} from CSV "{csv_path.name}" with {len(df)} rows and {len(df.columns)} columns.')
+    con.close()
+# ------------------------- CLI -------------------------
+def ensure_unique_table_name(existing: set, name: str) -> str:
+    base = sanitize_table_name(name) or "table"
+    if base not in existing:
+        existing.add(base); return base
+    i = 2
+    while f"{base}_{i}" in existing: i += 1
+    out = f"{base}_{i}"; existing.add(out); return out
+def main():
+    import argparse
+    ap = argparse.ArgumentParser(description="Excel/CSV → DuckDB (unified --file + lineage).")
+    ap.add_argument("--file", required=True, help="Path to .xlsx/.xlsm/.xls or .csv")
+    ap.add_argument("--duckdb", required=True, help="Path to DuckDB file")
+    args = ap.parse_args()
+    if not os.path.exists(args.file):
+        print(f"Error: file not found: {args.file}")
+        sys.exit(1)
+    ext = Path(args.file).suffix.lower()
+    if ext in [".xlsx", ".xlsm", ".xls"]:
+        persist(args.file, args.duckdb)
+    elif ext == ".csv":
+        persist_csv(args.file, args.duckdb)
+    else:
+        print("Error: unsupported file type. Use .xlsx/.xlsm/.xls or .csv")
+        sys.exit(2)
+if __name__ == "__main__":
+    main()

streamlit_app.py CHANGED Viewed

@@ -21,7 +21,7 @@ st.set_page_config(page_title="Excel → Dataset", page_icon="📊", layout="wid
 PRIMARY_DIR = Path(__file__).parent.resolve()
 UPLOAD_DIR = PRIMARY_DIR / "uploads"
 DB_DIR = PRIMARY_DIR / "dbs"
-SCRIPT_PATH = PRIMARY_DIR / "excel_to_duckdb.py"  # must be colocated
 UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
 DB_DIR.mkdir(parents=True, exist_ok=True)
@@ -82,7 +82,18 @@ def list_user_tables(con: duckdb.DuckDBPyConnection) -> List[str]:
         if names:
             return names
     except Exception:
-        pass
     # 2) duckdb_tables()
     try:
@@ -103,11 +114,22 @@ def list_user_tables(con: duckdb.DuckDBPyConnection) -> List[str]:
         if names:
             return names
     except Exception:
-        pass
     # 3) Fallback to metadata table
     try:
-        meta = con.execute("SELECT DISTINCT table_name FROM __excel_tables").fetchall()
         names = []
         for (t,) in meta:
             try:
@@ -117,7 +139,19 @@ def list_user_tables(con: duckdb.DuckDBPyConnection) -> List[str]:
                 continue
         return names
     except Exception:
-        return []
 def get_columns(con: duckdb.DuckDBPyConnection, table: str) -> List[Tuple[str,str]]:
     # Normalize table name for information_schema lookup
@@ -202,7 +236,7 @@ def table_mapping(con: duckdb.DuckDBPyConnection, user_tables: List[str]) -> Dic
     try:
         rows = con.execute(
             "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
-            "FROM __excel_tables ORDER BY block_index, start_row"
         ).fetchall()
         for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
             if table_name not in want_names:
@@ -210,14 +244,25 @@ def table_mapping(con: duckdb.DuckDBPyConnection, user_tables: List[str]) -> Dic
             title = inferred_title or original_title_text or 'untitled'
             mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
     except Exception:
-        pass
     return mapping
 def excel_schema_samples(con: duckdb.DuckDBPyConnection, mapping: Dict[str, Dict], max_cols: int = 8) -> Dict[str, List[str]]:
     """ Return up to max_cols original column names per table_name (normalized) for LLM hints. """
     samples: Dict[str, List[str]] = {}
     try:
-        rows = con.execute("SELECT sheet_name, table_name, column_ordinal, original_name FROM __excel_schema ORDER BY sheet_name, table_name, column_ordinal").fetchall()
         for sheet_name, table_name, ordn, orig in rows:
             if table_name not in mapping:
                 continue
@@ -225,7 +270,18 @@ def excel_schema_samples(con: duckdb.DuckDBPyConnection, mapping: Dict[str, Dict
             if orig and len(lst) < max_cols:
                 lst.append(str(orig))
     except Exception:
-        pass
     return samples
 # ---------- OpenAI ----------
@@ -261,7 +317,7 @@ Context (JSON):
     return resp.choices[0].message.content.strip()
 # ---------- Orchestration ----------
-def run_ingestion_pipeline(xlsx_path: Path, db_path: Path, log_placeholder):
     # Combined log function
     log_lines: List[str] = []
     def _append(line: str):
@@ -283,7 +339,7 @@ def run_ingestion_pipeline(xlsx_path: Path, db_path: Path, log_placeholder):
     env = os.environ.copy()
     env["PYTHONIOENCODING"] = "utf-8"
-    cmd = [sys.executable, str(SCRIPT_PATH), "--excel", str(xlsx_path), "--duckdb", str(db_path)]
     try:
         proc = subprocess.Popen(
             cmd, cwd=str(PRIMARY_DIR),
@@ -358,10 +414,10 @@ def analyze_and_summarize(con: duckdb.DuckDBPyConnection):
     return overview_md, preview_items
 # ---------- UI flow ----------
-file = st.file_uploader("Upload an Excel file", type=["xlsx"])
 if file is None and not st.session_state.last_overview_md:
-    st.info("Upload an .xlsx file to begin.")
 # Only show logs AFTER there is an upload or some result to show
 logs_placeholder = None
@@ -372,7 +428,7 @@ if file is not None or st.session_state.processing or st.session_state.last_over
 if file is not None:
     key = _file_key(file)
     stem = Path(file.name).stem
-    saved_xlsx = UPLOAD_DIR / f"{stem}.xlsx"
     db_path = DB_DIR / f"{stem}.duckdb"
     # --- CLEAR state immediately on new upload ---
@@ -394,11 +450,11 @@ if file is not None:
             logs_placeholder = logs_exp.empty()
         # Save uploaded file
-        with open(saved_xlsx, "wb") as f:
             f.write(file.getbuffer())
         try:
-            con, app_log = run_ingestion_pipeline(saved_xlsx, db_path, logs_placeholder)
             # Analyze + overview
             app_log("[app] Analyzing data…")
             overview_md, preview_items = analyze_and_summarize(con)
@@ -505,7 +561,7 @@ if st.session_state.get("last_overview_md"):
                     _db_path = _candidates[0] if _candidates else None
             if not _db_path or not Path(_db_path).exists():
-                stream_placeholder.error("No dataset found. Please re-upload the Excel file in this session.")
             else:
                 # Call agent lazily
                 get_schema_summary, make_llm, answer_question = _lazy_imports()
@@ -571,4 +627,4 @@ if st.session_state.get("last_overview_md"):
                     try:
                         con2.close()
                     except Exception:
-                        pass

 PRIMARY_DIR = Path(__file__).parent.resolve()
 UPLOAD_DIR = PRIMARY_DIR / "uploads"
 DB_DIR = PRIMARY_DIR / "dbs"
+SCRIPT_PATH = PRIMARY_DIR / "source_to_duckdb.py"  # must be colocated
 UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
 DB_DIR.mkdir(parents=True, exist_ok=True)
         if names:
             return names
     except Exception:
+        try:
+            rows = con.execute(
+                "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
+                "FROM __excel_tables ORDER BY block_index, start_row"
+            ).fetchall()
+            for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
+                if table_name not in want_names:
+                    continue
+                title = inferred_title or original_title_text or 'untitled'
+                mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
+        except Exception:
+            pass
     # 2) duckdb_tables()
     try:
         if names:
             return names
     except Exception:
+        try:
+            rows = con.execute(
+                "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
+                "FROM __excel_tables ORDER BY block_index, start_row"
+            ).fetchall()
+            for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
+                if table_name not in want_names:
+                    continue
+                title = inferred_title or original_title_text or 'untitled'
+                mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
+        except Exception:
+            pass
     # 3) Fallback to metadata table
     try:
+        meta = con.execute("SELECT DISTINCT table_name FROM __file_tables").fetchall()
         names = []
         for (t,) in meta:
             try:
                 continue
         return names
     except Exception:
+        # Fallback to legacy excel metadata table if unified not present
+        try:
+            meta = con.execute("SELECT DISTINCT table_name FROM __excel_tables").fetchall()
+            names = []
+            for (t,) in meta:
+                try:
+                    con.execute(f'SELECT 1 FROM "{t}" LIMIT 1').fetchone()
+                    names.append(t)
+                except Exception:
+                    continue
+            return names
+        except Exception:
+            return []
 def get_columns(con: duckdb.DuckDBPyConnection, table: str) -> List[Tuple[str,str]]:
     # Normalize table name for information_schema lookup
     try:
         rows = con.execute(
             "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
+            "FROM __file_tables ORDER BY block_index, start_row"
         ).fetchall()
         for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
             if table_name not in want_names:
             title = inferred_title or original_title_text or 'untitled'
             mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
     except Exception:
+        try:
+            rows = con.execute(
+                "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
+                "FROM __excel_tables ORDER BY block_index, start_row"
+            ).fetchall()
+            for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
+                if table_name not in want_names:
+                    continue
+                title = inferred_title or original_title_text or 'untitled'
+                mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
+        except Exception:
+            pass
     return mapping
 def excel_schema_samples(con: duckdb.DuckDBPyConnection, mapping: Dict[str, Dict], max_cols: int = 8) -> Dict[str, List[str]]:
     """ Return up to max_cols original column names per table_name (normalized) for LLM hints. """
     samples: Dict[str, List[str]] = {}
     try:
+        rows = con.execute("SELECT sheet_name, table_name, column_ordinal, original_name FROM __file_schema ORDER BY sheet_name, table_name, column_ordinal").fetchall()
         for sheet_name, table_name, ordn, orig in rows:
             if table_name not in mapping:
                 continue
             if orig and len(lst) < max_cols:
                 lst.append(str(orig))
     except Exception:
+        try:
+            rows = con.execute(
+                "SELECT sheet_name, table_name, inferred_title, original_title_text, block_index, start_row "
+                "FROM __excel_tables ORDER BY block_index, start_row"
+            ).fetchall()
+            for sheet_name, table_name, inferred_title, original_title_text, block_index, start_row in rows:
+                if table_name not in want_names:
+                    continue
+                title = inferred_title or original_title_text or 'untitled'
+                mapping[table_name] = {'sheet_name': sheet_name, 'title': title}
+        except Exception:
+            pass
     return samples
 # ---------- OpenAI ----------
     return resp.choices[0].message.content.strip()
 # ---------- Orchestration ----------
+def run_ingestion_pipeline(file_path: Path, db_path: Path, log_placeholder):
     # Combined log function
     log_lines: List[str] = []
     def _append(line: str):
     env = os.environ.copy()
     env["PYTHONIOENCODING"] = "utf-8"
+    cmd = [sys.executable, str(SCRIPT_PATH), "--file", str(file_path), "--duckdb", str(db_path)]
     try:
         proc = subprocess.Popen(
             cmd, cwd=str(PRIMARY_DIR),
     return overview_md, preview_items
 # ---------- UI flow ----------
+file = st.file_uploader("Upload an Excel or CSV file", type=["xlsx", "csv"])
 if file is None and not st.session_state.last_overview_md:
+    st.info("Upload a .xlsx or .csv file to begin.")
 # Only show logs AFTER there is an upload or some result to show
 logs_placeholder = None
 if file is not None:
     key = _file_key(file)
     stem = Path(file.name).stem
+    saved_file = UPLOAD_DIR / file.name
     db_path = DB_DIR / f"{stem}.duckdb"
     # --- CLEAR state immediately on new upload ---
             logs_placeholder = logs_exp.empty()
         # Save uploaded file
+        with open(saved_file, "wb") as f:
             f.write(file.getbuffer())
         try:
+            con, app_log = run_ingestion_pipeline(saved_file, db_path, logs_placeholder)
             # Analyze + overview
             app_log("[app] Analyzing data…")
             overview_md, preview_items = analyze_and_summarize(con)
                     _db_path = _candidates[0] if _candidates else None
             if not _db_path or not Path(_db_path).exists():
+                stream_placeholder.error("No dataset found. Please re-upload the file in this session.")
             else:
                 # Call agent lazily
                 get_schema_summary, make_llm, answer_question = _lazy_imports()
                     try:
                         con2.close()
                     except Exception:
+                        pass