File size: 2,795 Bytes
28035e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""Excel → PostgreSQL data synchronization script.

Usage:
    python data_sync.py path/to/file.xlsx
    python data_sync.py path/to/folder/       (syncs all .xlsx files)

Normalizes column names to lowercase with underscores,
then upserts each sheet/file into a PostgreSQL table.
"""

import os
import re
import sys

import pandas as pd
from sqlalchemy import text

from db.connection import get_engine


def normalize_column(name: str) -> str:
    """Lowercase, strip, and replace non-alphanumeric chars with underscore."""
    name = str(name).strip().lower()
    name = re.sub(r"[^a-z0-9]+", "_", name)
    name = name.strip("_")
    return name or "unnamed_col"


def sync_dataframe(df: pd.DataFrame, table_name: str) -> None:
    """Write a DataFrame to PostgreSQL, replacing the existing table."""
    engine = get_engine()

    # Normalize columns
    df.columns = [normalize_column(c) for c in df.columns]

    # Deduplicate column names
    seen: dict[str, int] = {}
    new_cols: list[str] = []
    for col in df.columns:
        if col in seen:
            seen[col] += 1
            new_cols.append(f"{col}_{seen[col]}")
        else:
            seen[col] = 0
            new_cols.append(col)
    df.columns = new_cols

    df.to_sql(table_name, engine, if_exists="replace", index=False)
    print(f"  ✓ Table '{table_name}' synced — {len(df)} rows, {len(df.columns)} columns")


def sync_excel(filepath: str) -> None:
    """Sync all sheets in an Excel file to separate tables."""
    basename = os.path.splitext(os.path.basename(filepath))[0]
    table_name = normalize_column(basename)

    xls = pd.ExcelFile(filepath)
    sheets = xls.sheet_names

    if len(sheets) == 1:
        df = pd.read_excel(filepath, sheet_name=sheets[0])
        sync_dataframe(df, table_name)
    else:
        for sheet in sheets:
            df = pd.read_excel(filepath, sheet_name=sheet)
            sheet_table = f"{table_name}_{normalize_column(sheet)}"
            sync_dataframe(df, sheet_table)


def main() -> None:
    if len(sys.argv) < 2:
        print("Usage: python data_sync.py <path_to_xlsx_or_folder>")
        sys.exit(1)

    target = sys.argv[1]

    if os.path.isdir(target):
        files = [
            os.path.join(target, f)
            for f in os.listdir(target)
            if f.endswith((".xlsx", ".xls"))
        ]
        if not files:
            print(f"No Excel files found in {target}")
            sys.exit(1)
        for fp in sorted(files):
            print(f"Syncing: {fp}")
            sync_excel(fp)
    elif os.path.isfile(target):
        print(f"Syncing: {target}")
        sync_excel(target)
    else:
        print(f"Path not found: {target}")
        sys.exit(1)

    print("\n✓ Data sync complete.")


if __name__ == "__main__":
    main()