Spaces:
Running
Running
File size: 2,795 Bytes
28035e9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | """Excel → PostgreSQL data synchronization script.
Usage:
python data_sync.py path/to/file.xlsx
python data_sync.py path/to/folder/ (syncs all .xlsx files)
Normalizes column names to lowercase with underscores,
then upserts each sheet/file into a PostgreSQL table.
"""
import os
import re
import sys
import pandas as pd
from sqlalchemy import text
from db.connection import get_engine
def normalize_column(name: str) -> str:
"""Lowercase, strip, and replace non-alphanumeric chars with underscore."""
name = str(name).strip().lower()
name = re.sub(r"[^a-z0-9]+", "_", name)
name = name.strip("_")
return name or "unnamed_col"
def sync_dataframe(df: pd.DataFrame, table_name: str) -> None:
"""Write a DataFrame to PostgreSQL, replacing the existing table."""
engine = get_engine()
# Normalize columns
df.columns = [normalize_column(c) for c in df.columns]
# Deduplicate column names
seen: dict[str, int] = {}
new_cols: list[str] = []
for col in df.columns:
if col in seen:
seen[col] += 1
new_cols.append(f"{col}_{seen[col]}")
else:
seen[col] = 0
new_cols.append(col)
df.columns = new_cols
df.to_sql(table_name, engine, if_exists="replace", index=False)
print(f" ✓ Table '{table_name}' synced — {len(df)} rows, {len(df.columns)} columns")
def sync_excel(filepath: str) -> None:
"""Sync all sheets in an Excel file to separate tables."""
basename = os.path.splitext(os.path.basename(filepath))[0]
table_name = normalize_column(basename)
xls = pd.ExcelFile(filepath)
sheets = xls.sheet_names
if len(sheets) == 1:
df = pd.read_excel(filepath, sheet_name=sheets[0])
sync_dataframe(df, table_name)
else:
for sheet in sheets:
df = pd.read_excel(filepath, sheet_name=sheet)
sheet_table = f"{table_name}_{normalize_column(sheet)}"
sync_dataframe(df, sheet_table)
def main() -> None:
if len(sys.argv) < 2:
print("Usage: python data_sync.py <path_to_xlsx_or_folder>")
sys.exit(1)
target = sys.argv[1]
if os.path.isdir(target):
files = [
os.path.join(target, f)
for f in os.listdir(target)
if f.endswith((".xlsx", ".xls"))
]
if not files:
print(f"No Excel files found in {target}")
sys.exit(1)
for fp in sorted(files):
print(f"Syncing: {fp}")
sync_excel(fp)
elif os.path.isfile(target):
print(f"Syncing: {target}")
sync_excel(target)
else:
print(f"Path not found: {target}")
sys.exit(1)
print("\n✓ Data sync complete.")
if __name__ == "__main__":
main()
|