File size: 4,207 Bytes
73f3860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import asyncio
from datetime import time, timedelta, datetime
import pandas as pd
import numpy as np
from sqlalchemy import select
from src.configs import DatabaseConfig
from src.entities.incident import IncidentModel
from src.entities.apparatus import ApparatusModel
from src.entities.personnel import PersonnelModel
from src.entities.auv_incidentbase import IncidentBaseModel
from src.entities.auv_incidentapparatus import IncidentApparatusModel
from src.entities.auv_incidentpersonnel import IncidentPersonnelModel


EXCEL_FILE = "ProjectTablesData.xlsx"

EXCEL_SHEET_MODEL_MAP = {
    0: IncidentModel,
    1: ApparatusModel,
    2: PersonnelModel,
    3: IncidentBaseModel,
    4: IncidentApparatusModel,
    5: IncidentPersonnelModel
}


def clean_value(value):

    # Handle NaN
    try:
        if pd.isna(value):
            return None
    except (TypeError, ValueError):
        pass

    # Handle pandas Timestamp → return datetime object (NOT string)
    if isinstance(value, pd.Timestamp):
        if value == pd.Timestamp("1900-01-01"):  # Excel empty date default
            return None
        return value.to_pydatetime()  # ✅ Return actual datetime, not string

    # Convert Python time -> string
    if isinstance(value, time):
        return value.strftime("%H:%M:%S")

    # Convert timedelta -> string
    if isinstance(value, timedelta):
        total_seconds = int(value.total_seconds())
        h = total_seconds // 3600
        m = (total_seconds % 3600) // 60
        s = total_seconds % 60
        return f"{h:02}:{m:02}:{s:02}"

    # Convert numpy types
    if isinstance(value, np.generic):
        return value.item()

    # Strip strings
    if isinstance(value, str):
        value = value.strip()
        return value if value else None

    return value


def get_model_columns(model):
    """Extract SQLAlchemy column names"""
    return {column.name for column in model.__table__.columns}


def get_primary_keys(model):
    """Get primary key column names"""
    return [column.name for column in model.__table__.primary_key.columns]


def row_to_model(model, row):
    """Convert pandas row → SQLAlchemy model"""
    model_columns = get_model_columns(model)

    data = {}

    for column in model_columns:
        if column in row:
            data[column] = clean_value(row[column])

    return model(**data)


async def get_existing_primary_keys(session, model, pk_column):
    """
    Fetch existing primary keys from DB
    """
    result = await session.execute(
        select(getattr(model, pk_column))
    )

    return {row[0] for row in result.fetchall()}


async def ingest_excel():

    print("Loading Excel file...")

    excel_data = pd.read_excel(EXCEL_FILE, sheet_name=None)

    async with DatabaseConfig.async_session() as session:

        total_inserted = 0

        for sheet_index, model in EXCEL_SHEET_MODEL_MAP.items():

            sheet_name = list(excel_data.keys())[sheet_index]

            print(f"\nProcessing Sheet: {sheet_name}")
            print(f"Mapped Model: {model.__name__}")

            df = excel_data[sheet_name]

            pk_columns = get_primary_keys(model)

            existing_keys = set()
            seen_keys = set()

            if pk_columns:
                pk_column = pk_columns[0]
                existing_keys = await get_existing_primary_keys(session, model, pk_column)

            objects = []
            skipped = 0

            for _, row in df.iterrows():

                obj = row_to_model(model, row)

                if pk_columns:
                    pk_value = getattr(obj, pk_columns[0], None)

                    if pk_value in existing_keys or pk_value in seen_keys:
                        skipped += 1
                        continue

                    seen_keys.add(pk_value)

                objects.append(obj)

            session.add_all(objects)

            inserted = len(objects)

            print(f"Inserted: {inserted}")
            print(f"Skipped duplicates: {skipped}")

            total_inserted += inserted

        await session.commit()

        print("\nTotal rows inserted:", total_inserted)


if __name__ == "__main__":
    asyncio.run(ingest_excel())