Spaces:
Sleeping
Sleeping
marintosti12 commited on
Commit ·
5b51a2a
1
Parent(s): d61744e
feat(seeder/dataset) : add model / migration dataset
Browse files
alembic/versions/b48f06bd8fd6_create_employee_dataset.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""create employee_dataset
|
| 2 |
+
|
| 3 |
+
Revision ID: b48f06bd8fd6
|
| 4 |
+
Revises: 24251a13df00
|
| 5 |
+
Create Date: 2025-09-26 17:49:21.505347
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from typing import Sequence, Union
|
| 9 |
+
|
| 10 |
+
from alembic import op
|
| 11 |
+
import sqlalchemy as sa
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# revision identifiers, used by Alembic.
|
| 15 |
+
revision: str = 'b48f06bd8fd6'
|
| 16 |
+
down_revision: Union[str, Sequence[str], None] = '24251a13df00'
|
| 17 |
+
branch_labels: Union[str, Sequence[str], None] = None
|
| 18 |
+
depends_on: Union[str, Sequence[str], None] = None
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def upgrade() -> None:
|
| 22 |
+
op.create_table(
|
| 23 |
+
"employee_dataset",
|
| 24 |
+
sa.Column("id", sa.BigInteger, primary_key=True, autoincrement=True),
|
| 25 |
+
sa.Column("id_employee", sa.Integer, nullable=False, index=True),
|
| 26 |
+
sa.Column("age", sa.Integer),
|
| 27 |
+
sa.Column("genre", sa.String(16)),
|
| 28 |
+
sa.Column("revenu_mensuel", sa.Integer),
|
| 29 |
+
sa.Column("statut_marital", sa.Text),
|
| 30 |
+
sa.Column("departement", sa.Text),
|
| 31 |
+
sa.Column("poste", sa.Text),
|
| 32 |
+
sa.Column("nombre_experiences_precedentes", sa.Integer),
|
| 33 |
+
sa.Column("nombre_heures_travailless", sa.Integer),
|
| 34 |
+
sa.Column("annee_experience_totale", sa.Integer),
|
| 35 |
+
sa.Column("annees_dans_l_entreprise", sa.Integer),
|
| 36 |
+
sa.Column("annees_dans_le_poste_actuel", sa.Integer),
|
| 37 |
+
sa.Column("a_quitte_l_entreprise", sa.Integer),
|
| 38 |
+
sa.Column("nombre_participation_pee", sa.Integer),
|
| 39 |
+
sa.Column("nb_formations_suivies", sa.Integer),
|
| 40 |
+
sa.Column("nombre_employee_sous_responsabilite", sa.Integer),
|
| 41 |
+
sa.Column("code_sondage", sa.Text),
|
| 42 |
+
sa.Column("distance_domicile_travail", sa.Integer),
|
| 43 |
+
sa.Column("niveau_education", sa.Text),
|
| 44 |
+
sa.Column("domaine_etude", sa.Text),
|
| 45 |
+
sa.Column("ayant_enfants", sa.Text),
|
| 46 |
+
sa.Column("frequence_deplacement", sa.Text),
|
| 47 |
+
sa.Column("annees_depuis_la_derniere_promotion", sa.Integer),
|
| 48 |
+
sa.Column("annes_sous_responsable_actuel", sa.Integer),
|
| 49 |
+
sa.Column("satisfaction_employee_environnement", sa.Integer),
|
| 50 |
+
sa.Column("note_evaluation_precedente", sa.Integer),
|
| 51 |
+
sa.Column("niveau_hierarchique_poste", sa.Integer),
|
| 52 |
+
sa.Column("satisfaction_employee_nature_travail", sa.Integer),
|
| 53 |
+
sa.Column("satisfaction_employee_equipe", sa.Integer),
|
| 54 |
+
sa.Column("satisfaction_employee_equilibre_pro_perso", sa.Integer),
|
| 55 |
+
sa.Column("eval_number", sa.Text),
|
| 56 |
+
sa.Column("note_evaluation_actuelle", sa.Integer),
|
| 57 |
+
sa.Column("heure_supplementaires", sa.Text),
|
| 58 |
+
sa.Column("augementation_salaire_precedente", sa.Text),
|
| 59 |
+
sa.Column("source_file", sa.Text, nullable=True),
|
| 60 |
+
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("timezone('utc', now())"), nullable=False),
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def downgrade() -> None:
|
| 65 |
+
op.drop_index("ix_employee_dataset_code_sondage", table_name="employee_dataset")
|
| 66 |
+
op.drop_index("ix_employee_dataset_eval_number", table_name="employee_dataset")
|
| 67 |
+
op.drop_index("ix_employee_dataset_id_employee", table_name="employee_dataset")
|
| 68 |
+
op.drop_table("employee_dataset")
|
src/models/employee_dataset.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlalchemy as sa
|
| 2 |
+
from datetime import datetime, timezone
|
| 3 |
+
from sqlalchemy.orm import Mapped, mapped_column
|
| 4 |
+
from sqlalchemy import BigInteger, Integer, String, DateTime
|
| 5 |
+
from .base import Base
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class EmployeeDataset(Base):
|
| 9 |
+
__tablename__ = "employee_dataset"
|
| 10 |
+
|
| 11 |
+
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
| 12 |
+
created_at: Mapped[datetime] = mapped_column(
|
| 13 |
+
DateTime(timezone=True),
|
| 14 |
+
server_default=sa.text("timezone('utc', now())"),
|
| 15 |
+
nullable=False,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
id_employee: Mapped[int] = mapped_column(Integer, index=True, nullable=False)
|
| 19 |
+
|
| 20 |
+
age: Mapped[int] = mapped_column(Integer)
|
| 21 |
+
|
| 22 |
+
genre: Mapped[str] = mapped_column(String(20))
|
| 23 |
+
revenu_mensuel: Mapped[int] = mapped_column(Integer)
|
| 24 |
+
statut_marital: Mapped[str] = mapped_column(String(50))
|
| 25 |
+
departement: Mapped[str] = mapped_column(String(100), index=True)
|
| 26 |
+
poste: Mapped[str] = mapped_column(String(100))
|
| 27 |
+
|
| 28 |
+
nombre_experiences_precedentes: Mapped[int] = mapped_column(Integer)
|
| 29 |
+
nombre_heures_travailless: Mapped[int] = mapped_column(Integer)
|
| 30 |
+
annee_experience_totale: Mapped[int] = mapped_column(Integer)
|
| 31 |
+
annees_dans_l_entreprise: Mapped[int] = mapped_column(Integer)
|
| 32 |
+
annees_dans_le_poste_actuel: Mapped[int] = mapped_column(Integer)
|
| 33 |
+
|
| 34 |
+
a_quitte_l_entreprise: Mapped[int] = mapped_column(Integer)
|
| 35 |
+
|
| 36 |
+
nombre_participation_pee: Mapped[int] = mapped_column(Integer)
|
| 37 |
+
nb_formations_suivies: Mapped[int] = mapped_column(Integer)
|
| 38 |
+
nombre_employee_sous_responsabilite: Mapped[int] = mapped_column(Integer)
|
| 39 |
+
|
| 40 |
+
code_sondage: Mapped[int] = mapped_column(Integer)
|
| 41 |
+
distance_domicile_travail: Mapped[int] = mapped_column(Integer)
|
| 42 |
+
niveau_education: Mapped[int] = mapped_column(Integer)
|
| 43 |
+
domaine_etude: Mapped[str] = mapped_column(String(100))
|
| 44 |
+
|
| 45 |
+
ayant_enfants: Mapped[str] = mapped_column(String(10))
|
| 46 |
+
frequence_deplacement: Mapped[str] = mapped_column(String(50))
|
| 47 |
+
|
| 48 |
+
annees_depuis_la_derniere_promotion: Mapped[int] = mapped_column(Integer)
|
| 49 |
+
annes_sous_responsable_actuel: Mapped[int] = mapped_column(Integer)
|
| 50 |
+
satisfaction_employee_environnement: Mapped[int] = mapped_column(Integer)
|
| 51 |
+
note_evaluation_precedente: Mapped[int] = mapped_column(Integer)
|
| 52 |
+
niveau_hierarchique_poste: Mapped[int] = mapped_column(Integer)
|
| 53 |
+
satisfaction_employee_nature_travail: Mapped[int] = mapped_column(Integer)
|
| 54 |
+
satisfaction_employee_equipe: Mapped[int] = mapped_column(Integer)
|
| 55 |
+
satisfaction_employee_equilibre_pro_perso: Mapped[int] = mapped_column(Integer)
|
| 56 |
+
|
| 57 |
+
eval_number: Mapped[str] = mapped_column(String(50), index=True)
|
| 58 |
+
note_evaluation_actuelle: Mapped[int] = mapped_column(Integer)
|
| 59 |
+
heure_supplementaires: Mapped[str] = mapped_column(String(10))
|
| 60 |
+
augementation_salaire_precedente: Mapped[int] = mapped_column(Integer)
|
| 61 |
+
|
| 62 |
+
source_file: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
src/seeds/employee_dataset_seed.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, csv, re
|
| 2 |
+
from sqlalchemy import create_engine, text
|
| 3 |
+
from sqlalchemy.orm import Session
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
load_dotenv()
|
| 8 |
+
except Exception:
|
| 9 |
+
pass
|
| 10 |
+
|
| 11 |
+
RAW_URL = os.environ["DATABASE_URL"]
|
| 12 |
+
|
| 13 |
+
CSV_PATH = os.getenv("CSV_PATH", "artifacts/df_merged.csv")
|
| 14 |
+
CSV_DELIM = os.getenv("CSV_DELIM", ";")
|
| 15 |
+
|
| 16 |
+
engine = create_engine(RAW_URL, future=True)
|
| 17 |
+
|
| 18 |
+
YES = {"oui", "y", "true", "1"}
|
| 19 |
+
NO = {"non", "n", "false", "0"}
|
| 20 |
+
|
| 21 |
+
def map_bool_to_int(v: str | None):
|
| 22 |
+
if v is None: return None
|
| 23 |
+
s = str(v).strip().lower()
|
| 24 |
+
if s in YES: return 1
|
| 25 |
+
if s in NO: return 0
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
def map_percent_to_int(v: str | None):
|
| 29 |
+
if not v: return None
|
| 30 |
+
m = re.search(r"-?\d+", str(v))
|
| 31 |
+
return int(m.group(0)) if m else None
|
| 32 |
+
|
| 33 |
+
def seed_employee_dataset(session: Session):
|
| 34 |
+
with open(CSV_PATH, "r", encoding="utf-8", newline="") as f:
|
| 35 |
+
reader = csv.DictReader(f, delimiter=CSV_DELIM)
|
| 36 |
+
cols = reader.fieldnames or []
|
| 37 |
+
if not cols:
|
| 38 |
+
raise RuntimeError("CSV sans en-tête.")
|
| 39 |
+
|
| 40 |
+
rows = []
|
| 41 |
+
for r in reader:
|
| 42 |
+
r["a_quitte_l_entreprise"] = map_bool_to_int(r.get("a_quitte_l_entreprise"))
|
| 43 |
+
r["augementation_salaire_precedente"] = map_percent_to_int(
|
| 44 |
+
r.get("augementation_salaire_precedente")
|
| 45 |
+
)
|
| 46 |
+
rows.append(r)
|
| 47 |
+
|
| 48 |
+
if not rows:
|
| 49 |
+
return
|
| 50 |
+
|
| 51 |
+
sql = text(
|
| 52 |
+
f"INSERT INTO employee_dataset ({', '.join(cols)}) "
|
| 53 |
+
f"VALUES ({', '.join(':'+c for c in cols)})"
|
| 54 |
+
)
|
| 55 |
+
session.execute(sql, rows)
|
| 56 |
+
|
| 57 |
+
def main():
|
| 58 |
+
with Session(engine) as s:
|
| 59 |
+
seed_employee_dataset(s)
|
| 60 |
+
s.commit()
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
main()
|
src/seeds/ml_models_seed.py
CHANGED
|
@@ -23,7 +23,7 @@ UPSERT = text("""
|
|
| 23 |
def seed_ml_models(session: Session):
|
| 24 |
rows = [
|
| 25 |
{"id": "5b1c7b3a-0000-4000-8000-000000000001", "name": "baseline", "description": "Baseline model", "is_active": True},
|
| 26 |
-
{"id": "5b1c7b3a-0000-4000-8000-000000000002", "name": "
|
| 27 |
]
|
| 28 |
now = datetime.now(timezone.utc)
|
| 29 |
for r in rows:
|
|
|
|
| 23 |
def seed_ml_models(session: Session):
|
| 24 |
rows = [
|
| 25 |
{"id": "5b1c7b3a-0000-4000-8000-000000000001", "name": "baseline", "description": "Baseline model", "is_active": True},
|
| 26 |
+
{"id": "5b1c7b3a-0000-4000-8000-000000000002", "name": "best_model", "description": "Best model", "is_active": False},
|
| 27 |
]
|
| 28 |
now = datetime.now(timezone.utc)
|
| 29 |
for r in rows:
|