marintosti12 commited on
Commit
5b51a2a
·
1 Parent(s): d61744e

feat(seeder/dataset) : add model / migration dataset

Browse files
alembic/versions/b48f06bd8fd6_create_employee_dataset.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """create employee_dataset
2
+
3
+ Revision ID: b48f06bd8fd6
4
+ Revises: 24251a13df00
5
+ Create Date: 2025-09-26 17:49:21.505347
6
+
7
+ """
8
+ from typing import Sequence, Union
9
+
10
+ from alembic import op
11
+ import sqlalchemy as sa
12
+
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision: str = 'b48f06bd8fd6'
16
+ down_revision: Union[str, Sequence[str], None] = '24251a13df00'
17
+ branch_labels: Union[str, Sequence[str], None] = None
18
+ depends_on: Union[str, Sequence[str], None] = None
19
+
20
+
21
+ def upgrade() -> None:
22
+ op.create_table(
23
+ "employee_dataset",
24
+ sa.Column("id", sa.BigInteger, primary_key=True, autoincrement=True),
25
+ sa.Column("id_employee", sa.Integer, nullable=False, index=True),
26
+ sa.Column("age", sa.Integer),
27
+ sa.Column("genre", sa.String(16)),
28
+ sa.Column("revenu_mensuel", sa.Integer),
29
+ sa.Column("statut_marital", sa.Text),
30
+ sa.Column("departement", sa.Text),
31
+ sa.Column("poste", sa.Text),
32
+ sa.Column("nombre_experiences_precedentes", sa.Integer),
33
+ sa.Column("nombre_heures_travailless", sa.Integer),
34
+ sa.Column("annee_experience_totale", sa.Integer),
35
+ sa.Column("annees_dans_l_entreprise", sa.Integer),
36
+ sa.Column("annees_dans_le_poste_actuel", sa.Integer),
37
+ sa.Column("a_quitte_l_entreprise", sa.Integer),
38
+ sa.Column("nombre_participation_pee", sa.Integer),
39
+ sa.Column("nb_formations_suivies", sa.Integer),
40
+ sa.Column("nombre_employee_sous_responsabilite", sa.Integer),
41
+ sa.Column("code_sondage", sa.Text),
42
+ sa.Column("distance_domicile_travail", sa.Integer),
43
+ sa.Column("niveau_education", sa.Text),
44
+ sa.Column("domaine_etude", sa.Text),
45
+ sa.Column("ayant_enfants", sa.Text),
46
+ sa.Column("frequence_deplacement", sa.Text),
47
+ sa.Column("annees_depuis_la_derniere_promotion", sa.Integer),
48
+ sa.Column("annes_sous_responsable_actuel", sa.Integer),
49
+ sa.Column("satisfaction_employee_environnement", sa.Integer),
50
+ sa.Column("note_evaluation_precedente", sa.Integer),
51
+ sa.Column("niveau_hierarchique_poste", sa.Integer),
52
+ sa.Column("satisfaction_employee_nature_travail", sa.Integer),
53
+ sa.Column("satisfaction_employee_equipe", sa.Integer),
54
+ sa.Column("satisfaction_employee_equilibre_pro_perso", sa.Integer),
55
+ sa.Column("eval_number", sa.Text),
56
+ sa.Column("note_evaluation_actuelle", sa.Integer),
57
+ sa.Column("heure_supplementaires", sa.Text),
58
+ sa.Column("augementation_salaire_precedente", sa.Text),
59
+ sa.Column("source_file", sa.Text, nullable=True),
60
+ sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("timezone('utc', now())"), nullable=False),
61
+ )
62
+
63
+
64
+ def downgrade() -> None:
65
+ op.drop_index("ix_employee_dataset_code_sondage", table_name="employee_dataset")
66
+ op.drop_index("ix_employee_dataset_eval_number", table_name="employee_dataset")
67
+ op.drop_index("ix_employee_dataset_id_employee", table_name="employee_dataset")
68
+ op.drop_table("employee_dataset")
src/models/employee_dataset.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlalchemy as sa
2
+ from datetime import datetime, timezone
3
+ from sqlalchemy.orm import Mapped, mapped_column
4
+ from sqlalchemy import BigInteger, Integer, String, DateTime
5
+ from .base import Base
6
+
7
+
8
+ class EmployeeDataset(Base):
9
+ __tablename__ = "employee_dataset"
10
+
11
+ id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
12
+ created_at: Mapped[datetime] = mapped_column(
13
+ DateTime(timezone=True),
14
+ server_default=sa.text("timezone('utc', now())"),
15
+ nullable=False,
16
+ )
17
+
18
+ id_employee: Mapped[int] = mapped_column(Integer, index=True, nullable=False)
19
+
20
+ age: Mapped[int] = mapped_column(Integer)
21
+
22
+ genre: Mapped[str] = mapped_column(String(20))
23
+ revenu_mensuel: Mapped[int] = mapped_column(Integer)
24
+ statut_marital: Mapped[str] = mapped_column(String(50))
25
+ departement: Mapped[str] = mapped_column(String(100), index=True)
26
+ poste: Mapped[str] = mapped_column(String(100))
27
+
28
+ nombre_experiences_precedentes: Mapped[int] = mapped_column(Integer)
29
+ nombre_heures_travailless: Mapped[int] = mapped_column(Integer)
30
+ annee_experience_totale: Mapped[int] = mapped_column(Integer)
31
+ annees_dans_l_entreprise: Mapped[int] = mapped_column(Integer)
32
+ annees_dans_le_poste_actuel: Mapped[int] = mapped_column(Integer)
33
+
34
+ a_quitte_l_entreprise: Mapped[int] = mapped_column(Integer)
35
+
36
+ nombre_participation_pee: Mapped[int] = mapped_column(Integer)
37
+ nb_formations_suivies: Mapped[int] = mapped_column(Integer)
38
+ nombre_employee_sous_responsabilite: Mapped[int] = mapped_column(Integer)
39
+
40
+ code_sondage: Mapped[int] = mapped_column(Integer)
41
+ distance_domicile_travail: Mapped[int] = mapped_column(Integer)
42
+ niveau_education: Mapped[int] = mapped_column(Integer)
43
+ domaine_etude: Mapped[str] = mapped_column(String(100))
44
+
45
+ ayant_enfants: Mapped[str] = mapped_column(String(10))
46
+ frequence_deplacement: Mapped[str] = mapped_column(String(50))
47
+
48
+ annees_depuis_la_derniere_promotion: Mapped[int] = mapped_column(Integer)
49
+ annes_sous_responsable_actuel: Mapped[int] = mapped_column(Integer)
50
+ satisfaction_employee_environnement: Mapped[int] = mapped_column(Integer)
51
+ note_evaluation_precedente: Mapped[int] = mapped_column(Integer)
52
+ niveau_hierarchique_poste: Mapped[int] = mapped_column(Integer)
53
+ satisfaction_employee_nature_travail: Mapped[int] = mapped_column(Integer)
54
+ satisfaction_employee_equipe: Mapped[int] = mapped_column(Integer)
55
+ satisfaction_employee_equilibre_pro_perso: Mapped[int] = mapped_column(Integer)
56
+
57
+ eval_number: Mapped[str] = mapped_column(String(50), index=True)
58
+ note_evaluation_actuelle: Mapped[int] = mapped_column(Integer)
59
+ heure_supplementaires: Mapped[str] = mapped_column(String(10))
60
+ augementation_salaire_precedente: Mapped[int] = mapped_column(Integer)
61
+
62
+ source_file: Mapped[str | None] = mapped_column(String(255), nullable=True)
src/seeds/employee_dataset_seed.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, csv, re
2
+ from sqlalchemy import create_engine, text
3
+ from sqlalchemy.orm import Session
4
+
5
+ try:
6
+ from dotenv import load_dotenv
7
+ load_dotenv()
8
+ except Exception:
9
+ pass
10
+
11
+ RAW_URL = os.environ["DATABASE_URL"]
12
+
13
+ CSV_PATH = os.getenv("CSV_PATH", "artifacts/df_merged.csv")
14
+ CSV_DELIM = os.getenv("CSV_DELIM", ";")
15
+
16
+ engine = create_engine(RAW_URL, future=True)
17
+
18
+ YES = {"oui", "y", "true", "1"}
19
+ NO = {"non", "n", "false", "0"}
20
+
21
+ def map_bool_to_int(v: str | None):
22
+ if v is None: return None
23
+ s = str(v).strip().lower()
24
+ if s in YES: return 1
25
+ if s in NO: return 0
26
+ return None
27
+
28
+ def map_percent_to_int(v: str | None):
29
+ if not v: return None
30
+ m = re.search(r"-?\d+", str(v))
31
+ return int(m.group(0)) if m else None
32
+
33
+ def seed_employee_dataset(session: Session):
34
+ with open(CSV_PATH, "r", encoding="utf-8", newline="") as f:
35
+ reader = csv.DictReader(f, delimiter=CSV_DELIM)
36
+ cols = reader.fieldnames or []
37
+ if not cols:
38
+ raise RuntimeError("CSV sans en-tête.")
39
+
40
+ rows = []
41
+ for r in reader:
42
+ r["a_quitte_l_entreprise"] = map_bool_to_int(r.get("a_quitte_l_entreprise"))
43
+ r["augementation_salaire_precedente"] = map_percent_to_int(
44
+ r.get("augementation_salaire_precedente")
45
+ )
46
+ rows.append(r)
47
+
48
+ if not rows:
49
+ return
50
+
51
+ sql = text(
52
+ f"INSERT INTO employee_dataset ({', '.join(cols)}) "
53
+ f"VALUES ({', '.join(':'+c for c in cols)})"
54
+ )
55
+ session.execute(sql, rows)
56
+
57
+ def main():
58
+ with Session(engine) as s:
59
+ seed_employee_dataset(s)
60
+ s.commit()
61
+
62
+ if __name__ == "__main__":
63
+ main()
src/seeds/ml_models_seed.py CHANGED
@@ -23,7 +23,7 @@ UPSERT = text("""
23
  def seed_ml_models(session: Session):
24
  rows = [
25
  {"id": "5b1c7b3a-0000-4000-8000-000000000001", "name": "baseline", "description": "Baseline model", "is_active": True},
26
- {"id": "5b1c7b3a-0000-4000-8000-000000000002", "name": "xgboost_v1", "description": "XGB v1", "is_active": False},
27
  ]
28
  now = datetime.now(timezone.utc)
29
  for r in rows:
 
23
  def seed_ml_models(session: Session):
24
  rows = [
25
  {"id": "5b1c7b3a-0000-4000-8000-000000000001", "name": "baseline", "description": "Baseline model", "is_active": True},
26
+ {"id": "5b1c7b3a-0000-4000-8000-000000000002", "name": "best_model", "description": "Best model", "is_active": False},
27
  ]
28
  now = datetime.now(timezone.utc)
29
  for r in rows: