Bromeo777 commited on
Commit
644b58f
·
verified ·
1 Parent(s): 36272bd

Add app\models\data.py

Browse files
Files changed (1) hide show
  1. app//models//data.py +76 -0
app//models//data.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/models/data.py
2
+
3
+ from sqlalchemy import String, Integer, DateTime, Float, ForeignKey, JSON, Enum
4
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
5
+ from datetime import datetime
6
+ from typing import Optional, Dict, Any
7
+ from app.models.base import Base
8
+ import enum
9
+
10
+ # -----------------------------
11
+ # Job Status Enum
12
+ # -----------------------------
13
+ class DataJobStatus(str, enum.Enum):
14
+ PENDING = "pending"
15
+ PROFILING = "profiling"
16
+ CLEANING = "cleaning"
17
+ COMPLETED = "completed"
18
+ FAILED = "failed"
19
+
20
+ # -----------------------------
21
+ # Dataset Model
22
+ # -----------------------------
23
+ class Dataset(Base):
24
+ __tablename__ = "datasets"
25
+
26
+ id: Mapped[str] = mapped_column(String(64), primary_key=True)
27
+ user_id: Mapped[int] = mapped_column(Integer, ForeignKey("users.id"))
28
+ filename: Mapped[str] = mapped_column(String(255))
29
+ storage_path: Mapped[str] = mapped_column(String(500))
30
+ institution_id: Mapped[Optional[str]] = mapped_column(String(100))
31
+ row_count: Mapped[Optional[int]] = mapped_column(Integer)
32
+ column_metadata: Mapped[Optional[Dict[str, Any]]] = mapped_column(JSON)
33
+ is_public_domain: Mapped[bool] = mapped_column(default=False)
34
+ created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=datetime.utcnow)
35
+
36
+ cleaning_jobs: Mapped[list["DataCleaningJob"]] = relationship(
37
+ "DataCleaningJob", back_populates="dataset", cascade="all, delete-orphan"
38
+ )
39
+
40
+ # -----------------------------
41
+ # Data Cleaning Job Model
42
+ # -----------------------------
43
+ class DataCleaningJob(Base):
44
+ __tablename__ = "data_cleaning_jobs"
45
+
46
+ id: Mapped[str] = mapped_column(String(64), primary_key=True)
47
+ dataset_id: Mapped[str] = mapped_column(String(64), ForeignKey("datasets.id"))
48
+ status: Mapped[DataJobStatus] = mapped_column(Enum(DataJobStatus), default=DataJobStatus.PENDING)
49
+ study_design: Mapped[Optional[str]] = mapped_column(String(50))
50
+ privacy_score: Mapped[Optional[float]] = mapped_column(Float)
51
+ bias_metrics: Mapped[Optional[Dict[str, Any]]] = mapped_column(JSON)
52
+ cleaned_file_path: Mapped[Optional[str]] = mapped_column(String(500))
53
+ reproducibility_script_path: Mapped[Optional[str]] = mapped_column(String(500))
54
+ started_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True))
55
+ completed_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True))
56
+
57
+ dataset: Mapped["Dataset"] = relationship("Dataset", back_populates="cleaning_jobs")
58
+ decisions: Mapped[list["CleaningDecision"]] = relationship(
59
+ "CleaningDecision", back_populates="job", cascade="all, delete-orphan"
60
+ )
61
+
62
+ # -----------------------------
63
+ # Cleaning Decision Model
64
+ # -----------------------------
65
+ class CleaningDecision(Base):
66
+ __tablename__ = "cleaning_decisions"
67
+
68
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
69
+ job_id: Mapped[str] = mapped_column(String(64), ForeignKey("data_cleaning_jobs.id"))
70
+ target_column: Mapped[str] = mapped_column(String(255), nullable=False)
71
+ action_type: Mapped[str] = mapped_column(String(255), nullable=False)
72
+ reasoning: Mapped[str] = mapped_column(String(1000), nullable=False)
73
+ is_reversed: Mapped[bool] = mapped_column(default=False)
74
+ timestamp: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=datetime.utcnow)
75
+
76
+ job: Mapped["DataCleaningJob"] = relationship("DataCleaningJob", back_populates="decisions")