File size: 1,963 Bytes
c7d4394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
16ec2cf
c7d4394
 
 
16ec2cf
c7d4394
 
 
 
16ec2cf
c7d4394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
models/anomaly-detection/src/entity/artifact_entity.py
Artifact entities for pipeline outputs
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path


@dataclass
class DataIngestionArtifact:
    """Artifact from data ingestion step"""
    raw_data_path: str
    total_records: int
    records_from_sqlite: int
    records_from_csv: int
    ingestion_timestamp: str
    is_data_available: bool


@dataclass
class DataValidationArtifact:
    """Artifact from data validation step"""
    validated_data_path: str
    validation_report_path: str
    total_records: int
    valid_records: int
    invalid_records: int
    validation_status: bool
    validation_errors: List[Dict[str, Any]]


@dataclass
class DataTransformationArtifact:
    """Artifact from data transformation step"""
    transformed_data_path: str
    vector_embeddings_path: str
    feature_store_path: str
    total_records: int
    language_distribution: Dict[str, int]
    transformation_report: Dict[str, Any]


@dataclass
class ModelTrainerArtifact:
    """Artifact from model training step"""
    # Best model info
    best_model_name: str
    best_model_path: str
    best_model_metrics: Dict[str, float]

    # All trained models
    trained_models: List[Dict[str, Any]]

    # MLflow tracking
    mlflow_run_id: str
    mlflow_experiment_id: str

    # Cluster/anomaly results
    n_clusters: Optional[int]
    n_anomalies: Optional[int]
    anomaly_indices: Optional[List[int]]

    # Training info
    training_duration_seconds: float
    optuna_study_name: Optional[str]


@dataclass
class PipelineArtifact:
    """Complete pipeline artifact"""
    data_ingestion: DataIngestionArtifact
    data_validation: DataValidationArtifact
    data_transformation: DataTransformationArtifact
    model_trainer: ModelTrainerArtifact
    pipeline_run_id: str
    pipeline_start_time: str
    pipeline_end_time: str
    pipeline_status: str