File size: 3,962 Bytes
c7d4394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
 
16ec2cf
c7d4394
 
 
 
16ec2cf
c7d4394
 
16ec2cf
c7d4394
 
 
 
 
 
 
 
 
 
 
4134ab0
c7d4394
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
 
16ec2cf
c7d4394
 
 
16ec2cf
c7d4394
 
 
 
 
 
 
 
 
 
 
 
 
16ec2cf
c7d4394
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
models/anomaly-detection/src/entity/config_entity.py
Configuration entities for the anomaly detection pipeline
"""
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional
import os


@dataclass
class DataIngestionConfig:
    """Configuration for data ingestion component"""
    sqlite_db_path: str = field(default_factory=lambda: os.getenv(
        "SQLITE_DB_PATH",
        str(Path(__file__).parent.parent.parent.parent.parent / "data" / "feeds" / "feed_cache.db")
    ))
    csv_directory: str = field(default_factory=lambda: str(
        Path(__file__).parent.parent.parent.parent.parent / "datasets" / "political_feeds"
    ))
    output_directory: str = field(default_factory=lambda: str(
        Path(__file__).parent.parent.parent / "artifacts" / "data_ingestion"
    ))
    batch_size: int = 1000
    min_text_length: int = 10


@dataclass
class DataValidationConfig:
    """Configuration for data validation component"""
    schema_file: str = field(default_factory=lambda: str(
        Path(__file__).parent.parent.parent / "data_schema" / "schema.yaml"
    ))
    required_columns: List[str] = field(default_factory=lambda: [
        "post_id", "timestamp", "platform", "category", "text", "content_hash"
    ])
    output_directory: str = field(default_factory=lambda: str(
        Path(__file__).parent.parent.parent / "artifacts" / "data_validation"
    ))


@dataclass
class DataTransformationConfig:
    """Configuration for data transformation/vectorization component"""
    # Huggingface models - will be downloaded locally
    models_cache_dir: str = field(default_factory=lambda: str(
        Path(__file__).parent.parent.parent / "models_cache"
    ))

    # Language-specific BERT models
    english_model: str = "distilbert-base-uncased"
    sinhala_model: str = "keshan/SinhalaBERTo"
    tamil_model: str = "l3cube-pune/tamil-bert"

    # Language detection
    fasttext_model_path: str = field(default_factory=lambda: str(
        Path(__file__).parent.parent.parent / "models_cache" / "lid.176.bin"  # FastText language ID model
    ))

    # Vector dimensions
    vector_dim: int = 768  # Standard BERT dimension

    # Output
    output_directory: str = field(default_factory=lambda: str(
        Path(__file__).parent.parent.parent / "artifacts" / "data_transformation"
    ))


@dataclass
class ModelTrainerConfig:
    """Configuration for model training component"""
    # MLflow configuration
    mlflow_tracking_uri: str = field(default_factory=lambda: os.getenv(
        "MLFLOW_TRACKING_URI", "https://dagshub.com/sliitguy/Model-X.mlflow"
    ))
    mlflow_username: str = field(default_factory=lambda: os.getenv(
        "MLFLOW_TRACKING_USERNAME", ""
    ))
    mlflow_password: str = field(default_factory=lambda: os.getenv(
        "MLFLOW_TRACKING_PASSWORD", ""
    ))
    experiment_name: str = "anomaly_detection_feeds"

    # Model configurations
    models_to_train: List[str] = field(default_factory=lambda: [
        "dbscan", "kmeans", "hdbscan", "isolation_forest", "lof"
    ])

    # Optuna hyperparameter tuning
    n_optuna_trials: int = 50
    optuna_timeout_seconds: int = 3600  # 1 hour

    # Model output
    output_directory: str = field(default_factory=lambda: str(
        Path(__file__).parent.parent.parent / "artifacts" / "model_trainer"
    ))


@dataclass
class PipelineConfig:
    """Master configuration for the entire pipeline"""
    data_ingestion: DataIngestionConfig = field(default_factory=DataIngestionConfig)
    data_validation: DataValidationConfig = field(default_factory=DataValidationConfig)
    data_transformation: DataTransformationConfig = field(default_factory=DataTransformationConfig)
    model_trainer: ModelTrainerConfig = field(default_factory=ModelTrainerConfig)

    # Pipeline settings
    batch_threshold: int = 1000  # Trigger training after this many new records
    run_interval_hours: int = 24  # Fallback daily run