|
|
""" |
|
|
models/anomaly-detection/src/entity/config_entity.py |
|
|
Configuration entities for the anomaly detection pipeline |
|
|
""" |
|
|
from dataclasses import dataclass, field |
|
|
from pathlib import Path |
|
|
from typing import List, Optional |
|
|
import os |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DataIngestionConfig: |
|
|
"""Configuration for data ingestion component""" |
|
|
sqlite_db_path: str = field(default_factory=lambda: os.getenv( |
|
|
"SQLITE_DB_PATH", |
|
|
str(Path(__file__).parent.parent.parent.parent.parent / "data" / "feeds" / "feed_cache.db") |
|
|
)) |
|
|
csv_directory: str = field(default_factory=lambda: str( |
|
|
Path(__file__).parent.parent.parent.parent.parent / "datasets" / "political_feeds" |
|
|
)) |
|
|
output_directory: str = field(default_factory=lambda: str( |
|
|
Path(__file__).parent.parent.parent / "artifacts" / "data_ingestion" |
|
|
)) |
|
|
batch_size: int = 1000 |
|
|
min_text_length: int = 10 |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DataValidationConfig: |
|
|
"""Configuration for data validation component""" |
|
|
schema_file: str = field(default_factory=lambda: str( |
|
|
Path(__file__).parent.parent.parent / "data_schema" / "schema.yaml" |
|
|
)) |
|
|
required_columns: List[str] = field(default_factory=lambda: [ |
|
|
"post_id", "timestamp", "platform", "category", "text", "content_hash" |
|
|
]) |
|
|
output_directory: str = field(default_factory=lambda: str( |
|
|
Path(__file__).parent.parent.parent / "artifacts" / "data_validation" |
|
|
)) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DataTransformationConfig: |
|
|
"""Configuration for data transformation/vectorization component""" |
|
|
|
|
|
models_cache_dir: str = field(default_factory=lambda: str( |
|
|
Path(__file__).parent.parent.parent / "models_cache" |
|
|
)) |
|
|
|
|
|
|
|
|
english_model: str = "distilbert-base-uncased" |
|
|
sinhala_model: str = "keshan/SinhalaBERTo" |
|
|
tamil_model: str = "l3cube-pune/tamil-bert" |
|
|
|
|
|
|
|
|
fasttext_model_path: str = field(default_factory=lambda: str( |
|
|
Path(__file__).parent.parent.parent / "models_cache" / "lid.176.bin" |
|
|
)) |
|
|
|
|
|
|
|
|
vector_dim: int = 768 |
|
|
|
|
|
|
|
|
output_directory: str = field(default_factory=lambda: str( |
|
|
Path(__file__).parent.parent.parent / "artifacts" / "data_transformation" |
|
|
)) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ModelTrainerConfig: |
|
|
"""Configuration for model training component""" |
|
|
|
|
|
mlflow_tracking_uri: str = field(default_factory=lambda: os.getenv( |
|
|
"MLFLOW_TRACKING_URI", "https://dagshub.com/sliitguy/Model-X.mlflow" |
|
|
)) |
|
|
mlflow_username: str = field(default_factory=lambda: os.getenv( |
|
|
"MLFLOW_TRACKING_USERNAME", "" |
|
|
)) |
|
|
mlflow_password: str = field(default_factory=lambda: os.getenv( |
|
|
"MLFLOW_TRACKING_PASSWORD", "" |
|
|
)) |
|
|
experiment_name: str = "anomaly_detection_feeds" |
|
|
|
|
|
|
|
|
models_to_train: List[str] = field(default_factory=lambda: [ |
|
|
"dbscan", "kmeans", "hdbscan", "isolation_forest", "lof" |
|
|
]) |
|
|
|
|
|
|
|
|
n_optuna_trials: int = 50 |
|
|
optuna_timeout_seconds: int = 3600 |
|
|
|
|
|
|
|
|
output_directory: str = field(default_factory=lambda: str( |
|
|
Path(__file__).parent.parent.parent / "artifacts" / "model_trainer" |
|
|
)) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class PipelineConfig: |
|
|
"""Master configuration for the entire pipeline""" |
|
|
data_ingestion: DataIngestionConfig = field(default_factory=DataIngestionConfig) |
|
|
data_validation: DataValidationConfig = field(default_factory=DataValidationConfig) |
|
|
data_transformation: DataTransformationConfig = field(default_factory=DataTransformationConfig) |
|
|
model_trainer: ModelTrainerConfig = field(default_factory=ModelTrainerConfig) |
|
|
|
|
|
|
|
|
batch_threshold: int = 1000 |
|
|
run_interval_hours: int = 24 |
|
|
|