nivakaran's picture
Upload folder using huggingface_hub
4134ab0 verified
"""
models/anomaly-detection/src/entity/config_entity.py
Configuration entities for the anomaly detection pipeline
"""
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional
import os
@dataclass
class DataIngestionConfig:
"""Configuration for data ingestion component"""
sqlite_db_path: str = field(default_factory=lambda: os.getenv(
"SQLITE_DB_PATH",
str(Path(__file__).parent.parent.parent.parent.parent / "data" / "feeds" / "feed_cache.db")
))
csv_directory: str = field(default_factory=lambda: str(
Path(__file__).parent.parent.parent.parent.parent / "datasets" / "political_feeds"
))
output_directory: str = field(default_factory=lambda: str(
Path(__file__).parent.parent.parent / "artifacts" / "data_ingestion"
))
batch_size: int = 1000
min_text_length: int = 10
@dataclass
class DataValidationConfig:
"""Configuration for data validation component"""
schema_file: str = field(default_factory=lambda: str(
Path(__file__).parent.parent.parent / "data_schema" / "schema.yaml"
))
required_columns: List[str] = field(default_factory=lambda: [
"post_id", "timestamp", "platform", "category", "text", "content_hash"
])
output_directory: str = field(default_factory=lambda: str(
Path(__file__).parent.parent.parent / "artifacts" / "data_validation"
))
@dataclass
class DataTransformationConfig:
"""Configuration for data transformation/vectorization component"""
# Huggingface models - will be downloaded locally
models_cache_dir: str = field(default_factory=lambda: str(
Path(__file__).parent.parent.parent / "models_cache"
))
# Language-specific BERT models
english_model: str = "distilbert-base-uncased"
sinhala_model: str = "keshan/SinhalaBERTo"
tamil_model: str = "l3cube-pune/tamil-bert"
# Language detection
fasttext_model_path: str = field(default_factory=lambda: str(
Path(__file__).parent.parent.parent / "models_cache" / "lid.176.bin" # FastText language ID model
))
# Vector dimensions
vector_dim: int = 768 # Standard BERT dimension
# Output
output_directory: str = field(default_factory=lambda: str(
Path(__file__).parent.parent.parent / "artifacts" / "data_transformation"
))
@dataclass
class ModelTrainerConfig:
"""Configuration for model training component"""
# MLflow configuration
mlflow_tracking_uri: str = field(default_factory=lambda: os.getenv(
"MLFLOW_TRACKING_URI", "https://dagshub.com/sliitguy/Model-X.mlflow"
))
mlflow_username: str = field(default_factory=lambda: os.getenv(
"MLFLOW_TRACKING_USERNAME", ""
))
mlflow_password: str = field(default_factory=lambda: os.getenv(
"MLFLOW_TRACKING_PASSWORD", ""
))
experiment_name: str = "anomaly_detection_feeds"
# Model configurations
models_to_train: List[str] = field(default_factory=lambda: [
"dbscan", "kmeans", "hdbscan", "isolation_forest", "lof"
])
# Optuna hyperparameter tuning
n_optuna_trials: int = 50
optuna_timeout_seconds: int = 3600 # 1 hour
# Model output
output_directory: str = field(default_factory=lambda: str(
Path(__file__).parent.parent.parent / "artifacts" / "model_trainer"
))
@dataclass
class PipelineConfig:
"""Master configuration for the entire pipeline"""
data_ingestion: DataIngestionConfig = field(default_factory=DataIngestionConfig)
data_validation: DataValidationConfig = field(default_factory=DataValidationConfig)
data_transformation: DataTransformationConfig = field(default_factory=DataTransformationConfig)
model_trainer: ModelTrainerConfig = field(default_factory=ModelTrainerConfig)
# Pipeline settings
batch_threshold: int = 1000 # Trigger training after this many new records
run_interval_hours: int = 24 # Fallback daily run