import pandas as pd import numpy as np from pathlib import Path import sys sys.path.append(str(Path(__file__).resolve().parents[1])) from utils.logger import * import logging logger = logging.getLogger(__name__) def engineer_news_features(df): KEYWORDS = { "strike": ["strike", "walkout", "protest"], "disaster": ["earthquake", "flood", "hurricane", "typhoon", "fire", "storm"], "accident": ["collision", "accident", "spill", "blockage"], } for key, words in KEYWORDS.items(): df[f"is_{key}"] = ( df['title'].str.contains('|'.join(words), case=False, na=False) | df['description'].str.contains('|'.join(words), case=False, na=False) ) if "publishedAt" in df.columns: df["event_weekday"] = pd.to_datetime(df["publishedAt"], errors='coerce').dt.weekday df["event_hour"] = pd.to_datetime(df["publishedAt"], errors='coerce').dt.hour logger.info(f"Engineered news event features: {df.shape}") return df def engineer_weather_features(df): severe_words = ["Storm", "Thunderstorm", "Rain", "Snow", "Hurricane", "Extreme"] df["severe_weather"] = df["weather_main"].str.contains('|'.join(severe_words), case=False, na=False) if "weather_main" in df.columns: df = pd.get_dummies(df, columns=["weather_main"], prefix="weather") if "timestamp" in df.columns: df["month"] = pd.to_datetime(df["timestamp"], errors='coerce').dt.month df["season"] = pd.to_datetime(df["timestamp"], errors='coerce').dt.month % 12 // 3 + 1 logger.info(f"Engineered weather features: {df.shape}") return df def engineer_supply_chain_features(df): if "order date (DateOrders)" in df.columns and "shipping date (DateOrders)" in df.columns: df["lead_time_days"] = ( pd.to_datetime(df["shipping date (DateOrders)"], errors='coerce') - pd.to_datetime(df["order date (DateOrders)"], errors='coerce') ).dt.days for col in ["Order Status", "Product Status", "Shipping Mode", "Order Region", "Order Country"]: if col in df.columns: df = pd.get_dummies(df, columns=[col], prefix=col.replace(' ', '_')) if "Late_delivery_risk" in df.columns: df["is_late"] = df["Late_delivery_risk"] > 0 logger.info(f"Engineered supply chain features: {df.shape}") return df if __name__ == "__main__": processed_dir = Path(__file__).resolve().parents[2] / "artifacts" / "data" / "processed" try: news_df = pd.read_csv(processed_dir / "news_events_clean.csv") news_feats = engineer_news_features(news_df) news_feats.to_csv(processed_dir / "news_events_features.csv", index=False) logger.info("Saved engineered news features.") except Exception as e: logger.error(f"Error engineering news features: {e}") try: weather_df = pd.read_csv(processed_dir / "weather_alerts_clean.csv") weather_feats = engineer_weather_features(weather_df) weather_feats.to_csv(processed_dir / "weather_alerts_features.csv", index=False) logger.info("Saved engineered weather features.") except Exception as e: logger.error(f"Error engineering weather features: {e}") try: sc_df = pd.read_csv(processed_dir / "supply_chain_disruptions_clean.csv", encoding="utf-8") sc_feats = engineer_supply_chain_features(sc_df) sc_feats.to_csv(processed_dir / "supply_chain_disruptions_features.csv", index=False) logger.info("Saved engineered supply chain features.") except Exception as e: logger.error(f"Error engineering supply chain features: {e}")