Spaces:

nivakaran
/

modelx

Sleeping

App Files Files Community

nivakaran commited on Dec 8, 2025

Commit

765b37c

verified ·

1 Parent(s): c7d4394

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

main.py +99 -48
models/anomaly-detection/main.py +9 -9
models/anomaly-detection/src/pipeline/train.py +44 -0
models/currency-volatility-prediction/main.py +7 -5
models/currency-volatility-prediction/src/pipeline/train.py +58 -0
models/stock-price-prediction/main.py +7 -5
models/stock-price-prediction/src/pipeline/train.py +63 -0
models/weather-prediction/main.py +7 -5
models/weather-prediction/pyproject.toml +1 -0
models/weather-prediction/src/components/data_ingestion.py +65 -0
models/weather-prediction/src/pipeline/train.py +55 -0
models/weather-prediction/src/utils/tutiempo_scraper.py +18 -5
models/weather-prediction/uv.lock +44 -0
requirements.txt +2 -0
src/config/intel_config.json +82 -69
src/nodes/intelligenceAgentNode.py +114 -80
src/nodes/socialAgentNode.py +128 -89
src/utils/utils.py +36 -25

main.py CHANGED Viewed

@@ -727,21 +727,57 @@ def get_anomalies(limit: int = 20, threshold: float = 0.5):
         # Get recent feeds
         feeds = storage_manager.get_recent_feeds(limit=100)
         if not _load_anomaly_components():
-            # Return high-severity events as proxy for anomalies
-            anomalies = [
-                {**f, "anomaly_score": 0.8, "is_anomaly": True}
-                for f in feeds
-                if f.get("severity") in ["critical", "high"]
-            ][:limit]
             return {
-                "anomalies": anomalies,
                 "total": len(anomalies),
-                "model_status": "not_trained",
-                "fallback": "severity_based"
             }
-        # Score each feed
         anomalies = []
         for feed in feeds:
             summary = feed.get("summary", "")
@@ -758,13 +794,13 @@ def get_anomalies(limit: int = 20, threshold: float = 0.5):
                 else:
                     score = 1.0 if prediction == -1 else 0.0
-                # Normalize score to 0-1 range (rough approximation)
                 normalized_score = max(0, min(1, (score + 0.5)))
                 if prediction == -1 or normalized_score >= threshold:
                     anomalies.append({
                         **feed,
-                        "anomaly_score": float(normalized_score),
                         "is_anomaly": prediction == -1,
                         "language": lang
                     })
@@ -783,7 +819,7 @@ def get_anomalies(limit: int = 20, threshold: float = 0.5):
             "anomalies": anomalies,
             "total": len(anomalies),
             "threshold": threshold,
-            "model_status": "loaded"
         }
     except Exception as e:
@@ -1117,42 +1153,57 @@ def remove_intel_target(target_type: str, value: str, platform: Optional[str] =
 _weather_predictor = None
 def get_weather_predictor():
-    """Lazy-load the weather predictor."""
     global _weather_predictor
-    if _weather_predictor is None:
-        try:
-            import sys
-            import importlib
-            from pathlib import Path
-            weather_path = Path(__file__).parent / "models" / "weather-prediction" / "src"
-            weather_path_str = str(weather_path)
-            # Ensure path is in sys.path
-            if weather_path_str not in sys.path:
-                sys.path.insert(0, weather_path_str)
-            # CRITICAL FIX: Handle 'components' package name collision
-            # If 'components' is already loaded from another model (e.g. currency), unload it
-            if 'components' in sys.modules:
-                existing_path = getattr(sys.modules['components'], '__file__', '')
-                if existing_path and weather_path_str not in str(existing_path):
-                    logger.warning(f"[WeatherAPI] components collision detected. Unloading {existing_path}")
-                    # Unload generic modules to force reload from new path
-                    for mod in list(sys.modules.keys()):
-                        if mod.startswith('components') or mod.startswith('utils'):
-                            del sys.modules[mod]
-            # Now import fresh
-            from components.predictor import WeatherPredictor
-            _weather_predictor = WeatherPredictor()
-            logger.info("[WeatherAPI] Weather predictor initialized")
-        except Exception as e:
-            logger.warning(f"[WeatherAPI] Failed to initialize predictor: {e}")
-            import traceback
-            logger.warning(traceback.format_exc())
-            _weather_predictor = None
-    return _weather_predictor
 @app.get("/api/weather/predictions")

         # Get recent feeds
         feeds = storage_manager.get_recent_feeds(limit=100)
+        if not feeds:
+            # No feeds yet - return helpful message
+            return {
+                "anomalies": [],
+                "total": 0,
+                "model_status": "no_data",
+                "message": "No feed data available yet. Wait for graph execution to complete."
+            }
         if not _load_anomaly_components():
+            # Use severity + keyword-based scoring as intelligent fallback
+            anomalies = []
+            anomaly_keywords = ["emergency", "crisis", "breaking", "urgent", "alert",
+                               "warning", "critical", "disaster", "flood", "protest"]
+            for f in feeds:
+                score = 0.0
+                summary = str(f.get("summary", "")).lower()
+                severity = f.get("severity", "low")
+                # Severity-based scoring
+                if severity == "critical": score = 0.9
+                elif severity == "high": score = 0.75
+                elif severity == "medium": score = 0.5
+                else: score = 0.25
+                # Keyword boosting
+                keyword_matches = sum(1 for kw in anomaly_keywords if kw in summary)
+                if keyword_matches > 0:
+                    score = min(1.0, score + (keyword_matches * 0.1))
+                # Only include if above threshold
+                if score >= threshold:
+                    anomalies.append({
+                        **f,
+                        "anomaly_score": round(score, 3),
+                        "is_anomaly": score >= 0.7
+                    })
+            # Sort by anomaly score
+            anomalies.sort(key=lambda x: x.get("anomaly_score", 0), reverse=True)
             return {
+                "anomalies": anomalies[:limit],
                 "total": len(anomalies),
+                "threshold": threshold,
+                "model_status": "fallback_scoring",
+                "message": "Using severity + keyword scoring. Train ML model for advanced detection."
             }
+        # ML Model is loaded - use it for scoring
         anomalies = []
         for feed in feeds:
             summary = feed.get("summary", "")
                 else:
                     score = 1.0 if prediction == -1 else 0.0
+                # Normalize score to 0-1 range
                 normalized_score = max(0, min(1, (score + 0.5)))
                 if prediction == -1 or normalized_score >= threshold:
                     anomalies.append({
                         **feed,
+                        "anomaly_score": float(round(normalized_score, 3)),
                         "is_anomaly": prediction == -1,
                         "language": lang
                     })
             "anomalies": anomalies,
             "total": len(anomalies),
             "threshold": threshold,
+            "model_status": "ml_active"
         }
     except Exception as e:
 _weather_predictor = None
 def get_weather_predictor():
+    """Lazy-load the weather predictor using isolated import."""
     global _weather_predictor
+    if _weather_predictor is not None:
+        return _weather_predictor
+    try:
+        import importlib.util
+        from pathlib import Path
+        # Use importlib.util for fully isolated import (avoids package collisions)
+        weather_src = Path(__file__).parent / "models" / "weather-prediction" / "src"
+        predictor_path = weather_src / "components" / "predictor.py"
+        if not predictor_path.exists():
+            logger.error(f"[WeatherAPI] predictor.py not found at {predictor_path}")
+            return None
+        # First, ensure entity module is loadable
+        entity_path = weather_src / "entity" / "config_entity.py"
+        if entity_path.exists():
+            entity_spec = importlib.util.spec_from_file_location(
+                "weather_config_entity",
+                str(entity_path)
+            )
+            entity_module = importlib.util.module_from_spec(entity_spec)
+            sys.modules["weather_config_entity"] = entity_module
+            entity_spec.loader.exec_module(entity_module)
+        # Add weather src to path temporarily for relative imports
+        import sys
+        weather_src_str = str(weather_src)
+        if weather_src_str not in sys.path:
+            sys.path.insert(0, weather_src_str)
+        # Now load predictor module
+        spec = importlib.util.spec_from_file_location(
+            "weather_predictor_module",
+            str(predictor_path)
+        )
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        _weather_predictor = module.WeatherPredictor()
+        logger.info("[WeatherAPI] ✓ Weather predictor initialized via isolated import")
+        return _weather_predictor
+    except Exception as e:
+        logger.error(f"[WeatherAPI] Failed to initialize predictor: {e}")
+        import traceback
+        logger.debug(traceback.format_exc())
+        return None
 @app.get("/api/weather/predictions")

models/anomaly-detection/main.py CHANGED Viewed

@@ -4,16 +4,11 @@ Entry point for the anomaly detection training pipeline
 """
 import os
 import sys
-import logging
 from pathlib import Path
-# Add src to path
-sys.path.insert(0, str(Path(__file__).parent / "src"))
-from src.pipeline import run_training_pipeline
-from src.entity import PipelineConfig
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
@@ -22,9 +17,14 @@ logging.basicConfig(
         logging.FileHandler("training.log")
     ]
 )
 logger = logging.getLogger("main")
 def main():
     """Run the anomaly detection training pipeline"""

 """
 import os
 import sys
+import logging  # Import standard library BEFORE path manipulation
 from pathlib import Path
+# CRITICAL: Configure logging BEFORE adding src/ to path
+# (src/logging/ directory would otherwise shadow the standard module)
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
         logging.FileHandler("training.log")
     ]
 )
 logger = logging.getLogger("main")
+# Add src to path - AFTER logging is configured
+sys.path.insert(0, str(Path(__file__).parent / "src"))
+from src.pipeline import run_training_pipeline
+from src.entity import PipelineConfig
 def main():
     """Run the anomaly detection training pipeline"""

models/anomaly-detection/src/pipeline/train.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Anomaly Detection Training Script
+Convenience wrapper for: python models/anomaly-detection/main.py
+Usage:
+    python models/anomaly-detection/src/pipeline/train.py
+"""
+import sys
+import argparse
+import logging  # Import BEFORE path manipulation
+from pathlib import Path
+# Configure logging BEFORE adding src/ to path
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+# Add parent directories to path - AFTER logging is configured
+PIPELINE_ROOT = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(PIPELINE_ROOT))
+sys.path.insert(0, str(PIPELINE_ROOT / "src"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Anomaly Detection Training")
+    parser.add_argument("--help-only", action="store_true", help="Show help and exit")
+    # Parse known args to allow --help to work without loading heavy modules
+    args, _ = parser.parse_known_args()
+    print("=" * 60)
+    print("ANOMALY DETECTION - TRAINING PIPELINE")
+    print("=" * 60)
+    # Import and run from main.py
+    from main import main
+    result = main()
+    if result:
+        print("=" * 60)
+        print("TRAINING COMPLETE!")
+        print(f"Best model: {result.model_trainer.best_model_name}")
+        print("=" * 60)

models/currency-volatility-prediction/main.py CHANGED Viewed

@@ -5,21 +5,23 @@ Can run data collection, training, or prediction independently
 """
 import os
 import sys
-import logging
 import argparse
 from pathlib import Path
 from datetime import datetime
-# Setup paths
-PIPELINE_ROOT = Path(__file__).parent
-sys.path.insert(0, str(PIPELINE_ROOT / "src"))
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("currency_prediction")
 def run_data_ingestion(period: str = "2y"):
     """Run data ingestion from yfinance."""

 """
 import os
 import sys
+import logging  # Import standard library BEFORE path manipulation
 import argparse
 from pathlib import Path
 from datetime import datetime
+# CRITICAL: Configure logging BEFORE adding src/ to path
+# (src/logging/ directory would otherwise shadow the standard module)
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("currency_prediction")
+# Setup paths - AFTER logging is configured
+PIPELINE_ROOT = Path(__file__).parent
+sys.path.insert(0, str(PIPELINE_ROOT / "src"))
 def run_data_ingestion(period: str = "2y"):
     """Run data ingestion from yfinance."""

models/currency-volatility-prediction/src/pipeline/train.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+Currency Volatility Prediction Training Script
+Convenience wrapper for: python models/currency-volatility-prediction/main.py --mode train
+Usage:
+    python models/currency-volatility-prediction/src/pipeline/train.py [--epochs 100] [--period 2y]
+"""
+import sys
+import argparse
+import logging  # CRITICAL: Import BEFORE path manipulation
+from pathlib import Path
+# Configure logging BEFORE adding src/ to path
+# (src/logging/ directory would otherwise shadow the standard module)
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+# Add parent directories to path - AFTER logging is configured
+PIPELINE_ROOT = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(PIPELINE_ROOT))
+sys.path.insert(0, str(PIPELINE_ROOT / "src"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Currency Prediction Training")
+    parser.add_argument("--epochs", type=int, default=100, help="Training epochs")
+    parser.add_argument("--period", type=str, default="2y", help="Data period (1y, 2y, 5y)")
+    parser.add_argument("--full", action="store_true", help="Run full pipeline (ingest + train + predict)")
+    args = parser.parse_args()
+    # Import from main.py (after path setup)
+    from main import run_training, run_full_pipeline, run_data_ingestion
+    print("=" * 60)
+    print("CURRENCY (USD/LKR) PREDICTION - TRAINING PIPELINE")
+    print("=" * 60)
+    if args.full:
+        run_full_pipeline()
+    else:
+        # Run data ingestion first if no data exists
+        try:
+            from components.data_ingestion import CurrencyDataIngestion
+            ingestion = CurrencyDataIngestion()
+            df = ingestion.load_existing()
+            print(f"✓ Found existing data: {len(df)} records")
+        except FileNotFoundError:
+            print("No existing data, running ingestion first...")
+            run_data_ingestion(period=args.period)
+        # Run training
+        run_training(epochs=args.epochs)
+    print("=" * 60)
+    print("TRAINING COMPLETE!")
+    print("=" * 60)

models/stock-price-prediction/main.py CHANGED Viewed

@@ -5,21 +5,23 @@ Can run data collection, training, or prediction independently
 """
 import os
 import sys
-import logging
 import argparse
 from pathlib import Path
 from datetime import datetime
-# Setup paths
-PIPELINE_ROOT = Path(__file__).parent
-sys.path.insert(0, str(PIPELINE_ROOT / "src"))
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("stock_prediction")
 def run_data_ingestion():
     """Run data ingestion for all stocks."""

 """
 import os
 import sys
+import logging  # Import standard library BEFORE path manipulation
 import argparse
 from pathlib import Path
 from datetime import datetime
+# CRITICAL: Configure logging BEFORE adding src/ to path
+# (src/logging/ directory would otherwise shadow the standard module)
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("stock_prediction")
+# Setup paths - AFTER logging is configured
+PIPELINE_ROOT = Path(__file__).parent
+sys.path.insert(0, str(PIPELINE_ROOT / "src"))
 def run_data_ingestion():
     """Run data ingestion for all stocks."""

models/stock-price-prediction/src/pipeline/train.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+Stock Price Prediction Training Script
+Convenience wrapper for: python models/stock-price-prediction/main.py --mode train
+Usage:
+    python models/stock-price-prediction/src/pipeline/train.py [--stock JKH] [--no-optuna] [--full]
+"""
+import sys
+import argparse
+import logging  # CRITICAL: Import BEFORE path manipulation
+from pathlib import Path
+# Configure logging BEFORE adding src/ to path
+# (src/logging/ directory would otherwise shadow the standard module)
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+# Add parent directories to path - AFTER logging is configured
+PIPELINE_ROOT = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(PIPELINE_ROOT))
+sys.path.insert(0, str(PIPELINE_ROOT / "src"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Stock Price Prediction Training")
+    parser.add_argument("--stock", type=str, default=None, help="Specific stock to train (e.g., JKH, COMB)")
+    parser.add_argument("--no-optuna", action="store_true", help="Disable Optuna hyperparameter optimization")
+    parser.add_argument("--full", action="store_true", help="Run full pipeline (ingest + train + predict)")
+    args = parser.parse_args()
+    use_optuna = not args.no_optuna
+    # Import from main.py (after path setup)
+    from main import run_training, run_full_pipeline, run_data_ingestion
+    print("=" * 60)
+    print("STOCK PRICE (CSE) PREDICTION - TRAINING PIPELINE")
+    print("=" * 60)
+    if args.full:
+        run_full_pipeline(use_optuna=use_optuna)
+    else:
+        # Run data ingestion first if no data exists
+        try:
+            from components.data_ingestion import StockDataIngestion
+            ingestion = StockDataIngestion()
+            stocks = list(ingestion.config.stocks.keys())
+            df = ingestion.load_stock_data(stocks[0])
+            if df is not None:
+                print(f"✓ Found existing data for {len(stocks)} stocks")
+            else:
+                raise FileNotFoundError()
+        except (FileNotFoundError, Exception):
+            print("No existing data, running ingestion first...")
+            run_data_ingestion()
+        # Run training
+        run_training(use_optuna=use_optuna, stock=args.stock)
+    print("=" * 60)
+    print("TRAINING COMPLETE!")
+    print("=" * 60)

models/weather-prediction/main.py CHANGED Viewed

@@ -5,21 +5,23 @@ Can run data collection, training, or prediction independently
 """
 import os
 import sys
-import logging
 import argparse
 from pathlib import Path
 from datetime import datetime
-# Setup paths
-PIPELINE_ROOT = Path(__file__).parent
-sys.path.insert(0, str(PIPELINE_ROOT / "src"))
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("weather_prediction")
 def run_data_ingestion(months: int = 12):
     """Run data ingestion for all stations."""

 """
 import os
 import sys
+import logging  # Import standard library BEFORE path manipulation
 import argparse
 from pathlib import Path
 from datetime import datetime
+# CRITICAL: Configure logging BEFORE adding src/ to path
+# (src/logging/ directory would otherwise shadow the standard module)
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger("weather_prediction")
+# Setup paths - AFTER logging is configured
+PIPELINE_ROOT = Path(__file__).parent
+sys.path.insert(0, str(PIPELINE_ROOT / "src"))
 def run_data_ingestion(months: int = 12):
     """Run data ingestion for all stations."""

models/weather-prediction/pyproject.toml CHANGED Viewed

@@ -10,6 +10,7 @@ dependencies = [
     "fastapi>=0.122.0",
     "mlflow>=3.6.0",
     "numpy>=2.3.5",
     "pandas>=2.3.3",
     "pyaml>=25.7.0",
     "pymongo[srv]>=4.15.4",

     "fastapi>=0.122.0",
     "mlflow>=3.6.0",
     "numpy>=2.3.5",
+    "optuna>=4.6.0",
     "pandas>=2.3.3",
     "pyaml>=25.7.0",
     "pymongo[srv]>=4.15.4",

models/weather-prediction/src/components/data_ingestion.py CHANGED Viewed

@@ -36,6 +36,7 @@ class DataIngestion:
     def ingest_all(self) -> str:
         """
         Ingest historical weather data for all stations.
         Returns:
             Path to saved CSV file
@@ -55,9 +56,73 @@ class DataIngestion:
             save_path=save_path
         )
         logger.info(f"[DATA_INGESTION] ✓ Ingested {len(df)} total records")
         return save_path
     def ingest_station(self, station_name: str, months: int = None) -> pd.DataFrame:
         """
         Ingest data for a single station.

     def ingest_all(self) -> str:
         """
         Ingest historical weather data for all stations.
+        Falls back to synthetic data if scraping fails.
         Returns:
             Path to saved CSV file
             save_path=save_path
         )
+        # Fallback to synthetic data if scraping failed
+        if df.empty or len(df) < 100:
+            logger.warning("[DATA_INGESTION] Scraping failed or insufficient data. Generating synthetic training data.")
+            df = self._generate_synthetic_data()
+            df.to_csv(save_path, index=False)
+            logger.info(f"[DATA_INGESTION] Generated {len(df)} synthetic records")
         logger.info(f"[DATA_INGESTION] ✓ Ingested {len(df)} total records")
         return save_path
+    def _generate_synthetic_data(self) -> pd.DataFrame:
+        """
+        Generate synthetic weather data for training when scraping fails.
+        Uses realistic Sri Lankan climate patterns.
+        """
+        import numpy as np
+        # Generate 1 year of daily data for priority stations
+        priority_stations = ["COLOMBO", "KANDY", "JAFFNA", "BATTICALOA", "RATNAPURA"]
+        records = []
+        for station in priority_stations:
+            if station not in self.config.stations:
+                continue
+            config = self.config.stations[station]
+            # Generate 365 days of data
+            for day_offset in range(365):
+                date = datetime.now() - pd.Timedelta(days=day_offset)
+                month = date.month
+                # Monsoon-aware temperature (more realistic for Sri Lanka)
+                # South-West monsoon: May-Sep, North-East: Dec-Feb
+                base_temp = 28 if month in [3, 4, 5, 6, 7, 8] else 26
+                temp_variation = np.random.normal(0, 2)
+                temp_mean = base_temp + temp_variation
+                # Monsoon rainfall patterns
+                if month in [10, 11, 12]:  # NE monsoon - heavy rain
+                    rainfall = max(0, np.random.exponential(15))
+                elif month in [5, 6, 7]:  # SW monsoon - moderate rain
+                    rainfall = max(0, np.random.exponential(10))
+                else:  # Inter-monsoon / dry
+                    rainfall = max(0, np.random.exponential(3))
+                records.append({
+                    "date": date.strftime("%Y-%m-%d"),
+                    "year": date.year,
+                    "month": month,
+                    "day": date.day,
+                    "station_code": config["code"],
+                    "station_name": station,
+                    "temp_mean": round(temp_mean, 1),
+                    "temp_max": round(temp_mean + np.random.uniform(3, 6), 1),
+                    "temp_min": round(temp_mean - np.random.uniform(3, 5), 1),
+                    "rainfall": round(rainfall, 1),
+                    "humidity": round(np.random.uniform(65, 90), 1),
+                    "wind_speed": round(np.random.uniform(5, 25), 1),
+                    "pressure": round(np.random.uniform(1008, 1015), 1),
+                })
+        df = pd.DataFrame(records)
+        df["date"] = pd.to_datetime(df["date"])
+        df = df.sort_values(["station_name", "date"]).reset_index(drop=True)
+        return df
     def ingest_station(self, station_name: str, months: int = None) -> pd.DataFrame:
         """
         Ingest data for a single station.

models/weather-prediction/src/pipeline/train.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+Weather Prediction Training Script
+Convenience wrapper for: python models/weather-prediction/main.py --mode train
+Usage:
+    python models/weather-prediction/src/pipeline/train.py [--station COLOMBO] [--epochs 100]
+"""
+import sys
+import argparse
+from pathlib import Path
+# CRITICAL: Import standard library logging BEFORE adding src/ to path
+# (src/logging/ directory would otherwise shadow the standard module)
+import logging
+# Add parent directories to path
+PIPELINE_ROOT = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(PIPELINE_ROOT))
+sys.path.insert(0, str(PIPELINE_ROOT / "src"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Weather Prediction Training")
+    parser.add_argument("--station", type=str, default=None, help="Station to train (e.g., COLOMBO)")
+    parser.add_argument("--epochs", type=int, default=100, help="Training epochs")
+    parser.add_argument("--full", action="store_true", help="Run full pipeline (ingest + train + predict)")
+    args = parser.parse_args()
+    # Import from main.py (after path setup)
+    from main import run_training, run_full_pipeline, run_data_ingestion
+    print("=" * 60)
+    print("WEATHER PREDICTION - TRAINING PIPELINE")
+    print("=" * 60)
+    if args.full:
+        run_full_pipeline()
+    else:
+        # Run data ingestion first if no data exists
+        try:
+            from components.data_ingestion import DataIngestion
+            ingestion = DataIngestion()
+            df = ingestion.load_existing()
+            print(f"✓ Found existing data: {len(df)} records")
+        except FileNotFoundError:
+            print("No existing data, running ingestion first...")
+            run_data_ingestion(months=3)
+        # Run training
+        run_training(station=args.station, epochs=args.epochs)
+    print("=" * 60)
+    print("TRAINING COMPLETE!")
+    print("=" * 60)

models/weather-prediction/src/utils/tutiempo_scraper.py CHANGED Viewed

@@ -175,19 +175,32 @@ class TutiempoScraper:
             DataFrame with all historical records
         """
         all_records = []
         current = datetime.now()
         for i in range(months):
-            target_date = current - timedelta(days=30 * i)
             year = target_date.year
             month = target_date.month
             records = self.scrape_month(station_code, year, month)
-            for r in records:
-                r["station_name"] = station_name
-            all_records.extend(records)
             # Be nice to the server
             time.sleep(1)

             DataFrame with all historical records
         """
         all_records = []
+        # IMPORTANT: TuTiempo has data publication delay of ~2-3 months
+        # Start from 3 months ago to avoid 404 errors on recent months
         current = datetime.now()
+        start_date = current - timedelta(days=90)  # Start 3 months ago
+        consecutive_failures = 0
+        max_consecutive_failures = 3
         for i in range(months):
+            target_date = start_date - timedelta(days=30 * i)
             year = target_date.year
             month = target_date.month
             records = self.scrape_month(station_code, year, month)
+            if not records:
+                consecutive_failures += 1
+                if consecutive_failures >= max_consecutive_failures:
+                    logger.warning(f"[TUTIEMPO] {max_consecutive_failures} consecutive failures for {station_name}, stopping")
+                    break
+            else:
+                consecutive_failures = 0  # Reset on success
+                for r in records:
+                    r["station_name"] = station_name
+                all_records.extend(records)
             # Be nice to the server
             time.sleep(1)

models/weather-prediction/uv.lock CHANGED Viewed

@@ -298,6 +298,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]
 [[package]]
 name = "contourpy"
 version = "1.3.3"
@@ -1375,6 +1387,7 @@ dependencies = [
     { name = "fastapi" },
     { name = "mlflow" },
     { name = "numpy" },
     { name = "pandas" },
     { name = "pyaml" },
     { name = "pymongo" },
@@ -1391,6 +1404,7 @@ requires-dist = [
     { name = "fastapi", specifier = ">=0.122.0" },
     { name = "mlflow", specifier = ">=3.6.0" },
     { name = "numpy", specifier = ">=2.3.5" },
     { name = "pandas", specifier = ">=2.3.3" },
     { name = "pyaml", specifier = ">=25.7.0" },
     { name = "pymongo", extras = ["srv"], specifier = ">=4.15.4" },
@@ -1659,6 +1673,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" },
 ]
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -2615,6 +2647,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
 ]
 [[package]]
 name = "treelib"
 version = "1.8.0"

     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]
+[[package]]
+name = "colorlog"
+version = "6.10.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/61/f083b5ac52e505dfc1c624eafbf8c7589a0d7f32daa398d2e7590efa5fda/colorlog-6.10.1.tar.gz", hash = "sha256:eb4ae5cb65fe7fec7773c2306061a8e63e02efc2c72eba9d27b0fa23c94f1321", size = 17162, upload-time = "2025-10-16T16:14:11.978Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" },
+]
 [[package]]
 name = "contourpy"
 version = "1.3.3"
     { name = "fastapi" },
     { name = "mlflow" },
     { name = "numpy" },
+    { name = "optuna" },
     { name = "pandas" },
     { name = "pyaml" },
     { name = "pymongo" },
     { name = "fastapi", specifier = ">=0.122.0" },
     { name = "mlflow", specifier = ">=3.6.0" },
     { name = "numpy", specifier = ">=2.3.5" },
+    { name = "optuna", specifier = ">=4.6.0" },
     { name = "pandas", specifier = ">=2.3.3" },
     { name = "pyaml", specifier = ">=25.7.0" },
     { name = "pymongo", extras = ["srv"], specifier = ">=4.15.4" },
     { url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" },
 ]
+[[package]]
+name = "optuna"
+version = "4.6.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "alembic" },
+    { name = "colorlog" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "sqlalchemy" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6b/81/08f90f194eed78178064a9383432eca95611e2c5331e7b01e2418ce4b15a/optuna-4.6.0.tar.gz", hash = "sha256:89e38c2447c7f793a726617b8043f01e31f0bad54855040db17eb3b49404a369", size = 477444, upload-time = "2025-11-10T05:14:30.151Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/58/de/3d8455b08cb6312f8cc46aacdf16c71d4d881a1db4a4140fc5ef31108422/optuna-4.6.0-py3-none-any.whl", hash = "sha256:4c3a9facdef2b2dd7e3e2a8ae3697effa70fae4056fcf3425cfc6f5a40feb069", size = 404708, upload-time = "2025-11-10T05:14:28.6Z" },
+]
 [[package]]
 name = "packaging"
 version = "25.0"
     { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
 ]
+[[package]]
+name = "tqdm"
+version = "4.67.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
+]
 [[package]]
 name = "treelib"
 version = "1.8.0"

requirements.txt CHANGED Viewed

@@ -111,3 +111,5 @@ pyyaml
 # Utilities
 joblib
 tqdm

 # Utilities
 joblib
 tqdm
+optuna

src/config/intel_config.json CHANGED Viewed

@@ -1,72 +1,85 @@
 {
-    "user_profiles": {
-        "twitter": [],
-        "facebook": [],
-        "linkedin": []
-    },
-    "user_keywords": [],
-    "user_products": [],
-    "operational_keywords": {
-        "infrastructure": [
-            "Colombo port",
-            "Hambantota port",
-            "port strike",
-            "power outage",
-            "water shortage",
-            "fuel shortage",
-            "airport delay",
-            "customs clearance",
-            "road closure",
-            "railway disruption"
-        ],
-        "government": [
-            "cabinet decision",
-            "new policy",
-            "regulation change",
-            "tax amendment",
-            "import restriction",
-            "export ban",
-            "license requirement",
-            "central bank",
-            "budget announcement"
-        ],
-        "opportunity": [
-            "investment",
-            "expansion",
-            "new factory",
-            "job creation",
-            "export growth",
-            "tourism boost",
-            "infrastructure project",
-            "development grant",
-            "FDI",
-            "trade agreement"
-        ]
-    },
-    "alert_thresholds": {
-        "trending_momentum_min": 2.0,
-        "spike_multiplier": 3.0,
-        "high_risk_score": 0.7,
-        "high_opportunity_score": 0.6
-    },
-    "default_competitors": {
-        "telecom": {
-            "twitter": [
-                "DialogSriLanka",
-                "Mobaborang",
-                "HutchSL"
-            ],
-            "facebook": [
-                "Dialog",
-                "SLT-Mobitel",
-                "Hutch"
-            ]
-        }
-    },
-    "notes": {
-        "removed_profiles": [
-            "SLTMobitel - Twitter profile not found/restricted"
-        ],
-        "last_verified": "2025-12-08"
     }
 }

 {
+  "user_profiles": {
+    "twitter": [
+      "nivakaran"
+    ],
+    "facebook": [
+      "Nivakaran"
+    ],
+    "linkedin": [
+      "nivakaran"
+    ]
+  },
+  "user_keywords": [
+    "Colombo",
+    "nivakaran",
+    "telco"
+  ],
+  "user_products": [
+    "iphone",
+    "anchor"
+  ],
+  "operational_keywords": {
+    "infrastructure": [
+      "Colombo port",
+      "Hambantota port",
+      "port strike",
+      "power outage",
+      "water shortage",
+      "fuel shortage",
+      "airport delay",
+      "customs clearance",
+      "road closure",
+      "railway disruption"
+    ],
+    "government": [
+      "cabinet decision",
+      "new policy",
+      "regulation change",
+      "tax amendment",
+      "import restriction",
+      "export ban",
+      "license requirement",
+      "central bank",
+      "budget announcement"
+    ],
+    "opportunity": [
+      "investment",
+      "expansion",
+      "new factory",
+      "job creation",
+      "export growth",
+      "tourism boost",
+      "infrastructure project",
+      "development grant",
+      "FDI",
+      "trade agreement"
+    ]
+  },
+  "alert_thresholds": {
+    "trending_momentum_min": 2.0,
+    "spike_multiplier": 3.0,
+    "high_risk_score": 0.7,
+    "high_opportunity_score": 0.6
+  },
+  "default_competitors": {
+    "telecom": {
+      "twitter": [
+        "DialogSriLanka",
+        "Mobaborang",
+        "HutchSL"
+      ],
+      "facebook": [
+        "Dialog",
+        "SLT-Mobitel",
+        "Hutch"
+      ]
     }
+  },
+  "notes": {
+    "removed_profiles": [
+      "SLTMobitel - Twitter profile not found/restricted"
+    ],
+    "last_verified": "2025-12-08"
+  }
 }

src/nodes/intelligenceAgentNode.py CHANGED Viewed

@@ -396,16 +396,19 @@ class IntelligenceAgentNode:
     def generate_llm_summary(self, state: IntelligenceAgentState) -> Dict[str, Any]:
         """
-        Generate competitive intelligence summary using LLM
         """
-        print("[MODULE 3B] Generating LLM Summary")
         all_results = state.get("worker_results", [])
         profile_feeds = state.get("profile_feeds", {})
         competitor_feeds = state.get("competitor_feeds", {})
         product_feeds = state.get("product_review_feeds", {})
-        # Prepare summary prompt
         summary_data = {
             "total_results": len(all_results),
             "profiles_monitored": list(profile_feeds.keys()),
@@ -415,42 +418,90 @@ class IntelligenceAgentNode:
             "global_competitors": len(state.get("global_intel", []))
         }
-        prompt = f"""
-        Analyze this competitive intelligence data and provide a strategic summary.
-        Data Overview:
-        - Total intelligence collected: {summary_data['total_results']} items
-        - Competitor profiles monitored: {', '.join(summary_data['profiles_monitored'])}
-        - Competitor mentions tracked: {', '.join(summary_data['competitors_tracked'])}
-        - Products analyzed: {', '.join(summary_data['products_analyzed'])}
-        - Local market intelligence: {summary_data['local_competitors']} items
-        - Global market intelligence: {summary_data['global_competitors']} items
-        Provide:
-        1. Key competitive insights
-        2. Market trends observed
-        3. Threats and opportunities
-        4. Recommended actions
-        Keep it concise and actionable.
-        """
         try:
             response = self.llm.invoke(prompt)
-            llm_summary = response.content if hasattr(response, 'content') else str(response)
-            print("  ✓ Generated LLM summary")
         except Exception as e:
-            llm_summary = f"LLM Summary unavailable: {e}"
             print(f"  ⚠️ LLM error: {e}")
         return {
             "llm_summary": llm_summary,
             "structured_output": summary_data
         }
     def format_final_output(self, state: IntelligenceAgentState) -> Dict[str, Any]:
         """
-        Module 3C: Format final competitive intelligence feed
         """
         print("[MODULE 3C] Formatting Final Output")
@@ -458,6 +509,7 @@ class IntelligenceAgentNode:
         competitor_feeds = state.get("competitor_feeds", {})
         product_feeds = state.get("product_review_feeds", {})
         llm_summary = state.get("llm_summary", "No summary available")
         local_intel = state.get("local_intel", [])
         global_intel = state.get("global_intel", [])
@@ -491,84 +543,66 @@ Source: Multi-platform competitive intelligence (Twitter, Facebook, LinkedIn, In
 """
         # Create integration output with structured data
-        # FIXED: Pass actual feed data, not just counts
         structured_feeds = {
-            "profiles": profile_feeds,  # Full profile data, not counts
-            "competitors": competitor_feeds,  # Full competitor data
-            "products": product_feeds,  # Full product review data
             "local_intel": local_intel,
             "global_intel": global_intel
         }
-        # Create list for per-item domain_insights (FRONTEND COMPATIBLE)
         domain_insights = []
         timestamp = datetime.utcnow().isoformat()
-        # 1. Create per-profile intelligence insights
-        for profile_name, posts in profile_feeds.items():
-            if not isinstance(posts, list):
-                continue
-            for post in posts[:5]:
-                post_text = post.get("text", "") or post.get("title", "")
-                if not post_text or len(post_text) < 10:
-                    continue
                 domain_insights.append({
                     "source_event_id": str(uuid.uuid4()),
                     "domain": "intelligence",
-                    "summary": f"Profile ({profile_name}): {post_text[:200]}",
-                    "severity": "medium",
-                    "impact_type": "risk",
-                    "timestamp": timestamp
                 })
-        # 2. Create per-competitor intelligence insights
-        for competitor, posts in competitor_feeds.items():
-            if not isinstance(posts, list):
-                continue
-            for post in posts[:5]:
-                post_text = post.get("text", "") or post.get("title", "")
-                if not post_text or len(post_text) < 10:
-                    continue
-                severity = "high" if any(kw in post_text.lower() for kw in ["launch", "expansion", "acquisition"]) else "medium"
-                domain_insights.append({
-                    "source_event_id": str(uuid.uuid4()),
-                    "domain": "intelligence",
-                    "summary": f"Competitor ({competitor}): {post_text[:200]}",
-                    "severity": severity,
-                    "impact_type": "risk",
-                    "timestamp": timestamp
-                })
-        # 3. Create per-product review insights
-        for product, reviews in product_feeds.items():
-            if not isinstance(reviews, list):
-                continue
-            for review in reviews[:5]:
-                review_text = review.get("text", "") or review.get("title", "")
-                if not review_text or len(review_text) < 10:
                     continue
-                severity = "low" if any(kw in review_text.lower() for kw in ["great", "excellent", "love"]) else "medium"
-                impact = "opportunity" if severity == "low" else "risk"
-                domain_insights.append({
-                    "source_event_id": str(uuid.uuid4()),
-                    "domain": "intelligence",
-                    "summary": f"Product Review ({product}): {review_text[:200]}",
-                    "severity": severity,
-                    "impact_type": impact,
-                    "timestamp": timestamp
-                })
-        # 4. Add executive summary insight
         domain_insights.append({
             "source_event_id": str(uuid.uuid4()),
             "structured_data": structured_feeds,
             "domain": "intelligence",
-            "summary": f"Business Intelligence Summary: {llm_summary[:300]}",
             "severity": "medium",
-            "impact_type": "risk"
         })
-        print(f"  ✓ Created {len(domain_insights)} intelligence insights")
         return {
             "final_feed": bulletin,

     def generate_llm_summary(self, state: IntelligenceAgentState) -> Dict[str, Any]:
         """
+        Generate competitive intelligence summary AND structured insights using LLM
         """
+        print("[MODULE 3B] Generating LLM Summary + Competitive Insights")
         all_results = state.get("worker_results", [])
         profile_feeds = state.get("profile_feeds", {})
         competitor_feeds = state.get("competitor_feeds", {})
         product_feeds = state.get("product_review_feeds", {})
+        llm_summary = "Competitive intelligence summary unavailable."
+        llm_insights = []
+        # Prepare summary data
         summary_data = {
             "total_results": len(all_results),
             "profiles_monitored": list(profile_feeds.keys()),
             "global_competitors": len(state.get("global_intel", []))
         }
+        # Collect sample data for LLM analysis
+        sample_posts = []
+        for profile, posts in profile_feeds.items():
+            if isinstance(posts, list):
+                for p in posts[:2]:
+                    text = p.get("text", "") or p.get("title", "") or p.get("raw_content", "")[:200]
+                    if text:
+                        sample_posts.append(f"[PROFILE: {profile}] {text[:150]}")
+        for competitor, posts in competitor_feeds.items():
+            if isinstance(posts, list):
+                for p in posts[:2]:
+                    text = p.get("text", "") or p.get("title", "") or p.get("raw_content", "")[:200]
+                    if text:
+                        sample_posts.append(f"[COMPETITOR: {competitor}] {text[:150]}")
+        posts_text = "\n".join(sample_posts[:10]) if sample_posts else "No detailed data available"
+        prompt = f"""Analyze this competitive intelligence data and generate:
+1. A strategic 3-sentence executive summary
+2. Up to 5 unique business intelligence insights
+Data Overview:
+- Total intelligence: {summary_data['total_results']} items
+- Competitors tracked: {', '.join(summary_data['competitors_tracked']) or 'None'}
+- Products analyzed: {', '.join(summary_data['products_analyzed']) or 'None'}
+Sample Data:
+{posts_text}
+Respond in this exact JSON format:
+{{
+    "executive_summary": "Strategic 3-sentence summary of competitive landscape",
+    "insights": [
+        {{"summary": "Unique competitive insight #1", "severity": "low/medium/high", "impact_type": "risk/opportunity"}},
+        {{"summary": "Unique competitive insight #2", "severity": "low/medium/high", "impact_type": "risk/opportunity"}}
+    ]
+}}
+Rules:
+- Generate actionable business intelligence, not just data descriptions
+- Identify competitive threats as "risk", business opportunities as "opportunity"
+- Severity: high=urgent action needed, medium=monitor closely, low=informational
+JSON only:"""
         try:
             response = self.llm.invoke(prompt)
+            content = response.content if hasattr(response, 'content') else str(response)
+            # Parse JSON response
+            import re
+            content = content.strip()
+            if content.startswith("```"):
+                content = re.sub(r'^```\w*\n?', '', content)
+                content = re.sub(r'\n?```$', '', content)
+            result = json.loads(content)
+            llm_summary = result.get("executive_summary", llm_summary)
+            llm_insights = result.get("insights", [])
+            print(f"  ✓ LLM generated {len(llm_insights)} competitive insights")
+        except json.JSONDecodeError as e:
+            print(f"  ⚠️ JSON parse error: {e}")
+            # Fallback to simple summary
+            try:
+                fallback_prompt = f"Summarize this competitive intelligence in 3 sentences:\n{posts_text[:1500]}"
+                response = self.llm.invoke(fallback_prompt)
+                llm_summary = response.content if hasattr(response, 'content') else str(response)
+            except:
+                pass
         except Exception as e:
             print(f"  ⚠️ LLM error: {e}")
         return {
             "llm_summary": llm_summary,
+            "llm_insights": llm_insights,
             "structured_output": summary_data
         }
     def format_final_output(self, state: IntelligenceAgentState) -> Dict[str, Any]:
         """
+        Module 3C: Format final competitive intelligence feed with LLM-enhanced insights
         """
         print("[MODULE 3C] Formatting Final Output")
         competitor_feeds = state.get("competitor_feeds", {})
         product_feeds = state.get("product_review_feeds", {})
         llm_summary = state.get("llm_summary", "No summary available")
+        llm_insights = state.get("llm_insights", [])  # NEW: Get LLM-generated insights
         local_intel = state.get("local_intel", [])
         global_intel = state.get("global_intel", [])
 """
         # Create integration output with structured data
         structured_feeds = {
+            "profiles": profile_feeds,
+            "competitors": competitor_feeds,
+            "products": product_feeds,
             "local_intel": local_intel,
             "global_intel": global_intel
         }
+        # Create list for domain_insights (FRONTEND COMPATIBLE)
         domain_insights = []
         timestamp = datetime.utcnow().isoformat()
+        # PRIORITY 1: Add LLM-generated unique insights (curated and actionable)
+        for insight in llm_insights:
+            if isinstance(insight, dict) and insight.get("summary"):
                 domain_insights.append({
                     "source_event_id": str(uuid.uuid4()),
                     "domain": "intelligence",
+                    "summary": f"🎯 {insight.get('summary', '')}",  # Mark as AI-analyzed
+                    "severity": insight.get("severity", "medium"),
+                    "impact_type": insight.get("impact_type", "risk"),
+                    "timestamp": timestamp,
+                    "is_llm_generated": True
                 })
+        print(f"  ✓ Added {len(llm_insights)} LLM-generated competitive insights")
+        # PRIORITY 2: Add raw data only as fallback if LLM didn't generate enough
+        if len(domain_insights) < 5:
+            # Add competitor insights as fallback
+            for competitor, posts in competitor_feeds.items():
+                if not isinstance(posts, list):
                     continue
+                for post in posts[:3]:
+                    post_text = post.get("text", "") or post.get("title", "")
+                    if not post_text or len(post_text) < 20:
+                        continue
+                    severity = "high" if any(kw in post_text.lower() for kw in ["launch", "expansion", "acquisition"]) else "medium"
+                    domain_insights.append({
+                        "source_event_id": str(uuid.uuid4()),
+                        "domain": "intelligence",
+                        "summary": f"Competitor ({competitor}): {post_text[:200]}",
+                        "severity": severity,
+                        "impact_type": "risk",
+                        "timestamp": timestamp,
+                        "is_llm_generated": False
+                    })
+        # Add executive summary insight
         domain_insights.append({
             "source_event_id": str(uuid.uuid4()),
             "structured_data": structured_feeds,
             "domain": "intelligence",
+            "summary": f"📊 Business Intelligence Summary: {llm_summary[:300]}",
             "severity": "medium",
+            "impact_type": "risk",
+            "is_llm_generated": True
         })
+        print(f"  ✓ Created {len(domain_insights)} total intelligence insights")
         return {
             "final_feed": bulletin,

src/nodes/socialAgentNode.py CHANGED Viewed

@@ -404,45 +404,94 @@ class SocialAgentNode:
     def generate_llm_summary(self, state: SocialAgentState) -> Dict[str, Any]:
         """
-        Module 3B: Use Groq LLM to generate executive summary
         """
-        print("[MODULE 3B] Generating LLM Summary")
         structured_feeds = state.get("structured_output", {})
         try:
-            summary_prompt = f"""Analyze the following social intelligence data and create a concise executive summary of trending topics, events, and people.
-Data Summary:
-- Sri Lanka Trending: {len(structured_feeds.get('sri lanka', []))} items
-- Asia Trending: {len(structured_feeds.get('asia', []))} items
-- World Trending: {len(structured_feeds.get('world', []))} items
-Sample Data:
-{json.dumps(structured_feeds, indent=2)[:2000]}
-Generate a brief (3-5 sentences) executive summary highlighting the most important trending topics, events, and social developments."""
-            llm_response = self.llm.invoke(summary_prompt)
-            llm_summary = llm_response.content if hasattr(llm_response, 'content') else str(llm_response)
-            print("  ✓ LLM Summary Generated")
         except Exception as e:
             print(f"  ⚠️ LLM Error: {e}")
-            llm_summary = "AI summary currently unavailable."
         return {
-            "llm_summary": llm_summary
         }
     def format_final_output(self, state: SocialAgentState) -> Dict[str, Any]:
         """
-        Module 3C: Format final feed output
         """
         print("[MODULE 3C] Formatting Final Output")
         llm_summary = state.get("llm_summary", "No summary available")
         structured_feeds = state.get("structured_output", {})
         trending_count = len([r for r in state.get("worker_results", []) if r.get("category") == "trending"])
@@ -483,91 +532,81 @@ Monitoring social sentiment, trending topics, events, and people across:
 Source: Multi-platform aggregation (Twitter, Facebook, LinkedIn, Instagram, Reddit)
 """
-        # Create list for per-region/topic domain_insights (FRONTEND COMPATIBLE)
         domain_insights = []
         timestamp = datetime.utcnow().isoformat()
-        # Sri Lankan districts for geographic tagging
-        districts = [
-            "colombo", "gampaha", "kalutara", "kandy", "matale",
-            "nuwara eliya", "galle", "matara", "hambantota",
-            "jaffna", "kilinochchi", "mannar", "mullaitivu", "vavuniya",
-            "puttalam", "kurunegala", "anuradhapura", "polonnaruwa",
-            "badulla", "monaragala", "ratnapura", "kegalle",
-            "ampara", "batticaloa", "trincomalee"
-        ]
-        # 1. Create per-item Sri Lanka social insights
-        sri_lanka_data = structured_feeds.get("sri lanka", [])
-        for post in sri_lanka_data[:15]:
-            post_text = post.get("text", "") or post.get("title", "")
-            if not post_text or len(post_text) < 10:
-                continue
-            # Try to detect district from post text
-            detected_district = "Sri Lanka"
-            for district in districts:
-                if district.lower() in post_text.lower():
-                    detected_district = district.title()
-                    break
-            # Determine severity based on keywords
-            severity = "low"
-            if any(kw in post_text.lower() for kw in ["protest", "riot", "emergency", "violence", "crisis"]):
-                severity = "high"
-            elif any(kw in post_text.lower() for kw in ["trending", "viral", "breaking", "update"]):
-                severity = "medium"
-            domain_insights.append({
-                "source_event_id": str(uuid.uuid4()),
-                "domain": "social",
-                "summary": f"{detected_district}: {post_text[:200]}",
-                "severity": severity,
-                "impact_type": "risk" if severity in ["high", "medium"] else "opportunity",
-                "timestamp": timestamp
-            })
-        # 2. Create Asia regional insights
-        asia_data = structured_feeds.get("asia", [])
-        for post in asia_data[:5]:
-            post_text = post.get("text", "") or post.get("title", "")
-            if not post_text or len(post_text) < 10:
-                continue
-            domain_insights.append({
-                "source_event_id": str(uuid.uuid4()),
-                "domain": "social",
-                "summary": f"Asia Regional: {post_text[:200]}",
-                "severity": "medium",
-                "impact_type": "risk",
-                "timestamp": timestamp
-            })
-        # 3. Create World insights
-        world_data = structured_feeds.get("world", [])
-        for post in world_data[:5]:
-            post_text = post.get("text", "") or post.get("title", "")
-            if not post_text or len(post_text) < 10:
-                continue
-            domain_insights.append({
-                "source_event_id": str(uuid.uuid4()),
-                "domain": "social",
-                "summary": f"Global: {post_text[:200]}",
-                "severity": "low",
-                "impact_type": "opportunity",
-                "timestamp": timestamp
-            })
-        # 4. Add executive summary insight
         domain_insights.append({
             "source_event_id": str(uuid.uuid4()),
             "structured_data": structured_feeds,
             "domain": "social",
-            "summary": f"Sri Lanka Social Intelligence Summary: {llm_summary[:300]}",
             "severity": "medium",
-            "impact_type": "risk"
         })
-        print(f"  ✓ Created {len(domain_insights)} social intelligence insights")
         return {
             "final_feed": bulletin,

     def generate_llm_summary(self, state: SocialAgentState) -> Dict[str, Any]:
         """
+        Module 3B: Use Groq LLM to generate executive summary AND structured insights
         """
+        print("[MODULE 3B] Generating LLM Summary + Structured Insights")
         structured_feeds = state.get("structured_output", {})
+        llm_summary = "AI summary currently unavailable."
+        llm_insights = []
         try:
+            # Collect sample posts for analysis
+            all_posts = []
+            for region, posts in structured_feeds.items():
+                for p in posts[:5]:  # Top 5 per region
+                    text = p.get("text", "") or p.get("title", "")
+                    if text and len(text) > 20:
+                        all_posts.append(f"[{region.upper()}] {text[:200]}")
+            if not all_posts:
+                return {"llm_summary": llm_summary, "llm_insights": []}
+            posts_text = "\n".join(all_posts[:15])
+            # Generate summary AND structured insights
+            analysis_prompt = f"""Analyze these social media posts from Sri Lanka and the region. Generate:
+1. A 3-sentence executive summary of key trends
+2. Up to 5 unique intelligence insights
+Posts:
+{posts_text}
+Respond in this exact JSON format:
+{{
+    "executive_summary": "Brief 3-sentence summary of key social trends and developments",
+    "insights": [
+        {{"summary": "Unique insight #1 (not copying post text)", "severity": "low/medium/high", "impact_type": "risk/opportunity"}},
+        {{"summary": "Unique insight #2", "severity": "low/medium/high", "impact_type": "risk/opportunity"}}
+    ]
+}}
+Rules:
+- Generate NEW insights, don't just copy post text
+- Identify patterns and emerging trends
+- Classify severity based on potential impact
+- Mark positive developments as "opportunity", concerning ones as "risk"
+JSON only, no explanation:"""
+            llm_response = self.llm.invoke(analysis_prompt)
+            content = llm_response.content if hasattr(llm_response, 'content') else str(llm_response)
+            # Parse JSON response
+            import re
+            content = content.strip()
+            if content.startswith("```"):
+                content = re.sub(r'^```\w*\n?', '', content)
+                content = re.sub(r'\n?```$', '', content)
+            result = json.loads(content)
+            llm_summary = result.get("executive_summary", llm_summary)
+            llm_insights = result.get("insights", [])
+            print(f"  ✓ LLM generated {len(llm_insights)} unique insights")
+        except json.JSONDecodeError as e:
+            print(f"  ⚠️ JSON parse error: {e}")
+            # Fallback to simple summary
+            try:
+                fallback_prompt = f"Summarize these social media trends in 3 sentences:\n{posts_text[:1500]}"
+                response = self.llm.invoke(fallback_prompt)
+                llm_summary = response.content if hasattr(response, 'content') else str(response)
+            except:
+                pass
         except Exception as e:
             print(f"  ⚠️ LLM Error: {e}")
         return {
+            "llm_summary": llm_summary,
+            "llm_insights": llm_insights
         }
     def format_final_output(self, state: SocialAgentState) -> Dict[str, Any]:
         """
+        Module 3C: Format final feed output with LLM-enhanced insights
         """
         print("[MODULE 3C] Formatting Final Output")
         llm_summary = state.get("llm_summary", "No summary available")
+        llm_insights = state.get("llm_insights", [])  # NEW: Get LLM-generated insights
         structured_feeds = state.get("structured_output", {})
         trending_count = len([r for r in state.get("worker_results", []) if r.get("category") == "trending"])
 Source: Multi-platform aggregation (Twitter, Facebook, LinkedIn, Instagram, Reddit)
 """
+        # Create list for domain_insights (FRONTEND COMPATIBLE)
         domain_insights = []
         timestamp = datetime.utcnow().isoformat()
+        # PRIORITY 1: Add LLM-generated unique insights (these are curated and unique)
+        for insight in llm_insights:
+            if isinstance(insight, dict) and insight.get("summary"):
+                domain_insights.append({
+                    "source_event_id": str(uuid.uuid4()),
+                    "domain": "social",
+                    "summary": f"🔍 {insight.get('summary', '')}",  # Mark as AI-analyzed
+                    "severity": insight.get("severity", "medium"),
+                    "impact_type": insight.get("impact_type", "risk"),
+                    "timestamp": timestamp,
+                    "is_llm_generated": True  # Flag for frontend
+                })
+        print(f"  ✓ Added {len(llm_insights)} LLM-generated insights")
+        # PRIORITY 2: Add top raw posts only if we need more (fallback)
+        # Only add raw posts if LLM didn't generate enough insights
+        if len(domain_insights) < 5:
+            # Sri Lankan districts for geographic tagging
+            districts = [
+                "colombo", "gampaha", "kalutara", "kandy", "matale",
+                "nuwara eliya", "galle", "matara", "hambantota",
+                "jaffna", "kilinochchi", "mannar", "mullaitivu", "vavuniya",
+                "puttalam", "kurunegala", "anuradhapura", "polonnaruwa",
+                "badulla", "monaragala", "ratnapura", "kegalle",
+                "ampara", "batticaloa", "trincomalee"
+            ]
+            # Add Sri Lanka posts as fallback
+            sri_lanka_data = structured_feeds.get("sri lanka", [])
+            for post in sri_lanka_data[:5]:
+                post_text = post.get("text", "") or post.get("title", "")
+                if not post_text or len(post_text) < 20:
+                    continue
+                # Detect district
+                detected_district = "Sri Lanka"
+                for district in districts:
+                    if district.lower() in post_text.lower():
+                        detected_district = district.title()
+                        break
+                # Determine severity
+                severity = "low"
+                if any(kw in post_text.lower() for kw in ["protest", "riot", "emergency", "violence", "crisis"]):
+                    severity = "high"
+                elif any(kw in post_text.lower() for kw in ["trending", "viral", "breaking", "update"]):
+                    severity = "medium"
+                domain_insights.append({
+                    "source_event_id": str(uuid.uuid4()),
+                    "domain": "social",
+                    "summary": f"{detected_district}: {post_text[:200]}",
+                    "severity": severity,
+                    "impact_type": "risk" if severity in ["high", "medium"] else "opportunity",
+                    "timestamp": timestamp,
+                    "is_llm_generated": False
+                })
+        # Add executive summary insight
         domain_insights.append({
             "source_event_id": str(uuid.uuid4()),
             "structured_data": structured_feeds,
             "domain": "social",
+            "summary": f"📊 Social Intelligence Summary: {llm_summary[:300]}",
             "severity": "medium",
+            "impact_type": "risk",
+            "is_llm_generated": True
         })
+        print(f"  ✓ Created {len(domain_insights)} total social intelligence insights")
         return {
             "final_feed": bulletin,

src/utils/utils.py CHANGED Viewed

@@ -439,21 +439,15 @@ def scrape_rivernet_impl(
                 main_html = page.content()
                 main_soup = BeautifulSoup(main_html, "html.parser")
-                # Look for any alert/warning text on main page
-                page_text = main_soup.get_text(separator=" ", strip=True)
-                alert_keywords = ["warning", "alert", "flood", "danger", "high", "rising"]
-                for keyword in alert_keywords:
-                    if keyword.lower() in page_text.lower():
-                        # Extract context around keyword
-                        match = re.search(rf'.{{0,100}}{keyword}.{{0,100}}', page_text, re.I)
-                        if match:
-                            alert_text = match.group(0).strip()
-                            if len(alert_text) > 20 and alert_text not in [a.get("text") for a in results["alerts"]]:
-                                results["alerts"].append({
-                                    "text": alert_text,
-                                    "severity": "high" if keyword in ["danger", "flood"] else "medium",
-                                    "source": "rivernet.lk main page"
-                                })
                 logger.info("[RIVERNET] Main page loaded successfully")
@@ -518,26 +512,43 @@ def scrape_rivernet_impl(
                             except (ValueError, IndexError):
                                 continue
-                    # Determine status based on keywords (refined to avoid false positives)
                     text_lower = page_text.lower()
-                    # Default to normal
                     river_data["status"] = "normal"
-                    # DANGER / CRITICAL
-                    if any(w in text_lower for w in ["major flood", "danger level", "critical level", "red alert", "evacuate", "extreme flood"]):
                         river_data["status"] = "danger"
-                    # WARNING (Stricter: removed generic "high", "alert")
-                    elif any(w in text_lower for w in ["minor flood", "warning level", "flood alert", "amber alert", "high risk", "flood warning"]):
                         river_data["status"] = "warning"
-                    # RISING
-                    elif any(w in text_lower for w in ["water level rising", "rising trend", "level is rising"]):
                         river_data["status"] = "rising"
-                    # explicitly check for normal keywords to confirm (optional, as we default to normal)
-                    elif any(w in text_lower for w in ["normal", "safe", "stable", "low", "green", "decreasing"]):
                         river_data["status"] = "normal"
                     results["rivers"].append(river_data)

                 main_html = page.content()
                 main_soup = BeautifulSoup(main_html, "html.parser")
+                # NOTE: Disabled loose keyword extraction - was causing false positives
+                # Real flood alerts will be determined from individual river page status
+                # The previous alert_keywords approach matched generic site text like
+                # "warning: javascript required" causing fake alerts
+                # If we need main page alerts, look for specific alert banner elements
+                # alert_banners = main_soup.select(".alert-banner, .flood-warning, .critical-notice")
+                # for banner in alert_banners:
+                #     results["alerts"].append({...})
                 logger.info("[RIVERNET] Main page loaded successfully")
                             except (ValueError, IndexError):
                                 continue
+                    # Determine status based on keywords (STRICTER to avoid false positives)
                     text_lower = page_text.lower()
+                    # Default to normal - only escalate if clear flood indicators
                     river_data["status"] = "normal"
+                    # CRITICAL: Only consider keywords in FLOOD CONTEXT
+                    # Look for phrases, not just words, to avoid false positives
+                    # DANGER / CRITICAL - Very specific phrases only
+                    danger_phrases = [
+                        "major flood", "danger level exceeded", "critical flood",
+                        "red alert", "evacuate immediately", "extreme flood",
+                        "water level exceeds danger", "above danger level"
+                    ]
+                    if any(phrase in text_lower for phrase in danger_phrases):
                         river_data["status"] = "danger"
+                    # WARNING - Specific flood warning phrases
+                    elif any(phrase in text_lower for phrase in [
+                        "minor flood", "warning level exceeded", "flood alert issued",
+                        "amber alert", "approaching warning level",
+                        "water level exceeds warning", "above warning level"
+                    ]):
                         river_data["status"] = "warning"
+                    # RISING - Only if explicitly rising
+                    elif any(phrase in text_lower for phrase in [
+                        "water level rising", "rising trend detected",
+                        "level is rising rapidly", "increasing water level"
+                    ]):
                         river_data["status"] = "rising"
+                    # NORMAL indicators (optional, just for logging)
+                    elif any(phrase in text_lower for phrase in [
+                        "normal level", "stable", "safe level", "decreasing", "below warning"
+                    ]):
                         river_data["status"] = "normal"
                     results["rivers"].append(river_data)