nivakaran commited on
Commit
765b37c
·
verified ·
1 Parent(s): c7d4394

Upload folder using huggingface_hub

Browse files
main.py CHANGED
@@ -727,21 +727,57 @@ def get_anomalies(limit: int = 20, threshold: float = 0.5):
727
  # Get recent feeds
728
  feeds = storage_manager.get_recent_feeds(limit=100)
729
 
 
 
 
 
 
 
 
 
 
730
  if not _load_anomaly_components():
731
- # Return high-severity events as proxy for anomalies
732
- anomalies = [
733
- {**f, "anomaly_score": 0.8, "is_anomaly": True}
734
- for f in feeds
735
- if f.get("severity") in ["critical", "high"]
736
- ][:limit]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  return {
738
- "anomalies": anomalies,
739
  "total": len(anomalies),
740
- "model_status": "not_trained",
741
- "fallback": "severity_based"
 
742
  }
743
 
744
- # Score each feed
745
  anomalies = []
746
  for feed in feeds:
747
  summary = feed.get("summary", "")
@@ -758,13 +794,13 @@ def get_anomalies(limit: int = 20, threshold: float = 0.5):
758
  else:
759
  score = 1.0 if prediction == -1 else 0.0
760
 
761
- # Normalize score to 0-1 range (rough approximation)
762
  normalized_score = max(0, min(1, (score + 0.5)))
763
 
764
  if prediction == -1 or normalized_score >= threshold:
765
  anomalies.append({
766
  **feed,
767
- "anomaly_score": float(normalized_score),
768
  "is_anomaly": prediction == -1,
769
  "language": lang
770
  })
@@ -783,7 +819,7 @@ def get_anomalies(limit: int = 20, threshold: float = 0.5):
783
  "anomalies": anomalies,
784
  "total": len(anomalies),
785
  "threshold": threshold,
786
- "model_status": "loaded"
787
  }
788
 
789
  except Exception as e:
@@ -1117,42 +1153,57 @@ def remove_intel_target(target_type: str, value: str, platform: Optional[str] =
1117
  _weather_predictor = None
1118
 
1119
  def get_weather_predictor():
1120
- """Lazy-load the weather predictor."""
1121
  global _weather_predictor
1122
- if _weather_predictor is None:
1123
- try:
1124
- import sys
1125
- import importlib
1126
- from pathlib import Path
1127
-
1128
- weather_path = Path(__file__).parent / "models" / "weather-prediction" / "src"
1129
- weather_path_str = str(weather_path)
1130
-
1131
- # Ensure path is in sys.path
1132
- if weather_path_str not in sys.path:
1133
- sys.path.insert(0, weather_path_str)
1134
-
1135
- # CRITICAL FIX: Handle 'components' package name collision
1136
- # If 'components' is already loaded from another model (e.g. currency), unload it
1137
- if 'components' in sys.modules:
1138
- existing_path = getattr(sys.modules['components'], '__file__', '')
1139
- if existing_path and weather_path_str not in str(existing_path):
1140
- logger.warning(f"[WeatherAPI] components collision detected. Unloading {existing_path}")
1141
- # Unload generic modules to force reload from new path
1142
- for mod in list(sys.modules.keys()):
1143
- if mod.startswith('components') or mod.startswith('utils'):
1144
- del sys.modules[mod]
1145
-
1146
- # Now import fresh
1147
- from components.predictor import WeatherPredictor
1148
- _weather_predictor = WeatherPredictor()
1149
- logger.info("[WeatherAPI] Weather predictor initialized")
1150
- except Exception as e:
1151
- logger.warning(f"[WeatherAPI] Failed to initialize predictor: {e}")
1152
- import traceback
1153
- logger.warning(traceback.format_exc())
1154
- _weather_predictor = None
1155
- return _weather_predictor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1156
 
1157
 
1158
  @app.get("/api/weather/predictions")
 
727
  # Get recent feeds
728
  feeds = storage_manager.get_recent_feeds(limit=100)
729
 
730
+ if not feeds:
731
+ # No feeds yet - return helpful message
732
+ return {
733
+ "anomalies": [],
734
+ "total": 0,
735
+ "model_status": "no_data",
736
+ "message": "No feed data available yet. Wait for graph execution to complete."
737
+ }
738
+
739
  if not _load_anomaly_components():
740
+ # Use severity + keyword-based scoring as intelligent fallback
741
+ anomalies = []
742
+ anomaly_keywords = ["emergency", "crisis", "breaking", "urgent", "alert",
743
+ "warning", "critical", "disaster", "flood", "protest"]
744
+
745
+ for f in feeds:
746
+ score = 0.0
747
+ summary = str(f.get("summary", "")).lower()
748
+ severity = f.get("severity", "low")
749
+
750
+ # Severity-based scoring
751
+ if severity == "critical": score = 0.9
752
+ elif severity == "high": score = 0.75
753
+ elif severity == "medium": score = 0.5
754
+ else: score = 0.25
755
+
756
+ # Keyword boosting
757
+ keyword_matches = sum(1 for kw in anomaly_keywords if kw in summary)
758
+ if keyword_matches > 0:
759
+ score = min(1.0, score + (keyword_matches * 0.1))
760
+
761
+ # Only include if above threshold
762
+ if score >= threshold:
763
+ anomalies.append({
764
+ **f,
765
+ "anomaly_score": round(score, 3),
766
+ "is_anomaly": score >= 0.7
767
+ })
768
+
769
+ # Sort by anomaly score
770
+ anomalies.sort(key=lambda x: x.get("anomaly_score", 0), reverse=True)
771
+
772
  return {
773
+ "anomalies": anomalies[:limit],
774
  "total": len(anomalies),
775
+ "threshold": threshold,
776
+ "model_status": "fallback_scoring",
777
+ "message": "Using severity + keyword scoring. Train ML model for advanced detection."
778
  }
779
 
780
+ # ML Model is loaded - use it for scoring
781
  anomalies = []
782
  for feed in feeds:
783
  summary = feed.get("summary", "")
 
794
  else:
795
  score = 1.0 if prediction == -1 else 0.0
796
 
797
+ # Normalize score to 0-1 range
798
  normalized_score = max(0, min(1, (score + 0.5)))
799
 
800
  if prediction == -1 or normalized_score >= threshold:
801
  anomalies.append({
802
  **feed,
803
+ "anomaly_score": float(round(normalized_score, 3)),
804
  "is_anomaly": prediction == -1,
805
  "language": lang
806
  })
 
819
  "anomalies": anomalies,
820
  "total": len(anomalies),
821
  "threshold": threshold,
822
+ "model_status": "ml_active"
823
  }
824
 
825
  except Exception as e:
 
1153
  _weather_predictor = None
1154
 
1155
  def get_weather_predictor():
1156
+ """Lazy-load the weather predictor using isolated import."""
1157
  global _weather_predictor
1158
+ if _weather_predictor is not None:
1159
+ return _weather_predictor
1160
+
1161
+ try:
1162
+ import importlib.util
1163
+ from pathlib import Path
1164
+
1165
+ # Use importlib.util for fully isolated import (avoids package collisions)
1166
+ weather_src = Path(__file__).parent / "models" / "weather-prediction" / "src"
1167
+ predictor_path = weather_src / "components" / "predictor.py"
1168
+
1169
+ if not predictor_path.exists():
1170
+ logger.error(f"[WeatherAPI] predictor.py not found at {predictor_path}")
1171
+ return None
1172
+
1173
+ # First, ensure entity module is loadable
1174
+ entity_path = weather_src / "entity" / "config_entity.py"
1175
+ if entity_path.exists():
1176
+ entity_spec = importlib.util.spec_from_file_location(
1177
+ "weather_config_entity",
1178
+ str(entity_path)
1179
+ )
1180
+ entity_module = importlib.util.module_from_spec(entity_spec)
1181
+ sys.modules["weather_config_entity"] = entity_module
1182
+ entity_spec.loader.exec_module(entity_module)
1183
+
1184
+ # Add weather src to path temporarily for relative imports
1185
+ import sys
1186
+ weather_src_str = str(weather_src)
1187
+ if weather_src_str not in sys.path:
1188
+ sys.path.insert(0, weather_src_str)
1189
+
1190
+ # Now load predictor module
1191
+ spec = importlib.util.spec_from_file_location(
1192
+ "weather_predictor_module",
1193
+ str(predictor_path)
1194
+ )
1195
+ module = importlib.util.module_from_spec(spec)
1196
+ spec.loader.exec_module(module)
1197
+
1198
+ _weather_predictor = module.WeatherPredictor()
1199
+ logger.info("[WeatherAPI] ✓ Weather predictor initialized via isolated import")
1200
+ return _weather_predictor
1201
+
1202
+ except Exception as e:
1203
+ logger.error(f"[WeatherAPI] Failed to initialize predictor: {e}")
1204
+ import traceback
1205
+ logger.debug(traceback.format_exc())
1206
+ return None
1207
 
1208
 
1209
  @app.get("/api/weather/predictions")
models/anomaly-detection/main.py CHANGED
@@ -4,16 +4,11 @@ Entry point for the anomaly detection training pipeline
4
  """
5
  import os
6
  import sys
7
- import logging
8
  from pathlib import Path
9
 
10
- # Add src to path
11
- sys.path.insert(0, str(Path(__file__).parent / "src"))
12
-
13
- from src.pipeline import run_training_pipeline
14
- from src.entity import PipelineConfig
15
-
16
- # Configure logging
17
  logging.basicConfig(
18
  level=logging.INFO,
19
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
@@ -22,9 +17,14 @@ logging.basicConfig(
22
  logging.FileHandler("training.log")
23
  ]
24
  )
25
-
26
  logger = logging.getLogger("main")
27
 
 
 
 
 
 
 
28
 
29
  def main():
30
  """Run the anomaly detection training pipeline"""
 
4
  """
5
  import os
6
  import sys
7
+ import logging # Import standard library BEFORE path manipulation
8
  from pathlib import Path
9
 
10
+ # CRITICAL: Configure logging BEFORE adding src/ to path
11
+ # (src/logging/ directory would otherwise shadow the standard module)
 
 
 
 
 
12
  logging.basicConfig(
13
  level=logging.INFO,
14
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 
17
  logging.FileHandler("training.log")
18
  ]
19
  )
 
20
  logger = logging.getLogger("main")
21
 
22
+ # Add src to path - AFTER logging is configured
23
+ sys.path.insert(0, str(Path(__file__).parent / "src"))
24
+
25
+ from src.pipeline import run_training_pipeline
26
+ from src.entity import PipelineConfig
27
+
28
 
29
  def main():
30
  """Run the anomaly detection training pipeline"""
models/anomaly-detection/src/pipeline/train.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Anomaly Detection Training Script
3
+ Convenience wrapper for: python models/anomaly-detection/main.py
4
+
5
+ Usage:
6
+ python models/anomaly-detection/src/pipeline/train.py
7
+ """
8
+ import sys
9
+ import argparse
10
+ import logging # Import BEFORE path manipulation
11
+ from pathlib import Path
12
+
13
+ # Configure logging BEFORE adding src/ to path
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
17
+ )
18
+
19
+ # Add parent directories to path - AFTER logging is configured
20
+ PIPELINE_ROOT = Path(__file__).parent.parent.parent
21
+ sys.path.insert(0, str(PIPELINE_ROOT))
22
+ sys.path.insert(0, str(PIPELINE_ROOT / "src"))
23
+
24
+ if __name__ == "__main__":
25
+ parser = argparse.ArgumentParser(description="Anomaly Detection Training")
26
+ parser.add_argument("--help-only", action="store_true", help="Show help and exit")
27
+
28
+ # Parse known args to allow --help to work without loading heavy modules
29
+ args, _ = parser.parse_known_args()
30
+
31
+ print("=" * 60)
32
+ print("ANOMALY DETECTION - TRAINING PIPELINE")
33
+ print("=" * 60)
34
+
35
+ # Import and run from main.py
36
+ from main import main
37
+
38
+ result = main()
39
+
40
+ if result:
41
+ print("=" * 60)
42
+ print("TRAINING COMPLETE!")
43
+ print(f"Best model: {result.model_trainer.best_model_name}")
44
+ print("=" * 60)
models/currency-volatility-prediction/main.py CHANGED
@@ -5,21 +5,23 @@ Can run data collection, training, or prediction independently
5
  """
6
  import os
7
  import sys
8
- import logging
9
  import argparse
10
  from pathlib import Path
11
  from datetime import datetime
12
 
13
- # Setup paths
14
- PIPELINE_ROOT = Path(__file__).parent
15
- sys.path.insert(0, str(PIPELINE_ROOT / "src"))
16
-
17
  logging.basicConfig(
18
  level=logging.INFO,
19
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
20
  )
21
  logger = logging.getLogger("currency_prediction")
22
 
 
 
 
 
23
 
24
  def run_data_ingestion(period: str = "2y"):
25
  """Run data ingestion from yfinance."""
 
5
  """
6
  import os
7
  import sys
8
+ import logging # Import standard library BEFORE path manipulation
9
  import argparse
10
  from pathlib import Path
11
  from datetime import datetime
12
 
13
+ # CRITICAL: Configure logging BEFORE adding src/ to path
14
+ # (src/logging/ directory would otherwise shadow the standard module)
 
 
15
  logging.basicConfig(
16
  level=logging.INFO,
17
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
18
  )
19
  logger = logging.getLogger("currency_prediction")
20
 
21
+ # Setup paths - AFTER logging is configured
22
+ PIPELINE_ROOT = Path(__file__).parent
23
+ sys.path.insert(0, str(PIPELINE_ROOT / "src"))
24
+
25
 
26
  def run_data_ingestion(period: str = "2y"):
27
  """Run data ingestion from yfinance."""
models/currency-volatility-prediction/src/pipeline/train.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Currency Volatility Prediction Training Script
3
+ Convenience wrapper for: python models/currency-volatility-prediction/main.py --mode train
4
+
5
+ Usage:
6
+ python models/currency-volatility-prediction/src/pipeline/train.py [--epochs 100] [--period 2y]
7
+ """
8
+ import sys
9
+ import argparse
10
+ import logging # CRITICAL: Import BEFORE path manipulation
11
+ from pathlib import Path
12
+
13
+ # Configure logging BEFORE adding src/ to path
14
+ # (src/logging/ directory would otherwise shadow the standard module)
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ )
19
+
20
+ # Add parent directories to path - AFTER logging is configured
21
+ PIPELINE_ROOT = Path(__file__).parent.parent.parent
22
+ sys.path.insert(0, str(PIPELINE_ROOT))
23
+ sys.path.insert(0, str(PIPELINE_ROOT / "src"))
24
+
25
+ if __name__ == "__main__":
26
+ parser = argparse.ArgumentParser(description="Currency Prediction Training")
27
+ parser.add_argument("--epochs", type=int, default=100, help="Training epochs")
28
+ parser.add_argument("--period", type=str, default="2y", help="Data period (1y, 2y, 5y)")
29
+ parser.add_argument("--full", action="store_true", help="Run full pipeline (ingest + train + predict)")
30
+
31
+ args = parser.parse_args()
32
+
33
+ # Import from main.py (after path setup)
34
+ from main import run_training, run_full_pipeline, run_data_ingestion
35
+
36
+ print("=" * 60)
37
+ print("CURRENCY (USD/LKR) PREDICTION - TRAINING PIPELINE")
38
+ print("=" * 60)
39
+
40
+ if args.full:
41
+ run_full_pipeline()
42
+ else:
43
+ # Run data ingestion first if no data exists
44
+ try:
45
+ from components.data_ingestion import CurrencyDataIngestion
46
+ ingestion = CurrencyDataIngestion()
47
+ df = ingestion.load_existing()
48
+ print(f"✓ Found existing data: {len(df)} records")
49
+ except FileNotFoundError:
50
+ print("No existing data, running ingestion first...")
51
+ run_data_ingestion(period=args.period)
52
+
53
+ # Run training
54
+ run_training(epochs=args.epochs)
55
+
56
+ print("=" * 60)
57
+ print("TRAINING COMPLETE!")
58
+ print("=" * 60)
models/stock-price-prediction/main.py CHANGED
@@ -5,21 +5,23 @@ Can run data collection, training, or prediction independently
5
  """
6
  import os
7
  import sys
8
- import logging
9
  import argparse
10
  from pathlib import Path
11
  from datetime import datetime
12
 
13
- # Setup paths
14
- PIPELINE_ROOT = Path(__file__).parent
15
- sys.path.insert(0, str(PIPELINE_ROOT / "src"))
16
-
17
  logging.basicConfig(
18
  level=logging.INFO,
19
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
20
  )
21
  logger = logging.getLogger("stock_prediction")
22
 
 
 
 
 
23
 
24
  def run_data_ingestion():
25
  """Run data ingestion for all stocks."""
 
5
  """
6
  import os
7
  import sys
8
+ import logging # Import standard library BEFORE path manipulation
9
  import argparse
10
  from pathlib import Path
11
  from datetime import datetime
12
 
13
+ # CRITICAL: Configure logging BEFORE adding src/ to path
14
+ # (src/logging/ directory would otherwise shadow the standard module)
 
 
15
  logging.basicConfig(
16
  level=logging.INFO,
17
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
18
  )
19
  logger = logging.getLogger("stock_prediction")
20
 
21
+ # Setup paths - AFTER logging is configured
22
+ PIPELINE_ROOT = Path(__file__).parent
23
+ sys.path.insert(0, str(PIPELINE_ROOT / "src"))
24
+
25
 
26
  def run_data_ingestion():
27
  """Run data ingestion for all stocks."""
models/stock-price-prediction/src/pipeline/train.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stock Price Prediction Training Script
3
+ Convenience wrapper for: python models/stock-price-prediction/main.py --mode train
4
+
5
+ Usage:
6
+ python models/stock-price-prediction/src/pipeline/train.py [--stock JKH] [--no-optuna] [--full]
7
+ """
8
+ import sys
9
+ import argparse
10
+ import logging # CRITICAL: Import BEFORE path manipulation
11
+ from pathlib import Path
12
+
13
+ # Configure logging BEFORE adding src/ to path
14
+ # (src/logging/ directory would otherwise shadow the standard module)
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ )
19
+
20
+ # Add parent directories to path - AFTER logging is configured
21
+ PIPELINE_ROOT = Path(__file__).parent.parent.parent
22
+ sys.path.insert(0, str(PIPELINE_ROOT))
23
+ sys.path.insert(0, str(PIPELINE_ROOT / "src"))
24
+
25
+ if __name__ == "__main__":
26
+ parser = argparse.ArgumentParser(description="Stock Price Prediction Training")
27
+ parser.add_argument("--stock", type=str, default=None, help="Specific stock to train (e.g., JKH, COMB)")
28
+ parser.add_argument("--no-optuna", action="store_true", help="Disable Optuna hyperparameter optimization")
29
+ parser.add_argument("--full", action="store_true", help="Run full pipeline (ingest + train + predict)")
30
+
31
+ args = parser.parse_args()
32
+ use_optuna = not args.no_optuna
33
+
34
+ # Import from main.py (after path setup)
35
+ from main import run_training, run_full_pipeline, run_data_ingestion
36
+
37
+ print("=" * 60)
38
+ print("STOCK PRICE (CSE) PREDICTION - TRAINING PIPELINE")
39
+ print("=" * 60)
40
+
41
+ if args.full:
42
+ run_full_pipeline(use_optuna=use_optuna)
43
+ else:
44
+ # Run data ingestion first if no data exists
45
+ try:
46
+ from components.data_ingestion import StockDataIngestion
47
+ ingestion = StockDataIngestion()
48
+ stocks = list(ingestion.config.stocks.keys())
49
+ df = ingestion.load_stock_data(stocks[0])
50
+ if df is not None:
51
+ print(f"✓ Found existing data for {len(stocks)} stocks")
52
+ else:
53
+ raise FileNotFoundError()
54
+ except (FileNotFoundError, Exception):
55
+ print("No existing data, running ingestion first...")
56
+ run_data_ingestion()
57
+
58
+ # Run training
59
+ run_training(use_optuna=use_optuna, stock=args.stock)
60
+
61
+ print("=" * 60)
62
+ print("TRAINING COMPLETE!")
63
+ print("=" * 60)
models/weather-prediction/main.py CHANGED
@@ -5,21 +5,23 @@ Can run data collection, training, or prediction independently
5
  """
6
  import os
7
  import sys
8
- import logging
9
  import argparse
10
  from pathlib import Path
11
  from datetime import datetime
12
 
13
- # Setup paths
14
- PIPELINE_ROOT = Path(__file__).parent
15
- sys.path.insert(0, str(PIPELINE_ROOT / "src"))
16
-
17
  logging.basicConfig(
18
  level=logging.INFO,
19
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
20
  )
21
  logger = logging.getLogger("weather_prediction")
22
 
 
 
 
 
23
 
24
  def run_data_ingestion(months: int = 12):
25
  """Run data ingestion for all stations."""
 
5
  """
6
  import os
7
  import sys
8
+ import logging # Import standard library BEFORE path manipulation
9
  import argparse
10
  from pathlib import Path
11
  from datetime import datetime
12
 
13
+ # CRITICAL: Configure logging BEFORE adding src/ to path
14
+ # (src/logging/ directory would otherwise shadow the standard module)
 
 
15
  logging.basicConfig(
16
  level=logging.INFO,
17
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
18
  )
19
  logger = logging.getLogger("weather_prediction")
20
 
21
+ # Setup paths - AFTER logging is configured
22
+ PIPELINE_ROOT = Path(__file__).parent
23
+ sys.path.insert(0, str(PIPELINE_ROOT / "src"))
24
+
25
 
26
  def run_data_ingestion(months: int = 12):
27
  """Run data ingestion for all stations."""
models/weather-prediction/pyproject.toml CHANGED
@@ -10,6 +10,7 @@ dependencies = [
10
  "fastapi>=0.122.0",
11
  "mlflow>=3.6.0",
12
  "numpy>=2.3.5",
 
13
  "pandas>=2.3.3",
14
  "pyaml>=25.7.0",
15
  "pymongo[srv]>=4.15.4",
 
10
  "fastapi>=0.122.0",
11
  "mlflow>=3.6.0",
12
  "numpy>=2.3.5",
13
+ "optuna>=4.6.0",
14
  "pandas>=2.3.3",
15
  "pyaml>=25.7.0",
16
  "pymongo[srv]>=4.15.4",
models/weather-prediction/src/components/data_ingestion.py CHANGED
@@ -36,6 +36,7 @@ class DataIngestion:
36
  def ingest_all(self) -> str:
37
  """
38
  Ingest historical weather data for all stations.
 
39
 
40
  Returns:
41
  Path to saved CSV file
@@ -55,9 +56,73 @@ class DataIngestion:
55
  save_path=save_path
56
  )
57
 
 
 
 
 
 
 
 
58
  logger.info(f"[DATA_INGESTION] ✓ Ingested {len(df)} total records")
59
  return save_path
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def ingest_station(self, station_name: str, months: int = None) -> pd.DataFrame:
62
  """
63
  Ingest data for a single station.
 
36
  def ingest_all(self) -> str:
37
  """
38
  Ingest historical weather data for all stations.
39
+ Falls back to synthetic data if scraping fails.
40
 
41
  Returns:
42
  Path to saved CSV file
 
56
  save_path=save_path
57
  )
58
 
59
+ # Fallback to synthetic data if scraping failed
60
+ if df.empty or len(df) < 100:
61
+ logger.warning("[DATA_INGESTION] Scraping failed or insufficient data. Generating synthetic training data.")
62
+ df = self._generate_synthetic_data()
63
+ df.to_csv(save_path, index=False)
64
+ logger.info(f"[DATA_INGESTION] Generated {len(df)} synthetic records")
65
+
66
  logger.info(f"[DATA_INGESTION] ✓ Ingested {len(df)} total records")
67
  return save_path
68
 
69
+ def _generate_synthetic_data(self) -> pd.DataFrame:
70
+ """
71
+ Generate synthetic weather data for training when scraping fails.
72
+ Uses realistic Sri Lankan climate patterns.
73
+ """
74
+ import numpy as np
75
+
76
+ # Generate 1 year of daily data for priority stations
77
+ priority_stations = ["COLOMBO", "KANDY", "JAFFNA", "BATTICALOA", "RATNAPURA"]
78
+
79
+ records = []
80
+ for station in priority_stations:
81
+ if station not in self.config.stations:
82
+ continue
83
+
84
+ config = self.config.stations[station]
85
+
86
+ # Generate 365 days of data
87
+ for day_offset in range(365):
88
+ date = datetime.now() - pd.Timedelta(days=day_offset)
89
+ month = date.month
90
+
91
+ # Monsoon-aware temperature (more realistic for Sri Lanka)
92
+ # South-West monsoon: May-Sep, North-East: Dec-Feb
93
+ base_temp = 28 if month in [3, 4, 5, 6, 7, 8] else 26
94
+ temp_variation = np.random.normal(0, 2)
95
+ temp_mean = base_temp + temp_variation
96
+
97
+ # Monsoon rainfall patterns
98
+ if month in [10, 11, 12]: # NE monsoon - heavy rain
99
+ rainfall = max(0, np.random.exponential(15))
100
+ elif month in [5, 6, 7]: # SW monsoon - moderate rain
101
+ rainfall = max(0, np.random.exponential(10))
102
+ else: # Inter-monsoon / dry
103
+ rainfall = max(0, np.random.exponential(3))
104
+
105
+ records.append({
106
+ "date": date.strftime("%Y-%m-%d"),
107
+ "year": date.year,
108
+ "month": month,
109
+ "day": date.day,
110
+ "station_code": config["code"],
111
+ "station_name": station,
112
+ "temp_mean": round(temp_mean, 1),
113
+ "temp_max": round(temp_mean + np.random.uniform(3, 6), 1),
114
+ "temp_min": round(temp_mean - np.random.uniform(3, 5), 1),
115
+ "rainfall": round(rainfall, 1),
116
+ "humidity": round(np.random.uniform(65, 90), 1),
117
+ "wind_speed": round(np.random.uniform(5, 25), 1),
118
+ "pressure": round(np.random.uniform(1008, 1015), 1),
119
+ })
120
+
121
+ df = pd.DataFrame(records)
122
+ df["date"] = pd.to_datetime(df["date"])
123
+ df = df.sort_values(["station_name", "date"]).reset_index(drop=True)
124
+ return df
125
+
126
  def ingest_station(self, station_name: str, months: int = None) -> pd.DataFrame:
127
  """
128
  Ingest data for a single station.
models/weather-prediction/src/pipeline/train.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Weather Prediction Training Script
3
+ Convenience wrapper for: python models/weather-prediction/main.py --mode train
4
+
5
+ Usage:
6
+ python models/weather-prediction/src/pipeline/train.py [--station COLOMBO] [--epochs 100]
7
+ """
8
+ import sys
9
+ import argparse
10
+ from pathlib import Path
11
+
12
+ # CRITICAL: Import standard library logging BEFORE adding src/ to path
13
+ # (src/logging/ directory would otherwise shadow the standard module)
14
+ import logging
15
+
16
+ # Add parent directories to path
17
+ PIPELINE_ROOT = Path(__file__).parent.parent.parent
18
+ sys.path.insert(0, str(PIPELINE_ROOT))
19
+ sys.path.insert(0, str(PIPELINE_ROOT / "src"))
20
+
21
+ if __name__ == "__main__":
22
+ parser = argparse.ArgumentParser(description="Weather Prediction Training")
23
+ parser.add_argument("--station", type=str, default=None, help="Station to train (e.g., COLOMBO)")
24
+ parser.add_argument("--epochs", type=int, default=100, help="Training epochs")
25
+ parser.add_argument("--full", action="store_true", help="Run full pipeline (ingest + train + predict)")
26
+
27
+ args = parser.parse_args()
28
+
29
+ # Import from main.py (after path setup)
30
+ from main import run_training, run_full_pipeline, run_data_ingestion
31
+
32
+ print("=" * 60)
33
+ print("WEATHER PREDICTION - TRAINING PIPELINE")
34
+ print("=" * 60)
35
+
36
+ if args.full:
37
+ run_full_pipeline()
38
+ else:
39
+ # Run data ingestion first if no data exists
40
+ try:
41
+ from components.data_ingestion import DataIngestion
42
+ ingestion = DataIngestion()
43
+ df = ingestion.load_existing()
44
+ print(f"✓ Found existing data: {len(df)} records")
45
+ except FileNotFoundError:
46
+ print("No existing data, running ingestion first...")
47
+ run_data_ingestion(months=3)
48
+
49
+ # Run training
50
+ run_training(station=args.station, epochs=args.epochs)
51
+
52
+ print("=" * 60)
53
+ print("TRAINING COMPLETE!")
54
+ print("=" * 60)
55
+
models/weather-prediction/src/utils/tutiempo_scraper.py CHANGED
@@ -175,19 +175,32 @@ class TutiempoScraper:
175
  DataFrame with all historical records
176
  """
177
  all_records = []
 
 
 
178
  current = datetime.now()
 
 
 
 
179
 
180
  for i in range(months):
181
- target_date = current - timedelta(days=30 * i)
182
  year = target_date.year
183
  month = target_date.month
184
 
185
  records = self.scrape_month(station_code, year, month)
186
 
187
- for r in records:
188
- r["station_name"] = station_name
189
-
190
- all_records.extend(records)
 
 
 
 
 
 
191
 
192
  # Be nice to the server
193
  time.sleep(1)
 
175
  DataFrame with all historical records
176
  """
177
  all_records = []
178
+
179
+ # IMPORTANT: TuTiempo has data publication delay of ~2-3 months
180
+ # Start from 3 months ago to avoid 404 errors on recent months
181
  current = datetime.now()
182
+ start_date = current - timedelta(days=90) # Start 3 months ago
183
+
184
+ consecutive_failures = 0
185
+ max_consecutive_failures = 3
186
 
187
  for i in range(months):
188
+ target_date = start_date - timedelta(days=30 * i)
189
  year = target_date.year
190
  month = target_date.month
191
 
192
  records = self.scrape_month(station_code, year, month)
193
 
194
+ if not records:
195
+ consecutive_failures += 1
196
+ if consecutive_failures >= max_consecutive_failures:
197
+ logger.warning(f"[TUTIEMPO] {max_consecutive_failures} consecutive failures for {station_name}, stopping")
198
+ break
199
+ else:
200
+ consecutive_failures = 0 # Reset on success
201
+ for r in records:
202
+ r["station_name"] = station_name
203
+ all_records.extend(records)
204
 
205
  # Be nice to the server
206
  time.sleep(1)
models/weather-prediction/uv.lock CHANGED
@@ -298,6 +298,18 @@ wheels = [
298
  { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
299
  ]
300
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  [[package]]
302
  name = "contourpy"
303
  version = "1.3.3"
@@ -1375,6 +1387,7 @@ dependencies = [
1375
  { name = "fastapi" },
1376
  { name = "mlflow" },
1377
  { name = "numpy" },
 
1378
  { name = "pandas" },
1379
  { name = "pyaml" },
1380
  { name = "pymongo" },
@@ -1391,6 +1404,7 @@ requires-dist = [
1391
  { name = "fastapi", specifier = ">=0.122.0" },
1392
  { name = "mlflow", specifier = ">=3.6.0" },
1393
  { name = "numpy", specifier = ">=2.3.5" },
 
1394
  { name = "pandas", specifier = ">=2.3.3" },
1395
  { name = "pyaml", specifier = ">=25.7.0" },
1396
  { name = "pymongo", extras = ["srv"], specifier = ">=4.15.4" },
@@ -1659,6 +1673,24 @@ wheels = [
1659
  { url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" },
1660
  ]
1661
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1662
  [[package]]
1663
  name = "packaging"
1664
  version = "25.0"
@@ -2615,6 +2647,18 @@ wheels = [
2615
  { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
2616
  ]
2617
 
 
 
 
 
 
 
 
 
 
 
 
 
2618
  [[package]]
2619
  name = "treelib"
2620
  version = "1.8.0"
 
298
  { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
299
  ]
300
 
301
+ [[package]]
302
+ name = "colorlog"
303
+ version = "6.10.1"
304
+ source = { registry = "https://pypi.org/simple" }
305
+ dependencies = [
306
+ { name = "colorama", marker = "sys_platform == 'win32'" },
307
+ ]
308
+ sdist = { url = "https://files.pythonhosted.org/packages/a2/61/f083b5ac52e505dfc1c624eafbf8c7589a0d7f32daa398d2e7590efa5fda/colorlog-6.10.1.tar.gz", hash = "sha256:eb4ae5cb65fe7fec7773c2306061a8e63e02efc2c72eba9d27b0fa23c94f1321", size = 17162, upload-time = "2025-10-16T16:14:11.978Z" }
309
+ wheels = [
310
+ { url = "https://files.pythonhosted.org/packages/6d/c1/e419ef3723a074172b68aaa89c9f3de486ed4c2399e2dbd8113a4fdcaf9e/colorlog-6.10.1-py3-none-any.whl", hash = "sha256:2d7e8348291948af66122cff006c9f8da6255d224e7cf8e37d8de2df3bad8c9c", size = 11743, upload-time = "2025-10-16T16:14:10.512Z" },
311
+ ]
312
+
313
  [[package]]
314
  name = "contourpy"
315
  version = "1.3.3"
 
1387
  { name = "fastapi" },
1388
  { name = "mlflow" },
1389
  { name = "numpy" },
1390
+ { name = "optuna" },
1391
  { name = "pandas" },
1392
  { name = "pyaml" },
1393
  { name = "pymongo" },
 
1404
  { name = "fastapi", specifier = ">=0.122.0" },
1405
  { name = "mlflow", specifier = ">=3.6.0" },
1406
  { name = "numpy", specifier = ">=2.3.5" },
1407
+ { name = "optuna", specifier = ">=4.6.0" },
1408
  { name = "pandas", specifier = ">=2.3.3" },
1409
  { name = "pyaml", specifier = ">=25.7.0" },
1410
  { name = "pymongo", extras = ["srv"], specifier = ">=4.15.4" },
 
1673
  { url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" },
1674
  ]
1675
 
1676
+ [[package]]
1677
+ name = "optuna"
1678
+ version = "4.6.0"
1679
+ source = { registry = "https://pypi.org/simple" }
1680
+ dependencies = [
1681
+ { name = "alembic" },
1682
+ { name = "colorlog" },
1683
+ { name = "numpy" },
1684
+ { name = "packaging" },
1685
+ { name = "pyyaml" },
1686
+ { name = "sqlalchemy" },
1687
+ { name = "tqdm" },
1688
+ ]
1689
+ sdist = { url = "https://files.pythonhosted.org/packages/6b/81/08f90f194eed78178064a9383432eca95611e2c5331e7b01e2418ce4b15a/optuna-4.6.0.tar.gz", hash = "sha256:89e38c2447c7f793a726617b8043f01e31f0bad54855040db17eb3b49404a369", size = 477444, upload-time = "2025-11-10T05:14:30.151Z" }
1690
+ wheels = [
1691
+ { url = "https://files.pythonhosted.org/packages/58/de/3d8455b08cb6312f8cc46aacdf16c71d4d881a1db4a4140fc5ef31108422/optuna-4.6.0-py3-none-any.whl", hash = "sha256:4c3a9facdef2b2dd7e3e2a8ae3697effa70fae4056fcf3425cfc6f5a40feb069", size = 404708, upload-time = "2025-11-10T05:14:28.6Z" },
1692
+ ]
1693
+
1694
  [[package]]
1695
  name = "packaging"
1696
  version = "25.0"
 
2647
  { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" },
2648
  ]
2649
 
2650
+ [[package]]
2651
+ name = "tqdm"
2652
+ version = "4.67.1"
2653
+ source = { registry = "https://pypi.org/simple" }
2654
+ dependencies = [
2655
+ { name = "colorama", marker = "sys_platform == 'win32'" },
2656
+ ]
2657
+ sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
2658
+ wheels = [
2659
+ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
2660
+ ]
2661
+
2662
  [[package]]
2663
  name = "treelib"
2664
  version = "1.8.0"
requirements.txt CHANGED
@@ -111,3 +111,5 @@ pyyaml
111
  # Utilities
112
  joblib
113
  tqdm
 
 
 
111
  # Utilities
112
  joblib
113
  tqdm
114
+
115
+ optuna
src/config/intel_config.json CHANGED
@@ -1,72 +1,85 @@
1
  {
2
- "user_profiles": {
3
- "twitter": [],
4
- "facebook": [],
5
- "linkedin": []
6
- },
7
- "user_keywords": [],
8
- "user_products": [],
9
- "operational_keywords": {
10
- "infrastructure": [
11
- "Colombo port",
12
- "Hambantota port",
13
- "port strike",
14
- "power outage",
15
- "water shortage",
16
- "fuel shortage",
17
- "airport delay",
18
- "customs clearance",
19
- "road closure",
20
- "railway disruption"
21
- ],
22
- "government": [
23
- "cabinet decision",
24
- "new policy",
25
- "regulation change",
26
- "tax amendment",
27
- "import restriction",
28
- "export ban",
29
- "license requirement",
30
- "central bank",
31
- "budget announcement"
32
- ],
33
- "opportunity": [
34
- "investment",
35
- "expansion",
36
- "new factory",
37
- "job creation",
38
- "export growth",
39
- "tourism boost",
40
- "infrastructure project",
41
- "development grant",
42
- "FDI",
43
- "trade agreement"
44
- ]
45
- },
46
- "alert_thresholds": {
47
- "trending_momentum_min": 2.0,
48
- "spike_multiplier": 3.0,
49
- "high_risk_score": 0.7,
50
- "high_opportunity_score": 0.6
51
- },
52
- "default_competitors": {
53
- "telecom": {
54
- "twitter": [
55
- "DialogSriLanka",
56
- "Mobaborang",
57
- "HutchSL"
58
- ],
59
- "facebook": [
60
- "Dialog",
61
- "SLT-Mobitel",
62
- "Hutch"
63
- ]
64
- }
65
- },
66
- "notes": {
67
- "removed_profiles": [
68
- "SLTMobitel - Twitter profile not found/restricted"
69
- ],
70
- "last_verified": "2025-12-08"
 
 
 
 
 
 
71
  }
 
 
 
 
 
 
 
72
  }
 
1
  {
2
+ "user_profiles": {
3
+ "twitter": [
4
+ "nivakaran"
5
+ ],
6
+ "facebook": [
7
+ "Nivakaran"
8
+ ],
9
+ "linkedin": [
10
+ "nivakaran"
11
+ ]
12
+ },
13
+ "user_keywords": [
14
+ "Colombo",
15
+ "nivakaran",
16
+ "telco"
17
+ ],
18
+ "user_products": [
19
+ "iphone",
20
+ "anchor"
21
+ ],
22
+ "operational_keywords": {
23
+ "infrastructure": [
24
+ "Colombo port",
25
+ "Hambantota port",
26
+ "port strike",
27
+ "power outage",
28
+ "water shortage",
29
+ "fuel shortage",
30
+ "airport delay",
31
+ "customs clearance",
32
+ "road closure",
33
+ "railway disruption"
34
+ ],
35
+ "government": [
36
+ "cabinet decision",
37
+ "new policy",
38
+ "regulation change",
39
+ "tax amendment",
40
+ "import restriction",
41
+ "export ban",
42
+ "license requirement",
43
+ "central bank",
44
+ "budget announcement"
45
+ ],
46
+ "opportunity": [
47
+ "investment",
48
+ "expansion",
49
+ "new factory",
50
+ "job creation",
51
+ "export growth",
52
+ "tourism boost",
53
+ "infrastructure project",
54
+ "development grant",
55
+ "FDI",
56
+ "trade agreement"
57
+ ]
58
+ },
59
+ "alert_thresholds": {
60
+ "trending_momentum_min": 2.0,
61
+ "spike_multiplier": 3.0,
62
+ "high_risk_score": 0.7,
63
+ "high_opportunity_score": 0.6
64
+ },
65
+ "default_competitors": {
66
+ "telecom": {
67
+ "twitter": [
68
+ "DialogSriLanka",
69
+ "Mobaborang",
70
+ "HutchSL"
71
+ ],
72
+ "facebook": [
73
+ "Dialog",
74
+ "SLT-Mobitel",
75
+ "Hutch"
76
+ ]
77
  }
78
+ },
79
+ "notes": {
80
+ "removed_profiles": [
81
+ "SLTMobitel - Twitter profile not found/restricted"
82
+ ],
83
+ "last_verified": "2025-12-08"
84
+ }
85
  }
src/nodes/intelligenceAgentNode.py CHANGED
@@ -396,16 +396,19 @@ class IntelligenceAgentNode:
396
 
397
  def generate_llm_summary(self, state: IntelligenceAgentState) -> Dict[str, Any]:
398
  """
399
- Generate competitive intelligence summary using LLM
400
  """
401
- print("[MODULE 3B] Generating LLM Summary")
402
 
403
  all_results = state.get("worker_results", [])
404
  profile_feeds = state.get("profile_feeds", {})
405
  competitor_feeds = state.get("competitor_feeds", {})
406
  product_feeds = state.get("product_review_feeds", {})
407
 
408
- # Prepare summary prompt
 
 
 
409
  summary_data = {
410
  "total_results": len(all_results),
411
  "profiles_monitored": list(profile_feeds.keys()),
@@ -415,42 +418,90 @@ class IntelligenceAgentNode:
415
  "global_competitors": len(state.get("global_intel", []))
416
  }
417
 
418
- prompt = f"""
419
- Analyze this competitive intelligence data and provide a strategic summary.
420
-
421
- Data Overview:
422
- - Total intelligence collected: {summary_data['total_results']} items
423
- - Competitor profiles monitored: {', '.join(summary_data['profiles_monitored'])}
424
- - Competitor mentions tracked: {', '.join(summary_data['competitors_tracked'])}
425
- - Products analyzed: {', '.join(summary_data['products_analyzed'])}
426
- - Local market intelligence: {summary_data['local_competitors']} items
427
- - Global market intelligence: {summary_data['global_competitors']} items
428
 
429
- Provide:
430
- 1. Key competitive insights
431
- 2. Market trends observed
432
- 3. Threats and opportunities
433
- 4. Recommended actions
 
434
 
435
- Keep it concise and actionable.
436
- """
437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  try:
439
  response = self.llm.invoke(prompt)
440
- llm_summary = response.content if hasattr(response, 'content') else str(response)
441
- print(" ✓ Generated LLM summary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  except Exception as e:
443
- llm_summary = f"LLM Summary unavailable: {e}"
444
  print(f" ⚠️ LLM error: {e}")
445
 
446
  return {
447
  "llm_summary": llm_summary,
 
448
  "structured_output": summary_data
449
  }
450
 
451
  def format_final_output(self, state: IntelligenceAgentState) -> Dict[str, Any]:
452
  """
453
- Module 3C: Format final competitive intelligence feed
454
  """
455
  print("[MODULE 3C] Formatting Final Output")
456
 
@@ -458,6 +509,7 @@ class IntelligenceAgentNode:
458
  competitor_feeds = state.get("competitor_feeds", {})
459
  product_feeds = state.get("product_review_feeds", {})
460
  llm_summary = state.get("llm_summary", "No summary available")
 
461
  local_intel = state.get("local_intel", [])
462
  global_intel = state.get("global_intel", [])
463
 
@@ -491,84 +543,66 @@ Source: Multi-platform competitive intelligence (Twitter, Facebook, LinkedIn, In
491
  """
492
 
493
  # Create integration output with structured data
494
- # FIXED: Pass actual feed data, not just counts
495
  structured_feeds = {
496
- "profiles": profile_feeds, # Full profile data, not counts
497
- "competitors": competitor_feeds, # Full competitor data
498
- "products": product_feeds, # Full product review data
499
  "local_intel": local_intel,
500
  "global_intel": global_intel
501
  }
502
 
503
- # Create list for per-item domain_insights (FRONTEND COMPATIBLE)
504
  domain_insights = []
505
  timestamp = datetime.utcnow().isoformat()
506
 
507
- # 1. Create per-profile intelligence insights
508
- for profile_name, posts in profile_feeds.items():
509
- if not isinstance(posts, list):
510
- continue
511
- for post in posts[:5]:
512
- post_text = post.get("text", "") or post.get("title", "")
513
- if not post_text or len(post_text) < 10:
514
- continue
515
  domain_insights.append({
516
  "source_event_id": str(uuid.uuid4()),
517
  "domain": "intelligence",
518
- "summary": f"Profile ({profile_name}): {post_text[:200]}",
519
- "severity": "medium",
520
- "impact_type": "risk",
521
- "timestamp": timestamp
 
522
  })
523
 
524
- # 2. Create per-competitor intelligence insights
525
- for competitor, posts in competitor_feeds.items():
526
- if not isinstance(posts, list):
527
- continue
528
- for post in posts[:5]:
529
- post_text = post.get("text", "") or post.get("title", "")
530
- if not post_text or len(post_text) < 10:
531
- continue
532
- severity = "high" if any(kw in post_text.lower() for kw in ["launch", "expansion", "acquisition"]) else "medium"
533
- domain_insights.append({
534
- "source_event_id": str(uuid.uuid4()),
535
- "domain": "intelligence",
536
- "summary": f"Competitor ({competitor}): {post_text[:200]}",
537
- "severity": severity,
538
- "impact_type": "risk",
539
- "timestamp": timestamp
540
- })
541
 
542
- # 3. Create per-product review insights
543
- for product, reviews in product_feeds.items():
544
- if not isinstance(reviews, list):
545
- continue
546
- for review in reviews[:5]:
547
- review_text = review.get("text", "") or review.get("title", "")
548
- if not review_text or len(review_text) < 10:
549
  continue
550
- severity = "low" if any(kw in review_text.lower() for kw in ["great", "excellent", "love"]) else "medium"
551
- impact = "opportunity" if severity == "low" else "risk"
552
- domain_insights.append({
553
- "source_event_id": str(uuid.uuid4()),
554
- "domain": "intelligence",
555
- "summary": f"Product Review ({product}): {review_text[:200]}",
556
- "severity": severity,
557
- "impact_type": impact,
558
- "timestamp": timestamp
559
- })
560
-
561
- # 4. Add executive summary insight
 
 
 
 
562
  domain_insights.append({
563
  "source_event_id": str(uuid.uuid4()),
564
  "structured_data": structured_feeds,
565
  "domain": "intelligence",
566
- "summary": f"Business Intelligence Summary: {llm_summary[:300]}",
567
  "severity": "medium",
568
- "impact_type": "risk"
 
569
  })
570
 
571
- print(f" ✓ Created {len(domain_insights)} intelligence insights")
572
 
573
  return {
574
  "final_feed": bulletin,
 
396
 
397
  def generate_llm_summary(self, state: IntelligenceAgentState) -> Dict[str, Any]:
398
  """
399
+ Generate competitive intelligence summary AND structured insights using LLM
400
  """
401
+ print("[MODULE 3B] Generating LLM Summary + Competitive Insights")
402
 
403
  all_results = state.get("worker_results", [])
404
  profile_feeds = state.get("profile_feeds", {})
405
  competitor_feeds = state.get("competitor_feeds", {})
406
  product_feeds = state.get("product_review_feeds", {})
407
 
408
+ llm_summary = "Competitive intelligence summary unavailable."
409
+ llm_insights = []
410
+
411
+ # Prepare summary data
412
  summary_data = {
413
  "total_results": len(all_results),
414
  "profiles_monitored": list(profile_feeds.keys()),
 
418
  "global_competitors": len(state.get("global_intel", []))
419
  }
420
 
421
+ # Collect sample data for LLM analysis
422
+ sample_posts = []
423
+ for profile, posts in profile_feeds.items():
424
+ if isinstance(posts, list):
425
+ for p in posts[:2]:
426
+ text = p.get("text", "") or p.get("title", "") or p.get("raw_content", "")[:200]
427
+ if text:
428
+ sample_posts.append(f"[PROFILE: {profile}] {text[:150]}")
 
 
429
 
430
+ for competitor, posts in competitor_feeds.items():
431
+ if isinstance(posts, list):
432
+ for p in posts[:2]:
433
+ text = p.get("text", "") or p.get("title", "") or p.get("raw_content", "")[:200]
434
+ if text:
435
+ sample_posts.append(f"[COMPETITOR: {competitor}] {text[:150]}")
436
 
437
+ posts_text = "\n".join(sample_posts[:10]) if sample_posts else "No detailed data available"
 
438
 
439
+ prompt = f"""Analyze this competitive intelligence data and generate:
440
+ 1. A strategic 3-sentence executive summary
441
+ 2. Up to 5 unique business intelligence insights
442
+
443
+ Data Overview:
444
+ - Total intelligence: {summary_data['total_results']} items
445
+ - Competitors tracked: {', '.join(summary_data['competitors_tracked']) or 'None'}
446
+ - Products analyzed: {', '.join(summary_data['products_analyzed']) or 'None'}
447
+
448
+ Sample Data:
449
+ {posts_text}
450
+
451
+ Respond in this exact JSON format:
452
+ {{
453
+ "executive_summary": "Strategic 3-sentence summary of competitive landscape",
454
+ "insights": [
455
+ {{"summary": "Unique competitive insight #1", "severity": "low/medium/high", "impact_type": "risk/opportunity"}},
456
+ {{"summary": "Unique competitive insight #2", "severity": "low/medium/high", "impact_type": "risk/opportunity"}}
457
+ ]
458
+ }}
459
+
460
+ Rules:
461
+ - Generate actionable business intelligence, not just data descriptions
462
+ - Identify competitive threats as "risk", business opportunities as "opportunity"
463
+ - Severity: high=urgent action needed, medium=monitor closely, low=informational
464
+
465
+ JSON only:"""
466
+
467
  try:
468
  response = self.llm.invoke(prompt)
469
+ content = response.content if hasattr(response, 'content') else str(response)
470
+
471
+ # Parse JSON response
472
+ import re
473
+ content = content.strip()
474
+ if content.startswith("```"):
475
+ content = re.sub(r'^```\w*\n?', '', content)
476
+ content = re.sub(r'\n?```$', '', content)
477
+
478
+ result = json.loads(content)
479
+ llm_summary = result.get("executive_summary", llm_summary)
480
+ llm_insights = result.get("insights", [])
481
+
482
+ print(f" ✓ LLM generated {len(llm_insights)} competitive insights")
483
+
484
+ except json.JSONDecodeError as e:
485
+ print(f" ⚠️ JSON parse error: {e}")
486
+ # Fallback to simple summary
487
+ try:
488
+ fallback_prompt = f"Summarize this competitive intelligence in 3 sentences:\n{posts_text[:1500]}"
489
+ response = self.llm.invoke(fallback_prompt)
490
+ llm_summary = response.content if hasattr(response, 'content') else str(response)
491
+ except:
492
+ pass
493
  except Exception as e:
 
494
  print(f" ⚠️ LLM error: {e}")
495
 
496
  return {
497
  "llm_summary": llm_summary,
498
+ "llm_insights": llm_insights,
499
  "structured_output": summary_data
500
  }
501
 
502
  def format_final_output(self, state: IntelligenceAgentState) -> Dict[str, Any]:
503
  """
504
+ Module 3C: Format final competitive intelligence feed with LLM-enhanced insights
505
  """
506
  print("[MODULE 3C] Formatting Final Output")
507
 
 
509
  competitor_feeds = state.get("competitor_feeds", {})
510
  product_feeds = state.get("product_review_feeds", {})
511
  llm_summary = state.get("llm_summary", "No summary available")
512
+ llm_insights = state.get("llm_insights", []) # NEW: Get LLM-generated insights
513
  local_intel = state.get("local_intel", [])
514
  global_intel = state.get("global_intel", [])
515
 
 
543
  """
544
 
545
  # Create integration output with structured data
 
546
  structured_feeds = {
547
+ "profiles": profile_feeds,
548
+ "competitors": competitor_feeds,
549
+ "products": product_feeds,
550
  "local_intel": local_intel,
551
  "global_intel": global_intel
552
  }
553
 
554
+ # Create list for domain_insights (FRONTEND COMPATIBLE)
555
  domain_insights = []
556
  timestamp = datetime.utcnow().isoformat()
557
 
558
+ # PRIORITY 1: Add LLM-generated unique insights (curated and actionable)
559
+ for insight in llm_insights:
560
+ if isinstance(insight, dict) and insight.get("summary"):
 
 
 
 
 
561
  domain_insights.append({
562
  "source_event_id": str(uuid.uuid4()),
563
  "domain": "intelligence",
564
+ "summary": f"🎯 {insight.get('summary', '')}", # Mark as AI-analyzed
565
+ "severity": insight.get("severity", "medium"),
566
+ "impact_type": insight.get("impact_type", "risk"),
567
+ "timestamp": timestamp,
568
+ "is_llm_generated": True
569
  })
570
 
571
+ print(f" ✓ Added {len(llm_insights)} LLM-generated competitive insights")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
 
573
+ # PRIORITY 2: Add raw data only as fallback if LLM didn't generate enough
574
+ if len(domain_insights) < 5:
575
+ # Add competitor insights as fallback
576
+ for competitor, posts in competitor_feeds.items():
577
+ if not isinstance(posts, list):
 
 
578
  continue
579
+ for post in posts[:3]:
580
+ post_text = post.get("text", "") or post.get("title", "")
581
+ if not post_text or len(post_text) < 20:
582
+ continue
583
+ severity = "high" if any(kw in post_text.lower() for kw in ["launch", "expansion", "acquisition"]) else "medium"
584
+ domain_insights.append({
585
+ "source_event_id": str(uuid.uuid4()),
586
+ "domain": "intelligence",
587
+ "summary": f"Competitor ({competitor}): {post_text[:200]}",
588
+ "severity": severity,
589
+ "impact_type": "risk",
590
+ "timestamp": timestamp,
591
+ "is_llm_generated": False
592
+ })
593
+
594
+ # Add executive summary insight
595
  domain_insights.append({
596
  "source_event_id": str(uuid.uuid4()),
597
  "structured_data": structured_feeds,
598
  "domain": "intelligence",
599
+ "summary": f"📊 Business Intelligence Summary: {llm_summary[:300]}",
600
  "severity": "medium",
601
+ "impact_type": "risk",
602
+ "is_llm_generated": True
603
  })
604
 
605
+ print(f" ✓ Created {len(domain_insights)} total intelligence insights")
606
 
607
  return {
608
  "final_feed": bulletin,
src/nodes/socialAgentNode.py CHANGED
@@ -404,45 +404,94 @@ class SocialAgentNode:
404
 
405
  def generate_llm_summary(self, state: SocialAgentState) -> Dict[str, Any]:
406
  """
407
- Module 3B: Use Groq LLM to generate executive summary
408
  """
409
- print("[MODULE 3B] Generating LLM Summary")
410
 
411
  structured_feeds = state.get("structured_output", {})
 
 
412
 
413
  try:
414
- summary_prompt = f"""Analyze the following social intelligence data and create a concise executive summary of trending topics, events, and people.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
- Data Summary:
417
- - Sri Lanka Trending: {len(structured_feeds.get('sri lanka', []))} items
418
- - Asia Trending: {len(structured_feeds.get('asia', []))} items
419
- - World Trending: {len(structured_feeds.get('world', []))} items
420
 
421
- Sample Data:
422
- {json.dumps(structured_feeds, indent=2)[:2000]}
 
 
 
 
 
 
423
 
424
- Generate a brief (3-5 sentences) executive summary highlighting the most important trending topics, events, and social developments."""
 
 
 
 
425
 
426
- llm_response = self.llm.invoke(summary_prompt)
427
- llm_summary = llm_response.content if hasattr(llm_response, 'content') else str(llm_response)
 
 
 
 
 
 
 
 
 
428
 
429
- print(" ✓ LLM Summary Generated")
 
 
430
 
 
 
 
 
 
 
 
 
 
 
 
431
  except Exception as e:
432
  print(f" ⚠️ LLM Error: {e}")
433
- llm_summary = "AI summary currently unavailable."
434
 
435
  return {
436
- "llm_summary": llm_summary
 
437
  }
438
 
439
  def format_final_output(self, state: SocialAgentState) -> Dict[str, Any]:
440
  """
441
- Module 3C: Format final feed output
442
  """
443
  print("[MODULE 3C] Formatting Final Output")
444
 
445
  llm_summary = state.get("llm_summary", "No summary available")
 
446
  structured_feeds = state.get("structured_output", {})
447
 
448
  trending_count = len([r for r in state.get("worker_results", []) if r.get("category") == "trending"])
@@ -483,91 +532,81 @@ Monitoring social sentiment, trending topics, events, and people across:
483
  Source: Multi-platform aggregation (Twitter, Facebook, LinkedIn, Instagram, Reddit)
484
  """
485
 
486
- # Create list for per-region/topic domain_insights (FRONTEND COMPATIBLE)
487
  domain_insights = []
488
  timestamp = datetime.utcnow().isoformat()
489
 
490
- # Sri Lankan districts for geographic tagging
491
- districts = [
492
- "colombo", "gampaha", "kalutara", "kandy", "matale",
493
- "nuwara eliya", "galle", "matara", "hambantota",
494
- "jaffna", "kilinochchi", "mannar", "mullaitivu", "vavuniya",
495
- "puttalam", "kurunegala", "anuradhapura", "polonnaruwa",
496
- "badulla", "monaragala", "ratnapura", "kegalle",
497
- "ampara", "batticaloa", "trincomalee"
498
- ]
 
 
 
499
 
500
- # 1. Create per-item Sri Lanka social insights
501
- sri_lanka_data = structured_feeds.get("sri lanka", [])
502
- for post in sri_lanka_data[:15]:
503
- post_text = post.get("text", "") or post.get("title", "")
504
- if not post_text or len(post_text) < 10:
505
- continue
 
 
 
 
 
 
 
 
506
 
507
- # Try to detect district from post text
508
- detected_district = "Sri Lanka"
509
- for district in districts:
510
- if district.lower() in post_text.lower():
511
- detected_district = district.title()
512
- break
513
-
514
- # Determine severity based on keywords
515
- severity = "low"
516
- if any(kw in post_text.lower() for kw in ["protest", "riot", "emergency", "violence", "crisis"]):
517
- severity = "high"
518
- elif any(kw in post_text.lower() for kw in ["trending", "viral", "breaking", "update"]):
519
- severity = "medium"
520
-
521
- domain_insights.append({
522
- "source_event_id": str(uuid.uuid4()),
523
- "domain": "social",
524
- "summary": f"{detected_district}: {post_text[:200]}",
525
- "severity": severity,
526
- "impact_type": "risk" if severity in ["high", "medium"] else "opportunity",
527
- "timestamp": timestamp
528
- })
529
-
530
- # 2. Create Asia regional insights
531
- asia_data = structured_feeds.get("asia", [])
532
- for post in asia_data[:5]:
533
- post_text = post.get("text", "") or post.get("title", "")
534
- if not post_text or len(post_text) < 10:
535
- continue
536
- domain_insights.append({
537
- "source_event_id": str(uuid.uuid4()),
538
- "domain": "social",
539
- "summary": f"Asia Regional: {post_text[:200]}",
540
- "severity": "medium",
541
- "impact_type": "risk",
542
- "timestamp": timestamp
543
- })
544
-
545
- # 3. Create World insights
546
- world_data = structured_feeds.get("world", [])
547
- for post in world_data[:5]:
548
- post_text = post.get("text", "") or post.get("title", "")
549
- if not post_text or len(post_text) < 10:
550
- continue
551
- domain_insights.append({
552
- "source_event_id": str(uuid.uuid4()),
553
- "domain": "social",
554
- "summary": f"Global: {post_text[:200]}",
555
- "severity": "low",
556
- "impact_type": "opportunity",
557
- "timestamp": timestamp
558
- })
559
-
560
- # 4. Add executive summary insight
561
  domain_insights.append({
562
  "source_event_id": str(uuid.uuid4()),
563
  "structured_data": structured_feeds,
564
  "domain": "social",
565
- "summary": f"Sri Lanka Social Intelligence Summary: {llm_summary[:300]}",
566
  "severity": "medium",
567
- "impact_type": "risk"
 
568
  })
569
 
570
- print(f" ✓ Created {len(domain_insights)} social intelligence insights")
571
 
572
  return {
573
  "final_feed": bulletin,
 
404
 
405
  def generate_llm_summary(self, state: SocialAgentState) -> Dict[str, Any]:
406
  """
407
+ Module 3B: Use Groq LLM to generate executive summary AND structured insights
408
  """
409
+ print("[MODULE 3B] Generating LLM Summary + Structured Insights")
410
 
411
  structured_feeds = state.get("structured_output", {})
412
+ llm_summary = "AI summary currently unavailable."
413
+ llm_insights = []
414
 
415
  try:
416
+ # Collect sample posts for analysis
417
+ all_posts = []
418
+ for region, posts in structured_feeds.items():
419
+ for p in posts[:5]: # Top 5 per region
420
+ text = p.get("text", "") or p.get("title", "")
421
+ if text and len(text) > 20:
422
+ all_posts.append(f"[{region.upper()}] {text[:200]}")
423
+
424
+ if not all_posts:
425
+ return {"llm_summary": llm_summary, "llm_insights": []}
426
+
427
+ posts_text = "\n".join(all_posts[:15])
428
+
429
+ # Generate summary AND structured insights
430
+ analysis_prompt = f"""Analyze these social media posts from Sri Lanka and the region. Generate:
431
+ 1. A 3-sentence executive summary of key trends
432
+ 2. Up to 5 unique intelligence insights
433
 
434
+ Posts:
435
+ {posts_text}
 
 
436
 
437
+ Respond in this exact JSON format:
438
+ {{
439
+ "executive_summary": "Brief 3-sentence summary of key social trends and developments",
440
+ "insights": [
441
+ {{"summary": "Unique insight #1 (not copying post text)", "severity": "low/medium/high", "impact_type": "risk/opportunity"}},
442
+ {{"summary": "Unique insight #2", "severity": "low/medium/high", "impact_type": "risk/opportunity"}}
443
+ ]
444
+ }}
445
 
446
+ Rules:
447
+ - Generate NEW insights, don't just copy post text
448
+ - Identify patterns and emerging trends
449
+ - Classify severity based on potential impact
450
+ - Mark positive developments as "opportunity", concerning ones as "risk"
451
 
452
+ JSON only, no explanation:"""
453
+
454
+ llm_response = self.llm.invoke(analysis_prompt)
455
+ content = llm_response.content if hasattr(llm_response, 'content') else str(llm_response)
456
+
457
+ # Parse JSON response
458
+ import re
459
+ content = content.strip()
460
+ if content.startswith("```"):
461
+ content = re.sub(r'^```\w*\n?', '', content)
462
+ content = re.sub(r'\n?```$', '', content)
463
 
464
+ result = json.loads(content)
465
+ llm_summary = result.get("executive_summary", llm_summary)
466
+ llm_insights = result.get("insights", [])
467
 
468
+ print(f" ✓ LLM generated {len(llm_insights)} unique insights")
469
+
470
+ except json.JSONDecodeError as e:
471
+ print(f" ⚠️ JSON parse error: {e}")
472
+ # Fallback to simple summary
473
+ try:
474
+ fallback_prompt = f"Summarize these social media trends in 3 sentences:\n{posts_text[:1500]}"
475
+ response = self.llm.invoke(fallback_prompt)
476
+ llm_summary = response.content if hasattr(response, 'content') else str(response)
477
+ except:
478
+ pass
479
  except Exception as e:
480
  print(f" ⚠️ LLM Error: {e}")
 
481
 
482
  return {
483
+ "llm_summary": llm_summary,
484
+ "llm_insights": llm_insights
485
  }
486
 
487
  def format_final_output(self, state: SocialAgentState) -> Dict[str, Any]:
488
  """
489
+ Module 3C: Format final feed output with LLM-enhanced insights
490
  """
491
  print("[MODULE 3C] Formatting Final Output")
492
 
493
  llm_summary = state.get("llm_summary", "No summary available")
494
+ llm_insights = state.get("llm_insights", []) # NEW: Get LLM-generated insights
495
  structured_feeds = state.get("structured_output", {})
496
 
497
  trending_count = len([r for r in state.get("worker_results", []) if r.get("category") == "trending"])
 
532
  Source: Multi-platform aggregation (Twitter, Facebook, LinkedIn, Instagram, Reddit)
533
  """
534
 
535
+ # Create list for domain_insights (FRONTEND COMPATIBLE)
536
  domain_insights = []
537
  timestamp = datetime.utcnow().isoformat()
538
 
539
+ # PRIORITY 1: Add LLM-generated unique insights (these are curated and unique)
540
+ for insight in llm_insights:
541
+ if isinstance(insight, dict) and insight.get("summary"):
542
+ domain_insights.append({
543
+ "source_event_id": str(uuid.uuid4()),
544
+ "domain": "social",
545
+ "summary": f"🔍 {insight.get('summary', '')}", # Mark as AI-analyzed
546
+ "severity": insight.get("severity", "medium"),
547
+ "impact_type": insight.get("impact_type", "risk"),
548
+ "timestamp": timestamp,
549
+ "is_llm_generated": True # Flag for frontend
550
+ })
551
 
552
+ print(f" ✓ Added {len(llm_insights)} LLM-generated insights")
553
+
554
+ # PRIORITY 2: Add top raw posts only if we need more (fallback)
555
+ # Only add raw posts if LLM didn't generate enough insights
556
+ if len(domain_insights) < 5:
557
+ # Sri Lankan districts for geographic tagging
558
+ districts = [
559
+ "colombo", "gampaha", "kalutara", "kandy", "matale",
560
+ "nuwara eliya", "galle", "matara", "hambantota",
561
+ "jaffna", "kilinochchi", "mannar", "mullaitivu", "vavuniya",
562
+ "puttalam", "kurunegala", "anuradhapura", "polonnaruwa",
563
+ "badulla", "monaragala", "ratnapura", "kegalle",
564
+ "ampara", "batticaloa", "trincomalee"
565
+ ]
566
 
567
+ # Add Sri Lanka posts as fallback
568
+ sri_lanka_data = structured_feeds.get("sri lanka", [])
569
+ for post in sri_lanka_data[:5]:
570
+ post_text = post.get("text", "") or post.get("title", "")
571
+ if not post_text or len(post_text) < 20:
572
+ continue
573
+
574
+ # Detect district
575
+ detected_district = "Sri Lanka"
576
+ for district in districts:
577
+ if district.lower() in post_text.lower():
578
+ detected_district = district.title()
579
+ break
580
+
581
+ # Determine severity
582
+ severity = "low"
583
+ if any(kw in post_text.lower() for kw in ["protest", "riot", "emergency", "violence", "crisis"]):
584
+ severity = "high"
585
+ elif any(kw in post_text.lower() for kw in ["trending", "viral", "breaking", "update"]):
586
+ severity = "medium"
587
+
588
+ domain_insights.append({
589
+ "source_event_id": str(uuid.uuid4()),
590
+ "domain": "social",
591
+ "summary": f"{detected_district}: {post_text[:200]}",
592
+ "severity": severity,
593
+ "impact_type": "risk" if severity in ["high", "medium"] else "opportunity",
594
+ "timestamp": timestamp,
595
+ "is_llm_generated": False
596
+ })
597
+
598
+ # Add executive summary insight
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  domain_insights.append({
600
  "source_event_id": str(uuid.uuid4()),
601
  "structured_data": structured_feeds,
602
  "domain": "social",
603
+ "summary": f"📊 Social Intelligence Summary: {llm_summary[:300]}",
604
  "severity": "medium",
605
+ "impact_type": "risk",
606
+ "is_llm_generated": True
607
  })
608
 
609
+ print(f" ✓ Created {len(domain_insights)} total social intelligence insights")
610
 
611
  return {
612
  "final_feed": bulletin,
src/utils/utils.py CHANGED
@@ -439,21 +439,15 @@ def scrape_rivernet_impl(
439
  main_html = page.content()
440
  main_soup = BeautifulSoup(main_html, "html.parser")
441
 
442
- # Look for any alert/warning text on main page
443
- page_text = main_soup.get_text(separator=" ", strip=True)
444
- alert_keywords = ["warning", "alert", "flood", "danger", "high", "rising"]
445
- for keyword in alert_keywords:
446
- if keyword.lower() in page_text.lower():
447
- # Extract context around keyword
448
- match = re.search(rf'.{{0,100}}{keyword}.{{0,100}}', page_text, re.I)
449
- if match:
450
- alert_text = match.group(0).strip()
451
- if len(alert_text) > 20 and alert_text not in [a.get("text") for a in results["alerts"]]:
452
- results["alerts"].append({
453
- "text": alert_text,
454
- "severity": "high" if keyword in ["danger", "flood"] else "medium",
455
- "source": "rivernet.lk main page"
456
- })
457
 
458
  logger.info("[RIVERNET] Main page loaded successfully")
459
 
@@ -518,26 +512,43 @@ def scrape_rivernet_impl(
518
  except (ValueError, IndexError):
519
  continue
520
 
521
- # Determine status based on keywords (refined to avoid false positives)
522
  text_lower = page_text.lower()
523
 
524
- # Default to normal
525
  river_data["status"] = "normal"
526
 
527
- # DANGER / CRITICAL
528
- if any(w in text_lower for w in ["major flood", "danger level", "critical level", "red alert", "evacuate", "extreme flood"]):
 
 
 
 
 
 
 
 
529
  river_data["status"] = "danger"
530
 
531
- # WARNING (Stricter: removed generic "high", "alert")
532
- elif any(w in text_lower for w in ["minor flood", "warning level", "flood alert", "amber alert", "high risk", "flood warning"]):
 
 
 
 
533
  river_data["status"] = "warning"
534
 
535
- # RISING
536
- elif any(w in text_lower for w in ["water level rising", "rising trend", "level is rising"]):
 
 
 
537
  river_data["status"] = "rising"
538
 
539
- # explicitly check for normal keywords to confirm (optional, as we default to normal)
540
- elif any(w in text_lower for w in ["normal", "safe", "stable", "low", "green", "decreasing"]):
 
 
541
  river_data["status"] = "normal"
542
 
543
  results["rivers"].append(river_data)
 
439
  main_html = page.content()
440
  main_soup = BeautifulSoup(main_html, "html.parser")
441
 
442
+ # NOTE: Disabled loose keyword extraction - was causing false positives
443
+ # Real flood alerts will be determined from individual river page status
444
+ # The previous alert_keywords approach matched generic site text like
445
+ # "warning: javascript required" causing fake alerts
446
+
447
+ # If we need main page alerts, look for specific alert banner elements
448
+ # alert_banners = main_soup.select(".alert-banner, .flood-warning, .critical-notice")
449
+ # for banner in alert_banners:
450
+ # results["alerts"].append({...})
 
 
 
 
 
 
451
 
452
  logger.info("[RIVERNET] Main page loaded successfully")
453
 
 
512
  except (ValueError, IndexError):
513
  continue
514
 
515
+ # Determine status based on keywords (STRICTER to avoid false positives)
516
  text_lower = page_text.lower()
517
 
518
+ # Default to normal - only escalate if clear flood indicators
519
  river_data["status"] = "normal"
520
 
521
+ # CRITICAL: Only consider keywords in FLOOD CONTEXT
522
+ # Look for phrases, not just words, to avoid false positives
523
+
524
+ # DANGER / CRITICAL - Very specific phrases only
525
+ danger_phrases = [
526
+ "major flood", "danger level exceeded", "critical flood",
527
+ "red alert", "evacuate immediately", "extreme flood",
528
+ "water level exceeds danger", "above danger level"
529
+ ]
530
+ if any(phrase in text_lower for phrase in danger_phrases):
531
  river_data["status"] = "danger"
532
 
533
+ # WARNING - Specific flood warning phrases
534
+ elif any(phrase in text_lower for phrase in [
535
+ "minor flood", "warning level exceeded", "flood alert issued",
536
+ "amber alert", "approaching warning level",
537
+ "water level exceeds warning", "above warning level"
538
+ ]):
539
  river_data["status"] = "warning"
540
 
541
+ # RISING - Only if explicitly rising
542
+ elif any(phrase in text_lower for phrase in [
543
+ "water level rising", "rising trend detected",
544
+ "level is rising rapidly", "increasing water level"
545
+ ]):
546
  river_data["status"] = "rising"
547
 
548
+ # NORMAL indicators (optional, just for logging)
549
+ elif any(phrase in text_lower for phrase in [
550
+ "normal level", "stable", "safe level", "decreasing", "below warning"
551
+ ]):
552
  river_data["status"] = "normal"
553
 
554
  results["rivers"].append(river_data)